diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..5328690 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,47 @@ +name: VidCast CD — Deploy to EKS + +on: + workflow_run: + workflows: ["VidCast CI — Lint, Scan, Build, Push"] + types: [completed] + branches: [main] + +permissions: + id-token: write # required to request the OIDC token + contents: read + +jobs: + deploy: + if: ${{ github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials (OIDC) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ secrets.AWS_DEPLOY_ROLE_ARN }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Update kubeconfig for EKS + run: | + aws eks update-kubeconfig \ + --name ${{ secrets.EKS_CLUSTER_NAME }} \ + --region ${{ secrets.AWS_REGION }} + + - name: Set short SHA from triggering workflow + run: | + echo "SHORT_SHA=$(echo ${{ github.event.workflow_run.head_sha }} | cut -c1-7)" >> $GITHUB_ENV + + - name: Deploy services to EKS + run: | + for svc in auth-service gateway-service converter-service notification-service; do + deploy_name="${svc%-service}" + kubectl set image deployment/${deploy_name} \ + ${deploy_name}=${{ secrets.DOCKERHUB_USERNAME }}/${svc}:${{ env.SHORT_SHA }} || true + kubectl rollout status deployment/${deploy_name} --timeout=120s || true + done + + - name: Verify all pods running + run: kubectl get pods -o wide diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..4654e05 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: VidCast CI — Lint, Scan, Build, Push + +on: + push: + branches: [main] + paths: ['src/**'] + pull_request: + branches: [main] + paths: ['src/**'] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install ruff + run: pip install ruff + + - name: Lint Python services + run: ruff check src/ --exclude src/frontend + + build-and-scan: + needs: lint + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + service: [auth-service, gateway-service, converter-service, notification-service, outbox-relay] + + steps: + - uses: actions/checkout@v4 + + - name: Set short SHA + run: echo "SHORT_SHA=${GITHUB_SHA::7}" >> $GITHUB_ENV + + - name: Build Docker image + run: | + docker build \ + -t ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} \ + src/${{ matrix.service }}/ + + - name: Trivy vulnerability scan + uses: aquasecurity/trivy-action@master + with: + image-ref: ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} + severity: CRITICAL,HIGH + exit-code: '1' + ignore-unfixed: true + format: table + + - name: Login to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Push image to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..677ceff --- /dev/null +++ b/.gitignore @@ -0,0 +1,75 @@ +# Terraform +terraform.tfvars +terraform.tfvars.json +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +tfplan +*.tfplan +crash.log + +# Kubernetes secrets +**/secret.yaml +# ...except Helm chart secret *templates*, which hold no literal credentials +# (they reference values.yaml via {{ .Values.secret.* }}) and must be tracked +# so a clean `helm install` can render the Secret resource. +!Helm_charts/MongoDB/templates/secret.yaml +!Helm_charts/RabbitMQ/templates/secret.yaml +!Helm_charts/Postgres/templates/secret.yaml + +# Deployment-specific files +DEPLOYMENT_CONFIG.md +# DEPLOYMENT_GUIDE.md is now a tracked runbook + newcomer guide (no secrets; +# secrets live in DEPLOYMENT_CONFIG.md / Parameter Store). +DEPLOYMENT_REPORT.md +SESSION_SUMMARY.md +DEPLOYMENT_PROBLEMS.md +deployment-ids.txt +# customise.sh is now tracked: it auto-detects identity and reads new values from +# env vars, so it contains no secrets or personal data (it just repoints the repo +# to your Docker Hub / AWS / GitHub for a fork). + +# Local session artifacts / working notes (may contain account IDs, IPs, secrets). +# Keep on disk, never commit. +[0-9][0-9]_[0-9][0-9]_[0-9][0-9]_*.md +FRONTEND_IMPROVEMENTS.md +VIDCAST_PLAIN_ENGLISH_GUIDE.md +CLAUDE.md +PR_DESCRIPTION.md + +# Build artifacts +*.mp3 +!assets/video.mp4 +output.* + +# Python +__pycache__/ +*.pyc +*.pyo +.env +venv/ +*.egg-info/ + +# Node +node_modules/ +dist/ +build/ +.cache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Explanation files (study material, not production) +*_EXPLAINED.md diff --git a/DEPLOYMENT_GUIDE.md b/DEPLOYMENT_GUIDE.md new file mode 100644 index 0000000..4811416 --- /dev/null +++ b/DEPLOYMENT_GUIDE.md @@ -0,0 +1,606 @@ +# VidCast Deployment Guide + +This is the **single canonical guide** for taking VidCast from "cluster torn down, +nothing running" to "everything live and verified," including all Sprint 1–4 upgrades +(Kustomize, ESO, KEDA, Argo CD, Kyverno, NetworkPolicies, outbox/idempotency/DLQ, SLO +alerting, supply-chain, Kubecost). Every command is copy-pasteable; every "wait for X" +has a check command. + +> **No personal data here.** This guide uses **placeholders** (``, +> ``, ``, ``, …). Substitute your own +> values — the easiest way is the two scripts below. + +## ⚡ The fast path (two scripts) + +Most of this guide is reference. The actual bring-up is **two commands** once your +infrastructure exists (Terraform applied, node Ready) and your config is in your shell: + +```bash +./customise.sh # rewrites identity (Docker Hub user, AWS account, GitHub repo) + DB + # creds + the bcrypt admin hash across the repo's config files +./deploy.sh # installs datastores → secrets → app → KEDA/Argo/Kyverno/monitoring/ + # Kubecost → NetworkPolicies, then smoke-tests and prints the URLs +./deploy.sh --teardown # when finished: terraform destroy + confirm $0 spend +``` + +Both read their inputs from **environment variables** (so no secret is ever written to +a tracked file). See **§A.2** for what to set and **how to obtain each value**, and the +header comments inside each script for the full list. The sections below explain what +the scripts do, step by step, so you can run them by hand or debug them. + +**This document serves two audiences:** +- **Part A** — for someone forking VidCast onto their **own AWS account** for the first + time: what to install, what each account needs, every value to change (and how to get + it), and an honest cost warning. +- **Part B (§0 onward)** — the concrete bring-up runbook (what `deploy.sh` automates), + with copy-pasteable commands using placeholders. + +> **Footprint decision (signed off):** deploy the **dev overlay** (1-replica +> backends) and run **Kubecost on the dev footprint** — this keeps the single +> 2-vCPU node at ~81% idle. Prod overlay + Kubecost would breach the 90% gate. + +--- + +# PART A — For Newcomers (read this first if you're forking the repo) + +## A.1 Prerequisites — What You Need Before You Start + +You need **four accounts** and **seven tools**. Budget ~30 minutes for first-time +setup before you ever touch the cluster. + +### Accounts + +1. **An AWS account** with either **admin access** or, at minimum, permission to + create: VPCs, EKS clusters, EC2 (the node), IAM roles/policies + an OIDC provider, + ECR repositories, and SSM Parameter Store entries. (Admin is simplest for a + learning project; the least-privilege set is the list above.) New AWS accounts get + a free tier, but **EKS itself is not free** — see the Cost Warning (§A.3). +2. **A Docker Hub account** (free) — the five backend images are built by CI and + pushed here, then pulled by the cluster. You'll set your username everywhere the + project currently says ``. +3. **A Gmail account with an "App Password"** — the notification service sends the + "your audio is ready" email via Gmail's SMTP. Normal Gmail passwords won't work for + SMTP; you must generate a 16-character **App Password** (requires 2-factor auth on + the account). Instructions: . Strip the + spaces when you paste it. +4. **A GitHub account with this repo forked.** GitHub is where the code lives, where + CI runs, and — importantly — the identity AWS trusts for keyless deploys (OIDC) and + image signing. Your fork's `owner/repo` name must be wired into a few places (§A.2). + +### Tools (Ubuntu / WSL2 install commands) + +| Tool | What it's for | Install | +|---|---|---| +| **AWS CLI v2** | talk to AWS from the terminal | `curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o awscliv2.zip && unzip awscliv2.zip && sudo ./aws/install` | +| **Terraform ≥ 1.5** | build the AWS infra from code | `sudo apt-get update && sudo apt-get install -y gnupg software-properties-common && wget -O- https://apt.releases.hashicorp.com/gpg \| gpg --dearmor \| sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg >/dev/null && echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" \| sudo tee /etc/apt/sources.list.d/hashicorp.list && sudo apt-get update && sudo apt-get install -y terraform` | +| **kubectl** | talk to the Kubernetes cluster | `curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" && sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl` | +| **Helm v3** | install off-the-shelf software (DBs, monitoring) | `curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \| bash` | +| **Docker** | build/run container images | `curl -fsSL https://get.docker.com \| sh && sudo usermod -aG docker $USER` (log out/in after) | +| **git** | clone the repo, push changes | `sudo apt-get install -y git` | +| **psql client** | run the database init script | `sudo apt-get install -y postgresql-client` | + +After installing, configure AWS auth once: `aws configure` (enter your access key, +secret, region `eu-west-2`, output `json`), then verify with +`aws sts get-caller-identity` — it should print *your* account ID. + +--- + +## A.2 Customisation — Making It Your Own + How to Get Each Value + +To run VidCast yourself you supply your **own** values. Set them as environment +variables, then **`./customise.sh`** writes the identity/DB values into the repo's +config files and **`./deploy.sh`** uses the secrets at install time. The table below is +the full inventory: each value, **how to obtain it**, and where it's used. + +| Value (env var) | How to get it | Where it's used | +|---|---|---| +| **AWS account ID** (`AWS_ACCOUNT_ID`) | `aws sts get-caller-identity --query Account --output text` | ECR image refs in `k8s/overlays/*/kustomization.yaml`; Terraform | +| **AWS region** (`AWS_REGION`) | Pick a region; default `eu-west-2`. Use one that allows non-T-type EKS nodes. | `terraform.tfvars`, ESO `ClusterSecretStore` | +| **Docker Hub username** (`DOCKER_HUB_USER`) | Sign up free at hub.docker.com — it's your account name. | `k8s/overlays/*` backend image names; GitHub secret `DOCKERHUB_USERNAME` | +| **GitHub org/repo** (`GITHUB_ORG`,`GITHUB_REPO`) | Your fork's URL: `github.com//`. | OIDC trust (Terraform), Argo CD `repoURL`, Kyverno cosign signer identity — all must point at **your** fork | +| **Cluster name** (`CLUSTER_NAME`) | Pick any name **without underscores** (EKS rejects them); e.g. `vidcast-cluster`. | `terraform.tfvars` | +| **ECR repo name** (`ECR_REPO_NAME`) | Pick a name for the frontend image repo; e.g. `vidcast-frontend`. | Terraform `repository_names`; overlay frontend `newName` | +| **PostgreSQL user / password** (`POSTGRES_USERNAME`,`POSTGRES_PASSWORD`) | User: pick one (e.g. `pguser`). Password: `openssl rand -base64 24`. | injected into the Postgres chart by `deploy.sh`; Parameter Store `/vidcast//auth/psql-password` | +| **MongoDB user / password** (`MONGODB_USERNAME`,`MONGODB_PASSWORD`) | User: pick one (e.g. `mongouser`). Password: `openssl rand -base64 24`. | injected into the Mongo chart by `deploy.sh`; embedded in the Mongo URIs in Parameter Store | +| **RabbitMQ user / password** (`RABBITMQ_USERNAME`,`RABBITMQ_PASSWORD`) | User: default `rabbituser`. Password: `openssl rand -base64 24`. | injected into the RabbitMQ chart by `deploy.sh` (→ `rabbitmq-secret`) | +| **JWT secret** (`JWT_SECRET`) | `openssl rand -base64 32` — the key that signs login tokens. | Parameter Store `/vidcast//auth/jwt-secret` | +| **Gmail address** (`GMAIL_ADDRESS`) | A Gmail account you control — the "from" address on the notification email. | Parameter Store `/vidcast//notification/gmail-address` | +| **Gmail App Password** (`GMAIL_APP_PASSWORD`) | Enable 2FA, then generate a 16-char app password at (strip spaces). | Parameter Store `/vidcast//notification/gmail-password` | +| **Login email / password** (`APP_LOGIN_EMAIL`,`APP_LOGIN_PASSWORD`) | Pick the admin login. `customise.sh` turns the password into a **bcrypt hash** in `init.sql` (you never store the plaintext). | seeded admin row in `Helm_charts/Postgres/init.sql` | + +> **Where each kind of value lives — and why secrets never touch Git:** +> - **Secrets** (DB passwords, JWT, Gmail password) → **AWS Parameter Store**, seeded by +> `deploy.sh` from your env vars. The chart values carry only `CHANGEME` placeholders; +> `deploy.sh` injects the real passwords with `--set` at install time. **No secret is +> ever written to a tracked file.** +> - **Identity** (Docker Hub user, AWS account, GitHub repo) → tracked config that the +> GitOps engine (Argo CD) and AWS need to *function* — these are inherently public +> (a public Docker Hub user / GitHub repo). `customise.sh` rewrites them to yours. + +> **Parameter Store is your safe-deposit box.** The secrets above aren't written into +> any file — they're put into Parameter Store once (by `deploy.sh`), and the app +> retrieves them at runtime via the External Secrets Operator. The app holds a key (its +> AWS identity) to the box; the contents are never committed anywhere. + +Convenient way to set everything, then customise + deploy: +```bash +# put your values in a LOCAL, gitignored file (never commit it), then: +set -a; source ./my-vidcast.env; set +a # exports all the vars above +./customise.sh # rewrites identity + DB creds + bcrypt admin hash in the repo +./deploy.sh # brings everything up and verifies +``` + +--- + +## A.3 ⚠️ COST WARNING — Read Before You `apply` + +``` +┌──────────────────────────────────────────────────────────────────────────────┐ +│ RUNNING THIS PROJECT COSTS REAL MONEY WHILE THE CLUSTER IS UP. │ +│ │ +│ • EKS control plane (the managed Kubernetes brain) ~ $0.10 / hour (~$73/mo)│ +│ • The node (m7i-flex.large EC2 instance) ~ $0.11 / hour (~$77/mo)│ +│ • EBS / data transfer / etc. (small) a few $ / month │ +│ ───────────────────────────────────────────────────────────────────────────│ +│ ≈ $0.21 / hour while up → ~$150 / month if left running 24×7. │ +│ │ +│ A 1-hour demo costs about 20 cents. Leaving it on all month costs ~$150. │ +│ │ +│ 👉 DESTROY IT WHEN YOU'RE DONE. Standing cost when destroyed = ~$0. │ +│ (Terraform state in S3, the DynamoDB lock table, and Parameter Store │ +│ entries are all free to leave; the frontend ECR images are pennies.) │ +└──────────────────────────────────────────────────────────────────────────────┘ +``` + +**Teardown (the one command that stops the billing):** +```bash +./deploy.sh --teardown # runs terraform destroy + confirms zero spend +# — or manually — +cd terraform/environments/dev && terraform destroy -auto-approve # ~10 min +aws eks list-clusters --region eu-west-2 # expect [] (nothing left billing) +``` +Everything is rebuildable from code in ~20 minutes, so the right habit is: **bring it +up for a session, then tear it down.** Treat "is the cluster on?" as the cost switch. + +> **Tip:** set an **AWS Budgets** alarm (e.g. alert at $20/month) before your first +> `apply`, so a forgotten cluster can't surprise you. AWS Console → Billing → Budgets. + +--- + +# PART B — The Runbook (worked example: original operator's values) + +> Everything below uses the original account/Docker Hub/cluster values as a concrete, +> copy-pasteable example. If you did §A.2, substitute your own values. **`deploy.sh` +> automates §3–§8 of this part;** §0–§2 (prerequisites, Terraform apply) are still +> run by hand because they create the AWS account-level infrastructure. + +## 0. Fixed facts (account / state / preserved resources) + +``` +AWS_ACCOUNT_ID: +AWS_REGION: eu-west-2 +CLUSTER_NAME: vidcast-cluster # Terraform-managed (NOT the old cba-microservices) +NODE_INSTANCE_TYPE: m7i-flex.large (2 vCPU / 8 GiB; NEVER T-type — SCP blocks it) +DOCKER_HUB_USER: +APP_LOGIN_EMAIL: +``` + +**Preserved across teardown (DO NOT delete — they make re-apply one command):** +``` +S3 state bucket: vidcast-tfstate- (key: vidcast/dev/terraform.tfstate) +DynamoDB lock table: vidcast-terraform-locks (ACTIVE) +terraform.tfvars: terraform/environments/dev/terraform.tfvars (gitignored, real inputs) +ECR repo + images: vidcast-frontend (tags incl. d9e4282 — frontend need NOT be rebuilt) +``` + +--- + +## 1. Prerequisites (before `terraform apply`) + +### 1.1 Tools +```bash +aws --version # v2.x +terraform version # >= 1.5 +kubectl version --client +helm version # v3.x +git --version +``` + +### 1.2 AWS credentials +```bash +aws sts get-caller-identity # expect account (user johnadmin / johnsadmin) +``` + +### 1.3 Docker Hub backend images must exist (you — build & push the backend images first) +The dev overlay pins these tags; each must be pullable **before** the app is deployed: +```bash +# Replace with the tag the overlay pins (k8s/overlays/dev/kustomization.yaml) +for s in auth-service gateway-service converter-service notification-service outbox-relay; do + docker manifest inspect /$s: >/dev/null 2>&1 \ + && echo "$s ✓" || echo "$s ✗ MISSING — build via CI before deploying"; +done +``` +> If any is ✗, the corresponding pod will `ImagePullBackOff`. The B4 `/metrics` +> endpoints exist ONLY in images rebuilt from Sprint-4 code (push to main → CI). +> The frontend (`vidcast-frontend:d9e4282`) is on ECR and pulled via the node role. + +### 1.4 Parameter Store seeded (you — seed these before installing ESO) +ESO reads these 7 `dev` SecureString parameters. Seed from the gitignored +`DEPLOYMENT_CONFIG.md` values (NOT committed anywhere): +```bash +REGION=eu-west-2 +put() { aws ssm put-parameter --region "$REGION" --type SecureString --overwrite --name "$1" --value "$2"; } +put /vidcast/dev/auth/psql-password "$POSTGRES_PASSWORD" +put /vidcast/dev/auth/jwt-secret "$JWT_SECRET" +put /vidcast/dev/gateway/mongodb-videos-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/videos?authSource=admin" +put /vidcast/dev/gateway/mongodb-mp3s-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +put /vidcast/dev/converter/mongodb-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +put /vidcast/dev/notification/gmail-address "$GMAIL_ADDRESS" +put /vidcast/dev/notification/gmail-password "$GMAIL_APP_PASSWORD" # 16 chars, NO spaces +# Verify: +aws ssm get-parameters-by-path --region $REGION --path /vidcast/dev --recursive --query 'Parameters[].Name' +``` + +--- + +## 2. Terraform apply (infra: VPC, EKS, node group, VPC-CNI netpol agent, ECR, OIDC) + +```bash +cd terraform/environments/dev +terraform init \ + -backend-config="bucket=vidcast-tfstate-" \ + -backend-config="key=vidcast/dev/terraform.tfstate" \ + -backend-config="region=eu-west-2" \ + -backend-config="dynamodb_table=vidcast-terraform-locks" +terraform validate +``` + +### 2.1 ECR import (A8 — the existing repo predates the module) +The `vidcast-frontend` ECR repo already exists; import it so `apply` doesn't fail +with "already exists": +```bash +terraform import 'module.ecr.aws_ecr_repository.this["vidcast-frontend"]' vidcast-frontend +``` +> If the GitHub OIDC provider errors `EntityAlreadyExistsException` on apply, import it too: +> `terraform import module.github_oidc.aws_iam_openid_connect_provider.github arn:aws:iam:::oidc-provider/token.actions.githubusercontent.com` + +### 2.2 Apply (~20 min: EKS control plane) +```bash +terraform plan # review — should show EKS + node group + ECR hardening deltas +terraform apply -auto-approve +``` + +### 2.3 Connect + confirm +```bash +aws eks update-kubeconfig --name vidcast-cluster --region eu-west-2 +kubectl get nodes -o wide # WAIT: 1 node Ready (~2-3 min after node group) +kubectl get nodes -o wide | grep -q ' Ready ' && echo "NODE READY ✓" + +# Confirm the VPC-CNI network-policy AGENT is on (A6 — else NetworkPolicies are decorative): +kubectl get ds aws-node -n kube-system -o jsonpath='{.spec.template.spec.containers[*].name}'; echo +# expect to see 'aws-eks-nodeagent' alongside 'aws-node' + +# Capture the deploy role ARN for CD (set this as the GitHub secret AWS_DEPLOY_ROLE_ARN): +terraform output github_actions_role_arn +terraform output external_secrets_irsa_role_arn # used by the ESO ServiceAccount annotation +terraform output ecr_repository_urls +``` + +--- + +## 3. Helm installs — datastores (dependency order) + +Order: **MongoDB → PostgreSQL → RabbitMQ** (the app needs all three; RabbitMQ also +creates `rabbitmq-secret` which gateway/converter/notification consume). + +```bash +cd /home/john/microservices-python-app + +helm install mongodb Helm_charts/MongoDB +kubectl rollout status statefulset/mongodb --timeout=180s + +helm install postgres Helm_charts/Postgres +kubectl rollout status deployment/postgres-deploy --timeout=120s + +helm install rabbitmq Helm_charts/RabbitMQ +kubectl rollout status statefulset/rabbitmq --timeout=180s + +kubectl get pods # WAIT: mongodb-0, postgres-deploy-*, rabbitmq-0 all Running +``` + +### 3.1 PostgreSQL init — schema + admin seed (SKIPPING THIS = login fails) +`init.sql` creates the `auth_user` table and enables pgcrypto, but contains **no +password hash** (nothing secret in the repo). The admin is seeded separately, with +its bcrypt hash generated **inside** PostgreSQL from your env vars — so the plaintext +and the hash never touch a file. `deploy.sh` does both steps; by hand: +```bash +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') +PSQL="psql -h $NODE_IP -p 30003 -U $POSTGRES_USERNAME -d authdb -v ON_ERROR_STOP=1" + +# 1) schema (table + pgcrypto extension) +PGPASSWORD="$POSTGRES_PASSWORD" $PSQL -f Helm_charts/Postgres/init.sql + +# 2) seed the admin — bcrypt hash generated in-DB via pgcrypto (no hash in any file) +PGPASSWORD="$POSTGRES_PASSWORD" $PSQL -v email="$APP_LOGIN_EMAIL" -v pw="$APP_LOGIN_PASSWORD" <<'SQL' +INSERT INTO auth_user (email, password, role) +VALUES (:'email', crypt(:'pw', gen_salt('bf', 12)), 'admin') +ON CONFLICT (email) DO UPDATE SET password = EXCLUDED.password, role = EXCLUDED.role; +SQL + +PGPASSWORD="$POSTGRES_PASSWORD" $PSQL -c "SELECT email, role FROM auth_user;" # expect your admin row +``` +> The DB/broker admin NodePorts (30003/30004/30005) are reachable until NetworkPolicies +> are applied in §8. Run DB init now, before the lockdown. + +### 3.2 RabbitMQ queues +The converter declares the full retry/DLQ topology (`video`, `video.retry`, +`video.dlq`, `vidcast.dlx`, `mp3`…) on startup (A3), so no manual queue creation is +strictly required. Confirm after consumers are up (§5) via the management UI on +`:30004` or the verification in §7. + +--- + +## 4. External Secrets Operator (A9) — after Parameter Store is seeded (§1.4) + +```bash +helm repo add external-secrets https://charts.external-secrets.io && helm repo update +helm install external-secrets external-secrets/external-secrets \ + -n external-secrets --create-namespace --version 0.14.0 # or later (CRDs serve external-secrets.io/v1) +kubectl rollout status deployment/external-secrets -n external-secrets --timeout=120s + +# The vidcast-eso ServiceAccount must carry the IRSA role annotation. Confirm it matches TF: +kubectl apply -k k8s/external-secrets/shared # SA + ClusterSecretStore +kubectl get sa vidcast-eso -n default -o jsonpath='{.metadata.annotations.eks\.amazonaws\.com/role-arn}'; echo +# must equal `terraform output external_secrets_irsa_role_arn` + +kubectl apply -k k8s/external-secrets/dev # the 4 ExternalSecrets + +# WAIT for ESO to materialize the Secrets: +kubectl get externalsecret -n default # all READY=True +kubectl get secret auth-secret gateway-secret converter-secret notification-secret -n default +``` +> `rabbitmq-secret` is created by the RabbitMQ Helm chart (§3), NOT by ESO — by design. + +--- + +## 5. App workloads — Kustomize (dev overlay) + +```bash +kubectl apply -k k8s/overlays/dev +for d in auth gateway converter notification frontend outbox-relay redis; do + kubectl rollout status deployment/$d --timeout=180s +done +kubectl get pods -o wide # all Running, 0 restarts +``` +> KEDA is not installed yet, so `converter` runs at its static floor (1 in dev). +> KEDA takes over the replica count in §6. + +--- + +## 6. Platform tooling (in this order) + +### 6.1 KEDA (A7 — scale-to-zero for the converter) +```bash +helm repo add kedacore https://kedacore.github.io/charts && helm repo update +helm install keda kedacore/keda -n keda --create-namespace -f k8s/keda/values.yaml +kubectl rollout status deployment/keda-operator -n keda --timeout=120s +kubectl apply -k k8s/keda # ScaledObject (converter) + HPA (gateway) + TriggerAuth +kubectl get scaledobject -n default # READY=True +``` + +### 6.2 Argo CD (B1 — GitOps) +```bash +helm repo add argo https://argoproj.github.io/argo-helm && helm repo update +helm install argocd argo/argo-cd -n argocd --create-namespace -f k8s/argocd/values.yaml +kubectl rollout status deployment/argocd-server -n argocd --timeout=180s +kubectl apply -k k8s/argocd # Application CRDs (dev auto-sync, prod manual-sync) +kubectl get applications -n argocd +``` +> Argo syncs from the git repo, so the Sprint-1–4 manifests must be pushed to `main` +> (Part 1 #3). Until then the dev Application shows `OutOfSync`/`Unknown` — expected. + +### 6.3 Kyverno (B2/B5 — policy-as-code, ALL Audit) +```bash +helm repo add kyverno https://kyverno.github.io/kyverno && helm repo update +helm install kyverno kyverno/kyverno -n kyverno --create-namespace -f k8s/kyverno/values.yaml +kubectl rollout status deployment/kyverno-admission-controller -n kyverno --timeout=180s +kubectl apply -k k8s/kyverno # 7 ClusterPolicies (0 Enforce) +kubectl get clusterpolicy # all READY=True +``` + +### 6.4 Monitoring (B4 — Prometheus/Grafana/Alertmanager + SLO stack) +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts && helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml -n monitoring --create-namespace +kubectl rollout status deployment/monitoring-grafana -n monitoring --timeout=240s + +kubectl apply -f monitoring/scrape/ # ServiceMonitors + PodMonitors (gateway/rabbitmq/converter/notification/kubecost) +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml +kubectl apply -f monitoring/alerts/vidcast-slo-rules.yaml + +# Load dashboards (sidecar picks up ConfigMaps labelled grafana_dashboard=1): +for d in vidcast-operations vidcast-slo vidcast-finops; do + kubectl create configmap $d -n monitoring --from-file=monitoring/dashboards/$d.json \ + --dry-run=client -o yaml | kubectl label -f - --local -o yaml grafana_dashboard=1 | kubectl apply -f - +done +``` + +### 6.5 Kubecost (B3 — LAST; dev footprint per the sign-off) +```bash +helm repo add kubecost https://kubecost.github.io/cost-analyzer/ && helm repo update +helm install kubecost kubecost/cost-analyzer -n kubecost --create-namespace -f k8s/kubecost/values.yaml +kubectl rollout status deployment/kubecost-cost-analyzer -n kubecost --timeout=240s +# (vidcast-kubecost ServiceMonitor was applied in §6.4) +``` +> If the node shows pressure (Pending pods), park Kubecost and continue: +> `kubectl scale deploy/kubecost-cost-analyzer -n kubecost --replicas=0` + +--- + +## 7. Runtime verification checklist + +Run **every** item. Record command → output → PASS/FAIL in `DEPLOYMENT_REPORT.md`. + +```bash +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') +``` + +| # | Check | Command | Expected | +|---|-------|---------|----------| +| 1 | Gateway boots under gunicorn | `kubectl exec deploy/gateway -- python -c "import urllib.request as u;print(u.urlopen('http://localhost:8080/healthz').read())"` | `{"status":"ok",...}` 200 | +| 2 | Gateway /metrics (B4) | `kubectl exec deploy/gateway -- python -c "import urllib.request as u;print(b'vidcast_gateway_requests_total' in u.urlopen('http://localhost:8080/metrics').read())"` | `True` | +| 3 | Converter/notification /metrics | `kubectl exec deploy/converter -- python -c "import urllib.request as u;print(u.urlopen('http://localhost:9000/metrics').status)"` | `200` | +| 4 | Outbox relay publishing | seed a row (below), then check the `video` queue depth on `:30004` | published count increments | +| 5 | DLQ topology (A3) | publish a poison msg to `video`; after MAX_RETRIES it lands in `video.dlq` | message in `video.dlq` | +| 6 | Idempotency (A2) | publish the same `video_fid` twice | 2nd logs `[idempotency] duplicate, skipping` | +| 7 | KEDA scale-to-zero | `kubectl get deploy converter -w` with empty queue | replicas → 0; →1+ on new msg | +| 8 | DNS resolves | `kubectl exec deploy/gateway -- python -c "import socket;print(socket.gethostbyname('rabbitmq'))"` | an IP | +| 9 | Prometheus targets UP | port-forward `:9090` → Status▸Targets | gateway/rabbitmq/converter/notification/kubecost UP | +| 10 | SLO rules evaluating | query `slo:availability:burnrate1h` in Prometheus | a series (after some traffic) | +| 11 | Kyverno PolicyReports | `kubectl get clusterpolicyreport` | pass/fail counts present | +| 12 | Argo CD UI | port-forward `argocd-server :8080`, login | app tree visible; dev=Synced | +| 13 | Argo dev auto-sync | edit a dev manifest in git, push | Argo auto-syncs the change | +| 14 | Argo prod manual-sync gate | inspect prod Application | `syncPolicy.automated` ABSENT | +| 15 | Kubecost data | port-forward `kubecost :9090` or check `node_total_hourly_cost` in Prometheus | a cost value | +| 16 | NetworkPolicy deny (after §10) | gateway→notification should TIME OUT; gateway→auth should CONNECT | see §10 | + +**Helper — outbox relay test (item 4):** +```bash +kubectl exec deploy/gateway -- python - <<'PY' +import os, datetime, pymongo +c = pymongo.MongoClient(os.environ["MONGODB_VIDEOS_URI"]) +c.get_default_database().outbox.insert_one({"event_type":"video.uploaded","routing_key":"video", + "payload":{"video_fid":"test","mp3_fid":None,"username":""}, + "created_at":datetime.datetime.utcnow(),"published_at":None}) +print("seeded outbox row") +PY +# within OUTBOX_POLL_INTERVAL (30s): kubectl logs deploy/outbox-relay -> "published 1 event(s)" +``` + +**Port-forwards for the platform tools (what each one shows you).** +A *port-forward* opens a private tunnel from a port on your laptop to a service +inside the cluster — most of these tools are deliberately **not** exposed publicly +(only the frontend `:30006`, gateway `:30002`, and Grafana `:30007` have NodePorts), +so port-forwarding is how an operator reaches them. Open `http://localhost:` in +your browser after each. (Run with `&` to background them; `kill %1 %2 …` to stop.) + +```bash +# ── PROMETHEUS (the metrics database) → http://localhost:9090 ────────────────── +# What it shows: every raw number the system emits. Use Status ▸ Targets to confirm +# all services are being scraped ("UP"), and the Graph tab to query metrics like +# `vidcast_conversions_total`, `rabbitmq_queue_messages`, or the SLO burn-rate rules. +kubectl -n monitoring port-forward svc/monitoring-kube-prometheus-prometheus 9090:9090 & + +# ── ALERTMANAGER (the alert router) → http://localhost:9093 ─────────────────── +# What it shows: which SLO/health alerts are currently FIRING and their grouping/ +# silences. Quiet = healthy. This is where a burn-rate page would surface. +# (Also reachable directly on NodePort :30008 if the security group allows your IP.) +kubectl -n monitoring port-forward svc/monitoring-kube-prometheus-alertmanager 9093:9093 & + +# ── GRAFANA (the dashboards) → http://localhost:3000 (or NodePort :30007) ───── +# What it shows: the human-friendly graphs — the 3 VidCast dashboards (Operations, +# SLO, FinOps/Cost) plus the stock Kubernetes ones. Login: admin / vidcast-demo. +kubectl -n monitoring port-forward svc/monitoring-grafana 3000:80 & + +# ── KUBECOST (the cost breakdown) → http://localhost:9091 ───────────────────── +# What it shows: cost attributed per namespace/pod/label, and the cost-per-conversion +# figure. Remember it's an ESTIMATE (list prices) — use it for trends, the AWS bill +# for absolutes. +kubectl -n kubecost port-forward deploy/kubecost-cost-analyzer 9091:9090 & + +# ── ARGO CD (the GitOps deployer) → https://localhost:8080 ──────────────────── +# What it shows: the live sync state of the dev/prod Applications — Synced vs +# OutOfSync, the resource tree, and the manual "Sync" button that IS the prod gate. +kubectl -n argocd port-forward svc/argocd-server 8080:443 & +# Argo CD admin password (user is `admin`): +kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d; echo +``` + +**End-to-end app test (item — the headline):** +```bash +JWT=$(curl -s -X POST http://$NODE_IP:30002/login -u ":$APP_LOGIN_PASSWORD") +curl -s -X POST http://$NODE_IP:30002/upload -F "file=@assets/video.mp4" -H "Authorization: Bearer $JWT" +# wait ~30-60s for converter; an email is sent if the real Gmail app password is in Parameter Store +sleep 60 +# download (FILE_ID from the email, or from gateway /my-files): +curl -s -X GET "http://$NODE_IP:30002/download?fid=" -H "Authorization: Bearer $JWT" -o out.mp3 +file out.mp3 # expect: Audio file / MPEG ADTS +``` + +--- + +## 8. NetworkPolicies — APPLY LAST (after §7 all green) + +Applied last so any unexpected block is unambiguously the policy. Allows first, +default-deny last (the file order already does this). +```bash +kubectl apply -k k8s/network-policies # default ns: allows + default-deny +kubectl apply -f k8s/network-policies/allow-kyverno-sigstore-egress.yaml # kyverno ns (B5) + +# Deny-test (verification item 16): +kubectl exec deploy/gateway -- python -c "import socket; socket.create_connection(('auth',5000),3); print('gateway->auth OK')" # CONNECT +kubectl exec deploy/gateway -- timeout 5 python -c "import socket; socket.create_connection(('notification',9000),3)" ; echo "exit=$? (nonzero = correctly denied)" +kubectl exec deploy/gateway -- python -c "import socket; print(socket.gethostbyname('rabbitmq'))" # DNS still works +``` +> Rollback (fastest in the plan): `kubectl delete networkpolicy default-deny-all -n default`. + +--- + +## 9. Teardown (cost saving) + +```bash +# App + platform (Helm + kustomize) can be left; the destroy removes the cluster anyway. +cd terraform/environments/dev && terraform destroy -auto-approve # ~10 min +# Verify zero spend: +aws eks list-clusters --region eu-west-2 # [] +terraform state list # 0 resources +``` +**PRESERVE (never delete):** S3 state bucket, DynamoDB lock table, `terraform.tfvars`, +`.terraform.lock.hcl`, the `vidcast-frontend` ECR repo+images. **Parameter Store** +SecureStrings are free and harmless to leave (they persist; ESO re-reads them next +bring-up) — delete only if rotating secrets. No Secrets Manager is used (cost decision). + +--- + +## 10. Known issues / runtime gaps to watch (collected from all sprint review notes) + +**Genuinely deferred (depend on CI — can't test until merged):** +- **Cosign signing / SBOM / SARIF / SLSA provenance (A8):** not in CI yet → **B5 + `verify-images` Audit report will show our images as "fail: no signature"** — this + is the EXPECTED "not yet signed" state, not a failure. Flip B5 to Enforce only + after signing is live and one image verifies PASS (`k8s/kyverno/README.md` §B5). + +**Verify-on-this-deploy (the point of the bring-up):** +- **Datastore non-root (gap-fix):** RabbitMQ now runs non-root (uid 999 + fsGroup) — + confirm it boots against the existing PVC. mongo/postgres CANNOT run non-root + (documented Kyverno `require-non-root` exception) — confirm they still start. +- **postgres:16.4-alpine** (was implicit `:latest`) — confirm init.sql + `HOST_AUTH_METHOD`. +- **RabbitMQ `/metrics/per-object`** (B4) — confirm `rabbitmq_queue_messages{queue="video"}` + appears (the two RabbitMQ alerts depend on it). +- **gunicorn multiprocess metrics (B4)** — confirm `/metrics` aggregates across both + gateway workers (counts shouldn't halve between scrapes). +- **Kubecost vs external Prometheus** — confirm the FQDN resolves and cost series populate. + +**Carried operational notes:** +- NodePort SG: 30003/30004/30005/30007/30008 should be locked to the operator IP; + 30002 (gateway) + 30006 (frontend) stay public. (SG module / manual.) +- Kyverno Audit→Enforce is a deliberate later step: 5/6 policies are clean post + gap-fix; `require-non-root` needs a label-scoped exclude for mongo/postgres first. +- Node budget: dev footprint + all add-ons ≈ **~81% idle**; converter 2nd replica at + peak is best-effort (may stay Pending) — by design on a 2-vCPU node. + +--- + +## 11. History (condensed) + +- **May–Jun 1:** base deploy (hand-built `cba-microservices`, since torn down) → + Terraform IaC (`vidcast-cluster`) + GitHub OIDC + state backend created. +- **Jun 2:** full app live on `vidcast-cluster`; images `/*:16f49a0`; + Mongo 4.0.8→4.2; RBAC + frontend (ECR `vidcast-frontend`) merged (PR #1/#2). +- **Jun 3:** `terraform destroy` (cost saving) — 22 resources destroyed, state + emptied, backend+ECR+tfvars preserved. +- **Jun 6–8 (Sprint 1–4):** Kustomize+ESO+KEDA+Argo+Kyverno+NetworkPolicies+outbox/ + idempotency/DLQ (A-series); B4 SLO alerting, A8 supply-chain, B5 cosign verify, + B3 Kubecost. All config-verified; this runbook brings them up live. +``` diff --git a/Helm_charts/MongoDB/templates/pvc.yaml b/Helm_charts/MongoDB/templates/pvc.yaml index cd90e16..5e678c1 100644 --- a/Helm_charts/MongoDB/templates/pvc.yaml +++ b/Helm_charts/MongoDB/templates/pvc.yaml @@ -6,6 +6,11 @@ spec: accessModes: - ReadWriteOnce resources: + # NOTE: the backing PersistentVolume (templates/pv.yaml) is 10Gi but this + # claim only requests 1Gi. The bind still succeeds (a PVC binds to any PV + # that is >= the request), but ~9Gi of the manual hostPath volume sits + # unused. Raise this to 10Gi to consume the full volume, or shrink the PV to + # match if 1Gi is the real intent. requests: storage: 1Gi storageClassName: manual diff --git a/Helm_charts/MongoDB/templates/secret.yaml b/Helm_charts/MongoDB/templates/secret.yaml index 8f280ab..aaf8d6f 100644 --- a/Helm_charts/MongoDB/templates/secret.yaml +++ b/Helm_charts/MongoDB/templates/secret.yaml @@ -4,8 +4,8 @@ metadata: name: mongodb-secret type: Opaque stringData: - MONGO_ROOT_USERNAME: {{ .Values.secret.root_username }} - MONGO_ROOT_PASSWORD: {{ .Values.secret.root_password }} - MONGO_USERNAME: {{ .Values.secret.username }} - MONGO_PASSWORD: {{ .Values.secret.password }} - MONGO_USERS_LIST: {{ .Values.secret.users_list }} + MONGO_ROOT_USERNAME: {{ .Values.secret.root_username | quote }} + MONGO_ROOT_PASSWORD: {{ .Values.secret.root_password | quote }} + MONGO_USERNAME: {{ .Values.secret.username | quote }} + MONGO_PASSWORD: {{ .Values.secret.password | quote }} + MONGO_USERS_LIST: {{ .Values.secret.users_list | quote }} diff --git a/Helm_charts/MongoDB/templates/service.yaml b/Helm_charts/MongoDB/templates/service.yaml index 73dbc69..a76f6fd 100644 --- a/Helm_charts/MongoDB/templates/service.yaml +++ b/Helm_charts/MongoDB/templates/service.yaml @@ -5,9 +5,10 @@ metadata: labels: app: database spec: - type: NodePort + # I2: ClusterIP only. The datastore is reachable in-cluster (and via + # `kubectl port-forward` for admin) but no longer has an external NodePort. + type: ClusterIP ports: - port: 27017 - nodePort: 30005 selector: app: database diff --git a/Helm_charts/MongoDB/templates/statefulset.yaml b/Helm_charts/MongoDB/templates/statefulset.yaml index be88df1..f3391b4 100644 --- a/Helm_charts/MongoDB/templates/statefulset.yaml +++ b/Helm_charts/MongoDB/templates/statefulset.yaml @@ -13,10 +13,22 @@ spec: labels: app: database selector: mongodb + environment: {{ .Values.labels.environment }} + app.kubernetes.io/managed-by: {{ .Values.labels.managedBy }} spec: + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: - name: mongodb - image: mongo:4.0.8 + # mongo:4.2 (wire v8) is the minimum the services' pinned PyMongo + # supports after the CVE dependency bump (commit 5c224a3). mongo:4.0.8 + # (wire v7) was rejected at runtime with PyMongo error + # "requires at least 8 (MongoDB 4.2)", breaking gateway/converter. + image: mongo:4.2 + securityContext: + {{- toYaml .Values.containerSecurityContext | nindent 10 }} + resources: + {{- toYaml .Values.resources | nindent 10 }} env: - name: MONGO_INITDB_ROOT_USERNAME_FILE value: /etc/k8-test/admin/MONGO_ROOT_USERNAME diff --git a/Helm_charts/MongoDB/values.yaml b/Helm_charts/MongoDB/values.yaml index c2677f3..8a1b9ac 100644 --- a/Helm_charts/MongoDB/values.yaml +++ b/Helm_charts/MongoDB/values.yaml @@ -1,6 +1,38 @@ +# Credentials are injected at install time from environment variables by deploy.sh +# (`--set secret.*`) — NEVER commit real passwords here. The CHANGEME placeholders +# are only used if you `helm install` this chart directly without overriding them; +# deploy.sh requires MONGODB_USERNAME / MONGODB_PASSWORD and passes them in, keeping +# secrets out of the (public) repo. secret: - root_username: nasi - root_password: nasi1234 - username: nasi - password: nasi1234 - users_list: nasi \ No newline at end of file + root_username: mongouser + root_password: CHANGEME # deploy.sh: --set secret.root_password=$MONGODB_PASSWORD + username: mongouser + password: CHANGEME # deploy.sh: --set secret.password=$MONGODB_PASSWORD + users_list: mongouser + +# B2 gap-fix (require-requests-limits): right-sized for the demo workload. GridFS +# chunk writes are memory-hungry during uploads, so memory headroom matters more +# than CPU here. Review under production load. +resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + +# B2 gap-fix (require-labels): `app: database` already exists on the pod template; +# these add the two remaining Kyverno-required labels. +labels: + environment: prod + managedBy: helm + +# B2 gap-fix (require-seccomp). NOTE: the official mongo image's entrypoint runs +# as root to chown /data/db and run initdb — it CANNOT start with runAsNonRoot. +# That remains a DOCUMENTED require-non-root Audit exception (needs a Kyverno +# exclude rule before Enforce). Safe subset applied: seccomp + no priv-escalation. +podSecurityContext: + seccompProfile: + type: RuntimeDefault +containerSecurityContext: + allowPrivilegeEscalation: false diff --git a/Helm_charts/Postgres/init.sql b/Helm_charts/Postgres/init.sql index 8f7b0c7..0cb26b8 100644 --- a/Helm_charts/Postgres/init.sql +++ b/Helm_charts/Postgres/init.sql @@ -1,9 +1,23 @@ -CREATE TABLE auth_user ( +CREATE TABLE IF NOT EXISTS auth_user ( id integer GENERATED ALWAYS AS IDENTITY PRIMARY KEY, - email VARCHAR (255) NOT NULL, - password VARCHAR (255) NOT NULL + email VARCHAR (255) NOT NULL UNIQUE, + password VARCHAR (255) NOT NULL, + role VARCHAR (32) NOT NULL DEFAULT 'user', + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ); ---Add Username and Password for Admin User --- INSERT INTO auth_user (email, password) VALUES ('thomasfookins007helby@gmail.com', '123456'); -INSERT INTO auth_user (email, password) VALUES ('iambatmanthegoat@gmail.com', '123456'); \ No newline at end of file +-- SECURITY: the password column stores a bcrypt hash (NOT plaintext). The auth +-- service verifies logins with bcrypt.checkpw (constant-time) and hashes new +-- sign-ups with bcrypt.hashpw. +-- +-- RBAC: every row has a role. 'admin' unlocks Dashboard/Architecture/Users in the +-- frontend and any admin-gated backend endpoint; 'user' is the default for sign-ups. +-- +-- This file intentionally contains NO admin row and NO password hash — so nothing +-- secret ever lives in the (public) repo. The admin account is seeded at deploy +-- time by deploy.sh, which generates the bcrypt hash IN PostgreSQL via pgcrypto's +-- crypt()/gen_salt('bf') from the APP_LOGIN_EMAIL / APP_LOGIN_PASSWORD env vars. +-- pgcrypto bcrypt ($2a$) hashes are compatible with the auth service's bcrypt.checkpw. + +-- pgcrypto provides crypt()/gen_salt() used by deploy.sh to seed the admin securely. +CREATE EXTENSION IF NOT EXISTS pgcrypto; diff --git a/Helm_charts/Postgres/templates/postgres-deploy.yaml b/Helm_charts/Postgres/templates/postgres-deploy.yaml index 8dbce58..712970f 100644 --- a/Helm_charts/Postgres/templates/postgres-deploy.yaml +++ b/Helm_charts/Postgres/templates/postgres-deploy.yaml @@ -7,6 +7,14 @@ metadata: app: auth-app spec: replicas: 1 + {{- if .Values.persistence.enabled }} + # An RWO EBS volume attaches to one node at a time. Recreate (not the default + # RollingUpdate) tears the old pod down BEFORE starting the new one, so a rollout + # doesn't deadlock with the new pod stuck waiting on a still-attached volume. + # A single-replica datastore has no availability to lose by recreating. + strategy: + type: Recreate + {{- end }} selector: matchLabels: name: postgres-pod @@ -17,12 +25,20 @@ spec: labels: name: postgres-pod app: auth-app + environment: {{ .Values.labels.environment }} + app.kubernetes.io/managed-by: {{ .Values.labels.managedBy }} spec: + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: - name: postgres image: {{ .Values.container.image }} ports: - containerPort: 5432 + securityContext: + {{- toYaml .Values.containerSecurityContext | nindent 10 }} + resources: + {{- toYaml .Values.resources | nindent 10 }} env: - name: POSTGRES_USER value: {{ .Values.container.env.user }} @@ -31,5 +47,23 @@ spec: - name: POSTGRES_DB value: {{ .Values.container.env.db }} - name: POSTGRES_HOST_AUTH_METHOD - value: trust + value: {{ .Values.container.env.authMethod }} + {{- if .Values.persistence.enabled }} + # initdb refuses to run in the volume root: a fresh EBS volume contains a + # lost+found directory, which the entrypoint won't treat as an empty data + # dir. Point PGDATA at a subdirectory so first-boot initdb gets a clean path. + - name: PGDATA + value: /var/lib/postgresql/data/pgdata + {{- end }} + {{- if .Values.persistence.enabled }} + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + {{- end }} + {{- if .Values.persistence.enabled }} + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: {{ .Values.persistence.claimName }} + {{- end }} diff --git a/Helm_charts/Postgres/templates/postgres-pvc.yaml b/Helm_charts/Postgres/templates/postgres-pvc.yaml new file mode 100644 index 0000000..8a6ec06 --- /dev/null +++ b/Helm_charts/Postgres/templates/postgres-pvc.yaml @@ -0,0 +1,20 @@ +{{- if .Values.persistence.enabled }} +# PersistentVolumeClaim for Postgres data. Without this, PGDATA lives in the +# pod's ephemeral filesystem and every registered user (except the deploy.sh seed +# admin) is lost on the first pod restart. ReadWriteOnce is correct for a +# single-replica datastore — exactly one pod mounts it at a time. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.persistence.claimName }} + labels: + app: auth-app + app.kubernetes.io/part-of: vidcast +spec: + accessModes: + - ReadWriteOnce + storageClassName: {{ .Values.persistence.storageClassName }} + resources: + requests: + storage: {{ .Values.persistence.size }} +{{- end }} diff --git a/Helm_charts/Postgres/templates/postgres-service.yaml b/Helm_charts/Postgres/templates/postgres-service.yaml index bb126db..b3916d1 100644 --- a/Helm_charts/Postgres/templates/postgres-service.yaml +++ b/Helm_charts/Postgres/templates/postgres-service.yaml @@ -6,11 +6,12 @@ metadata: name: postgres-service app: auth-app spec: - type: NodePort + # I2: ClusterIP only. PostgreSQL is reached by the auth service in-cluster (and + # via `kubectl port-forward` for admin) — no external NodePort. + type: ClusterIP ports: - port: {{ .Values.service.port }} targetPort: {{ .Values.service.port }} - nodePort: {{ .Values.service.nodeport }} selector: name: postgres-pod app: auth-app diff --git a/Helm_charts/Postgres/templates/postgres-storageclass.yaml b/Helm_charts/Postgres/templates/postgres-storageclass.yaml new file mode 100644 index 0000000..877c36c --- /dev/null +++ b/Helm_charts/Postgres/templates/postgres-storageclass.yaml @@ -0,0 +1,25 @@ +{{- if and .Values.persistence.enabled .Values.persistence.createStorageClass }} +# gp3 StorageClass backed by the aws-ebs-csi-driver addon (provisioned in +# terraform/modules/eks). The cluster has no default dynamic StorageClass — the +# in-tree kubernetes.io/aws-ebs provisioner is gone in k8s 1.31 — so the Postgres +# PVC needs this to bind. +# +# reclaimPolicy: Retain is deliberate (durability, A11): deleting the PVC must NOT +# delete the underlying EBS volume, so an accidental `helm uninstall` or pod churn +# can't take user accounts with it. Orphaned volumes are cleaned up manually. +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: {{ .Values.persistence.storageClassName }} + labels: + app.kubernetes.io/part-of: vidcast +provisioner: ebs.csi.aws.com +parameters: + type: gp3 + encrypted: "true" +reclaimPolicy: Retain +# WaitForFirstConsumer pins the volume to the AZ where the Postgres pod actually +# lands, avoiding a cross-AZ attach failure on a multi-subnet cluster. +volumeBindingMode: WaitForFirstConsumer +allowVolumeExpansion: true +{{- end }} diff --git a/Helm_charts/Postgres/values.yaml b/Helm_charts/Postgres/values.yaml index fd2d455..5aa2a84 100644 --- a/Helm_charts/Postgres/values.yaml +++ b/Helm_charts/Postgres/values.yaml @@ -4,8 +4,59 @@ service: nodeport: 30003 container: - image: postgres + # B2 gap-fix (disallow-latest-tag): pinned off the implicit :latest. 16.4-alpine + # chosen for a small, low-CVE base; it honours POSTGRES_HOST_AUTH_METHOD and the + # init.sql bootstrap unchanged. Runtime re-verify owed on next cluster apply. + image: postgres:16.4-alpine env: - user: nasi - password: cnd2023 - db: authdb \ No newline at end of file + user: pguser + # Injected by deploy.sh: --set container.env.password=$POSTGRES_PASSWORD. + # Never commit a real password here (CHANGEME is a no-real-secret placeholder). + password: CHANGEME + # scram-sha-256: passwords are actually enforced (H-2 fix). Requires POSTGRES_PASSWORD + # to be non-empty (which deploy.sh guarantees via --set). Changed from 'trust', which + # accepted any password — access was network-controlled but the credential was cosmetic. + authMethod: scram-sha-256 + db: authdb + +# A11 durability: back PGDATA with an EBS-backed PVC so registered users survive a +# pod restart (without this, only the deploy.sh seed admin survives — everything in +# the ephemeral pod filesystem is lost). Requires the aws-ebs-csi-driver addon +# (terraform/modules/eks). Set persistence.enabled=false to fall back to the old +# ephemeral behaviour (e.g. a local kind cluster with no EBS). +persistence: + enabled: true + # Create the gp3 StorageClass from this chart. Set false if a suitable dynamic + # StorageClass already exists and you reference it via storageClassName below. + createStorageClass: true + storageClassName: vidcast-ebs-gp3 + claimName: postgres-pvc + # Small: the auth_user table is tiny. gp3 can be expanded later (allowVolumeExpansion). + size: 2Gi + +# B2 gap-fix (require-requests-limits): right-sized for the demo workload — small +# auth_user table, low query volume. Review under production load. +resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "250m" + memory: "256Mi" + +# B2 gap-fix (require-labels): `app: auth-app` already exists on the pod template; +# these add the two remaining Kyverno-required labels. +labels: + environment: prod + managedBy: helm + +# B2 gap-fix (require-seccomp). NOTE: the official postgres image's entrypoint +# runs as root to initdb and chown $PGDATA, then drops to the `postgres` user via +# gosu — it CANNOT start with runAsNonRoot: true. That remains a DOCUMENTED +# require-non-root Audit exception (needs a Kyverno exclude rule before Enforce). +# Safe subset applied: seccomp RuntimeDefault + no privilege escalation. +podSecurityContext: + seccompProfile: + type: RuntimeDefault +containerSecurityContext: + allowPrivilegeEscalation: false diff --git a/Helm_charts/RabbitMQ/templates/enabled-plugins-configmap.yaml b/Helm_charts/RabbitMQ/templates/enabled-plugins-configmap.yaml new file mode 100644 index 0000000..11a4f9d --- /dev/null +++ b/Helm_charts/RabbitMQ/templates/enabled-plugins-configmap.yaml @@ -0,0 +1,15 @@ +# B4 (M-2 metrics): enable the built-in rabbitmq_prometheus plugin so RabbitMQ +# exposes Prometheus metrics on :15692/metrics (queue depth, connections, etc.). +# This OVERRIDES the image's baked enabled_plugins, so it must also re-list +# rabbitmq_management (the NodePort UI on :15672 the project already relies on). +# +# Kept as a SEPARATE ConfigMap from rabbitmq-configmap on purpose: that one is +# consumed via envFrom (its keys become env vars), and we must NOT turn this file's +# content into an env var. This one is mounted as a file at /etc/rabbitmq. +apiVersion: v1 +kind: ConfigMap +metadata: + name: rabbitmq-enabled-plugins +data: + enabled_plugins: | + [rabbitmq_management,rabbitmq_prometheus]. diff --git a/Helm_charts/RabbitMQ/templates/secret.yaml b/Helm_charts/RabbitMQ/templates/secret.yaml index d714599..ed0608b 100644 --- a/Helm_charts/RabbitMQ/templates/secret.yaml +++ b/Helm_charts/RabbitMQ/templates/secret.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: Secret metadata: name: rabbitmq-secret +type: Opaque stringData: - PLACEHOLDER: "NONE" -type: Opaque \ No newline at end of file + RABBITMQ_DEFAULT_USER: {{ .Values.secret.default_user | quote }} + RABBITMQ_DEFAULT_PASS: {{ .Values.secret.default_pass | quote }} diff --git a/Helm_charts/RabbitMQ/templates/service.yaml b/Helm_charts/RabbitMQ/templates/service.yaml index 137f2d7..b15b360 100644 --- a/Helm_charts/RabbitMQ/templates/service.yaml +++ b/Helm_charts/RabbitMQ/templates/service.yaml @@ -2,8 +2,14 @@ apiVersion: v1 kind: Service metadata: name: {{ .Values.service.name }} + labels: + # B4: the ServiceMonitor selects the Service by this label. + app: rabbitmq spec: - type: NodePort + # I2: ClusterIP only. The management UI (15672) loses its NodePort — reach it via + # `kubectl port-forward svc/rabbitmq 15672` for admin. AMQP (5672) and the + # Prometheus port (15692) were already in-cluster only. + type: ClusterIP selector: app: rabbitmq ports: @@ -11,8 +17,13 @@ spec: protocol: TCP port: 15672 targetPort: 15672 - nodePort: 30004 - name: amqp protocol: TCP port: 5672 targetPort: 5672 + # B4: prometheus metrics port (rabbitmq_prometheus plugin). No nodePort — it is + # in-cluster only, scraped by the B4 ServiceMonitor. + - name: prometheus + protocol: TCP + port: 15692 + targetPort: 15692 diff --git a/Helm_charts/RabbitMQ/templates/statefulset.yaml b/Helm_charts/RabbitMQ/templates/statefulset.yaml index dbf1c47..007d518 100644 --- a/Helm_charts/RabbitMQ/templates/statefulset.yaml +++ b/Helm_charts/RabbitMQ/templates/statefulset.yaml @@ -11,10 +11,18 @@ spec: metadata: labels: app: rabbitmq + environment: {{ .Values.labels.environment }} + app.kubernetes.io/managed-by: {{ .Values.labels.managedBy }} spec: + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} containers: - name: rabbitmq image: rabbitmq:3-management + securityContext: + {{- toYaml .Values.containerSecurityContext | nindent 12 }} + resources: + {{- toYaml .Values.resources | nindent 12 }} ports: - name: http protocol: TCP @@ -22,6 +30,10 @@ spec: - name: amqp protocol: TCP containerPort: 5672 + # B4: rabbitmq_prometheus metrics endpoint (scraped by a ServiceMonitor). + - name: prometheus + protocol: TCP + containerPort: 15692 envFrom: - configMapRef: name: rabbitmq-configmap @@ -30,7 +42,16 @@ spec: volumeMounts: - mountPath: /var/lib/rabbitmq name: rabbitmq-volume + # B4: override enabled_plugins to add rabbitmq_prometheus (subPath mounts + # just this one file, leaving the rest of /etc/rabbitmq untouched). + - mountPath: /etc/rabbitmq/enabled_plugins + subPath: enabled_plugins + name: enabled-plugins + readOnly: true volumes: - name: rabbitmq-volume persistentVolumeClaim: - claimName: rabbitmq-pvc \ No newline at end of file + claimName: rabbitmq-pvc + - name: enabled-plugins + configMap: + name: rabbitmq-enabled-plugins \ No newline at end of file diff --git a/Helm_charts/RabbitMQ/values.yaml b/Helm_charts/RabbitMQ/values.yaml index 53003fa..ff61cb3 100644 --- a/Helm_charts/RabbitMQ/values.yaml +++ b/Helm_charts/RabbitMQ/values.yaml @@ -1,3 +1,42 @@ service: name: rabbitmq - port: 15672 \ No newline at end of file + port: 15672 + +secret: + default_user: rabbituser + # Injected by deploy.sh: --set secret.default_pass=$RABBITMQ_PASSWORD. + # Never commit a real password here (CHANGEME is a no-real-secret placeholder). + default_pass: CHANGEME + +# B2 gap-fix (require-requests-limits): right-sized for the demo workload — +# moderate queue depth, two durable queues. Review under production load. +resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + +# B2 gap-fix (require-labels): `app: rabbitmq` already exists on the pod template; +# these add the two remaining Kyverno-required labels. +labels: + environment: prod + managedBy: helm + +# B2 gap-fix (require-non-root + require-seccomp). UNLIKE mongo/postgres, the +# rabbitmq image runs cleanly as the non-root `rabbitmq` user (uid 999) when its +# data dir is group-writable — fsGroup 999 makes the PVC writable by that gid. So +# rabbitmq FULLY satisfies require-non-root (no Enforce exception needed). Runtime +# re-verify owed: confirm the broker boots non-root against the existing PVC. +podSecurityContext: + runAsNonRoot: true + runAsUser: 999 + runAsGroup: 999 + fsGroup: 999 + seccompProfile: + type: RuntimeDefault +containerSecurityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..dadccbb --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,122 @@ +pipeline { + agent any + + environment { + DOCKERHUB = credentials('dockerhub-credentials') + AWS_CREDS = credentials('aws-credentials') + CLUSTER = 'vidcast-cluster' + REGION = 'eu-west-2' + BUILD_TAG = "${env.BUILD_NUMBER}-${env.GIT_COMMIT?.take(7) ?: 'unknown'}" + STAGING_IP = credentials('swarm-staging-ip') + } + + stages { + stage('Checkout') { + steps { + git branch: 'main', url: 'https://github.com/johnnybabs/vidcast.git' + } + } + + stage('Lint') { + steps { + sh 'pip install ruff && ruff check src/ --exclude src/frontend' + } + } + + stage('Build Images') { + parallel { + stage('Build Auth') { + steps { + sh "docker build -t vidcast/auth:${BUILD_TAG} src/auth-service/" + } + } + stage('Build Gateway') { + steps { + sh "docker build -t vidcast/gateway:${BUILD_TAG} src/gateway-service/" + } + } + stage('Build Converter') { + steps { + sh "docker build -t vidcast/converter:${BUILD_TAG} src/converter-service/" + } + } + stage('Build Notification') { + steps { + sh "docker build -t vidcast/notification:${BUILD_TAG} src/notification-service/" + } + } + } + } + + stage('Security Scan') { + steps { + sh """ + for svc in auth gateway converter notification; do + trivy image --severity CRITICAL,HIGH --exit-code 1 \ + --ignore-unfixed vidcast/\${svc}:${BUILD_TAG} + done + """ + } + } + + stage('Push Images') { + steps { + sh "echo \$DOCKERHUB_PSW | docker login -u \$DOCKERHUB_USR --password-stdin" + sh """ + for svc in auth gateway converter notification; do + docker push vidcast/\${svc}:${BUILD_TAG} + done + """ + } + } + + stage('Deploy Staging (Swarm)') { + steps { + sh """ + ssh -o StrictHostKeyChecking=no ubuntu@${STAGING_IP} \ + 'docker stack deploy -c docker-compose.swarm.yml vidcast' + """ + sh 'sleep 30' + } + } + + stage('Smoke Test Staging') { + steps { + sh "curl -f http://${STAGING_IP}:8080/healthz || exit 1" + } + } + + stage('Approve Production') { + steps { + input message: 'Staging tests passed. Deploy to Production?', ok: 'Deploy to Production' + } + } + + stage('Deploy Production (EKS)') { + steps { + sh """ + aws eks update-kubeconfig --name ${CLUSTER} --region ${REGION} + for svc in auth gateway converter notification; do + kubectl set image deployment/\${svc} \${svc}=vidcast/\${svc}:${BUILD_TAG} + kubectl rollout status deployment/\${svc} --timeout=120s + done + """ + } + } + } + + post { + failure { + sh """ + aws eks update-kubeconfig --name ${CLUSTER} --region ${REGION} || true + for svc in auth gateway converter notification; do + kubectl rollout undo deployment/\${svc} || true + done + """ + echo "PIPELINE FAILED — automatic rollback executed for all services" + } + success { + echo "Pipeline completed — build ${BUILD_TAG} deployed to production" + } + } +} diff --git a/PHASE_UP_PLAN.md b/PHASE_UP_PLAN.md new file mode 100644 index 0000000..67b7be1 --- /dev/null +++ b/PHASE_UP_PLAN.md @@ -0,0 +1,601 @@ +# PHASE_UP_PLAN.md — VidCast Hardening & Differentiation + +> **Status: Sprint 0 deliverable. PLAN ONLY. No code has been written.** +> This document is the sign-off gate for everything that follows. Nothing in +> Sprints 1–5 starts until the operator explicitly approves (and answers the open +> questions in §6). Honest dissent is in §7 — read it before signing. + +> **Author's framing note.** I read `TECHNICAL_ANALYSIS.md`, the two project +> memories, the live source (`gateway storage/util.py`, `converter consumer.py`, +> `terraform/environments/dev/main.tf`), and CLAUDE.md before writing this. The +> plan is grounded in the *actual* current state, not the idealised one: +> **the EKS cluster is currently TORN DOWN** (destroyed 2026-06-03 for cost +> savings; state backend + tfvars + ECR images preserved for a ~20-min +> re-apply). The app is feature-complete and was E2E-verified on `main` at +> `c36b319`. That teardown fact materially changes the cost calculus of Part A5 +> and is the spine of my pushback in §7. + +--- + +## 0. How to read this document + +| Section | What it answers | +|---|---| +| §1 Executive summary | The non-technical "why" and "what" | +| §2 Scope, sequencing, dependency graph | What we build, in what order, and why that order | +| §3 Trade-off matrices | Every non-obvious decision, scored | +| §4 Risk register (per sprint) | What breaks and how we prevent/detect it | +| §5 Rollback strategy (per sprint) | How we undo each change if staging breaks | +| §6 Open questions | What I need from the operator **before** Sprint 1 | +| §7 What I would push back on | Where I think the prompt is wrong/over-scoped | +| §8 Revised readiness table | Where each capability moves, sprint by sprint | +| §9 Per-sprint review-gate checklist | The one-page sign-off ritual | + +--- + +## 1. Executive summary (for a non-technical stakeholder) + +VidCast already works: a user uploads a video, the system pulls the audio out, +and emails them a download link. It already runs on professional cloud +infrastructure (AWS), with automated security scanning, monitoring, and a login +system with user roles. An independent technical review rated it "well above +average" for a portfolio project. + +This phase does two things. + +**First, it closes the known gaps** that separate "great demo" from "could run +a real business." Today, if the messaging system hiccups at the wrong moment, a +user's upload could be silently lost; the databases run *inside* the cluster +(so they vanish if the cluster is rebuilt); and secrets are managed by hand. We +fix all of that by adopting the same managed, durable services a real company +would use, and by adding a "transactional outbox" — a safety ledger that +guarantees no upload event is ever dropped, even during an outage. + +**Second, it adds five capabilities that make VidCast genuinely stand out** from +peer projects: automated "GitOps" deployments (the system deploys itself from +git, with an approval gate); automated policy enforcement (the cluster refuses +to run insecure containers); a live cost dashboard answering "what does this +cost to run?"; reliability targets with automatic alerting when we're at risk of +missing them; and cryptographic proof that every running container was built by +us and not tampered with. + +**The honesty commitment:** every feature we claim in the README will be backed +by code that actually does it. Anything partial is labelled "Partial" with the +reason. This matches the standard the project already sets. + +**The one thing the stakeholder must understand about cost:** the "managed +services" upgrade (managed databases, managed message broker) takes the running +cost from roughly **$10/month to a few hundred dollars/month** if left on +permanently. Because this is a portfolio project, the recommended posture is to +build all of it as *code that can be turned on in ~20 minutes for a demo and +turned off again* — not to leave it running. See §7.1. + +--- + +## 2. Scope, sequencing, and dependency graph + +### 2.1 What's in (mapped to the prompt) + +Part A (import from peer): A1 outbox · A2 idempotency · A3 retry/DLQ · A4 +gunicorn (+ FastAPI decision) · A5 managed datastores · A6 NetworkPolicy +default-deny · A7 KEDA+HPA · A8 SBOM/SARIF/ECR hardening · A9 External Secrets +Operator · A10 Kustomize overlays. + +Part B (differentiation): B1 Argo CD GitOps · B2 Kyverno policy-as-code · B3 +Kubecost FinOps · B4 SLO burn-rate alerting · B5 cosign + Kyverno verify. + +### 2.2 What's explicitly out (per prompt §"NOT asking for") — restated so it's on the record + +- **Service mesh** (Linkerd/Istio) — parked. NetworkPolicy + Kyverno cover the + 80%. Documented as a deliberate omission in `SUPPLY_CHAIN.md` / README. +- **Multi-region** — out of scope; documented as deliberate with the trade-off + (single-region eu-west-2 SPOF accepted for a demo; HA would need RDS + cross-region read replica + Route53 failover + DocumentDB global cluster, all + cost-prohibitive here). +- **Switching IaC tool** — Terraform stays. +- **Single-CI consolidation** — both GitHub Actions and Jenkins stay; the + Jenkins manual approval gate is a strength. The Argo CD migration (B1) + *relocates* the gate to a manifest-repo PR rather than removing it (see §2.5). + +### 2.3 Execution split (non-negotiable per prompt §4) + +| I implement directly | the operator writes (I provide diffs + explanation only) | +|---|---| +| Terraform modules (RDS, DocumentDB/Atlas, Amazon MQ, ElastiCache, ECR, ESO IRSA) | `.github/workflows/ci.yml` changes (SBOM, SARIF, cosign sign) | +| Helm values / installs (ESO, Kyverno, Argo CD, Kubecost, KEDA) | `.github/workflows/cd.yml` changes (open-PR-to-manifest-repo flow) | +| Kustomize `base/`+`overlays/` | `Jenkinsfile` changes (gate relocation, smoke-test additions) | +| Kyverno ClusterPolicies, Argo CD `Application` CRDs, PrometheusRules, Grafana dashboards | — | +| Application *code* changes (outbox writer, relay, idempotency lock, DLQ topology, gunicorn entrypoint) | — | +| `ExternalSecret`/`SecretStore` CRDs, NetworkPolicies, KEDA `ScaledObject`, HPA | — | + +**Coupling this creates** (flagged early because it bites in Sprint 4): Kyverno +`verify-images` (mine, B5) is inert until CI actually signs images (the operator's, +B5/A8). We ship the policy in **Audit** mode first so it can't block deploys +before signing exists, then promote to Enforce only after the operator's signing job is +merged and producing signatures. Sequencing is in §2.5. + +### 2.4 Dependency graph (why the sprint order is what it is) + +``` +A10 Kustomize ───────────────► B1 Argo CD (Argo needs overlays to sync) +A9 ESO ─────────────────────► A5 cutover (managed DBs need creds in SM) +A5 managed DB Terraform ────► Sprint 5 cutover (build before flip) +A1 outbox ──► A3 DLQ ──► A2 idempotency (outbox feeds queues; idempotency + guards redelivery from DLQ) +A8 SBOM/cosign (CI) ─────────► B5 Kyverno verify (policy verifies what CI signs) +B2 Kyverno (Audit) ──────────► B2 Kyverno (Enforce) +A6 NetworkPolicy ◄── needs VPC CNI network-policy add-on enabled (Terraform, Sprint 1) +B4 SLO alerts ◄── needs RabbitMQ exporter + real /metrics (fixes M-2 first) +``` + +The prompt's Sprint 1→5 ordering respects this graph. I am keeping it. The only +re-ordering I propose: **enable the VPC CNI network-policy agent in the EKS +add-on config in Sprint 1** (Terraform), even though the NetworkPolicies +themselves land in Sprint 2 — because that add-on flag is `ForceNew`-adjacent +(changing add-on config can recycle the agent) and is cheapest to set while the +cluster is being re-applied from scratch anyway. + +### 2.5 The approval-gate migration (B1) — explicit, because the prompt demands honesty here + +Today: Jenkins builds → deploys to Swarm staging → smoke test → **human clicks +"approve"** → `kubectl set image` to EKS. + +After B1: GitHub Actions builds + pushes image → **opens a PR** against the +manifest repo (or `apps/` dir) bumping the image tag in `overlays/prod`. Argo CD +watches that path with **auto-sync OFF for prod**. The deploy *is* the merge of +that PR. + +**Why this is stronger, not weaker:** +- The gate moves from an ephemeral Jenkins button (no durable record, tied to + one CI server's uptime) to a **git PR with reviewers, diff, CI checks, and an + immutable audit trail**. You can see exactly which image SHA went to prod, + who approved it, and when — forever. +- Rollback becomes `git revert` of the tag bump (Argo re-syncs to the previous + SHA), instead of `kubectl rollout undo` (which is correct but invisible in + git history). +- The Jenkins Swarm smoke-test stage **stays** — it just gates *opening the PR* + rather than gating the kubectl call. Defence in depth, not replacement. + +**Honest caveat:** running two gates (Jenkins smoke-test AND manifest PR) is +arguably redundant for a solo project. I keep both because the prompt says keep +both and because it's a legitimate "I understand the difference between staging +verification and prod authorisation" talking point. If the operator wants to simplify +later, the cleaner end-state is Jenkins→Swarm smoke-test→auto-open-PR, GitHub +review = the single human gate. + +--- + +## 3. Trade-off matrices + +Scoring: **1 = worst, 5 = best** on each axis (higher is always better — e.g. a +high "cost" score means *cheaper*). "Team-fit" = fit for a solo +portfolio/learning context. Weighted columns aren't summed blindly; the +recommendation paragraph states what actually drove the choice. + +### 3.1 MongoDB managed choice (GridFS is the hard constraint) + +| Option | Cost (mo) | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **MongoDB Atlas (M10)** | 3 (~$57/mo) | 5 | 5 | 4 | 5 | 4 | 4 | Real MongoDB → **GridFS works unchanged**. Off-AWS (PrivateLink to VPC). Free M0 tier exists for dev. | +| **Amazon DocumentDB** | 2 (~$200/mo min, t3.medium) | 3 | 3 | 4 | 2 | 5 | 4 | **GridFS partially supported** — DocumentDB emulates the Mongo API and historically had gaps around some GridFS/`fs.chunks` operations. **Must be functionally tested before trusting.** Pricey minimum. | +| **In-cluster StatefulSet (keep, gate dev-only)** | 5 (~$0, on node) | 5 | 2 | 2 | 5 | 1 | 3 | Zero new cost; no durability beyond the PVC; what we have today. | + +**Recommendation: Atlas for the managed path (default per prompt), in-cluster +StatefulSet retained as `dev-only` behind `var.use_managed_datastores=false`.** +Driver: GridFS is load-bearing in VidCast (videos *and* mp3s live in GridFS) and +Atlas is genuine MongoDB, so it's zero application risk. DocumentDB's GridFS +support is the single biggest sleeper risk in Part A5 — **I will write an +explicit GridFS smoke test** (put a >255KB file so it chunks, read it back, byte +-compare) and the plan does **not** assume DocumentDB until that test passes. If +the operator prefers all-AWS for the compliance/narrative story, we run that test in +Sprint 1 and only then commit to DocumentDB. Atlas M0 (free) covers dev. + +### 3.2 Broker choice + +| Option | Cost (mo) | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **Amazon MQ for RabbitMQ** | 3 (~$25–30/mo single-instance; more for cluster) | 5 | 4 | 3 | 5 | 5 | 4 | **Drop-in** — same AMQP, Pika unchanged, same management API. Our DLQ/retry topology (A3) ports verbatim. | +| **Amazon MSK (Kafka)** | 1 (~$130+/mo min) | 1 | 2 | 5 | 1 | 5 | 5 | Would require **rewriting every producer/consumer** from Pika→Kafka. Massive scope creep. Huge learning value, wrong phase. | +| **Clustered Helm RabbitMQ (in-cluster)** | 5 (~$0) | 4 | 2 | 3 | 4 | 1 | 3 | Free; clustering on a single node is theatre (no real HA on one node). | + +**Recommendation: Amazon MQ for RabbitMQ (default per prompt), in-cluster Helm +RabbitMQ retained dev-only behind the toggle.** Driver: it's the only managed +option that doesn't force an application rewrite — A1/A2/A3 are designed against +AMQP semantics and Amazon MQ preserves them. MSK is explicitly rejected as +out-of-scope scope-creep (the prompt asks for reliability patterns, not a +messaging-platform migration); I document it as the "if this were event-sourced +at scale" path. **Single-instance Amazon MQ for cost**; note that +single-instance is not HA — documented honestly, cluster mode is a one-flag +change if needed for a demo. + +### 3.3 Outbox relay mechanism ⚠️ (the prompt's default is, I believe, wrong — see §7.2) + +> **Terminology fix:** the prompt says "goroutine" — that's Go. VidCast is +> Python. The in-process equivalent is a background **thread** (or +> `APScheduler`). This matters for the conclusion. + +| Option | Cost (mo) | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Correctness under our topology | +|---|---|---|---|---|---|---|---|---| +| **In-process thread in gateway** | 5 | 5 | 4 | 2 | 4 | 3 | 3 | ❌ **Broken by default.** Gateway runs `gunicorn -w 4` (A4) → 4 worker processes → **4 relay threads** all scanning `outbox` and double/quadruple-publishing. Needs a Mongo-level claim/lock or single-worker carve-out. | +| **Sidecar container in gateway pod** | 5 | 4 | 3 | 2 | 4 | 3 | 4 | Scales with gateway replicas → N relays → same multi-publisher problem unless leader-elected. Shares pod lifecycle. | +| **Separate single-replica Deployment** | 5 (~$0, tiny) | 3 | 4 | 4 | 5 | 4 | 5 | ✅ **Correct by construction.** One replica = one publisher = no double-send. Scales/restarts independently. Idempotent consumers (A2) make even an occasional double-publish during rollover harmless. | + +**Recommendation: separate single-replica Deployment (`outbox-relay`), +overriding the prompt's "default in-process".** Driver: correctness. The outbox +pattern's entire value is "exactly-this-event, eventually." Running the relay +inside a multi-worker gunicorn process re-introduces the duplicate-publish +problem the pattern exists to prevent. A single-replica deployment makes the +invariant structural rather than something we have to defend with a distributed +lock. It also reads better in an interview ("I separated the relay because the +app server is multi-process") than explaining a Mongo lock retrofitted onto a +thread. Cost is negligible (it's a 50m/64Mi pod). **Belt-and-braces:** the relay +marks rows `published_at` and the consumers are idempotent (A2), so a duplicate +during a relay pod restart is a no-op, not a double-email. See §7.2. + +### 3.4 Flask → FastAPI (the prompt asks me to propose, with default = stay on Flask + gunicorn now) + +| Option | Cost | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **gunicorn now, FastAPI never** | 5 | 5 | 5 | 3 | 4 | 3 | 2 | Fixes M-1 immediately. Sync framework caps the streaming-upload concurrency story. | +| **gunicorn now, FastAPI as a follow-on phase** | 5 | 4 | 4 | 4 | 5 | 3 | 5 | Get the prod-server win this phase; bank async migration as a clean, self-contained future phase with real before/after load numbers. | +| **FastAPI migration now** | 4 | 1 | 2 | 5 | 2 | 3 | 5 | Rewrites both web services mid-reliability-sprint. High delivery risk; competes for attention with outbox/DLQ which matter more. | + +**Recommendation: gunicorn now (Sprint 2), FastAPI as an explicitly-scoped +follow-on phase (NOT this phase).** Driver: delivery risk vs. value timing. The +production-server fix (gunicorn `-w` workers + a proper WSGI entrypoint) is a +one-file change that closes M-1 today. A Flask→FastAPI rewrite is genuinely +valuable for the upload-streaming path (`async` + `UploadFile` streaming beats +Werkzeug's buffer-to-`/tmp`), and it's a strong learning artifact — but doing it +*during* the reliability sprint dilutes both. I'll write the gunicorn entrypoint +so the eventual FastAPI swap is a contained blast radius (keep `server` importable, +keep route handlers thin). The follow-on phase should produce a load-test +before/after (locust/k6) so the async benefit is *measured*, not asserted. + +### 3.5 Argo CD vs Flux + +| Option | Cost | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **Argo CD** | 5 | 4 | 3 | 4 | 5 | 4 | 5 | Has a UI (huge for demos/screenshots), `Application` CRD model is intuitive, sync-waves, manual-sync gate maps perfectly to the prod approval requirement. Heavier footprint. | +| **Flux** | 5 | 3 | 4 | 4 | 3 | 4 | 4 | Lighter, more "pure GitOps", no first-party UI (needs Weave GUI/CLI). Kustomize-native. Less visual for a portfolio. | +| **Both / neither (keep kubectl CD)** | 5 | 5 | 5 | 2 | 2 | 2 | 1 | Status quo; no GitOps story. | + +**Recommendation: Argo CD (default per prompt).** Driver: it's a *portfolio* +project — the Argo UI gives screenshottable, demoable evidence of sync state, +drift detection, and the manual-sync prod gate, which is exactly the +differentiation B1 is for. Flux is arguably more elegant but invisible. The +manual-sync-for-prod / auto-sync-for-dev split is a built-in first-class concept +in Argo (`syncPolicy.automated` present vs absent). + +### 3.6 Kubecost OSS vs OpenCost vs AWS Cost Explorer + custom exporter + +| Option | Cost | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **Kubecost (free OSS)** | 4 | 4 | 3 | 3 | 5 | 3 | 4 | Turnkey UI + Grafana data source, allocation by namespace/label, AWS spot/on-demand split. Free tier limits: 15-day metric retention, single cluster — fine here. | +| **OpenCost** | 5 | 3 | 2 | 3 | 4 | 3 | 4 | The CNCF core Kubecost is built on; more DIY for dashboards, no polished UI. More "I built it from primitives" cred, more work. | +| **AWS Cost Explorer + custom exporter** | 4 | 2 | 2 | 4 | 2 | 4 | 5 | Billing-accurate (real invoice data) but no per-pod/per-namespace granularity without heavy custom tagging+ETL. Most work. | +| **Hybrid (chosen): Kubecost for in-cluster allocation + CE/CUR for ground-truth $** | 4 | 3 | 3 | 4 | 5 | 4 | 5 | Use Kubecost for "cost-per-minute-converted" and per-service breakdown; reconcile the total against the real AWS bill so the README number is *honest*. | + +**Recommendation: Kubecost OSS as the primary (default per prompt), reconciled +against AWS Cost Explorer for the headline number.** Driver: Kubecost gives the +per-service / cost-per-conversion granularity B3 needs out of the box, but its +node-cost model is an *estimate*. To honour the honesty principle, the README's +"What does VidCast cost?" number will be cross-checked against the actual AWS +bill, and the dashboard will label estimated vs. billed. OpenCost is the same +engine with more assembly; not worth it here. + +### 3.7 Cosign keyless vs key-based vs Notary v2 + +| Option | Cost | Impl time | Ops complexity | Scale ceiling | Team-fit | Compliance | Learning | Notes | +|---|---|---|---|---|---|---|---|---| +| **Cosign keyless (GitHub OIDC + Fulcio/Rekor)** | 5 | 4 | 4 | 5 | 5 | 5 | 5 | **No private key to manage** — identity = the GitHub Actions OIDC token, logged in the Rekor transparency log. Kyverno `verify-images` matches on the repo-scoped identity. Modern SLSA-aligned story. | +| **Cosign key-based** | 5 | 4 | 3 | 4 | 3 | 4 | 4 | A keypair you must store (KMS/secret) and rotate — reintroduces the secret-management problem A9 just solved. | +| **Notary v2 / notation** | 5 | 2 | 2 | 4 | 2 | 4 | 3 | Less ubiquitous tooling/docs, weaker Kyverno integration story than cosign. | + +**Recommendation: cosign keyless (default per prompt) using GitHub Actions OIDC.** +Driver: it's the strongest *and* the simplest here — no key to store (consistent +with A9's "get secrets out of files" thesis), and the verifiable chain (Fulcio +cert → Rekor log → Kyverno policy scoped to +`repo:/vidcast`) is exactly the SLSA narrative B5/ +`SUPPLY_CHAIN.md` is meant to demonstrate. **Prerequisite I'll flag loudly:** +keyless verification at admission requires the cluster to reach Fulcio/Rekor +(public sigstore) — fine on EKS with egress; would need the NetworkPolicy DNS/ +egress carve-out (A6) to not block it. + +--- + +## 4. Risk register (per sprint) + +Severity: 🔴 high · 🟠 medium · 🟢 low. Each row: risk → mitigation → detection. + +### Sprint 1 — Foundation (A5 Terraform, A9 ESO, A10 Kustomize) + +| # | Sev | Risk | Mitigation | Detection | +|---|---|---|---|---| +| 1.1 | 🔴 | Managed-datastore Terraform applied → **surprise AWS bill** (RDS Multi-AZ + DocumentDB + Amazon MQ + ElastiCache ≈ hundreds/mo) | Build behind `var.use_managed_datastores`, **default false**; do NOT `apply` the managed modules in Sprint 1 — `terraform plan` only, reviewed for cost; a `terraform-cost` note in the review gate | AWS Budgets alert at $50; review the plan's resource list before any apply | +| 1.2 | 🔴 | DocumentDB GridFS incompatibility discovered late | Sprint 1 spike: stand up smallest DocumentDB, run the GridFS chunk test, decide DocumentDB vs Atlas *before* writing the rest of A5 | Test fails → fall back to Atlas (already the default) | +| 1.3 | 🟠 | A10 Kustomize refactor silently changes a rendered manifest (drops a securityContext, env, probe) | `kubectl kustomize overlays/dev > rendered.yaml` and **diff against the current raw manifests**; CI `kustomize build` check | Pre/post render diff must be empty except intended changes | +| 1.4 | 🟠 | ESO misconfig → pods can't get secrets → CrashLoop on next rebuild | Keep gitignored `secret.yaml` working in parallel until ESO is proven; flip per-service | `kubectl describe externalsecret` status `SecretSynced` | +| 1.5 | 🟢 | IRSA role for ESO over-scoped | Scope the Secrets Manager IAM policy to `vidcast/*` ARNs only | `terraform plan` policy review | + +### Sprint 2 — Reliability core (A1, A2, A3, A4, A6, A7) + +| # | Sev | Risk | Mitigation | Detection | +|---|---|---|---|---| +| 2.1 | 🔴 | Outbox relay double-publishes (multi-worker) | Separate single-replica relay (§3.3) + idempotent consumers (A2) + `published_at` marker | Duplicate-email count; outbox rows stuck `unpublished` | +| 2.2 | 🔴 | A6 default-deny NetworkPolicy **without** the VPC CNI network-policy agent → policies silently do nothing (declarative-only) | Enable the add-on in Sprint 1 Terraform; **verify enforcement** with a deny test (exec into a pod, `curl auth:5000`, expect timeout) | Negative test: blocked call must hang/fail | +| 2.3 | 🔴 | Default-deny breaks DNS / the app entirely | Land NetworkPolicies in **Audit mindset**: apply allow-rules first, default-deny last; explicit DNS egress carve-out to kube-dns; per-service allow matrix written before deny | Smoke test after each policy; rollback = delete the deny policy | +| 2.4 | 🟠 | KEDA scale-to-zero + HPA both target the **same** Deployment → fighting controllers | Prompt already mandates the fix: KEDA→converter, HPA→gateway (different Deployments). Verify no overlap in `scaleTargetRef` | `kubectl get hpa,scaledobject` — distinct targets | +| 2.5 | 🟠 | DLQ topology misconfigured → messages loop forever (poison) or vanish | Bounded `MAX_RETRIES`; retry queue TTL dead-letters *back* to main; terminal DLQ via `vidcast.dlx`; consumers do **not** consume retry queues | Inspect queue depths; a message with retry-count > MAX lands in DLQ, not main | +| 2.6 | 🟠 | gunicorn worker count starves the 2-vCPU node (converter already at 2 replicas for CPU) | Conservative `-w 2` for gateway/auth; set against resource limits already tuned in U3 | Pod OOM/CPU throttle metrics | +| 2.7 | 🟢 | Redis (A2) becomes a new SPOF | Dev: in-cluster Redis; prod: ElastiCache single-AZ acceptable per prompt; lock TTL short so a Redis outage degrades to "occasional duplicate", not "stuck" | Redis up/down alert | + +### Sprint 3 — Differentiation core (B1 Argo CD, B2 Kyverno) + +| # | Sev | Risk | Mitigation | Detection | +|---|---|---|---|---| +| 3.1 | 🔴 | Argo CD auto-sync (dev) fights manual `kubectl` changes → drift war / surprise reverts | Declare Argo the owner of app manifests once cutover; stop hand-`kubectl apply` for synced apps; document the new workflow in GITOPS.md | Argo "OutOfSync" / unexpected self-heal events | +| 3.2 | 🔴 | Kyverno in **Enforce** too early blocks all deploys (e.g. require-non-root catches a stray pod) | Prompt-mandated: **Audit mode for one PR cycle**, fix violations, *then* Enforce; verify-images stays Audit until cosign signing exists | `kubectl get policyreport` shows violations before promotion | +| 3.3 | 🟠 | Argo prod app auto-syncs by accident (gate bypassed) | `syncPolicy.automated` **absent** on prod Application; codify in review checklist; RBAC who can click "Sync" | Inspect prod Application spec; sync history | +| 3.4 | 🟠 | Manifest-repo PR flow (CD change, the operator's) not ready → Argo has nothing to sync | Argo can point at the same repo's `overlays/prod` initially (in-repo), defer separate manifest repo if the operator prefers; decision in §6 | — | +| 3.5 | 🟢 | Kyverno admission webhook latency / availability affects all pod creates | Kyverno HA not needed at this scale; `failurePolicy: Ignore` during Audit, revisit for Enforce | Webhook latency metric | + +### Sprint 4 — Differentiation polish (B3 Kubecost, B4 SLO alerts, B5 cosign, A8 SBOM/SARIF) + +| # | Sev | Risk | Mitigation | Detection | +|---|---|---|---|---| +| 4.1 | 🔴 | Kyverno `verify-images` Enforce blocks deploys because not all images are signed (esp. **frontend**, which CI doesn't build) | Add frontend to signing scope (or exempt it explicitly in policy with a documented reason); promote verify-images to Enforce **only** after every deployed image is signed | Audit policyreport: any unsigned deployed image | +| 4.2 | 🟠 | SLO numbers are meaningless on a single-node, frequently-torn-down cluster (teardowns instantly blow a 99.9% budget) | Label SLOs **"demonstrative"** in SLO.md; compute burn rate over *uptime windows*, document the single-node caveat honestly | n/a — documentation honesty | +| 4.3 | 🟠 | B4 requires real metrics; M-2 says gateway has **no /metrics** and **no RabbitMQ exporter** | Fix M-2 first in Sprint 4: re-add a `/metrics` endpoint (request + queue gauges) and deploy the RabbitMQ Prometheus plugin; only then write the burn-rate rules | Prometheus targets all `up`; the old dangling alerts replaced | +| 4.4 | 🟠 | cosign keyless verify can't reach Fulcio/Rekor (egress blocked by A6) | A6 egress carve-out includes sigstore endpoints; test verify in Audit first | Kyverno verify failures with network errors | +| 4.5 | 🟢 | SBOM/SARIF upload needs `security-events: write` + GHAS enabled on the repo | Confirm GitHub Advanced Security availability (public repo = free) in §6 | SARIF tab populates | + +### Sprint 5 — Cutover + README + +| # | Sev | Risk | Mitigation | Detection | +|---|---|---|---|---| +| 5.1 | 🔴 | Flipping `use_managed_datastores=true` in prod = **the big bill** + a real data migration (GridFS dump/restore, Postgres `pg_dump`, queue drain) | See §7.1 — recommend **NOT** leaving it on; cutover only inside a timed demo window then destroy. Migration runbook with dump/restore + GridFS chunk verify; bcrypt seed must precede auth image (known hazard from memory) | Post-cutover E2E smoke (login→upload→convert→email→download) | +| 5.2 | 🔴 | Decommissioning in-cluster stateful Helm charts in prod overlay before data is migrated = data loss | Migrate-then-decommission ordering; decommission only in `overlays/prod`, dev keeps Helm charts; snapshot before delete | Data byte-compare post-migration | +| 5.3 | 🟠 | README rewrite over-claims (violates honesty principle) | Every claim cross-checked against shipped code; readiness table audited; "Partial" where partial | Self-review + the §9 gate | + +--- + +## 5. Rollback strategy (per sprint) + +The governing principle: **every change is reversible without touching prod data +until Sprint 5.** Sprints 1–4 add capabilities behind toggles/Audit modes; the +only destructive sprint is 5, which gets a snapshot-first runbook. + +| Sprint | Change | How to undo if staging breaks | +|---|---|---| +| **1** | A5 managed Terraform | It's `plan`-only / behind a `false` toggle — nothing applied, nothing to roll back. If a managed module *was* applied for the GridFS spike: `terraform destroy -target=module.documentdb` (and friends). State backend untouched. | +| **1** | A9 ESO | Per-service flip; the gitignored `secret.yaml` is kept until ESO proven. Roll back = `kubectl apply` the old secret + remove the `ExternalSecret`. `helm uninstall external-secrets`. | +| **1** | A10 Kustomize | The raw manifests stay in git history; `git revert` the overlay commit and `kubectl apply -f src/*/manifest/` as before. Rendered-diff gate means dev knows it's equivalent. | +| **2** | A1 outbox | Feature-flag `OUTBOX_ENABLED`; off = gateway publishes directly (today's path) and the compensating `fs.delete` stays as the fallback. Relay deployment scaled to 0. | +| **2** | A2 idempotency | `IDEMPOTENCY_ENABLED` flag; off = consumers behave as today. Redis outage is already a graceful-degrade, not a hard dep. | +| **2** | A3 DLQ | Topology is additive (new exchanges/queues). Roll back = consumers point back at plain `video`/`mp3`; delete the `vidcast.dlx` exchange. Existing messages drain normally. | +| **2** | A4 gunicorn | Dockerfile `CMD` revert to `python server.py`; one-line, one-image rebuild. | +| **2** | A6 NetworkPolicy | `kubectl delete networkpolicy --all -n ` instantly restores open networking (default-allow). This is *the* fastest rollback in the plan — and why default-deny is applied last. | +| **2** | A7 KEDA/HPA | `kubectl delete scaledobject/hpa`; replicas return to the static manifest count. | +| **3** | B1 Argo CD | Disable auto-sync (`syncPolicy: {}`); Argo stops reconciling; fall back to `kubectl`/CD-as-before. `helm uninstall argocd` removes it entirely (apps keep running — Argo is control-plane only). | +| **3** | B2 Kyverno | Set policy `validationFailureAction: Audit` (un-enforce) or `helm uninstall kyverno`. Audit mode means there's nothing to roll back during the trial cycle. | +| **4** | B3 Kubecost | `helm uninstall kubecost`; pure observability, zero app impact. | +| **4** | B4 SLO alerts | `kubectl delete prometheusrule`; restores prior alerting. The M-2 metrics fixes are additive (new `/metrics`, new exporter) — revert the gateway image + `helm uninstall` the exporter. | +| **4** | B5 cosign verify | Kyverno `verify-images` → Audit or delete; CI signing job is the operator's (revert the workflow commit). | +| **4** | A8 SBOM/SARIF | CI-only (the operator); revert the workflow commit. No cluster impact. | +| **5** | Cutover to managed | **Snapshot first** (RDS snapshot, GridFS `mongodump`, `pg_dump`). Roll back = flip `use_managed_datastores=false`, re-point services at in-cluster charts, restore from dump if needed, `terraform destroy` the managed modules to stop the bill. The in-cluster charts are *not deleted* until a post-cutover soak passes. | + +--- + +## 6. Open questions for the operator (need answers before Sprint 1) + +1. **Cost posture (blocking — see §7.1).** Do you want managed datastores left + *running* (steady ~$300–400/mo all-in), or built-as-code and only spun up for + timed demos then destroyed? My strong recommendation is the latter. This + changes Sprint 5's "flip to true in prod" from "permanent" to "demo-window." +2. **MongoDB target:** Atlas (default, zero GridFS risk, off-AWS) or DocumentDB + (all-AWS narrative, but gated on the Sprint-1 GridFS compatibility test)? +3. **Manifest repo for B1:** separate dedicated repo (`vidcast-manifests`) or an + `apps/` directory *in this repo*? Separate repo is the textbook GitOps + pattern; same-repo is simpler for a solo project. Affects A10's layout. +4. **GitHub Advanced Security / SARIF:** is the repo public (free GHAS) or + private (needs a license for code-scanning SARIF upload)? +5. **Cluster availability for testing:** the cluster is torn down. Do you want me + to (a) develop everything against a local kind/k3d cluster and only validate + on EKS in batches, or (b) re-apply EKS for the duration of this phase? (a) is + far cheaper; (b) is higher-fidelity. I lean (a) for Sprints 1–4 code/config, + (b) for the Sprint-5 cutover validation only. +6. **Redis for dev (A2):** in-cluster Redis Helm chart, or skip dev Redis and + make idempotency a no-op locally (flag off)? +7. **Amazon MQ sizing:** single-instance (cheap, not HA) or cluster (HA, ~3× + cost)? I default to single-instance with an honest "not HA" note. +8. **Do you want the Jenkins gate to stay as a *second* gate** after B1, or + collapse to Jenkins-smoke-test → auto-open-PR → GitHub-review-as-single-gate + (my preferred simplification, §2.5)? +9. **FastAPI:** confirm you're happy parking it as a *named follow-on phase* + (with a load-test deliverable) rather than doing it now? + +--- + +## 7. What I would push back on (honest dissent — required, not optional) + +### 7.1 🔴 The biggest one: managed datastores contradict the project's own cost decision + +The memories record that the **cluster was deliberately torn down on 2026-06-03 +to save money**, preserving everything for a ~20-minute re-apply. That is a +*good* instinct for a portfolio project. Part A5 then proposes RDS **Multi-AZ**, +DocumentDB (or Atlas M10), **Amazon MQ**, and ElastiCache — and Sprint 5 says +"flip `use_managed_datastores` to true in prod." Left running, that's roughly: + +| Service | Cheapest realistic prod-ish | ~$/mo | +|---|---|---| +| RDS PostgreSQL Multi-AZ (db.t3.micro) | Multi-AZ doubles the instance | ~$30–60 | +| DocumentDB (t3.medium min) **or** Atlas M10 | — | ~$200 / ~$57 | +| Amazon MQ RabbitMQ (single mq.t3.micro) | cluster ≈ 3× | ~$25–30 | +| ElastiCache Redis (cache.t3.micro) | — | ~$12–15 | +| **Plus the EKS cluster itself** | already ~$150 | ~$150 | + +That's a **15–40× jump** over today's ~$10 staging cost, on a project that was +just torn down *for $10*. + +**My recommendation:** build A5 in full as Terraform behind the toggle (it's +genuinely valuable code and a strong portfolio artifact — "I can stand up the +managed-services version on demand"), but **do not leave it running, and reframe +Sprint 5** from "permanently flip prod to managed" to "demo-window cutover: +`apply` → migrate → record the screenshots/numbers → `destroy`." This keeps the +honesty (the managed path *works* and is *demonstrated*) without a standing bill. +RDS Multi-AZ specifically: use single-AZ for the demo and *document* that +Multi-AZ is a one-flag change — Multi-AZ on a demo you tear down nightly is pure +cost for zero observed benefit. **This is question 6.1 and I'd like an explicit +decision.** + +### 7.2 🟠 The outbox relay default ("in-process goroutine") is wrong for this stack + +Covered in §3.3. Two concrete problems with the prompt's stated default: (1) +"goroutine" is Go — VidCast is Python; (2) more importantly, the gateway will run +multi-process under gunicorn (A4), so an in-process relay = N concurrent +publishers = the duplicate-publish bug the outbox exists to kill. I'm +overriding the default to a **separate single-replica deployment**. If you +specifically want the in-process variant for learning reasons, we *must* add a +Mongo-level claim (findAndModify lease) — say so and I'll do that instead, but +I'd be building a distributed lock to work around a self-inflicted problem. + +### 7.3 🟠 The scope is ~2–4 months of senior work, not a sprint or two + +A1–A10 + B1–B5 is **15 substantial workstreams**, each with code + Terraform/ +Helm + an `_EXPLAINED.md`. Realistically each sprint here is 1–3 weeks of +focused work. That's fine if the goal is a sustained portfolio build — but if +there's a deadline (job application, course submission), I'd **prioritise for +signal-per-effort**: +- **Highest signal, do first:** A1/A2/A3 (reliability story), A6 (security + story), B1 (GitOps story), B2 (policy story). These four are what make + reviewers say "this person operates production systems." +- **High signal, moderate effort:** A9 ESO, A8 supply-chain, B5 cosign. +- **Lower signal-per-effort for a *demo*:** A5 managed datastores (expensive, + and "I used RDS" is less differentiating than "I built a verified supply + chain"), B3 Kubecost (nice, but the number is small and a bit theatrical on a + single node), B4 SLOs (great concept, but the numbers are demonstrative on a + torn-down single node — §4.2). +- If forced to cut: I'd cut **A5's permanent cutover** (keep it as on-demand + code) before anything else. + +I'm not refusing any of it — the prompt is the prompt — but a senior engineer +should tell you where the marginal hour pays off most. If you want the full set, +we do the full set in the given order. + +### 7.4 🟢 SLO targets are aspirational on this topology — say so + +99.9% availability / 5-min conversion / 99% email-success are good *definitions*, +but on a **single-node cluster that gets torn down for cost**, the measured +error budget is fiction (every teardown = 100% of the budget gone). B4 is still +worth doing — the *machinery* (multi-window multi-burn-rate PrometheusRules, the +error-budget dashboard) is the portfolio artifact. SLO.md will label the targets +"demonstrative" and explain the single-node caveat rather than pretending the +numbers are an operated reality. That's the honesty principle applied to SLOs. + +### 7.5 🟢 "Replace the compensating-GridFS-delete with the outbox" — keep both, briefly + +A1 says the outbox "replaces the current compensating-GridFS-delete pattern +(which is good but only half the solution)." I'd **keep the compensating delete +as a belt-and-braces fallback during the transition** (behind the same flag), +not rip it out. Once the outbox is proven in staging over a soak period, then +remove the now-dead compensation path in a clean follow-up commit. Ripping it out +in the same change that introduces the outbox means a single outbox bug can +orphan GridFS objects with no safety net. + +### 7.6 🟢 Two CD gates (Jenkins + Argo manual-sync) is redundant for a solo repo + +Flagged in §2.5/§6.8. I'll keep both because you asked, but the genuinely clean +end-state is one human gate (the manifest PR), with Jenkins demoted to "run the +Swarm smoke test and open the PR on success." Happy either way — just noting the +redundancy so it's a *choice*, not an accident. + +--- + +## 8. Revised readiness table (movement per sprint) + +Legend: ❌ absent · 🟡 Partial · ✅ Complete (**only marked ✅ when demonstrably +shipped + verified — in this PLAN everything is a *target*, written as the +status we intend to reach by the end of that sprint**). Baseline column = today, +per `TECHNICAL_ANALYSIS.md`. + +| Capability | Today | S1 | S2 | S3 | S4 | S5 | Refs | +|---|---|---|---|---|---|---|---| +| Event durability (no lost uploads) | 🟡 compensating-delete only | 🟡 | ✅ outbox+relay | ✅ | ✅ | ✅ | A1 | +| Idempotent / retry-safe consumers | ❌ | ❌ | ✅ claim-once+release | ✅ | ✅ | ✅ | A2 | +| Retry/DLQ topology | ❌ NACK-requeue loop | ❌ | ✅ retry+DLQ+max | ✅ | ✅ | ✅ | A3, fixes L-4/poison | +| Production app server | ❌ Werkzeug dev | ❌ | ✅ gunicorn | ✅ | ✅ | ✅ | A4, M-1 | +| Async framework (FastAPI) | ❌ | ❌ | ❌ (deferred) | ❌ | ❌ | ❌ → *named follow-on* | A4 | +| Durable Postgres | ❌ Deployment no-PVC | 🟡 RDS coded (off) | 🟡 | 🟡 | 🟡 | ✅ RDS (demo-window) | A5, M-3 | +| Managed Mongo/GridFS | ❌ in-cluster | 🟡 Atlas/DocDB coded+tested | 🟡 | 🟡 | 🟡 | ✅ (demo-window) | A5 | +| Managed broker | ❌ in-cluster | 🟡 Amazon MQ coded | 🟡 | 🟡 | 🟡 | ✅ (demo-window) | A5 | +| Managed Redis | ❌ none | 🟡 ElastiCache coded | 🟡 | 🟡 | 🟡 | ✅ (demo-window) | A5, A2 | +| NetworkPolicy default-deny (enforced) | ❌ | 🟡 CNI agent on | ✅ deny+allow+DNS | ✅ | ✅ | ✅ | A6, M-5 | +| Autoscaling (KEDA+HPA) | ❌ manual | ❌ | ✅ KEDA(conv)+HPA(gw) | ✅ | ✅ | ✅ | A7, L-1 | +| Supply chain: SBOM + SARIF | 🟡 Trivy gate only | 🟡 | 🟡 | 🟡 | ✅ SBOM+SARIF | ✅ | A8 | +| ECR hardening (immutable/scan/CMK/lifecycle) | 🟡 basic ECR | 🟡 coded | 🟡 | 🟡 | ✅ | ✅ | A8 | +| External secret management | ❌ gitignored files | 🟢 ESO+Parameter Store (strong-partial: app secrets done $0 standing; broker creds pending) | 🟢 | 🟢 | 🟢 | 🟢 | A9, H-4 | +| App manifests as Kustomize | ❌ raw per-svc | ✅ base+overlays | ✅ | ✅ | ✅ | ✅ | A10 | +| GitOps (Argo CD) | ❌ kubectl CD | ❌ | ❌ | ✅ Argo+gate | ✅ | ✅ | B1 | +| Policy-as-code (Kyverno) | ❌ | ❌ | ❌ | ✅ Audit→Enforce | ✅ | ✅ | B2 | +| FinOps cost dashboard | ❌ | ❌ | ❌ | ❌ | ✅ Kubecost+panel | ✅ | B3 | +| SLO burn-rate alerting | ❌ (dangling alerts) | ❌ | ❌ | ❌ | ✅ (demonstrative) | ✅ | B4, M-2 | +| Image signing + admission verify | ❌ | ❌ | ❌ | ❌ | ✅ cosign+Kyverno | ✅ | B5 | +| Monitoring reflects reality (no dead alerts) | ❌ M-2 dead scrape/alerts | ❌ | ❌ | ❌ | ✅ /metrics+rabbit exporter | ✅ | M-2 | +| Frontend built+signed by CI | ❌ manual ECR push | ❌ | ❌ | ❌ | 🟡/✅ (q.4.1) | ✅ | M-6 | +| Multi-region | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ *deliberate omission* | out-of-scope | +| Service mesh | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ *deliberately parked* | out-of-scope | + +> **Ratchet rule (honesty principle):** I will only move a cell to ✅ in the +> living version of this table when the capability is shipped *and* I've run the +> verification named in §4/§5. Until then it stays 🟡. Nothing is ✅ on the +> strength of "the code exists." + +--- + +## 9. Per-sprint review-gate checklist (the sign-off ritual) + +After each sprint I produce a **one-page review note** containing exactly: + +1. **What shipped** (files touched, separated into "I implemented" vs "diffs for + the operator to apply to CI/CD/Jenkins"). +2. **Proof it works** — the specific verification command(s) from §4/§5 and + their output (e.g. the NetworkPolicy deny-test hanging; the duplicate-email + count being zero; `kubectl get policyreport`). +3. **Readiness-table delta** — which cells moved and the evidence. +4. **New `_EXPLAINED.md` files** created (one per new code/config file — + line-by-line + 3 interview questions + dependency map, per the existing + convention; kept gitignored as local study material per the project's + established `.gitignore:64` decision). +5. **Cost impact** of anything applied (should be ~$0 until Sprint 5). +6. **Open risks carried forward.** + +the operator signs off → next sprint starts. No sprint starts on an unsigned predecessor. + +--- + +## 10. Documentation deliverables tracking + +| Doc | Produced in | Status | +|---|---|---| +| `PHASE_UP_PLAN.md` | Sprint 0 | ✅ this document | +| `_EXPLAINED.md` per new file | every sprint | pending | +| `SUPPLY_CHAIN.md` | Sprint 4 (B5/A8) | pending | +| `SLO.md` | Sprint 4 (B4) | pending | +| `GITOPS.md` | Sprint 3 (B1) | pending | +| Updated `TECHNICAL_ANALYSIS.md` / project summary + "Differentiation" section | Sprint 5 | pending | +| README rewrite (platform-story-first) | Sprint 5 | pending | + +--- + +## 11. Sign-off + +**This plan is complete and awaiting the operator's review.** I have **not** written any +implementation code, Terraform, Helm values, manifests, or workflow changes. + +**Before Sprint 1 begins I need answers to §6 (especially 6.1 cost posture and +6.2 Mongo target), and acknowledgement of the §7 pushbacks — in particular that +Sprint 5 is reframed to a demo-window cutover rather than a permanent +managed-prod, and that the outbox relay is a separate deployment, not +in-process.** + +Stop. Awaiting sign-off. diff --git a/PROJECT_GUIDE.md b/PROJECT_GUIDE.md new file mode 100644 index 0000000..2c11166 --- /dev/null +++ b/PROJECT_GUIDE.md @@ -0,0 +1,824 @@ +# VidCast — The Complete Project Guide + +> A plain-English walkthrough of the entire VidCast platform, written for anyone: +> a bootcamp assessor, an interviewer, a teammate joining on day one, or a curious +> friend who doesn't work in tech. No prior knowledge assumed. Where a technical +> term is unavoidable, it's explained in parentheses the first time it appears. +> +> If you read this end to end, you'll understand not just *what* VidCast does, but +> *why* every piece is built the way it is — enough to discuss it confidently in a +> 30-minute technical interview. + +--- + +## 1. What VidCast Is + +**VidCast turns a video recording into a podcast-ready audio file.** You upload an +MP4 (a video file), and a few moments later you get an email with a link to +download the MP3 (just the audio, extracted from the video). That's the whole +user-facing product: "drop in a video, get back the audio." Useful for turning a +recorded talk, webinar, or Zoom call into something you can publish as a podcast. + +But here's the thing to understand before anything else: **the converter is the +demo; the platform is the project.** Extracting audio from a video is a few lines +of code — any developer could write it in an afternoon. That part is deliberately +simple, because it's not the point. The point is *everything around it*: how the +work is queued so it survives a crash, how the system scales itself down to zero +when nobody's using it and back up under load, how secrets are kept out of the +code, how a code change travels safely from a developer's laptop to a live server, +how the whole thing is monitored, cost-tracked, locked down, and rebuildable from +scratch in twenty minutes. VidCast is a small, easy-to-explain application wrapped +in a **production-grade platform** — the kind of infrastructure a real company runs +behind a much more complicated app. + +So when you read this guide, think of the video-to-audio feature as a worked +example — a realistic but simple thing for the platform to *do* — and pay attention +to the machinery underneath. That machinery is what makes this a platform +engineering project rather than a coding exercise: event-driven messaging, +self-healing deployments, zero-trust networking, supply-chain security, autoscaling, +observability, and infrastructure-as-code, all running on Amazon's managed +Kubernetes service. Every one of those is a thing companies hire for, and each is +implemented here honestly — with its real-world trade-offs and limitations written +down rather than hidden. + +--- + +## 2. Architecture Overview + +VidCast is built as **microservices** — instead of one big program that does +everything, the work is split into several small programs, each with one job, that +talk to each other. Think of a restaurant: rather than one person taking orders, +cooking, and washing up, you have a host, waiters, chefs, and a dishwasher, each +specialised and each able to be added or replaced independently. + +### The five services (the staff) + +| Service | One-sentence job | Analogy | +|---|---|---| +| **Frontend** | The website you actually click on — login, upload, download, dashboard. | The **shopfront** — the only part customers see. | +| **Gateway** | The front door for all requests; checks you're logged in, takes your upload, hands back your download. | The **receptionist** — everyone goes through them; they direct traffic but don't do the heavy work. | +| **Auth** | Checks your email and password and issues a "you're logged in" token. | The **security guard** at the door checking ID and handing out a wristband. | +| **Converter** | Takes a video off the queue, extracts the audio with ffmpeg, saves the MP3. | The **workshop** out back — where the actual product gets made. | +| **Notification** | Watches for finished MP3s and emails the user a download link. | The **mailroom** that posts the "your order is ready" letter. | + +A note on technology, for the technically minded: auth and gateway are **Flask** +(a Python web framework) apps run under **gunicorn** (a production web server); +converter and notification are Python programs using **Pika** (a RabbitMQ client +library) that sit and wait for messages rather than serving web pages; the frontend +is a **React** app (a popular JavaScript UI framework) served by **nginx** (a web +server). The converter does the audio extraction with **ffmpeg** (the standard +open-source media-processing tool), wrapped by a Python library. + +### The four data stores (the storage rooms) + +| Store | What it holds | Analogy | +|---|---|---| +| **PostgreSQL** | User accounts: email, hashed password, role (admin/user). | The **filing cabinet** of membership records — structured, one row per member. | +| **MongoDB / GridFS** | The actual video and audio files (which are big). | The **warehouse** — built to store large boxes, not index cards. (GridFS is MongoDB's way of storing files too big for a normal record, by splitting them into chunks.) | +| **RabbitMQ** | The to-do lists ("a video needs converting", "an MP3 needs emailing"). | The **internal mail system / pigeonholes** — one department drops a note, another picks it up later. | +| **Redis** | Short-lived "we already handled this job" tickets. | The **coat-check counter** — a tiny ticket that says "this one's taken," thrown away after a few minutes. | + +PostgreSQL, MongoDB, and RabbitMQ are the "three datastores"; Redis is a small +fourth helper used only to prevent duplicate work (explained in §6). + +### How data flows between them + +The key idea is that VidCast is **event-driven and asynchronous** (jobs happen in +the background, not while you wait). When you upload a video, the gateway doesn't +make you sit there while it converts — it stores your file, drops a note in the +mail system, and immediately says "got it." The conversion happens later, and you +find out by email. This is exactly how big systems handle slow work: accept it +fast, do it in the background, notify when done. + +Here's the whole picture as a text diagram. Read it top to bottom: + +``` + YOU (browser) + │ click "Login", "Upload", "Download" + ▼ + ┌─────────────┐ + │ FRONTEND │ React website (nginx) NodePort :30006 + └─────┬───────┘ + │ /api/* proxied to ↓ + ▼ + ┌─────────────┐ check password ┌──────────┐ ┌────────────┐ + │ GATEWAY │ ───────────────────────► │ AUTH │ ─► │ PostgreSQL │ (users) + │ (Flask) │ ◄─── "here's a token" ── │ (Flask) │ └────────────┘ + └─────┬───────┘ └──────────┘ + │ store the uploaded video + ▼ + ┌────────────────────┐ + │ MongoDB / GridFS │ (the video file) + └────────────────────┘ + │ write a "job to do" note (the outbox) + ▼ + ┌────────────────────┐ relay ┌──────────────────────────┐ + │ outbox (in Mongo) │ ────────► │ RabbitMQ "video" queue │ + └────────────────────┘ └────────────┬─────────────┘ + │ picked up by + ▼ + ┌─────────────┐ extract audio (ffmpeg) + │ CONVERTER │ ──────────────────────────► MP3 + └──────┬──────┘ │ + │ save MP3 to GridFS, then ◄──────────┘ + ▼ + ┌──────────────────────────┐ + │ RabbitMQ "mp3" queue │ + └────────────┬─────────────┘ + │ picked up by + ▼ + ┌──────────────┐ sends email + │ NOTIFICATION │ ─────────────────► YOU 📧 + └──────────────┘ "your audio is ready" + │ + ▼ later, you click the download link + ┌─────────────┐ + │ GATEWAY │ ── reads MP3 from GridFS ──► streams the file back to your browser + └─────────────┘ +``` + +(Redis isn't drawn because it's a side-helper: the converter and notification each +quickly check Redis — "have I already done this exact job?" — before doing work, so +a job that somehow arrives twice isn't processed twice.) + +--- + +## 3. The User Journey — What Happens When You Upload a Video + +Let's walk the whole thing slowly, one step at a time. Each step names the service +responsible, so you can map it back to the diagram above. + +**Step 1 — You log in.** You open the website (the **frontend**) and type your email +and password. The frontend sends those to the **gateway**, which forwards them to +the **auth** service. Auth looks up your email in **PostgreSQL** and checks your +password. Crucially, it doesn't store your actual password — it stores a **bcrypt +hash** (a scrambled, one-way version; explained in §6). It scrambles what you typed +the same way and compares the scrambles. If they match, you're in. + +**Step 2 — You get a token (JWT).** On a successful login, auth issues a **JWT** +(JSON Web Token) — a small, digitally-signed string that proves "this person logged +in successfully and is an admin/user." Think of it as a **festival wristband**: the +guard checks your ID once at the gate and gives you a wristband; after that, you +flash the wristband instead of showing ID again. Your browser holds the token and +attaches it to every later request, so the gateway can trust you without +re-checking your password each time. + +**Step 3 — You upload a video.** You pick an MP4 and hit upload. The browser sends +the file (with your token attached) to the **gateway**. The gateway checks the +token is valid, then needs to store the file. Videos are large, so it puts them in +**MongoDB GridFS** (the warehouse for big files). GridFS chops the file into chunks +and stores them; it hands back an ID (`video_fid`) — like a warehouse shelf +reference for "your video." + +**Step 4 — The gateway records a job in the outbox.** Now the gateway needs to tell +the rest of the system "there's a video to convert." Instead of phoning the message +system directly (which might be down), it writes the job into an **outbox** — a +little to-do note saved *in the same database* as the video, marked "not sent yet." +Then it immediately replies to you: "success!" You're done waiting; the rest happens +in the background. (Why the outbox instead of messaging directly? See §6, +*Transactional outbox* — it's so an upload can never be silently lost.) + +**Step 5 — The relay publishes the job to RabbitMQ.** A separate little program, the +**outbox-relay**, continuously reads the outbox looking for unsent notes. It finds +yours, publishes it as a message onto the **RabbitMQ "video" queue** (drops it in +the right pigeonhole), and marks the note "sent." The job is now officially in the +mail system, waiting for a worker. + +**Step 6 — The converter picks it up.** The **converter** service is always watching +the "video" queue. It takes your message, reads the `video_fid`, and pulls the video +back out of GridFS. Before doing the work, it asks **Redis**: "have I already done +job `video_fid`?" If not, it claims the job and proceeds. + +**Step 7 — ffmpeg extracts the audio.** The converter runs **ffmpeg** to strip the +audio out of the video and produce an MP3. This is the "actual product being made" +step — and the only genuinely CPU-heavy part of the whole system. + +**Step 8 — The MP3 is stored and a new job is queued.** The converter saves the MP3 +back into **MongoDB GridFS** (getting an `mp3_fid`), then publishes a new message +onto the **RabbitMQ "mp3" queue**: "an MP3 is ready, tell the user." + +**Step 9 — Notification sends the email.** The **notification** service watches the +"mp3" queue. It picks up your message and uses **smtplib** (Python's email library) +to send you an email via Gmail, containing the file ID you'll need to download. Like +the converter, it first checks Redis so you never get two emails for one job. + +**Step 10 — You download your audio.** You click the link in the email (or use the +download page). The request goes to the **gateway**, which reads the MP3 back out of +GridFS and streams it to your browser. You now have your podcast-ready audio file. + +The beautiful part: steps 5–9 all happen on their own, in the background, each +service doing one job and handing off to the next via the queues. If any service is +briefly busy or restarting, the messages wait patiently in RabbitMQ until it's ready +— nothing is lost, nobody is kept waiting at the front desk. + +--- + +## 4. Where It All Runs — Infrastructure + +So we have these programs. Where do they actually *live*, and how do they stay +running? This is the infrastructure layer, and it's built from a handful of tools +that each solve one problem. + +**Docker — the shipping container.** Before Docker, "it works on my machine" was a +real nightmare: code that ran on a developer's laptop would break on the server +because of slightly different versions of things. Docker fixes this by packing each +service — the code *and* everything it needs to run (Python, libraries, ffmpeg) — +into a **container**: a sealed, standardised box. Just like a shipping container can +go on any truck, train, or ship without anyone repacking it, a Docker container runs +identically on any machine. Each VidCast service is its own container image. + +**Kubernetes — the harbour master.** Once you have lots of containers, something has +to decide where they run, restart them if they crash, replace them during updates, +and connect them to each other. **Kubernetes** (often "K8s") is that orchestrator — +the **harbour master** directing which container goes on which ship, making sure the +right number are running, and rerouting around problems. You tell Kubernetes "I want +two copies of the gateway running, always," and it makes that true and keeps it true, +even if a machine dies. + +**EKS — renting the harbour from Amazon.** Running Kubernetes yourself is a lot of +work. **EKS** (Elastic Kubernetes Service) is Amazon's managed Kubernetes — AWS runs +the complicated "control plane" (the brain of Kubernetes) for you, and you just bring +the machines that run your containers. VidCast runs on EKS in Amazon's **London +region** (`eu-west-2`), on a **single machine** (an `m7i-flex.large`: 2 CPUs, 8 GB +of memory). One node keeps costs tiny; it's a deliberate constraint that shapes many +later decisions (you'll see "single-node" mentioned a lot — it's why we scale to +zero, why we skip some redundancy, etc.). + +**Terraform — the self-building blueprint.** Here's the powerful part: none of the +AWS infrastructure (the network, the Kubernetes cluster, the machine, the +permissions, the container registry) is created by clicking around in the AWS +console. It's all described in code using **Terraform**. Terraform is like an +architect's blueprint that *builds itself*: you write "I want a network, a cluster, +one node, these permissions," run one command, and Terraform creates it all in the +right order. Run a different command and it tears it all back down. This means the +**entire infrastructure can be destroyed and recreated from scratch in about 20 +minutes** — which is exactly what VidCast does to save money (destroy it overnight, +rebuild it when needed). Infrastructure-as-code also means the setup is versioned, +reviewable, and repeatable, instead of a pile of forgotten manual clicks. + +**Helm — the app installer.** Some things you run on Kubernetes are standard, +off-the-shelf software (the databases, the monitoring stack). **Helm** is the +"app store" for Kubernetes — it packages complex software into installable +**charts** so you can install MongoDB or Prometheus with one command and some +settings, instead of hand-writing hundreds of lines of configuration. VidCast uses +Helm to install its datastores and most of its platform tools. + +**Kustomize — one recipe, two kitchens.** VidCast runs in more than one environment +(a lighter "dev" setup and a heavier "prod" setup). Rather than duplicate all the +configuration, it uses **Kustomize**: a **base recipe** of the core setup, plus small +**overlays** that tweak it per environment ("dev runs one copy of each service; prod +runs more"). Same base, two variations — no copy-paste, no drift between them. + +Put together: Terraform builds the AWS foundation and the Kubernetes cluster; Helm +installs the off-the-shelf software onto it; Kustomize lays down VidCast's own +services in the right shape for the environment; and Kubernetes keeps the whole thing +running and self-healing on top of Docker containers. Destroy it all, run two +commands, and twenty minutes later it's back. + +--- + +## 5. How Code Gets to Production — CI/CD Pipeline + +A developer changes some code on their laptop. How does that change safely become +part of the live, running system without anyone manually copying files onto a +server? That's **CI/CD** (Continuous Integration / Continuous Delivery), and +VidCast's pipeline is worth understanding step by step because each step catches a +specific kind of problem. + +**Step 1 — Push to GitHub.** The developer commits their change and pushes it to +**GitHub** (where the code lives). This automatically triggers the pipeline — a +series of automated checks and actions defined in a file in the repo +(`.github/workflows/ci.yml`), run by **GitHub Actions** (GitHub's built-in automation +that runs your steps on fresh, throwaway machines). + +**Step 2 — Lint.** First, the code is **linted** with a tool called `ruff` — an +automated style-and-correctness checker that catches obvious mistakes (unused +variables, syntax slips, bad imports) in seconds. This runs first and fast, so a +trivial typo fails the build before wasting time building anything. Think of it as +spell-check before you print. + +**Step 3 — Build the images.** For each of the five backend services *in parallel* +(all at once, to save time), the pipeline runs `docker build` to package the code +into a container image, tagged with the short git commit hash (so every build is +uniquely traceable back to the exact code it came from). + +**Step 4 — Scan for vulnerabilities (Trivy).** Each freshly-built image is scanned by +**Trivy**, a security scanner that checks every package inside the image against +databases of known vulnerabilities. If it finds anything rated **CRITICAL or HIGH**, +the build **fails** (`exit-code: 1`) — the bad image never ships. This is the +quality inspector on the assembly line who can stop the whole line. (`ignore-unfixed` +means it won't fail you for vulnerabilities that have no patch available yet — you +can't fix what the upstream maintainers haven't.) + +**Step 5 — Push to Docker Hub.** If linting and scanning pass *and* this is the main +branch, the images are pushed to **Docker Hub** (a public registry of container +images), where the cluster can later pull them. Only main-branch pushes publish — +pull requests get tested but don't ship. + +**Step 6 — OIDC federation: the day pass, not the permanent keycard.** When the +pipeline needs to talk to AWS, it faces a classic security problem: how do you give +an automated job AWS permissions without storing long-lived AWS keys somewhere they +could leak? The old way was to paste a permanent secret key into the pipeline — a +**permanent keycard** that, if stolen, works forever. VidCast uses **OIDC +federation** instead: GitHub vouches for the workflow's identity ("this really is the +`ci.yml` job on the main branch of this repo"), and AWS hands back a **temporary, +short-lived credential** — a **day pass** that expires in minutes and only works for +that specific job. There's no long-lived secret to steal. (The login email and +trust setup for this is the GitHub OIDC provider configured in Terraform.) + +**Step 7 — Deployment, the GitOps way (Argo CD).** Now the new image exists — how +does it get onto the cluster? Here VidCast uses a modern, safer model called +**GitOps**, run by a tool called **Argo CD**. The old way ("push") had the pipeline +hold cluster credentials and shove changes in (`kubectl set image`). The new way +("pull") flips it: **Argo CD lives *inside* the cluster and continuously pulls the +desired setup from Git**, making the cluster match what's described in the repo. Git +becomes the single source of truth for "what should be running." + +Picture Argo CD as a **diligent gardener** who has a copy of the garden's master +plan (Git) and constantly walks the garden making the real plants match the plan. If +someone sneaks in and moves a plant (a manual change to the cluster), the gardener +quietly puts it back. If the plan changes (you merge a new image tag), the gardener +plants the new thing. The benefits are real: the pipeline no longer needs cluster +keys (smaller blast radius if it's ever compromised), every deployment is a +reviewable Git commit (full audit trail; roll back with `git revert`), and any drift +between "what's running" and "what should be running" is detected and corrected +automatically. + +**Step 8 — Dev auto-sync vs prod manual gate.** VidCast has two Argo CD +"Applications": **dev** and **prod**. Dev is set to **auto-sync** — the moment the +plan changes in Git, the gardener applies it automatically. Prod is deliberately +**not** auto-sync — Argo CD notices the change and shows "out of sync," but it +**waits for a human to click Sync**. That pause *is* the production approval gate. +The clever detail: the gate isn't a special "if approved" step in the code — it's the +*absence* of the auto-sync setting on the prod Application. The most important line +in the prod config is the one that isn't there. + +There's also a **Jenkinsfile** in the repo, which expresses the same pipeline in a +different tool (Jenkins) and adds a Docker Swarm staging environment plus an explicit +"Deploy to Production?" approval button — demonstrating that the same CI/CD concepts +translate across tools, and connecting the Docker Swarm learning module to the +Kubernetes production deployment. + +--- + +## 6. Platform Capabilities + +This is the heart of the project — the production-grade features that turn a simple +app into a real platform. They were built across four "sprints" and are grouped here +by what problem they solve. For each, here's *what it does, what problem it solves, +and why it matters* (with the interview-relevant detail). + +### Reliability & Messaging + +**Transactional outbox (A1) — never lose an upload.** +The problem: when you upload a video, two things must both happen — store the file, +*and* tell the system to convert it. If the message system (RabbitMQ) is down for the +split second between those two steps, you'd have a stored video that nobody knows to +convert: a silently lost upload. The outbox pattern fixes this by writing the "please +convert this" instruction as a row *in the same database as the video*, marked "not +sent." A separate program (the relay) reads those rows and publishes them to RabbitMQ +later, retrying until it succeeds. The instruction can't be lost because it's sitting +durably in the database until it's confirmed sent. The analogy: instead of phoning in +an order the instant a customer walks out (and losing it if the line's busy), you +write every order in your own ledger first, then work through the ledger calling them +in — the ledger is the safety net. + +Why it matters / the interview detail: the relay runs as a **separate deployment with +exactly one copy**, *not* as a background thread inside the gateway. Why? The gateway +runs as multiple processes (under gunicorn — see A4), so a thread inside it would run +once *per process*, and you'd get several relays all publishing the same row multiple +times — the exact duplicate-send bug the outbox exists to prevent. Making it a +single-replica deployment makes "exactly one publisher" a structural guarantee rather +than something you have to police. Honest limitation worth stating: the file-write and +the outbox-write aren't a single atomic transaction (true atomicity needs a MongoDB +replica set, which the single in-cluster Mongo isn't), so a crash in the tiny window +between them could still orphan a file — but that's the *same* small window the +original code had, and the outbox eliminates the much *larger* "broker down = lost +event" window. + +**Retry / Dead-Letter Queue topology (A3) — handle poison messages.** +The problem: what if a message can *never* succeed — a corrupt video ffmpeg can't +read, or a permanently invalid email address? The naïve approach (put it back on the +queue and try again) loops it **forever**, pinning a worker and blocking everyone +behind it (a "poison message"). The fix is **bounded retries plus a dead-letter +queue**: try a few times with a delay, and if it still fails, move it to a special +**dead-letter queue** (the "problem pile") where a human can inspect it later, and get +on with the rest of the work. VidCast builds three queues per pipeline — the main +queue, a `.retry` queue, and a terminal `.dlq` queue — plus a shared dead-letter +exchange. + +Why it matters / the interview detail: the *delay* between retries has no timer in the +code at all — the `.retry` queue is given a **time-to-live** and *no consumer*, so a +message simply expires after the delay, and RabbitMQ's expiry machinery routes it back +to the main queue for another attempt. The broker's own TTL-and-dead-letter feature +*is* the delay mechanism. An explicit `x-retry-count` header (rather than RabbitMQ's +built-in `x-death`) tracks attempts, so the behaviour is identical across broker +versions. After the retry limit (default 3, so 4 total attempts), the message goes to +the terminal dead-letter queue, which nothing consumes — it stops and waits for a +human. This also fixed a real crash: a bad video used to throw an error that killed +the converter pod; now it's caught and dead-lettered. + +**Idempotent consumers (A2) — duplicates become no-ops.** +The problem: the outbox and the retry system both deliberately deliver "at least +once" — meaning a message could occasionally arrive twice (e.g. the relay publishes, +then crashes before marking it sent, so it publishes again on restart). Without a +guard, a duplicate means converting the same video twice and sending two emails. +**Idempotency** makes "process this job twice" have the same effect as processing it +once. The mechanism is a single atomic Redis command (`SET NX EX`): the first +delivery sets a key for that job ID and proceeds; any later delivery finds the key +already there and skips. The key auto-expires after a few minutes (so a crashed worker +can't wedge a job forever). The analogy: a coat-check ticket — the first person to +claim a job gets the ticket; anyone else who shows up with the same job sees it's +already taken and walks away. + +Why it matters / the interview detail: there's a subtle, much-tested rule about *when +to release the ticket*. On **success**, keep the key (so a genuine duplicate is +suppressed). On a **retryable failure**, *delete* the key — because the retry will +redeliver the same job, and if the key were still there the retry would be skipped +forever and the job would silently never complete. On a **permanent (dead-letter) +failure**, keep the key (the job is unfixable; don't reprocess it). Getting this +backwards turns a transient error into a permanent silent loss. Also: if Redis itself +is down, the system **fails open** (processes anyway) — the worst case is a rare +duplicate, which is far better than halting the whole pipeline every time Redis blips. + +**Gunicorn production server (A4) — a real web server, not the toy one.** +The problem: Flask ships with a built-in development web server that even *prints a +warning telling you not to use it in production* — it handles one request at a time +and has no worker model. VidCast swaps it for **gunicorn**, a proper production web +server that runs the app as several worker processes, so one slow request no longer +blocks everyone. No application code changed — gunicorn just imports the existing app +and serves it better. + +Why it matters / the interview detail: gunicorn running the gateway as *multiple +processes* is precisely why the outbox relay (A1) had to become a separate single-copy +deployment — this is the dependency that orders the whole reliability sprint. The +worker count is deliberately kept low (2, not the textbook "2×cores+1") because on a +single 2-CPU node already running a dozen pods, the textbook number would +oversubscribe the machine — and the CPU-heavy work lives in the converters, not the +web tier. Horizontal scaling is handled by adding *pods* (HPA, below), not cramming +in more workers. + +**KEDA autoscaling + HPA (A7) — right-size automatically, even to zero.** +The problem: the converter is idle most of the time (nobody's uploading), but bursts +hard when work arrives. Keeping it always-on wastes resources; keeping it too small +makes uploads slow. VidCast uses **two autoscalers, each matched to its workload**. +The converter is scaled by **KEDA** (Kubernetes Event-Driven Autoscaler) on **queue +depth** — how many videos are waiting — and KEDA can scale it all the way to **zero** +when the queue's empty, then back up to 3 as work piles in. The gateway is scaled by +the standard **HPA** (Horizontal Pod Autoscaler) on **CPU usage**, staying at least 1 +(it's user-facing and must always answer). + +Why it matters / the interview detail: match the signal to the workload — a queue +worker should scale on *how much work is queued* (a leading signal; you know work is +coming before CPU even rises), and a web server on *how busy it is*. A plain HPA +*can't* scale to zero (minimum 1) and reacts to CPU only *after* the backlog builds. +The footgun avoided: if KEDA and an HPA both target the *same* deployment they fight +over the replica count and oscillate — so they're kept on *different* deployments +(converter vs gateway), which never conflict. (One real-world wrinkle that bit us: +because KEDA now owns the converter's replica count, the GitOps tool Argo CD must be +told to *ignore* that field, or the two controllers tug-of-war over it.) + +### Security & Access Control + +**External Secrets Operator + Parameter Store (A9) — no secrets in the code.** +The problem: passwords, API keys, and database URIs must never sit in the Git repo +(public, forever, searchable). VidCast stores them in **AWS Parameter Store** (a +secure, encrypted key-value store) and uses the **External Secrets Operator (ESO)** — +a cluster add-on that pulls those secrets into Kubernetes at runtime, authenticating +via the cluster's own AWS identity (no long-lived keys). The analogy: Parameter Store +is a **safe-deposit box at the bank** — the app has a key that lets it retrieve the +contents at runtime, but the contents are never written down in the code. + +Why it matters / the interview detail: it's **Parameter Store, not Secrets Manager**, +deliberately — Secrets Manager charges $0.40 per secret per month and *keeps billing +even after the cluster is destroyed*, while standard Parameter Store entries (and the +AWS-managed encryption key) are **free**. For seven secrets that's ~$3/month saved, +and it preserves the project's "$0 when the cluster is off" rule. One honest exception: +the RabbitMQ password is still created by RabbitMQ's own Helm chart (because that same +secret sets up the broker), so it isn't ESO-managed — that's documented, not hidden. + +**NetworkPolicy default-deny (A6) — zero-trust networking.** +The problem: by default, every pod in a Kubernetes namespace can talk to every other +pod — a flat, open office where anyone can walk into any room. If one service is +compromised, the attacker can reach everything. VidCast flips this to **default-deny**: +every pod is blocked from all network traffic *except* the specific connections +explicitly allowed (gateway→auth, gateway→Mongo, converter→RabbitMQ, etc.). The +analogy: an office where **every door is locked by default** and you only get +key-card access to the specific rooms your job needs. + +Why it matters / the interview detail: the **number-one mistake** here is that a +NetworkPolicy is *just a piece of paper* — something has to *enforce* it. On EKS the +default network plugin doesn't enforce policies unless you explicitly turn on the +enforcement agent (done in Terraform). Apply a default-deny without it and the API +accepts the policy, it *looks* applied, and nothing actually changes — you think +you're secure and you're not. The **second** classic mistake: the very first thing you +must allow is **DNS** (name lookups), because every service is reached by name; block +DNS and the whole app dies in a way that looks like total breakage rather than "DNS is +blocked." A real-world wrinkle we hit live: the policy for Kyverno's namespace had to +allow it to reach the cloud metadata service on port 80 to authenticate to the private +image registry — miss that and image-verification calls time out and block deployments. +Networking lockdowns are full of these "you forgot one allow" lessons, and they're +documented honestly. + +**Kyverno policy-as-code (B2) — rules that enforce themselves.** +The problem: you can *write* rules like "no container may run as root" or "every +image must have a real version tag, not `latest`," but humans forget. **Kyverno** is +an **admission controller** — it sits in front of the Kubernetes API and inspects +every deployment *before* it's allowed to run, checking it against policies written as +code (YAML in Git). The analogy: a **building inspector** who checks every new +structure against the code before it's allowed to open. VidCast ships seven policies: +no `:latest` tags, must declare resource limits, must run non-root, must use a seccomp +profile (restricts dangerous system calls), must carry standard labels, no privileged +containers, and verify image signatures (the last one ties into supply chain, §6.5). + +Why it matters / the interview detail: every policy starts in **Audit** mode, not +**Enforce**. Audit *records* violations without blocking; Enforce *rejects* them. If +you ship Enforce on day one, the first existing resource that violates a rule (and +several do) blocks deployments immediately — possibly including the very fix you're +trying to deploy. The disciplined path is Audit → read the violation reports → fix +everything → promote to Enforce only when clean. One honest residual: MongoDB and +PostgreSQL *can't* run fully non-root (their official startup scripts need root to +initialise, then drop privileges), so that one policy keeps a documented exception +for the two databases. + +**Bcrypt password hashing + RBAC.** +The problem: storing passwords as plain text is catastrophic — one database leak and +every account is compromised. VidCast hashes passwords with **bcrypt**, a one-way +scrambling function deliberately designed to be *slow* (so attackers can't rapidly +guess billions of passwords) and salted (so identical passwords don't produce +identical hashes). At login, the typed password is hashed and compared to the stored +hash; the real password is never stored or recoverable. On top of this, **RBAC** +(Role-Based Access Control) gives each user a role (`admin` or `user`) carried in their +JWT, so admin-only pages and actions can be gated. The analogy: bcrypt is a **one-way +blender** — you can blend the fruit but never un-blend the smoothie back into fruit; +you just blend the next fruit and check if the smoothies match. Interview-relevant +gotcha we hit: the database and the auth *image* must be upgraded together — a +bcrypt-storing database with an old plain-text-comparing app (or vice versa) rejects +every login, because it's comparing a typed password against a scrambled hash. + +**Pod security contexts (read-only rootfs, non-root, seccomp).** +The problem: if an attacker breaks into a container, you want them to find as little +power as possible. VidCast hardens every pod with a **security context**: run as a +**non-root** user (so a breakout doesn't own the host), a **read-only root filesystem** +(the attacker can't modify the running container or drop in tools), **drop all Linux +capabilities** (no special kernel powers), and a **seccomp profile** (block dangerous +system calls). This is **least privilege** applied to the container. Interview detail: +read-only-rootfs interacts with gunicorn, which needs to write a couple of temp files — +so exactly *one* writable scratch directory (`/tmp`) is mounted while everything else +stays read-only. Least privilege means "exactly the access needed, nothing more." + +### GitOps & Deployment + +**Kustomize overlays (A10) — one base, environment variations.** +The problem: dev and prod need *almost* the same configuration, differing only in a few +places (replica counts, image tags). Copy-pasting two full sets of config guarantees +they'll drift apart. **Kustomize** keeps a single **base** definition and small +**overlays** that patch it per environment. Dev runs one replica of each backend; prod +runs more — expressed as a tiny diff on top of the shared base, not a fork. The +analogy: a base recipe with "for the spicy version, add chilli" written in the margin, +rather than two entire cookbooks. + +**Argo CD (B1) — the cluster pulls from Git.** +Covered in §5, but to restate as a capability: Argo CD is the engine that makes +**Git the source of truth** for what runs in the cluster. It continuously reconciles +the live cluster to match the repo, auto-correcting drift. **Dev auto-syncs** +(every merged change deploys itself); **prod waits for a human to click Sync** (the +approval gate). This replaces the old, riskier model where the CI pipeline held +cluster keys and pushed changes in. Every deployment becomes a reviewable, revertible +Git commit. + +**The approval-gate migration story.** +Worth telling as a narrative: VidCast *started* with a "push" pipeline (CI ran +`kubectl set image` against the cluster using stored credentials). Moving to Argo CD +meant retiring that push step and replacing it with "merge a tag-bump commit, then +sync." Dev's gate became fully automatic; prod's gate became the deliberate *absence* +of auto-sync — a human reviews the diff in the Argo CD UI and clicks Sync. The lesson +for interviews: the safest production gate isn't a clever pipeline step you can +accidentally bypass; it's a structural property (no auto-sync) that *requires* a human +by construction. + +### Observability & Cost + +**SLO burn-rate alerting (B4) — alert on what users feel, not noise.** +The problem: naïve alerts are either too noisy (page someone at 3 a.m. for a harmless +30-second blip) or too slow (a steady tiny error leak silently drains reliability for +weeks without ever crossing a threshold). VidCast uses **SLOs** (Service Level +Objectives — explicit reliability targets like "99.9% of requests succeed") and the +matching idea of an **error budget**: the allowed amount of failure (for 99.9%, that's +0.1%, which over 30 days is about **43 minutes** of badness you're permitted to spend). +The mental flip: reliability isn't "100% or bust," it's a *budget* you deliberately +spend on shipping features — budget left, ship; budget gone, stop and stabilise. + +The alerting technique is **multi-window, multi-burn-rate**, which sounds scary but is +intuitive. **Burn rate** = how fast you're spending the budget relative to +sustainable: burn rate 1 means you'll spend exactly 100% of the month's budget right +at month-end; burn rate 14 means you'll be empty in about a fourteenth of the time — +something is badly wrong *now*. **Multi-window** means an alert only fires if *both* a +**long** window (say 1 hour — confirms it's a real, sustained problem, not a blip) and +a **short** window (say 5 minutes — so the alert clears quickly once the problem ends) +are burning fast. The result: pages only on real, ongoing problems, and they +self-clear soon after recovery. Interview detail: one tricky bit is measuring an SLI +across the gateway's *two* gunicorn worker processes — each keeps its own counters, so +a scrape would read a random half; the fix is Prometheus "multiprocess mode" where the +workers write to a shared directory and the metrics endpoint sums across them. + +**Prometheus + Grafana dashboards.** +**Prometheus** is the monitoring system that continuously collects numbers (metrics) +from every service — request counts, queue depths, conversion times, CPU. **Grafana** +turns those numbers into **dashboards** — live graphs of the system's health. VidCast +ships three custom dashboards (operations, SLO, cost), and the frontend's Dashboard +page even embeds the Grafana operations view directly. The analogy: Prometheus is the +**car's sensors** constantly reading speed, fuel, temperature; Grafana is the +**dashboard** that displays them so the driver can see at a glance. + +**Kubecost FinOps (B3) — what does a conversion actually cost?** +The problem: the cloud makes it trivially easy to spend money and very hard to see +*who or what inside your cluster* caused the bill. AWS bills you for a *machine*; it +has no idea that machine ran twelve pods for four different features. **Kubecost** +reads how much CPU and memory each pod uses and multiplies by the machine's price to +**attribute** cost down to individual services — turning "the cluster costs ~$150/mo" +into the unit-economics number a business actually cares about: **"each conversion +costs $X."** That number literally joins a Kubecost metric (node hourly cost) with a +monitoring metric (conversions per hour) — a neat demonstration that the cost +instrumentation and the reliability instrumentation reinforce each other. + +Why it matters / the interview detail: there's a lovely irony — on a tiny 2-CPU node, +Kubecost's *default* install (which bundles its own monitoring stack) would burn +roughly a whole CPU just to *measure* cost. The fix — point it at the Prometheus +already running and strip it to one small pod — is *itself* a FinOps decision: the cost +of measuring cost must be smaller than what it saves. Also worth knowing: Kubecost is +an *estimate* (list prices, can't see your Reserved-Instance discounts), so you use it +for *relative* answers ("the converter costs 3× the gateway", "cost per conversion rose +20% this week") and the actual AWS bill for *absolute* answers. + +**The dangling-alert fix (M-2).** +A small but honest detail worth mentioning: an early version had alert rules that +referenced metrics the app didn't actually emit yet (the gateway's `/metrics` endpoint +had been removed during an earlier cleanup) — "dangling" alerts that could never fire +correctly. The fix was to re-add the proper metrics instrumentation so the SLO rules +have real data to evaluate. It's the kind of subtle gap that only shows up when you +wire monitoring end-to-end, and it's recorded rather than quietly papered over. + +### Supply Chain + +The overarching question this whole category answers: *"You pulled an image and ran +it. Prove it's really your code, built by your CI, and not tampered with."* Without +controls you can't — a tag is mutable and the contents are opaque. VidCast adds four +independent proofs. + +**SBOM generation (A8).** An **SBOM** (Software Bill of Materials) is a complete, +machine-readable **ingredients list** of everything inside an image — every OS package +and library, with versions. Why it matters: when the next big vulnerability drops (the +next Log4Shell), "are we affected?" becomes a quick *lookup* against stored SBOMs +instead of a frantic rebuild-and-rescan of everything. It's the difference between +"what's in production?" being a guess versus a query. + +**Trivy scanning + SARIF (A8).** Trivy (the vulnerability scanner from the CI pipeline) +can output its findings in **SARIF**, a standard format GitHub understands natively — +so the results show up right in the repo's *Security ▸ Code scanning* tab, inline and +deduplicated with history, instead of being buried in build logs. The pattern is two +Trivy runs: one **gate** that fails the build on CRITICAL/HIGH, and one **report** that +always uploads the SARIF (even when the gate fails) so you can see *why* it failed. + +**Cosign image signing (A8/B5).** **Cosign** cryptographically **signs** each image so +its integrity and origin can be verified — like a **tamper-evident wax seal**. VidCast +uses **keyless** signing, which is elegant: instead of a long-lived private key you +must guard, the CI job presents its short-lived OIDC identity ("I am the `ci.yml` +workflow on main"), a service called **Fulcio** issues a certificate valid for ~10 +minutes binding *that identity* to the signature, and the signature is recorded in +**Rekor**, a public append-only **transparency log** (tamper-evident forever). The key +expires in minutes — **there's no long-lived secret to leak**. The trust is rooted in +*identity*, not a stored key. + +**Kyverno verify-images (B5).** This closes the loop: the Kyverno policy from §6.2 can +**verify those signatures at deploy time** — "is there a signature whose certificate +says it was made by *our* CI workflow, recorded in Rekor? If not, don't admit the pod." +Currently it runs in Audit mode and honestly reports our images as "not yet signed," +because the signing step isn't wired into CI yet — that's the expected "supply chain +not yet closed" signal, flipped to enforcing the moment CI starts signing. + +**The full chain: commit → build → sign → verify → admit.** Putting it together, every +hop adds a verifiable property: a developer **commits** code → CI **builds** the image +and the Trivy gate blocks CRITICAL/HIGH while an SBOM and SARIF are generated → the +image is **pushed by digest** to an immutable, scan-on-push registry → cosign +**keyless-signs** the digest and logs the signature in Rekor, attaching the SBOM as a +signed attestation → at deploy, Kyverno **verifies** the signature and the exact CI +identity before **admitting** the pod. From commit to running container, every step is +provable. (There's also **SLSA provenance**, a graded standard for how trustworthy the +*build* itself is — a signed statement of "image X was built from commit Y by workflow +Z" — documented with a recommendation to use the hardened reusable builder for the +highest level.) + +--- + +## 7. Cost Story + +A recurring theme you've seen throughout: VidCast is obsessive about cost, on purpose. +The whole platform is engineered so its **standing cost is $0 when the cluster is +off**, and the decisions reflect real, defensible trade-offs rather than reflexively +reaching for the most "production" option. + +**Why managed datastores were skipped.** The biggest single cost decision. The +"proper production" move is to replace the in-cluster databases with AWS-managed ones +(RDS for PostgreSQL, MongoDB Atlas, Amazon MQ for RabbitMQ, ElastiCache for Redis). +VidCast deliberately **didn't**, and the deciding number is **Amazon MQ for RabbitMQ**: +its *smallest possible* broker is ~**$183/month** (there is no cheap tier, and no +"pause"). That single service costs more than the entire rest of the platform combined, +and more than the EKS control plane itself — on a project whose whole point is $0 +when off. The all-managed version would run ~$262–273/month standing. So the managed +path is **documented and costed as the production migration story**, but the +in-cluster Helm charts stay — and critically, the *reliability patterns* that managed +services usually provide (no lost events, idempotent retries, dead-lettering) are +delivered **in code** (A1/A2/A3) against the in-cluster brokers instead. You get the +reliability story without the bill. + +**Why Parameter Store over Secrets Manager.** As covered in §6: Secrets Manager bills +$0.40 per secret per month and persists after teardown; Parameter Store (standard tier, +AWS-managed encryption key) is **free**. Same security outcome, ~$3/month saved, $0 +standing cost. + +**The "$0 when off" target.** The cluster is genuinely **torn down to save money** and +rebuilt on demand in ~20 minutes via Terraform — preserving only free-to-keep things +(the Terraform state, the configuration file, the container images). This is why so +much of the design (infrastructure-as-code, scale-to-zero, no managed datastores) bends +toward "destroy and recreate cheaply." + +**Node-budget tracking discipline.** Because everything runs on a *single* 2-CPU node, +there's a running discipline of tracking how much of that node each tool consumes — and +a self-imposed "~90% idle budget" gate. This is why the converter scales to zero (frees +the node when idle), why gunicorn uses few workers, why Kubecost is stripped to one +small pod and run on the lighter dev footprint, and why the monitoring stack is tuned +down. Every add-on has to justify its slice of two CPUs. + +**What it costs when running.** While up, the dominant costs are the **EKS control +plane** (~$0.10/hour ≈ ~$73/month if left on, often cited alongside the ~$150/month +all-in figure for a continuously-running small cluster) and the **node itself** +(`m7i-flex.large` ≈ **$0.11/hour**). Run it for a demo and destroy it, and the bill is +a few cents to a couple of dollars. Leave it on all month and it's roughly $150. The +discipline is to treat "is the cluster on?" as the main cost lever. + +--- + +## 8. Honest Gaps + +A core value of this project is **honesty about what's incomplete** — the same standard +applied throughout the docs. Nothing below is hidden; each is a deliberate, +understood trade-off appropriate to a single-node portfolio cluster, with the +"proper" fix noted. + +- **MongoDB and PostgreSQL require root to start.** Their official container + entrypoints need root to initialise the database and fix file ownership, then drop + privileges. So they can't satisfy the "run as non-root" policy, which keeps a + *documented exception* for those two pods. Everything else runs non-root. + +- **The single-node constraint shapes everything.** One `m7i-flex.large` is the whole + cluster. That caps how much can run at once (there's even a hard ~29-pods-per-node + limit from the networking layer that we hit when adding the monitoring stack — the + fix was a temporary second node), means no real high-availability (the node *is* the + failure boundary), and is why a single-instance in-cluster database is "acceptable + here" — because nothing else is redundant either. Real HA needs multiple nodes and + managed datastores, which is the documented (costed) production path. + +- **The frontend's Grafana embed is IP-dependent.** The Dashboard page embeds the live + Grafana view, but the Grafana address is baked into the frontend *at build time* + (it's a `VITE_` variable). Because the node's public IP changes when the + infrastructure is recreated, the frontend image has to be rebuilt with the new + address each time the cluster is rebuilt. A more robust fix (runtime configuration or + an ingress with a stable hostname) is noted but not built — fine for a demo, a real + gap for a permanently-running site. + +- **Metrics don't survive a pod restart.** The monitoring stack (Prometheus, Grafana) + runs on **emptyDir** storage — ephemeral scratch space — because the cluster has no + dynamic disk-provisioning driver installed and the design avoids billable, orphan- + prone EBS volumes. The trade-off: if the Prometheus pod restarts, its history is + gone. Acceptable on a transient demo cluster that's torn down nightly; a real + deployment would use persistent volumes (or remote storage like Thanos/Mimir, which + is also what true 30-day error-budget accounting would need — Prometheus here keeps + only 7 days, so the *alerts* are fully correct but the dashboard's "budget remaining" + panels are labelled as a 7-day view). + +- **The SLO targets are demonstrative, not battle-tested.** The 99.9%-style objectives + are reasonable and the burn-rate math is the standard Google SRE approach, but on a + single-node demo cluster with synthetic traffic they're there to *demonstrate the + technique* rather than to reflect hard-won production numbers. The end-to-end success + SLI in particular spans two services minutes apart, so it's only trustworthy over + long windows — which is documented. + +- **Supply-chain signing isn't wired into CI yet.** The verification policy (B5) and + all the signing concepts are in place, but the cosign-signing *step* isn't in the CI + pipeline yet, so images are honestly reported as "not yet signed" (in Audit mode, so + it never blocks). It flips to fully enforced the moment CI signs — by design, not by + omission. + +--- + +> **In one breath:** VidCast is a simple video-to-audio app deliberately wrapped in a +> production-grade platform — event-driven and crash-safe (outbox, retries, +> dead-letter queues, idempotency), self-scaling (KEDA to zero, HPA on load), +> locked-down (default-deny networking, policy-as-code, non-root hardened pods, +> secrets out of code), GitOps-deployed (Argo CD pulls from Git; prod gated by a +> human), fully observed (SLO burn-rate alerts, Grafana, per-conversion cost), and +> supply-chain-aware (SBOM, scanning, keyless signing, admission verification) — all +> on a single cheap EKS node that costs $0 when off and rebuilds from code in twenty +> minutes, with every limitation written down rather than hidden. The converter is +> the demo; the platform is the project. diff --git a/README.md b/README.md index 0ac9c72..76a29be 100644 --- a/README.md +++ b/README.md @@ -1,274 +1,359 @@ -# Devops Project: video-converter -Converting mp4 videos to mp3 in a microservices architecture. +# VidCast — Video-to-Audio Microservices Platform -## Architecture - -

- Architecture -

- -## Deploying a Python-based Microservice Application on AWS EKS - -### Introduction - -This document provides a step-by-step guide for deploying a Python-based microservice application on AWS Elastic Kubernetes Service (EKS). The application comprises four major microservices: `auth-server`, `converter-module`, `database-server` (PostgreSQL and MongoDB), and `notification-server`. - -### Prerequisites - -Before you begin, ensure that the following prerequisites are met: - -1. **Create an AWS Account:** If you do not have an AWS account, create one by following the steps [here](https://docs.aws.amazon.com/streams/latest/dev/setting-up.html). - -2. **Install Helm:** Helm is a Kubernetes package manager. Install Helm by following the instructions provided [here](https://helm.sh/docs/intro/install/). - -3. **Python:** Ensure that Python is installed on your system. You can download it from the [official Python website](https://www.python.org/downloads/). - -4. **AWS CLI:** Install the AWS Command Line Interface (CLI) following the official [installation guide](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). - -5. **Install kubectl:** Install the latest stable version of `kubectl` on your system. You can find installation instructions [here](https://kubernetes.io/docs/tasks/tools/). - -6. **Databases:** Set up PostgreSQL and MongoDB for your application. - -### High Level Flow of Application Deployment - -Follow these steps to deploy your microservice application: - -1. **MongoDB and PostgreSQL Setup:** Create databases and enable automatic connections to them. - -2. **RabbitMQ Deployment:** Deploy RabbitMQ for message queuing, which is required for the `converter-module`. - -3. **Create Queues in RabbitMQ:** Before deploying the `converter-module`, create two queues in RabbitMQ: `mp3` and `video`. - -4. **Deploy Microservices:** - - **auth-server:** Navigate to the `auth-server` manifest folder and apply the configuration. - - **gateway-server:** Deploy the `gateway-server`. - - **converter-module:** Deploy the `converter-module`. Make sure to provide your email and password in `converter/manifest/secret.yaml`. - - **notification-server:** Configure email for notifications and two-factor authentication (2FA). - -5. **Application Validation:** Verify the status of all components by running: - ```bash - kubectl get all - ``` - -6. **Destroying the Infrastructure** - - -### Low Level Steps - -#### Cluster Creation - -1. **Log in to AWS Console:** - - Access the AWS Management Console with your AWS account credentials. - -2. **Create eksCluster IAM Role** - - Follow the steps mentioned in [this](https://docs.aws.amazon.com/eks/latest/userguide/service_IAM_role.html) documentation using root user - - After creating it will look like this: +**Turn video recordings into podcast-ready audio.** -

- ekscluster_role -

+VidCast is a production-grade Python microservices platform running on AWS EKS. Upload an MP4, and the platform converts it to MP3 asynchronously — then emails you a download link. Built to demonstrate event-driven architecture, container security, CI/CD automation, and infrastructure as code. - - Please attach `AmazonEKS_CNI_Policy` explicitly if it is not attached by default +--- -3. **Create Node Role - AmazonEKSNodeRole** - - Follow the steps mentioned in [this](https://docs.aws.amazon.com/eks/latest/userguide/create-node-role.html#create-worker-node-role) documentation using root user - - Please note that you do NOT need to configure any VPC CNI policy mentioned after step 5.e under Creating the Amazon EKS node IAM role - - Simply attach the following policies to your role once you have created `AmazonEKS_CNI_Policy` , `AmazonEBSCSIDriverPolicy` , `AmazonEC2ContainerRegistryReadOnly` - incase it is not attached by default - - Your AmazonEKSNodeRole will look like this: +## What's Inside -

- Node_IAM -

+| Component | Technology | What it does | +|-----------|-----------|--------------| +| Frontend | React 18 + nginx | Web interface — login, upload, download, monitoring dashboard | +| Gateway API | Flask + GridFS + Pika | Entry point — handles uploads, downloads, JWT validation | +| Auth Service | Flask + PyJWT + psycopg2 | Issues and validates JWT tokens against PostgreSQL | +| Converter | Pika + MoviePy + ffmpeg | 4 worker pods consuming RabbitMQ, converting MP4 → MP3 | +| Notification | Pika + smtplib | 2 worker pods sending email with download link | +| MongoDB | mongo:4.0.8 StatefulSet | Stores video and MP3 files via GridFS | +| PostgreSQL | postgres Deployment | User credentials for auth | +| RabbitMQ | rabbitmq:3-management | Message broker — video queue and mp3 queue | -4. **Open EKS Dashboard:** - - Navigate to the Amazon EKS service from the AWS Console dashboard. - -5. **Create EKS Cluster:** - - Click "Create cluster." - - Choose a name for your cluster. - - Configure networking settings (VPC, subnets). - - Choose the `eksCluster` IAM role that was created above - - Review and create the cluster. - -6. **Cluster Creation:** - - Wait for the cluster to provision, which may take several minutes. - -7. **Cluster Ready:** - - Once the cluster status shows as "Active," you can now create node groups. - -#### Node Group Creation +## Architecture -1. In the "Compute" section, click on "Add node group." +``` +Browser + │ + ▼ +Frontend (React, NodePort :30006) + │ + ▼ +Gateway (Flask :8080, NodePort :30002) + ├── /login ──► Auth Service (:5000) ──► PostgreSQL (:5432) + ├── /upload ──► MongoDB GridFS ──► RabbitMQ "video" queue + └── /download ◄── MongoDB GridFS + │ + RabbitMQ "video" queue + │ + Converter ×4 (ffmpeg) + ├── fetch video from MongoDB + ├── convert to MP3 + ├── store MP3 in MongoDB + └── publish to RabbitMQ "mp3" queue + │ + Notification ×2 (smtplib) + └── email file ID to user +``` -2. Choose the AMI (default), instance type (e.g., t3.medium), and the number of nodes (attach a screenshot here). +--- -3. Click "Create node group." +## Infrastructure -#### Adding inbound rules in Security Group of Nodes +- **Platform:** AWS EKS eu-west-2 (London) +- **Node type:** m7i-flex.large — 2 vCPU / 8 GB RAM +- **IaC:** Terraform modules for VPC, IAM, EKS, security groups +- **Helm charts:** MongoDB, PostgreSQL, RabbitMQ +- **CI/CD:** GitHub Actions (lint → Trivy scan → build → push → EKS deploy) +- **Staging:** Docker Swarm on EC2 t2.micro (97% cheaper than a second EKS cluster) +- **Monitoring:** kube-prometheus-stack — Grafana :30007, Alertmanager :30008 -**NOTE:** Ensure that all the necessary ports are open in the node security group. +--- -

- Inbound_rules_sg -

+## Quick Start — Deploy to AWS -#### Enable EBS CSI Addon -1. enable addon `ebs csi` this is for enabling pvcs once cluster is created +> **New here?** For the full, narrated walkthrough from cloning the repo all the way to +> teardown — including configuration, seeding, CI/CD secrets, and troubleshooting — follow +> **[`docs/GETTING_STARTED.md`](docs/GETTING_STARTED.md)**. The steps below are the +> condensed version. -

- ebs_addon -

+### Prerequisites -#### Deploying your application on EKS Cluster +```bash +# Tools required +aws --version # AWS CLI v2 +kubectl version # kubectl 1.31+ +helm version # Helm 3.x +terraform version # Terraform 1.5+ +``` -1. Clone the code from this repository. +### 1 — Provision infrastructure with Terraform -2. Set the cluster context: - ``` - aws eks update-kubeconfig --name --region - ``` +```bash +cd terraform/environments/dev -### Commands +# Copy and fill in your values +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars with your state bucket name etc. -Here are some essential Kubernetes commands for managing your deployment: +terraform init \ + -backend-config="bucket=YOUR_STATE_BUCKET" \ + -backend-config="key=vidcast/dev/terraform.tfstate" \ + -backend-config="region=eu-west-2" \ + -backend-config="dynamodb_table=vidcast-terraform-locks" +terraform plan +terraform apply +``` -### MongoDB +### 2 — Deploy infrastructure services -To install MongoDB, set the database username and password in `values.yaml`, then navigate to the MongoDB Helm chart folder and run: +```bash +# Connect kubectl to the new cluster +aws eks update-kubeconfig --name vidcast-cluster --region eu-west-2 -``` -cd Helm_charts/MongoDB -helm install mongo . +# Deploy MongoDB, PostgreSQL, RabbitMQ +cd Helm_charts/MongoDB && helm install mongodb . && cd ../.. +kubectl wait --for=condition=ready pod/mongodb-0 --timeout=120s +cd Helm_charts/Postgres && helm install postgres . && cd ../.. +cd Helm_charts/RabbitMQ && helm install rabbitmq . && cd ../.. ``` -Connect to the MongoDB instance using: +### 3 — Initialise PostgreSQL +```bash +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql -h $NODE_IP -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb -f Helm_charts/Postgres/init.sql ``` -mongosh mongodb://:@:30005/mp3s?authSource=admin -``` - -### PostgreSQL -Set the database username and password in `values.yaml`. Install PostgreSQL from the PostgreSQL Helm chart folder and initialize it with the queries in `init.sql`. For PowerShell users: +### 4 — Create RabbitMQ queues -``` -cd .. -cd Postgres -helm install postgres . +```bash +curl -u guest:guest -X PUT http://$NODE_IP:30004/api/queues/%2F/video \ + -H "Content-Type: application/json" -d '{"durable":true}' +curl -u guest:guest -X PUT http://$NODE_IP:30004/api/queues/%2F/mp3 \ + -H "Content-Type: application/json" -d '{"durable":true}' ``` -Connect to the Postgres database and copy all the queries from the "init.sql" file. -``` -psql 'postgres://:@:30003/authdb' -``` +### 5 — Deploy microservices -### RabbitMQ +Application manifests are managed with **Kustomize** (`k8s/base` + per-environment +overlays in `k8s/overlays/{dev,prod}`). Secrets are *not* in the Kustomize tree — +apply them first (from the gitignored `secret.yaml` files, or via External +Secrets Operator), then apply the overlay. -Deploy RabbitMQ by running: +```bash +# 1. Create the per-service Secrets (gitignored; rabbitmq-secret comes from the +# RabbitMQ Helm chart): +kubectl apply -f src/auth-service/manifest/secret.yaml +kubectl apply -f src/gateway-service/manifest/secret.yaml +kubectl apply -f src/converter-service/manifest/secret.yaml +kubectl apply -f src/notification-service/manifest/secret.yaml -``` -helm install rabbitmq . +# 2. Deploy all services via Kustomize (use overlays/dev for the lighter dev env): +kubectl apply -k k8s/overlays/prod +kubectl get pods # all should reach Running ``` -Ensure you have created two queues in RabbitMQ named `mp3` and `video`. To create queues, visit `:30004>` and use default username `guest` and password `guest` +### 6 — Test end-to-end -**NOTE:** Ensure that all the necessary ports are open in the node security group. +```bash +# Login +TOKEN=$(curl -s -X POST http://$NODE_IP:30002/login -u "EMAIL:PASSWORD") -### Apply the manifest file for each microservice: +# Upload +curl -X POST http://$NODE_IP:30002/upload \ + -F "file=@assets/video.mp4" -H "Authorization: Bearer $TOKEN" -- **Auth Service:** - ``` - cd auth-service/manifest - kubectl apply -f . - ``` - -- **Gateway Service:** - ``` - cd gateway-service/manifest - kubectl apply -f . - ``` - -- **Converter Service:** - ``` - cd converter-service/manifest - kubectl apply -f . - ``` +# Download (use file_id from notification email) +curl -X GET "http://$NODE_IP:30002/download?fid=FILE_ID" \ + -H "Authorization: Bearer $TOKEN" -o output.mp3 +``` -- **Notification Service:** - ``` - cd notification-service/manifest - kubectl apply -f . - ``` +--- -### Application Validation +## CI/CD Pipeline -After deploying the microservices, verify the status of all components by running: +Push to `main` triggers the pipeline automatically: ``` -kubectl get all +push to main + └── GitHub Actions ci.yml + ├── ruff lint (Python) + ├── Docker build × 4 services (matrix) + ├── Trivy scan (CRITICAL + HIGH — fails build if found) + └── Push to Docker Hub (tagged with short git SHA) + └── GitHub Actions cd.yml + ├── aws eks update-kubeconfig + └── kubectl set image × 4 deployments ``` -### Notification Configuration +Jenkins pipeline (`Jenkinsfile`) mirrors the same stages for enterprise environments, adding a Docker Swarm staging deploy and a manual approval gate before production. +See [`docs/GETTING_STARTED.md` → CI/CD secrets](docs/GETTING_STARTED.md#10-cicd-secrets) for the secrets to configure (none are stored in this repo). +--- -For configuring email notifications and two-factor authentication (2FA), follow these steps: +## Monitoring -1. Go to your Gmail account and click on your profile. +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml -n monitoring --create-namespace -2. Click on "Manage Your Google Account." +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml +``` -3. Navigate to the "Security" tab on the left side panel. +| Dashboard | URL | Credentials | +|-----------|-----|-------------| +| Grafana — VidCast Operations | `http://NODE_IP:30007` | admin / vidcast-demo | +| Grafana — SLO / Error Budget (B4) | `http://NODE_IP:30007` (uid `vidcast-slo`) | admin / vidcast-demo | +| Grafana — FinOps / Cost (B3) | `http://NODE_IP:30007` (uid `vidcast-finops`) | admin / vidcast-demo | +| Alertmanager | `http://NODE_IP:30008` | — | -4. Enable "2-Step Verification." +--- -5. Search for the application-specific passwords. You will find it in the settings. +## What does VidCast cost? -6. Click on "Other" and provide your name. +Cost visibility via **Kubecost** (OSS/OpenCost core, no license key) — see +`k8s/kubecost/` and `FINOPS_EXPLAINED.md`. -7. Click on "Generate" and copy the generated password. +**Headline: cost per conversion.** -8. Paste this generated password in `notification-service/manifest/secret.yaml` along with your email. +``` +cost_per_conversion = cluster_$/hr ÷ conversions/hr + = sum(node_total_hourly_cost) ÷ (rate(vidcast_conversions_total{status="success"}[1h]) × 3600) +``` -Run the application through the following API calls: +It joins Kubecost's `node_total_hourly_cost` with the B4 SLO counter +`vidcast_conversions_total`. _(Screenshot placeholder — fill from the live FinOps +dashboard.)_ + +**Accuracy caveat:** Kubecost **estimates** from instance list pricing — +m7i-flex.large ≈ **$0.106/hr** (eu-west-2 on-demand; verify current pricing), so the +node is ~$77/mo + ~$73/mo EKS control-plane ≈ the ~$150/mo figure. The **AWS Cost +Explorer bill is ground truth**; Kubecost is for *attribution and trends* (who/what, +relative change), not the absolute invoice. + +**The node-sizing story:** on a 2-vCPU node, Kubecost is the **largest single +observability cost** — its default bundled Prometheus would eat ~1 of 2 CPUs. We +strip it to one ~175m pod pointed at the existing Prometheus; even so it tips the +prod footprint past the 90% idle budget gate, so it runs against the dev footprint +or scales to zero between analyses. _The cost of measuring cost must be smaller than +what it saves._ + +--- + +## Security + +- All pods run as non-root (uid 1000), read-only root filesystem, capabilities dropped +- Resource limits on every container — converters can't starve gateway/auth +- HTTP health probes on auth + gateway; exec probes on converter + notification +- Secrets gitignored — never committed +- Images scanned with Trivy before push; tagged with git SHA (no `:latest` in production) + +--- + +## Reliability + +**Transactional outbox (no lost uploads).** When `OUTBOX_ENABLED=true`, the +gateway records each upload event as a row in a MongoDB `outbox` collection +(durable, in the same database as the video) instead of publishing straight to +RabbitMQ. A dedicated single-replica `outbox-relay` deployment polls the +collection and publishes pending rows to the `video` queue, marking each +`published_at` on success. If RabbitMQ is down at upload time the event is **not +lost** — it publishes once the broker recovers. + +The relay is a separate `replicas: 1` deployment (not an in-process thread) +because the gateway runs multi-process under gunicorn — one publisher by +construction avoids duplicate sends. Roll out with the flag off (relay idle), +then flip to `true`. See `OUTBOX_EXPLAINED.md` for the full design and the +single-node consistency caveat. + +**Retry / dead-letter topology.** Each pipeline (`video`, `mp3`) has a delayed +retry queue and a terminal dead-letter queue. A failed message is retried +`MAX_RETRIES` times (with a `RETRY_TTL_MS` delay between attempts) and then parked +in `.dlq` via the `vidcast.dlx` exchange — replacing the old infinite +NACK-requeue loop on poison messages. Declared from code at consumer startup. + +**Idempotent consumers.** With `IDEMPOTENCY_ENABLED=true`, the converter and +notification consumers claim each job once (Redis `SET NX EX`, keyed on +`video_fid`/`mp3_fid`) so an at-least-once redelivery isn't converted/emailed +twice. Redis runs in-cluster; `claim_once` fails open if Redis is unavailable +(degrades to a possible duplicate, never a stuck pipeline). + +| Flag | Where | Default | Effect | +|------|-------|---------|--------| +| `OUTBOX_ENABLED` | `gateway-configmap` | `false` | `false` = gateway publishes directly to RabbitMQ (legacy path, unchanged). `true` = uploads routed through the outbox + relay. | +| `IDEMPOTENCY_ENABLED` | `converter`/`notification` configmaps | `false` | `false` = consumers behave as before. `true` = claim-once dedup via Redis. | +| `MAX_RETRIES` / `RETRY_TTL_MS` | `converter`/`notification` configmaps | `3` / `30000` | Retry count and inter-attempt delay before a message is dead-lettered. | + +See `OUTBOX_EXPLAINED.md`, `DLQ_TOPOLOGY_EXPLAINED.md`, `IDEMPOTENCY_EXPLAINED.md` +for the full designs. + +--- + +## Teardown + +```bash +# Microservices (Kustomize — match the overlay you deployed) +kubectl delete -k k8s/overlays/prod + +# Helm +helm uninstall mongodb postgres rabbitmq +helm uninstall monitoring -n monitoring + +# Infrastructure +cd terraform/environments/dev +terraform destroy +``` -# API Definition +--- -- **Login Endpoint** - ```http request - POST http://nodeIP:30002/login - ``` +## Bugs Fixed - ```console - curl -X POST http://nodeIP:30002/login -u : - ``` - Expected output: success! +| # | Severity | Issue | Fix | +|---|----------|-------|-----| +| 1 | High | `unauth_count.inc()` NameError in gateway service crashes pod on any 401 response | Removed 2 stale Prometheus stub lines | +| 2 | High | JWT secret was `"sarcasm"` (base64) — trivially guessable | Replaced with 34-char random string | -- **Upload Endpoint** - ```http request - POST http://nodeIP:30002/upload - ``` +--- - ```console - curl -X POST -F 'file=@./video.mp4' -H 'Authorization: Bearer ' http://nodeIP:30002/upload - ``` - - Check if you received the ID on your email. +## Repository Structure -- **Download Endpoint** - ```http request - GET http://nodeIP:30002/download?fid= - ``` - ```console - curl --output video.mp3 -X GET -H 'Authorization: Bearer ' "http://nodeIP:30002/download?fid=" - ``` +``` +├── README.md # You are here — overview + condensed quick start +├── CLAUDE.md # Operating instructions for AI assistants (build/deploy playbook) +├── VIDCAST_UPGRADE_PLAN.md # The plan that took the base project to production-grade +├── Jenkinsfile # Enterprise CI/CD pipeline with Swarm staging + approval gate +├── docker-compose.swarm.yml # Docker Swarm staging environment +├── install_prerequisites.sh # Installs kubectl, Helm, Terraform, Python, psql, mongosh +├── .github/workflows/ # CI (lint+scan+build+push) and CD (OIDC → EKS deploy) +├── Helm_charts/ # MongoDB, PostgreSQL, RabbitMQ Helm charts +├── monitoring/ # kube-prometheus-stack values, dashboard, alerts +├── assets/ # Sample video.mp4 for end-to-end testing +├── docs/ # All project documentation — see docs/README.md +│ ├── README.md # Index: which doc to read for what +│ ├── GETTING_STARTED.md# Full clone → run → teardown walkthrough +│ ├── PROJECT_GUIDE.md # Comprehensive guide (technical + plain English) +│ ├── architecture.md # Service inventory, ports, data flow reference +│ ├── deployment-guide.md # Phase-by-phase operations reference +│ ├── presentation-notes.md # Timed demo script +│ ├── DECISIONS_MADE.md # Architectural decision records +│ └── MERGE_RUNBOOK_RBAC.md # RBAC/bcrypt merge runbook +├── src/ +│ ├── auth-service/ +│ ├── converter-service/ +│ ├── frontend/ # React web app + nginx + Kubernetes manifests +│ ├── gateway-service/ +│ └── notification-service/ +└── terraform/ + ├── environments/dev/ # Root module (main, variables, outputs, backend) + └── modules/ # vpc, iam, eks, security-groups, github-oidc +``` -## Destroying the Infrastructure +## Documentation -To clean up the infrastructure, follow these steps: +Full documentation lives in **[`docs/`](docs/)** — start with +**[`docs/README.md`](docs/README.md)**, which points you to the right document: -1. **Delete the Node Group:** Delete the node group associated with your EKS cluster. +- **Run it** → [`docs/GETTING_STARTED.md`](docs/GETTING_STARTED.md) +- **Understand it** → [`docs/PROJECT_GUIDE.md`](docs/PROJECT_GUIDE.md) +- **Look something up** → [`docs/architecture.md`](docs/architecture.md) +- **Present it** → [`docs/presentation-notes.md`](docs/presentation-notes.md) -2. **Delete the EKS Cluster:** Once the nodes are deleted, you can proceed to delete the EKS cluster itself. +> **Security note:** no real credentials are committed to this repo. Account-specific +> values appear as placeholders (``, `YOUR_STATE_BUCKET`, +> `admin@example.com`, ``). Supply your own via the gitignored +> `terraform.tfvars` / `DEPLOYMENT_CONFIG.md` and your CI/CD secret store. diff --git a/VIDCAST_UPGRADE_PLAN.md b/VIDCAST_UPGRADE_PLAN.md new file mode 100644 index 0000000..5119e8d --- /dev/null +++ b/VIDCAST_UPGRADE_PLAN.md @@ -0,0 +1,634 @@ +# VidCast — Production Upgrade Plan + +**Project:** Video-to-Audio Microservices Platform on AWS EKS +**Product Name:** VidCast — "Turn video recordings into podcast-ready audio" +**Date:** May 2026 +**Status:** Base platform deployed and passing end-to-end tests. This document covers planned improvements. + +--- + +## How to Read This Document + +This document is for the team. It explains every improvement we plan to make, why it matters, what it costs (in time and money), and what the alternatives were. If you're picking up a phase to work on, read the relevant section fully before writing any code. If something isn't clear, ask — don't guess. + +Every improvement falls into one of three categories: + +- **Build It** — We will implement this. It goes into the repo and the demo. +- **Talk About It** — We understand this and can explain it in the presentation, but we're not implementing it. +- **Skip It** — Not relevant for this project at this stage. + +--- + +## Table of Contents + +1. [Current State — What We Have](#1-current-state--what-we-have) +2. [Product Concept — VidCast](#2-product-concept--vidcast) +3. [Phase 1 — Terraform Infrastructure as Code](#3-phase-1--terraform-infrastructure-as-code) +4. [Phase 2 — CI/CD Pipeline](#4-phase-2--cicd-pipeline) +5. [Phase 3 — Security Hardening](#5-phase-3--security-hardening) +6. [Phase 4 — Monitoring and Observability](#6-phase-4--monitoring-and-observability) +7. [Phase 5 — Frontend Web Application](#7-phase-5--frontend-web-application) +8. [Phase 6 — Documentation and Presentation](#8-phase-6--documentation-and-presentation) +9. [Things We Talk About But Don't Build](#9-things-we-talk-about-but-dont-build) +10. [Repository Structure](#10-repository-structure) +11. [Branch Strategy](#11-branch-strategy) +12. [Cost Breakdown](#12-cost-breakdown) +13. [Real-World Use Cases](#13-real-world-use-cases) +14. [Presentation Strategy](#14-presentation-strategy) + +--- + +## 1. Current State — What We Have + +The base platform is deployed on AWS EKS in eu-west-2. It consists of four Python microservices (auth, gateway, converter, notification) and three infrastructure services (MongoDB, PostgreSQL, RabbitMQ) deployed via Helm charts. The application accepts video uploads via HTTP, converts them to MP3 asynchronously using RabbitMQ as a message broker, and emails the user when the audio file is ready for download. + +What works: end-to-end flow (login, upload, convert, notify, download), JWT authentication, event-driven async processing, Helm-managed infrastructure services, multi-replica deployments. + +What's missing: no infrastructure as code (cluster built manually via console), no CI/CD pipeline (images built and deployed manually), no health checks or resource limits on pods, no monitoring or alerting, credentials stored in plaintext YAML committed to the repo, no web interface (API-only via curl), no documentation beyond the deployment guide. + +These gaps are normal for a first-pass learning project. The purpose of this upgrade plan is to close them systematically. + +--- + +## 2. Product Concept — VidCast + +Instead of presenting this as "a Kubernetes exercise," we're framing it as a product that solves a real problem. This makes the demo accessible to non-technical audiences and gives the architecture a business context. + +**The product story:** Content creators record video — Zoom interviews, webinars, conference talks. They need the audio as a standalone podcast episode. VidCast lets them upload the video, converts it automatically, and emails them when the MP3 is ready to download. + +**Why this framing matters:** Every architectural decision now has a business justification. "Why do we use a message queue?" becomes "Because the creator shouldn't have to wait 5 minutes staring at a loading screen — they upload and walk away." "Why do we have 4 converter replicas?" becomes "Because if 20 creators upload at once, we need parallel processing capacity." + +**Why not YouTube downloads:** Downloading from YouTube violates their Terms of Service, yt-dlp breaks regularly as YouTube fights it, and a failed download during a live demo would derail the presentation. Our demo uses locally-stored video files that we control. + +--- + +## 3. Phase 1 — Terraform Infrastructure as Code + +### What We're Building + +Terraform modules that create and manage all AWS infrastructure: VPC, subnets, internet gateway, route tables, security groups, IAM roles, EKS cluster, and managed node group. After this phase, the entire platform can be destroyed and recreated from a single `terraform apply` command. + +### Why This Matters + +Right now, if someone deletes the EKS cluster, we'd need to click through the AWS Console for 30-60 minutes to rebuild it, hoping we remember every setting. With Terraform, the infrastructure is version-controlled, reviewable, and repeatable. This is the single most impactful improvement for the CV and the demo. + +In industry, this is non-negotiable. Every company running cloud infrastructure uses some form of IaC — Terraform, CloudFormation, Pulumi, or CDK. "I can destroy and recreate this entire platform from scratch with one command" is a sentence that separates you from most bootcamp graduates. + +### What the Industry Calls This + +Infrastructure as Code (IaC). The practice comes from the DevOps principle that infrastructure should be treated like application code: version-controlled, peer-reviewed, tested, and reproducible. The term was popularised by tools like Chef and Puppet in the 2010s, and Terraform (by HashiCorp, now part of IBM) became the dominant multi-cloud IaC tool. + +### Trade-off Analysis + +| Dimension | Terraform (Chosen) | AWS CloudFormation | Pulumi | +|---|---|---|---| +| Multi-cloud support | Yes — works with AWS, Azure, GCP | AWS only | Yes | +| Language | HCL (domain-specific) | JSON/YAML | Python, TypeScript, Go | +| Industry adoption | Dominant in multi-cloud shops | Dominant in AWS-only shops | Growing but smaller | +| Learning curve | Moderate — HCL is readable | Low for simple stacks | Low if you know the language | +| State management | Remote state in S3 + DynamoDB lock | Managed by AWS automatically | Managed by Pulumi Cloud or self-hosted | +| Bootcamp relevance | Taught in most DevOps curricula | Less commonly taught | Rarely taught in bootcamps | + +**Why Terraform:** It's what we learned, it's what most job postings list, and it works across cloud providers. CloudFormation would also be fine for an AWS-only project, but Terraform demonstrates a transferable skill. + +### What We're Creating + +``` +terraform/ +├── environments/ +│ └── dev/ +│ ├── main.tf # Root module — calls all child modules +│ ├── variables.tf # Input variables (region, instance type, etc.) +│ ├── outputs.tf # Cluster endpoint, node IP, kubeconfig command +│ └── terraform.tfvars # Actual values (gitignored — never committed) +└── modules/ + ├── vpc/ # VPC, subnets, IGW, route tables, NAT + ├── eks/ # EKS cluster, node group, OIDC provider + ├── iam/ # Cluster role, node role, policies + └── security-groups/ # NodePort rules (30002-30005) +``` + +### Key Decisions + +**Remote state in S3 with DynamoDB locking.** Local state files are not acceptable for any shared project. If two people run `terraform apply` simultaneously with local state, one of them will corrupt the infrastructure. S3 stores the state file, and DynamoDB prevents concurrent modifications. This is standard practice. + +**Module structure instead of a single flat file.** Each concern (networking, compute, identity) is a separate module with its own inputs and outputs. This means one person can modify the security groups without touching the VPC configuration. It also means modules can be reused across environments (dev, staging, prod) with different variable values. + +**terraform.tfvars is gitignored.** This file contains the actual values for your deployment — AWS account ID, region, instance type. It's environment-specific and must never be committed to the repo. Each team member creates their own from a template. + +### Estimated Effort + +4-6 hours to write and test all modules. Most of the time is in the EKS module (cluster creation takes 15 minutes per attempt, so iteration is slow). + +--- + +## 4. Phase 2 — CI/CD Pipeline + +### What We're Building + +A GitHub Actions workflow that automatically lints, scans, builds, and deploys the application whenever code is pushed. A Jenkinsfile that achieves the same pipeline for teams using Jenkins. + +### Why This Matters + +Right now, deploying a code change means: manually build a Docker image on your laptop, manually push it to Docker Hub, manually run `kubectl apply` against the cluster, and hope you didn't forget a step. This is error-prone, unreviewable, and unauditable. Nobody knows who deployed what, when, or from which commit. + +A CI/CD pipeline enforces a consistent process: every change goes through the same steps, every deployment is traceable to a specific commit, and security scanning happens automatically before any image reaches the cluster. + +### What the Industry Calls This + +Continuous Integration (CI) — automatically building and testing every change. Continuous Delivery/Deployment (CD) — automatically deploying validated changes to environments. Together, CI/CD. The practice originated in the early 2000s with tools like CruiseControl and Hudson (which became Jenkins). Modern implementations use GitHub Actions, GitLab CI, CircleCI, or Jenkins. + +### Trade-off Analysis + +| Dimension | GitHub Actions (Chosen) | Jenkins | GitLab CI | +|---|---|---|---| +| Infrastructure cost | Free for public repos, generous free tier | Must host and maintain Jenkins server | Free for public repos | +| Setup complexity | Zero — lives in the repo | High — needs a server, plugins, configuration | Low if using GitLab.com | +| Plugin ecosystem | Growing (Actions marketplace) | Massive (1800+ plugins) | Built-in features | +| Enterprise adoption | High and growing | Very high (legacy and current) | High in European companies | +| Pipeline as code | YAML in .github/workflows/ | Jenkinsfile in repo root | .gitlab-ci.yml in repo root | +| Demo-ability | Excellent — visible in GitHub UI | Requires Jenkins server running | Requires GitLab instance | + +**Why both:** GitHub Actions for the actual pipeline (easy to demo, no infrastructure needed). Jenkinsfile in the repo to show we can work in enterprise environments. During the presentation, we show GitHub Actions running; we mention Jenkins as "the enterprise alternative I also wrote." + +### Pipeline Stages + +``` +Push to any branch + │ + ├── Lint (ruff for Python) + ├── Trivy Scan (container vulnerability scanning) + │ + └── If main branch: + ├── Build Docker Image + ├── Tag with Git SHA (never :latest) + ├── Push to Docker Hub + ├── Configure kubectl for EKS + └── Deploy to cluster (kubectl apply or helm upgrade) +``` + +### Security Scanning — Where Trivy Fits + +Trivy is an open-source vulnerability scanner by Aqua Security. It scans container images for known CVEs (Common Vulnerabilities and Exposures) in OS packages and application dependencies. In our pipeline, Trivy runs after the Docker image is built but before it's pushed to the registry. If Trivy finds a CRITICAL or HIGH severity CVE, the pipeline fails and the image never reaches the cluster. + +This is the same concept as Docker Content Trust from Docker Swarm — ensuring that only verified, safe images run in your cluster. Trivy is the scanning step; Docker Content Trust (or Cosign/Sigstore in Kubernetes) is the signing step. We implement scanning; we talk about signing. + +In industry, this is called "shift-left security" — catching security issues early in the development process rather than discovering them in production. Most companies run Trivy, Snyk, or Grype as a CI pipeline gate. + +### Jenkins Pipeline + +The Jenkinsfile mirrors the GitHub Actions workflow exactly. Same stages, same tools, different syntax. This demonstrates that the pipeline logic is tool-agnostic — the stages (lint, scan, build, push, deploy) are the same regardless of whether you're using GitHub Actions, Jenkins, GitLab CI, or CircleCI. + +```groovy +// Jenkinsfile — same pipeline, different syntax +pipeline { + agent any + stages { + stage('Lint') { steps { sh 'ruff check src/' } } + stage('Scan') { steps { sh 'trivy image ...' } } + stage('Build') { steps { sh 'docker build ...' } } + stage('Push') { steps { sh 'docker push ...' } } + stage('Deploy') { steps { sh 'kubectl apply ...' } } + } +} +``` + +### Estimated Effort + +3-4 hours. The workflow files are straightforward; most time goes into configuring GitHub Secrets (Docker Hub credentials, AWS credentials, kubeconfig) and testing the pipeline end-to-end. + +--- + +## 5. Phase 3 — Security Hardening + +### What We're Building + +Four categories of security improvements applied to every Kubernetes deployment manifest. + +### 5a. Liveness and Readiness Probes + +**What they are:** Health checks that Kubernetes runs continuously to determine if a pod is alive (liveness) and ready to receive traffic (readiness). If a liveness probe fails, Kubernetes restarts the pod. If a readiness probe fails, Kubernetes stops sending traffic to that pod but doesn't restart it. + +**Why they matter:** Right now, Kubernetes has no way to know if our pods are actually healthy. It only knows they're running. If the Gateway loses its RabbitMQ connection, Kubernetes keeps routing traffic to it, and every upload silently fails. With probes, Kubernetes detects the failure and either restarts the pod or routes traffic to a healthy replica. + +**Where this concept comes from:** Health checks are a core Kubernetes primitive, inspired by process monitoring in traditional infrastructure (like systemd watchdog timers or Nagios checks). The distinction between liveness and readiness was introduced by Kubernetes to handle the common case where a service is alive but temporarily unable to serve (e.g., during startup or when a dependency is down). + +**What we're adding:** + +| Service | Probe Type | Check Method | What It Checks | +|---|---|---|---| +| Auth | HTTP GET /healthz | Liveness + Readiness | Flask is responding, PostgreSQL is reachable | +| Gateway | HTTP GET /healthz | Liveness + Readiness | Flask is responding, MongoDB and RabbitMQ are reachable | +| Converter | Exec command | Liveness | Process is alive, RabbitMQ connection is active | +| Notification | Exec command | Liveness | Process is alive, RabbitMQ connection is active | + +This requires adding a small `/healthz` endpoint to the Flask services (auth and gateway) — about 10 lines of Python each. + +### 5b. Resource Requests and Limits + +**What they are:** CPU and memory boundaries set on each pod. Requests are the guaranteed minimum — Kubernetes uses these for scheduling decisions. Limits are the hard ceiling — if a pod exceeds its memory limit, it gets killed (OOMKilled). + +**Why they matter:** The converter service runs ffmpeg, which is CPU-intensive. Without limits, four converter replicas could consume all 2 vCPUs on our m7i-flex.large node, starving the gateway and auth services. Users would be able to upload files but never log in, because the auth service can't get CPU time to process JWT validation. + +**What we're setting:** + +| Service | CPU Request | CPU Limit | Memory Request | Memory Limit | Rationale | +|---|---|---|---|---|---| +| Auth | 50m | 200m | 64Mi | 128Mi | Lightweight Flask app, small queries | +| Gateway | 100m | 300m | 128Mi | 256Mi | HTTP handling + GridFS uploads | +| Converter | 250m | 500m | 256Mi | 512Mi | ffmpeg is CPU and memory hungry | +| Notification | 50m | 100m | 64Mi | 128Mi | Sends emails — minimal resources | + +Total request across all replicas: approximately 1.5 vCPU and 1.5GB RAM, which fits comfortably on a 2 vCPU / 8GB node. + +### 5c. Security Contexts (Runtime Hardening) + +**What they are:** Linux-level security constraints applied to the container process. This is the direct Kubernetes equivalent of the Docker Swarm runtime hardening we learned in class. + +**Where this concept comes from:** The principle of least privilege — a container should have only the permissions it needs to do its job, nothing more. In Docker Swarm, we configured this through service spec options. In Kubernetes, the same concepts exist in the `securityContext` block of the pod spec. + +**What we're adding to every pod:** + +```yaml +securityContext: + runAsNonRoot: true # Container cannot run as root user + runAsUser: 1000 # Run as a non-privileged user + readOnlyRootFilesystem: true # Filesystem is read-only (prevents malware writing to disk) + allowPrivilegeEscalation: false # Cannot gain more privileges than it started with + capabilities: + drop: ["ALL"] # Drop all Linux capabilities (network raw, sys admin, etc.) +``` + +**Special case — Converter service:** The converter needs to write temporary files (the video input and MP3 output during conversion). We set `readOnlyRootFilesystem: true` but mount a writable `emptyDir` volume at `/tmp`. This means the converter can write temp files but cannot modify its own binaries, configuration, or any other part of the filesystem. If an attacker compromises the converter, they can write to /tmp but cannot install tools, modify the application, or persist across pod restarts. + +**Mapping from Docker Swarm to Kubernetes:** + +| Swarm Concept | Kubernetes Equivalent | +|---|---| +| `--user` flag | `securityContext.runAsUser` | +| `--read-only` flag | `securityContext.readOnlyRootFilesystem` | +| `--cap-drop ALL` | `securityContext.capabilities.drop: ["ALL"]` | +| `--no-new-privileges` | `securityContext.allowPrivilegeEscalation: false` | +| mTLS between services | Requires a service mesh (Istio/Linkerd) — Talk About It, don't build | +| Rotating join tokens | Managed by EKS automatically — Talk About It | +| Certificate management | ACM for external certs, EKS manages internal — Talk About It | + +### 5d. .gitignore and Secrets Audit + +**What we're adding:** A comprehensive .gitignore that prevents credentials, state files, and generated artifacts from being committed. We're also auditing every file in the repo for hardcoded secrets and documenting which files contain sensitive values. + +**Files that must never be committed:** + +``` +# Terraform +terraform.tfvars +*.tfstate +*.tfstate.backup +.terraform/ + +# Kubernetes secrets (generated by customise.sh) +**/secret.yaml + +# Credentials and state +deployment-ids.txt +DEPLOYMENT_CONFIG.md +DEPLOYMENT_GUIDE.md +customise.sh + +# Build artifacts +*.mp3 +*.mp4 +node_modules/ +__pycache__/ +.env +``` + +### Estimated Effort + +2-3 hours for all four categories. Most of the work is YAML editing and adding small health endpoints to the Python services. + +--- + +## 6. Phase 4 — Monitoring and Observability + +### What We're Building + +A Prometheus + Grafana + Alertmanager monitoring stack deployed via the kube-prometheus-stack Helm chart, with one custom Grafana dashboard for the demo. + +### Why This Matters + +Right now, if the converter pods crash, if RabbitMQ fills up, if MongoDB runs out of disk — nobody knows until a user complains (or, more likely, until we notice during a demo that nothing is working). In industry, this is unacceptable for anything beyond a personal experiment. + +Monitoring answers three questions: Is the system healthy right now? Was it healthy over the past hour/day/week? When did it stop being healthy, and what changed? + +### What the Industry Calls This + +Observability — the ability to understand the internal state of a system by examining its outputs. The "three pillars of observability" are metrics (numerical measurements over time), logs (structured event records), and traces (request paths across services). We're implementing metrics and dashboards. We'll discuss logs and traces in the presentation. + +### Trade-off Analysis + +| Dimension | kube-prometheus-stack (Chosen) | AWS CloudWatch | Datadog | +|---|---|---|---| +| Cost | Free (self-hosted) | Pay per metric/log/alarm | $15-23/host/month | +| Setup complexity | One Helm install | Requires CloudWatch agent, IAM roles | Agent install + SaaS config | +| Kubernetes integration | Native — built for K8s | Good but requires extra config | Excellent | +| Dashboard quality | Grafana — highly customisable | Basic but functional | Excellent out of the box | +| Industry relevance | Prometheus is the CNCF standard | Common in AWS-heavy shops | Common in well-funded startups | +| Demo impact | High — Grafana looks impressive | Medium | High but costs money | + +**Why kube-prometheus-stack:** One Helm install gives us Prometheus (metrics collection), Grafana (dashboards), Alertmanager (alerts), kube-state-metrics (Kubernetes object metrics), and node-exporter (host-level metrics). It's free, it's the CNCF standard, and Grafana dashboards look professional in a demo. + +### What We Get + +**Out of the box (no extra configuration):** CPU and memory usage per pod, per node, and cluster-wide. Pod restart counts and crash loop detection. Network I/O. Disk usage. Kubernetes object status (deployments, statefulsets, pods). + +**Custom dashboard for the demo ("VidCast Operations"):** RabbitMQ queue depth (video queue and mp3 queue) — this is the most compelling visual during a demo. Pod status for all four microservices. Node resource utilisation. Converter processing rate (if we add custom metrics to the Python code). + +**Alerts:** + +| Alert | Condition | Severity | Why | +|---|---|---|---| +| Pod CrashLoopBackOff | Pod restarted 3+ times in 10 minutes | Critical | Service is broken | +| High Node Memory | Node memory > 85% for 5 minutes | Warning | Risk of OOMKill | +| RabbitMQ Queue Backlog | Video queue depth > 10 for 5 minutes | Warning | Conversions are backing up | +| RabbitMQ Unavailable | RabbitMQ pod not ready for 2 minutes | Critical | Entire pipeline is blocked | + +### Estimated Effort + +3-4 hours. The Helm install takes 5 minutes; building a good custom dashboard takes iteration. + +--- + +## 7. Phase 5 — Frontend Web Application + +### What We're Building + +A React web application that serves as the VidCast product interface. It communicates with the existing Gateway API and provides a visual way to interact with the platform during the demo. + +### Why This Matters + +Right now, the demo involves running curl commands in a terminal. This is fine for a technical audience, but for a bootcamp presentation where we need to explain the system to non-technical people, a visual interface makes the flow immediately understandable. The frontend also gives us a place to show the monitoring dashboard and the architecture diagram during the presentation. + +### Pages + +**Login Page:** Email and password form. Calls `/login` on the Gateway, stores the JWT in React state (not localStorage — that's not supported in artifacts/sandboxed environments, and it's a security consideration worth mentioning). Clean VidCast branding. + +**Upload Page:** Drag-and-drop file upload. Sends the video to `/upload` with the JWT. Shows a success confirmation: "Your file is being processed. You'll receive an email when it's ready." + +**Download Page:** Text input for the file ID (from the email notification). Calls `/download` with the JWT and file ID. Triggers a browser download of the MP3. + +**Dashboard Page:** Embedded Grafana panels showing RabbitMQ queue depth and pod health, or a simplified custom view. This is the "behind the scenes" view for the presentation. + +**Architecture Page:** An interactive system diagram showing the microservices and data flow. During the demo, this helps explain what happens when you upload a file — "the request hits the Gateway here, then the video goes into the queue here, then a converter worker picks it up here..." + +### Deployment + +The frontend gets its own Dockerfile (Node.js, nginx to serve the built React app), its own Kubernetes Deployment and Service (NodePort or Ingress), and its own entry in the CI/CD pipeline. It becomes the fifth microservice in the cluster. + +### Trade-off Analysis + +| Dimension | React SPA (Chosen) | Plain HTML/CSS/JS | Next.js | +|---|---|---|---| +| Complexity | Moderate | Low | High | +| State management | React hooks (useState) | Manual DOM manipulation | React + SSR complexity | +| Component reuse | Excellent | Poor | Excellent | +| Build step required | Yes (npm build) | No | Yes | +| Team familiarity | Depends | Everyone knows HTML | Fewer people know Next.js | +| Demo appearance | Professional | Can look professional | Professional | + +**Why React:** Component-based architecture makes the dashboard and architecture views easier to build. Tailwind CSS keeps styling consistent without custom CSS. The built app is served as static files by nginx, so it's lightweight and fast. + +### Estimated Effort + +6-8 hours. This is the most visible piece but not the most complex — the backend already works, so the frontend is mostly API calls and UI design. + +--- + +## 8. Phase 6 — Documentation and Presentation + +### What We're Producing + +An updated README.md that explains the project from the perspective of someone finding it on GitHub — what it does, how to deploy it, how to destroy it. Architecture diagrams. Presentation notes with talking points and analogies for non-technical audiences. + +### Analogies for Non-Technical Audiences + +**Microservices → Restaurant:** A monolith is one chef doing everything. Microservices are specialised roles: host, cook, runner, cashier. Each can be scaled independently. + +**Message Queue → Post Office:** You don't wait at the counter for your letter to be delivered. You drop it off, and the postal workers process it on their own schedule. + +**JWT Authentication → Security Badge:** You show your ID at reception once (login), get a badge (token), and swipe it for access to different rooms (upload, download) without going back to reception. + +**Containers → Shipping Containers:** Standardised boxes that work the same everywhere — your laptop, a data centre, the cloud. + +**Kubernetes → Port Authority:** Manages where containers go, replaces ones that fall off the ship, and adds more when demand increases. + +**Infrastructure as Code → Building Blueprints:** Instead of telling builders "make it like the last one," you hand them exact blueprints. Anyone can build the same building from the same plans. + +**CI/CD Pipeline → Factory Assembly Line:** Raw materials (code) go in one end, pass through quality checks, and a finished product (deployed application) comes out the other end. Every step is automated and inspected. + +--- + +## 9. Things We Talk About But Don't Build + +These are concepts we understand and can discuss in the presentation or interviews, but we're not implementing them in this project. For each one, the reason for not building it is included. + +### ArgoCD / GitOps + +**What it is:** A deployment model where Git is the single source of truth. Instead of running `kubectl apply` from a pipeline, ArgoCD watches the Git repo and automatically syncs the cluster state to match what's in Git. If someone manually changes something in the cluster, ArgoCD detects the drift and reverts it. + +**Why we're not building it:** ArgoCD adds significant operational complexity (it needs its own deployment, RBAC, and repository credentials). For a single-developer project, the CI/CD pipeline with `kubectl apply` achieves the same outcome. ArgoCD shines in multi-team environments where drift detection and audit trails matter. + +**What to say in an interview:** "For a single-developer project, I used direct deployment from the CI/CD pipeline. In a team environment, I'd introduce ArgoCD for drift detection and to enforce that all changes go through Git." + +### KEDA / Queue-Based Autoscaling + +**What it is:** Kubernetes Event-Driven Autoscaling. Instead of scaling based on CPU (which HPA does), KEDA scales based on external metrics — in our case, RabbitMQ queue depth. If 50 videos are in the queue, KEDA would scale the converter from 4 replicas to 20. When the queue drains, it scales back down. + +**Why we're not building it:** Our demo processes one video at a time. KEDA is impressive but meaningless without a load-testing scenario to demonstrate it. Implementing it without a visible demo adds complexity without presentation value. + +**What to say in an interview:** "The converter service would benefit from queue-based autoscaling with KEDA. Instead of a fixed 4 replicas, KEDA would watch the RabbitMQ queue depth and scale converter workers dynamically. This means we pay for compute only when there's work to do." + +### Service Mesh / mTLS + +**What it is:** A service mesh (Istio, Linkerd) adds a sidecar proxy to every pod that handles service-to-service communication. This enables mutual TLS (mTLS) — every connection between services is encrypted and both sides verify each other's identity. In Docker Swarm, mTLS is built in. In Kubernetes, it requires a service mesh. + +**Why we're not building it:** Installing Istio would triple the resource consumption on our single node and add significant operational complexity. For a four-service demo with no sensitive data, it's overkill. + +**What to say in an interview:** "In production, I'd add a service mesh like Istio or Linkerd for mTLS between services. Even if an attacker gets inside the cluster network, they can't intercept or modify traffic between the gateway and auth service. The same encryption that Docker Swarm provides built-in requires a service mesh in Kubernetes." + +### Managed Database Services (RDS, DocumentDB, Amazon MQ) + +**What it is:** Instead of running MongoDB, PostgreSQL, and RabbitMQ as containers in the cluster, use AWS managed services: RDS for PostgreSQL, DocumentDB or MongoDB Atlas for MongoDB, and Amazon MQ for RabbitMQ. AWS handles backups, patching, replication, and failover. + +**Why we're not building it:** Managed services cost $200-400/month for a project we run for demos. They also remove the Kubernetes operational experience (running StatefulSets, Helm charts) that makes the project valuable. The in-cluster approach demonstrates more skills. + +**What to say in an interview:** "In production, I'd migrate PostgreSQL to RDS and RabbitMQ to Amazon MQ. Managed services handle backups, patching, and replication — operational burden the platform team shouldn't own. I kept them as StatefulSets in this project to demonstrate Kubernetes data service management." + +### External Secrets Operator / AWS Secrets Manager + +**What it is:** Instead of storing secrets in Kubernetes Secret objects (which are just base64-encoded, not encrypted), store them in AWS Secrets Manager and use the External Secrets Operator to sync them into the cluster at runtime. + +**Why we might not build it:** It requires an OIDC provider configured on the EKS cluster and IRSA (IAM Roles for Service Accounts). This is achievable but adds 2-3 hours of work. If time permits, we'll add it. If not, we document the approach and explain it. + +**What to say in an interview:** "Credentials are currently in Kubernetes Secrets, which are base64-encoded but not encrypted at rest unless you enable EKS envelope encryption. In production, I'd use AWS Secrets Manager with the External Secrets Operator. Secrets are stored in Secrets Manager, retrieved at runtime via IRSA, and never exist in Git." + +### Network Policies + +**What it is:** Kubernetes NetworkPolicy resources that restrict which pods can communicate with each other. By default, every pod in a Kubernetes cluster can talk to every other pod. Network Policies implement the principle of least privilege at the network level. + +**Why we should try to build it (stretch goal):** It's a 20-minute task that demonstrates security awareness. The auth service should only accept traffic from the gateway. MongoDB should only accept traffic from the gateway and converter. + +**What to say in an interview:** "I implemented Network Policies to restrict east-west traffic. The auth service only accepts connections from the gateway — even if an attacker compromises the converter, they can't directly access the auth database." + +--- + +## 10. Repository Structure + +``` +vidcast/ (repo root) +│ +├── README.md # Public-facing: what, why, how to deploy, how to destroy +├── VIDCAST_UPGRADE_PLAN.md # This document +├── .gitignore # Comprehensive — secrets, state, artifacts +├── Jenkinsfile # Enterprise CI/CD alternative +│ +├── .github/ +│ └── workflows/ +│ ├── ci.yml # Lint + scan + build + push +│ └── cd.yml # Deploy to EKS +│ +├── terraform/ +│ ├── environments/ +│ │ └── dev/ +│ │ ├── main.tf +│ │ ├── variables.tf +│ │ ├── outputs.tf +│ │ ├── backend.tf # S3 + DynamoDB state config +│ │ └── terraform.tfvars # GITIGNORED — actual values +│ └── modules/ +│ ├── vpc/ +│ ├── eks/ +│ ├── iam/ +│ └── security-groups/ +│ +├── Helm_charts/ # Existing — unchanged +│ ├── MongoDB/ +│ ├── Postgres/ +│ └── RabbitMQ/ +│ +├── src/ +│ ├── auth-service/ # Existing + health endpoint + security context +│ ├── gateway-service/ # Existing + health endpoint + security context +│ ├── converter-service/ # Existing + security context + resource limits +│ ├── notification-service/ # Existing + security context +│ └── frontend/ # NEW — React web application +│ ├── Dockerfile +│ ├── nginx.conf +│ ├── package.json +│ ├── src/ +│ │ ├── App.jsx +│ │ ├── pages/ +│ │ │ ├── Login.jsx +│ │ │ ├── Upload.jsx +│ │ │ ├── Download.jsx +│ │ │ ├── Dashboard.jsx +│ │ │ └── Architecture.jsx +│ │ └── components/ +│ └── manifest/ +│ ├── deployment.yaml +│ ├── service.yaml +│ └── configmap.yaml +│ +├── monitoring/ +│ ├── values.yaml # Custom values for kube-prometheus-stack +│ ├── dashboards/ +│ │ └── vidcast-operations.json # Custom Grafana dashboard +│ └── alerts/ +│ └── vidcast-alerts.yaml # Custom alert rules +│ +├── docs/ +│ ├── architecture.md +│ ├── deployment-guide.md +│ └── presentation-notes.md +│ +└── assets/ + └── video.mp4 # Test video +``` + +--- + +## 11. Branch Strategy + +``` +main ← current working state (base project) + │ + ├── feature/terraform-infra ← Phase 1: all Terraform code + ├── feature/ci-cd-pipeline ← Phase 2: GitHub Actions + Jenkinsfile + ├── feature/security-harden ← Phase 3: probes, limits, security contexts, .gitignore + ├── feature/monitoring ← Phase 4: kube-prometheus-stack + dashboard + ├── feature/frontend ← Phase 5: React web application + └── feature/documentation ← Phase 6: README, arch docs, presentation notes +``` + +Each branch is merged to main via a Pull Request when complete and tested. This gives us a clean Git history where each PR represents a meaningful improvement. The PR descriptions become talking points: "Here's the PR where I added infrastructure as code. Here's where I introduced container security scanning." + +**Rules:** +- Never push directly to main. Always use a feature branch and PR. +- Each PR should have a description explaining what changed and why. +- Merge in order: Phase 1 → 2 → 3 → 4 → 5 → 6 (though 2 and 3 can be parallel). + +--- + +## 12. Cost Breakdown + +| Component | Monthly Cost | Notes | +|---|---|---| +| EKS cluster | ~$73 | $0.10/hour for the control plane | +| EC2 node (m7i-flex.large) | ~$70 on-demand | Could reduce with Spot (~$25) but not for a demo | +| EBS storage (30GB gp3) | ~$2.40 | Root volume for the node | +| S3 (Terraform state) | <$0.10 | A few KB of state files | +| DynamoDB (state lock) | <$0.10 | On-demand pricing, minimal usage | +| Data transfer | ~$5 | Minimal for a demo | +| Docker Hub | Free | Public repos, free tier | +| **Total (running 24/7)** | **~$150/month** | | +| **Total (8 hours/day, weekdays only)** | **~$40/month** | Stop the node group outside working hours | + +**Cost-saving tip:** The biggest expense is the EC2 node. If you're not actively using the cluster, delete the node group (`aws eks delete-nodegroup`) and recreate it when you need it. The EKS control plane still costs $73/month even with no nodes, so for extended breaks, destroy the whole cluster and recreate it from Terraform. + +--- + +## 13. Real-World Use Cases + +This architecture pattern — API gateway, async processing queue, worker services, notification — is used everywhere in industry. Here are concrete examples to reference during the presentation: + +**Media processing (YouTube, TikTok, Spotify):** When you upload a video, it goes through a processing pipeline: transcoding to multiple resolutions, thumbnail generation, audio extraction for captions, content moderation. Each step is a separate service consuming from a queue. Our project does the same thing at a smaller scale. + +**E-commerce order processing (Amazon, ASOS):** When you place an order, separate services handle payment, inventory, warehouse notification, shipping labels, and confirmation email. The queue absorbs traffic spikes (Black Friday) without dropping orders. + +**Banking document processing:** Mortgage applications, bank statements, and identity documents go through OCR, data extraction, fraud checks, and compliance verification — each as a separate service. + +**Healthcare imaging:** MRI and X-ray images are uploaded, converted to standard formats, analysed by AI, stored in archives, and the referring doctor is notified. Upload, queue, process, store, notify — same pattern. + +--- + +## 14. Presentation Strategy + +### Flow (12-15 minutes) + +**Open with the product (2 min):** "This is VidCast — a platform that converts video recordings into podcast-ready audio." Demo the upload through the web interface. Everyone understands what the system does. + +**Explain the architecture (3 min):** Switch to the architecture view. Use the restaurant analogy for microservices, the post office analogy for queues. Walk through the data flow. + +**Show the platform engineering (5 min):** Show Terraform creating infrastructure. Show the CI/CD pipeline deploying a change. Show the Grafana dashboard. Show the security contexts. Explain each in terms the audience can follow. + +**Talk about what you'd do next (2 min):** Managed databases, service mesh, KEDA, GitOps. Shows you see beyond what you built. + +**Close with real-world connection (1 min):** "This is the same pattern used by YouTube, Spotify, and every media processing platform. The scale is different, but the principles are identical." + +### Teaching Tips + +- Start with the problem, not the technology. +- One analogy per concept. Don't stack metaphors. +- If you're about to say a technical term, explain it immediately: "RabbitMQ — that's our post office sorting room — was showing a backlog." +- Show, don't tell. A live demo is worth ten slides. +- End each section with "and this is why it matters" before moving on. diff --git a/customise.sh b/customise.sh new file mode 100755 index 0000000..4ff6195 --- /dev/null +++ b/customise.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# ============================================================================= +# customise.sh — point VidCast at YOUR identity (Docker Hub / AWS / GitHub) +# ============================================================================= +# Run this ONCE after forking, BEFORE ./deploy.sh. It rewrites the *identity* +# values in the repo's GitOps config so the cluster pulls YOUR images and Argo CD / +# AWS / Kyverno trust YOUR GitHub repo. It does NOT write any secret to a file — +# database passwords, the JWT secret, the Gmail password, and the admin's bcrypt +# hash are all handled at install time by deploy.sh (via `--set`, Parameter Store, +# and an in-database pgcrypto hash respectively). +# +# It does not hard-code anyone's values: it AUTO-DETECTS whatever identity is +# currently in the repo and replaces it with yours (from the env vars below). +# +# ── HOW TO GET EACH VALUE ──────────────────────────────────────────────────── +# DOCKER_HUB_USER Your Docker Hub username — sign up free at hub.docker.com. +# The cluster pulls /auth-service: etc. +# AWS_ACCOUNT_ID Run: aws sts get-caller-identity --query Account --output text +# GITHUB_ORG Your GitHub username/org that owns the fork (github.com//). +# GITHUB_REPO Your fork's repository name. +# AWS_REGION Your AWS region (default eu-west-2). Use one that allows +# non-T-type EKS nodes. +# CLUSTER_NAME Any name WITHOUT underscores (EKS rejects them); e.g. vidcast-cluster. +# ECR_REPO_NAME Name for the frontend image's ECR repo; e.g. vidcast-frontend. +# +# Anything left unset keeps the current value (a no-op for that field). +# +# USAGE: +# export DOCKER_HUB_USER=... AWS_ACCOUNT_ID=... GITHUB_ORG=... GITHUB_REPO=... +# ./customise.sh +# (Secrets for deploy.sh — POSTGRES_PASSWORD, MONGODB_PASSWORD, RABBITMQ_PASSWORD, +# JWT_SECRET, GMAIL_ADDRESS, GMAIL_APP_PASSWORD, APP_LOGIN_EMAIL, APP_LOGIN_PASSWORD — +# are NOT used here; set them in your shell before running ./deploy.sh.) +# ============================================================================= +set -euo pipefail +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$REPO_ROOT" + +green=$'\e[32m'; yellow=$'\e[33m'; red=$'\e[31m'; reset=$'\e[0m' +upd() { echo " ${green}✓${reset} $*"; } +note() { echo " ${yellow}!${reset} $*"; } + +# ── Auto-detect the identity currently in the repo (no hard-coded values) ──── +DEV_OVERLAY="k8s/overlays/dev/kustomization.yaml" +ARGO_APP="k8s/argocd/application-dev.yaml" + +CUR_DOCKER_USER="$(grep -oE '[a-z0-9._-]+/auth-service' "$DEV_OVERLAY" 2>/dev/null | head -1 | cut -d/ -f1 || true)" +CUR_ACCOUNT_ID="$(grep -oE '[0-9]{12}' "$DEV_OVERLAY" 2>/dev/null | head -1 || true)" +CUR_ORG_REPO="$(grep -oE 'github\.com/[^/]+/[^/.]+' "$ARGO_APP" 2>/dev/null | head -1 | sed 's#github.com/##' || true)" +CUR_GITHUB_ORG="${CUR_ORG_REPO%%/*}" +CUR_GITHUB_REPO="${CUR_ORG_REPO##*/}" +CUR_REGION="$(grep -oE 'dkr\.ecr\.[a-z0-9-]+\.amazonaws' "$DEV_OVERLAY" 2>/dev/null | head -1 | sed -E 's/dkr\.ecr\.([a-z0-9-]+)\.amazonaws/\1/' || true)" +CUR_ECR_REPO="$(grep -oE 'amazonaws\.com/[a-z0-9-]+' "$DEV_OVERLAY" 2>/dev/null | head -1 | sed 's#amazonaws.com/##' || true)" +CUR_CLUSTER="$(grep -oE 'cluster_name[[:space:]]*=[[:space:]]*"[^"]+"' terraform/environments/dev/terraform.tfvars 2>/dev/null | sed -E 's/.*"([^"]+)".*/\1/' || true)" +: "${CUR_REGION:=eu-west-2}"; : "${CUR_CLUSTER:=vidcast-cluster}"; : "${CUR_ECR_REPO:=vidcast-frontend}" + +# ── New values from env (default to current = no-op if unset) ──────────────── +NEW_DOCKER_USER="${DOCKER_HUB_USER:-$CUR_DOCKER_USER}" +NEW_ACCOUNT_ID="${AWS_ACCOUNT_ID:-$CUR_ACCOUNT_ID}" +NEW_GITHUB_ORG="${GITHUB_ORG:-$CUR_GITHUB_ORG}" +NEW_GITHUB_REPO="${GITHUB_REPO:-$CUR_GITHUB_REPO}" +NEW_REGION="${AWS_REGION:-$CUR_REGION}" +NEW_CLUSTER="${CLUSTER_NAME:-$CUR_CLUSTER}" +NEW_ECR_REPO="${ECR_REPO_NAME:-$CUR_ECR_REPO}" + +echo "===== customise.sh — repointing identity to yours =====" +echo " Docker Hub : ${CUR_DOCKER_USER:-?} -> $NEW_DOCKER_USER" +echo " AWS acct : ${CUR_ACCOUNT_ID:-?} -> $NEW_ACCOUNT_ID" +echo " GitHub : ${CUR_GITHUB_ORG:-?}/${CUR_GITHUB_REPO:-?} -> $NEW_GITHUB_ORG/$NEW_GITHUB_REPO" +echo " Region : ${CUR_REGION} -> $NEW_REGION Cluster: ${CUR_CLUSTER} -> $NEW_CLUSTER ECR: ${CUR_ECR_REPO} -> $NEW_ECR_REPO" +echo + +repl() { # $1=file $2=from $3=to (no-op if file missing, from empty, or unchanged) + [ -f "$1" ] || return 0; [ -n "$2" ] || return 0; [ "$2" = "$3" ] && return 0 + sed -i "s|$2|$3|g" "$1" +} + +# ── 1. Kustomize overlays — backend image names + frontend ECR ref ─────────── +for ov in dev prod; do + F="k8s/overlays/$ov/kustomization.yaml" + repl "$F" "$CUR_DOCKER_USER/" "$NEW_DOCKER_USER/" + repl "$F" "$CUR_ACCOUNT_ID.dkr.ecr.$CUR_REGION.amazonaws.com/$CUR_ECR_REPO" \ + "$NEW_ACCOUNT_ID.dkr.ecr.$NEW_REGION.amazonaws.com/$NEW_ECR_REPO" + [ -f "$F" ] && upd "overlay $ov: image names + ECR ref" +done + +# ── 2. Terraform variables (identity AWS trusts + builds) ──────────────────── +F="terraform/environments/dev/terraform.tfvars" +if [ -f "$F" ]; then + repl "$F" "\"$CUR_GITHUB_ORG\"" "\"$NEW_GITHUB_ORG\"" + repl "$F" "\"$CUR_GITHUB_REPO\"" "\"$NEW_GITHUB_REPO\"" + repl "$F" "\"$CUR_CLUSTER\"" "\"$NEW_CLUSTER\"" + repl "$F" "\"$CUR_REGION\"" "\"$NEW_REGION\"" + upd "terraform.tfvars: github_org/repo, cluster, region" +else + note "terraform.tfvars not found (gitignored) — set github_org/github_repo/cluster_name/aws_region yourself." +fi + +# ── 3. Argo CD Applications — the source repo Argo pulls from ───────────────── +for app in dev prod; do + F="k8s/argocd/application-$app.yaml" + repl "$F" "github.com/$CUR_GITHUB_ORG/$CUR_GITHUB_REPO" "github.com/$NEW_GITHUB_ORG/$NEW_GITHUB_REPO" + [ -f "$F" ] && upd "argocd application-$app: repoURL" +done + +# ── 4. Kyverno verify-images — the keyless cosign signer identity (B5) ─────── +F="k8s/kyverno/verify-images.yaml" +repl "$F" "github.com/$CUR_GITHUB_ORG/$CUR_GITHUB_REPO/" "github.com/$NEW_GITHUB_ORG/$NEW_GITHUB_REPO/" +[ -f "$F" ] && upd "kyverno verify-images: cosign subject identity" + +# ── Validation ─────────────────────────────────────────────────────────────── +echo +echo "===== Validation =====" +if [ "$NEW_DOCKER_USER" = "$CUR_DOCKER_USER" ] && [ "$NEW_GITHUB_ORG" = "$CUR_GITHUB_ORG" ] && [ "$NEW_ACCOUNT_ID" = "$CUR_ACCOUNT_ID" ]; then + note "No identity env vars set — nothing changed. Set DOCKER_HUB_USER / AWS_ACCOUNT_ID / GITHUB_ORG / GITHUB_REPO and re-run." +else + LEFT="$(grep -rn "$CUR_DOCKER_USER/\|$CUR_ACCOUNT_ID\|github.com/$CUR_GITHUB_ORG/$CUR_GITHUB_REPO" \ + k8s/overlays k8s/argocd k8s/kyverno terraform/environments/dev/terraform.tfvars 2>/dev/null || true)" + if [ -n "$LEFT" ]; then note "Some old identity values remain (review):"; echo "$LEFT" | sed 's/^/ /' + else upd "no old identity values remain in the GitOps config"; fi +fi +echo +echo "Next: set your secrets in the shell, then run ./deploy.sh (see DEPLOYMENT_GUIDE.md §A.2)." +echo "===== customise.sh complete =====" diff --git a/deploy.sh b/deploy.sh new file mode 100755 index 0000000..959962f --- /dev/null +++ b/deploy.sh @@ -0,0 +1,406 @@ +#!/usr/bin/env bash +# ============================================================================= +# deploy.sh — VidCast automated bring-up +# ============================================================================= +# Takes the cluster from "Terraform applied, node Ready" to "everything live and +# verified". This automates §3–§8 of DEPLOYMENT_GUIDE.md so you don't have to +# copy-paste the runbook. (§0–§2 — AWS prerequisites + `terraform apply` — are +# still run by hand because they create account-level infrastructure.) +# +# WHAT IT DOES, IN ORDER (each step waits for readiness before the next): +# 1. Validate prerequisites (cluster reachable, tools present, env vars set) +# 2. Datastores via Helm: MongoDB -> PostgreSQL -> RabbitMQ +# 3. PostgreSQL init.sql (RBAC schema + bcrypt admin seed) +# 4. Seed AWS Parameter Store (the 7 SecureString secrets) +# 5. External Secrets Operator + the 4 ExternalSecrets (pull secrets into the cluster) +# 6. App workloads (kubectl apply -k ) +# 7. KEDA (converter scale-to-zero) + gateway HPA + metrics-server +# 8. Argo CD (GitOps; dev auto-sync / prod manual gate) +# 9. Kyverno (policy-as-code, all Audit) +# 10. Monitoring (kube-prometheus-stack + scrape configs + alerts + SLO rules + dashboards) +# 11. Kubecost (FinOps; pinned to a stable chart) +# 12. NetworkPolicies (allows first, default-deny LAST) +# 13. Smoke test + print access URLs +# +# IDEMPOTENT: uses `helm upgrade --install` and `kubectl apply`, so re-running is +# safe and just reconciles to the desired state. +# +# USAGE: +# ./deploy.sh # bring up (reads config from env vars / DEPLOYMENT_CONFIG) +# ./deploy.sh --teardown # terraform destroy + confirm zero spend +# ./deploy.sh --help +# +# CONFIG (env vars; required ones are validated up front): +# POSTGRES_USERNAME POSTGRES_PASSWORD +# MONGODB_USERNAME MONGODB_PASSWORD +# RABBITMQ_PASSWORD (RABBITMQ_USERNAME optional, default 'rabbituser') +# JWT_SECRET +# GMAIL_ADDRESS GMAIL_APP_PASSWORD +# APP_LOGIN_EMAIL APP_LOGIN_PASSWORD (used to seed the admin login + the login smoke test) +# DOCKER_HUB_USER (informational; image names live in the overlay) +# ENVIRONMENT (dev|prod, default: dev) +# AWS_REGION (default: eu-west-2) +# NODE_IP (optional — auto-detected from the node's ExternalIP) +# +# Secrets are NOT stored in any tracked file: DB passwords are injected into the +# Helm charts here via `--set` (the chart values hold CHANGEME placeholders), the +# admin's bcrypt hash is generated in-DB, and JWT/Gmail go to Parameter Store. +# Run ./customise.sh first to set identity (Docker Hub user / AWS account / GitHub +# repo), then source your config into the shell and run this. +# ============================================================================= + +set -euo pipefail + +# ── Locate the repo root (so the script works from anywhere) ───────────────── +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$REPO_ROOT" + +# ── Defaults ───────────────────────────────────────────────────────────────── +ENVIRONMENT="${ENVIRONMENT:-dev}" +AWS_REGION="${AWS_REGION:-eu-west-2}" +OVERLAY="k8s/overlays/${ENVIRONMENT}" +KUBECOST_CHART_VERSION="2.8.6" # 2.9.x is a broken transitional chart — pin stable + +# ── Pretty output helpers ──────────────────────────────────────────────────── +c_reset=$'\e[0m'; c_blue=$'\e[34m'; c_green=$'\e[32m'; c_yellow=$'\e[33m'; c_red=$'\e[31m'; c_bold=$'\e[1m' +step() { echo; echo "${c_blue}${c_bold}▶ $*${c_reset}"; } +ok() { echo " ${c_green}✓${c_reset} $*"; } +warn() { echo " ${c_yellow}!${c_reset} $*"; } +die() { echo "${c_red}${c_bold}✗ $*${c_reset}" >&2; exit 1; } + +# ============================================================================= +# TEARDOWN (./deploy.sh --teardown) +# ============================================================================= +teardown() { + step "TEARDOWN — destroying all AWS infrastructure (this stops the billing)" + warn "This runs 'terraform destroy'. The EKS cluster, node, VPC, etc. are deleted." + read -r -p " Type 'destroy' to confirm: " confirm + [ "$confirm" = "destroy" ] || die "Aborted (you did not type 'destroy')." + ( cd terraform/environments/dev && terraform destroy -auto-approve ) + step "Verifying zero spend" + if [ "$(aws eks list-clusters --region "$AWS_REGION" --query 'length(clusters)' --output text 2>/dev/null || echo '?')" = "0" ]; then + ok "No EKS clusters remain — standing cost is now ~\$0." + else + warn "EKS clusters still listed — check 'aws eks list-clusters --region $AWS_REGION'." + fi + echo + echo "Preserved (free to keep; makes the next bring-up one command):" + echo " • S3 Terraform state bucket + DynamoDB lock table" + echo " • terraform.tfvars, .terraform.lock.hcl" + echo " • Parameter Store SecureStrings, frontend ECR images" + exit 0 +} + +[ "${1:-}" = "--help" ] || [ "${1:-}" = "-h" ] && { sed -n '2,46p' "$0" | sed 's/^# \{0,1\}//'; exit 0; } +[ "${1:-}" = "--teardown" ] && teardown + +# ============================================================================= +# STEP 1 — VALIDATE PREREQUISITES (fail early, with a clear list) +# ============================================================================= +step "1/13 Validating prerequisites" + +# 1a. tools on PATH +for t in kubectl helm aws psql; do + command -v "$t" >/dev/null 2>&1 || die "Required tool not found on PATH: $t (see DEPLOYMENT_GUIDE.md §A.1)" +done +ok "tools present: kubectl, helm, aws, psql" + +# 1b. cluster reachable +kubectl cluster-info >/dev/null 2>&1 || die "kubectl cannot reach a cluster. Run: aws eks update-kubeconfig --name --region $AWS_REGION" +if ! kubectl get nodes 2>/dev/null | grep -q ' Ready '; then + die "No node is Ready yet. Wait for the node group, then re-run. (kubectl get nodes)" +fi +ok "cluster reachable; at least one node Ready" + +# RabbitMQ creds (the chart provisions the broker with these; the app reads them +# from the chart-created rabbitmq-secret). Username defaults; password is required. +RABBITMQ_USERNAME="${RABBITMQ_USERNAME:-rabbituser}" + +# 1c. required env vars (collect ALL missing, then fail once with the full list). +# NOTE: for an EXISTING cluster these must match the passwords the databases were +# first created with — Mongo/Postgres set the root password at init only, so a +# changed value would leave the app unable to authenticate. For a fresh cluster, +# any strong values work (e.g. `openssl rand -base64 24`). +REQUIRED=(POSTGRES_USERNAME POSTGRES_PASSWORD MONGODB_USERNAME MONGODB_PASSWORD RABBITMQ_PASSWORD JWT_SECRET GMAIL_ADDRESS GMAIL_APP_PASSWORD) +missing=() +for v in "${REQUIRED[@]}"; do [ -n "${!v:-}" ] || missing+=("$v"); done +if [ "${#missing[@]}" -gt 0 ]; then + echo "${c_red} Missing required environment variables:${c_reset}" + for m in "${missing[@]}"; do echo " - $m"; done + echo " Set them (e.g. source the values from DEPLOYMENT_CONFIG.md) and re-run." + die "Cannot continue without the secrets above." +fi +ok "all required secrets are set" + +# 1d. auto-detect NODE_IP if not provided +NODE_IP="${NODE_IP:-$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}' 2>/dev/null)}" +[ -n "$NODE_IP" ] && ok "NODE_IP = $NODE_IP" || warn "could not auto-detect NODE_IP (NodePort smoke tests will be skipped)" + +ok "environment = ${ENVIRONMENT} region = ${AWS_REGION} overlay = ${OVERLAY}" + +# small helper: wait for a rollout, tolerating 'not found yet' +wait_rollout() { # $1=kind/name $2=namespace $3=timeout + kubectl rollout status "$1" -n "${2:-default}" --timeout="${3:-180s}" 2>/dev/null \ + || warn "rollout wait for $1 timed out or not found (continuing — check manually)" +} + +# ============================================================================= +# STEP 2 — DATASTORES (Helm, in dependency order: Mongo -> Postgres -> RabbitMQ) +# ============================================================================= +# Order matters: the app needs all three, and RabbitMQ's chart creates the +# 'rabbitmq-secret' that gateway/converter/notification consume. +step "2/13 Installing datastores (MongoDB → PostgreSQL → RabbitMQ)" +# Credentials are injected here via --set from env vars, NOT stored in the chart +# values (which carry CHANGEME placeholders) — so no DB password lives in the repo. + +helm upgrade --install mongodb Helm_charts/MongoDB \ + --set secret.root_username="$MONGODB_USERNAME" --set secret.username="$MONGODB_USERNAME" \ + --set secret.users_list="$MONGODB_USERNAME" \ + --set secret.root_password="$MONGODB_PASSWORD" --set secret.password="$MONGODB_PASSWORD" >/dev/null +wait_rollout statefulset/mongodb default 180s; ok "MongoDB ready" + +helm upgrade --install postgres Helm_charts/Postgres \ + --set container.env.user="$POSTGRES_USERNAME" --set container.env.password="$POSTGRES_PASSWORD" >/dev/null +wait_rollout deployment/postgres-deploy default 120s; ok "PostgreSQL ready" + +helm upgrade --install rabbitmq Helm_charts/RabbitMQ \ + --set secret.default_user="$RABBITMQ_USERNAME" --set secret.default_pass="$RABBITMQ_PASSWORD" >/dev/null +wait_rollout statefulset/rabbitmq default 180s; ok "RabbitMQ ready" + +# ============================================================================= +# STEP 3 — PostgreSQL init (RBAC schema + bcrypt admin seed) +# ============================================================================= +# Skipping this = every login 500s (no auth_user table / no admin row). The DB +# admin NodePort :30003 is still open here (NetworkPolicies are applied last). +step "3/13 Initialising PostgreSQL (schema, then admin seed)" +if [ -n "$NODE_IP" ]; then + PSQL=(psql -h "$NODE_IP" -p 30003 -U "$POSTGRES_USERNAME" -d authdb -v ON_ERROR_STOP=1) + if PGPASSWORD="$POSTGRES_PASSWORD" "${PSQL[@]}" -f Helm_charts/Postgres/init.sql >/dev/null 2>&1; then + ok "schema applied (auth_user table + pgcrypto)" + # Seed the admin with a bcrypt hash generated IN the database (pgcrypto), so no + # password or hash is ever written to a file. Needs APP_LOGIN_EMAIL + _PASSWORD. + if [ -n "${APP_LOGIN_EMAIL:-}" ] && [ -n "${APP_LOGIN_PASSWORD:-}" ]; then + if PGPASSWORD="$POSTGRES_PASSWORD" "${PSQL[@]}" \ + -v email="$APP_LOGIN_EMAIL" -v pw="$APP_LOGIN_PASSWORD" >/dev/null 2>&1 <<'SQL' +INSERT INTO auth_user (email, password, role) +VALUES (:'email', crypt(:'pw', gen_salt('bf', 12)), 'admin') +ON CONFLICT (email) DO UPDATE SET password = EXCLUDED.password, role = EXCLUDED.role; +SQL + then ok "admin seeded: ${APP_LOGIN_EMAIL} (bcrypt hash generated in-DB)" + else warn "admin seed failed (is pgcrypto available in this postgres image?)." + fi + else + warn "APP_LOGIN_EMAIL/APP_LOGIN_PASSWORD not set — no admin seeded, login won't work until you seed one." + fi + else + warn "schema init failed (port 30003 reachable? credentials match?). Re-run by hand if needed." + fi +else + warn "NODE_IP unknown — skipping DB init. Run it manually per DEPLOYMENT_GUIDE.md §3.1." +fi + +# ============================================================================= +# STEP 4 — SEED AWS PARAMETER STORE (the 7 SecureString secrets) +# ============================================================================= +# The app never reads these from a file — ESO (step 5) pulls them at runtime. +# Parameter Store = the safe-deposit box; the app holds the key (its AWS identity). +step "4/13 Seeding Parameter Store (/vidcast/${ENVIRONMENT}/*)" +put() { aws ssm put-parameter --region "$AWS_REGION" --type SecureString --overwrite --name "$1" --value "$2" >/dev/null; } +P="/vidcast/${ENVIRONMENT}" +put "$P/auth/psql-password" "$POSTGRES_PASSWORD" +put "$P/auth/jwt-secret" "$JWT_SECRET" +put "$P/gateway/mongodb-videos-uri" "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/videos?authSource=admin" +put "$P/gateway/mongodb-mp3s-uri" "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +put "$P/converter/mongodb-uri" "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +put "$P/notification/gmail-address" "$GMAIL_ADDRESS" +put "$P/notification/gmail-password" "${GMAIL_APP_PASSWORD// /}" # strip any spaces from the app password +ok "7 SecureString parameters written under $P" + +# ============================================================================= +# STEP 5 — EXTERNAL SECRETS OPERATOR + the 4 ExternalSecrets +# ============================================================================= +step "5/13 Installing External Secrets Operator + ExternalSecrets" +helm repo add external-secrets https://charts.external-secrets.io >/dev/null 2>&1 || true +helm repo update external-secrets >/dev/null 2>&1 || true +# 0.18.2+ serves the external-secrets.io/v1 API the manifests use. +helm upgrade --install external-secrets external-secrets/external-secrets \ + -n external-secrets --create-namespace --version 0.18.2 >/dev/null +wait_rollout deployment/external-secrets external-secrets 150s +ok "ESO installed" + +# Best-effort: stamp the IRSA role ARN onto the ESO ServiceAccount from terraform output. +if IRSA_ARN="$(cd terraform/environments/dev 2>/dev/null && terraform output -raw external_secrets_irsa_role_arn 2>/dev/null)"; then + [ -n "$IRSA_ARN" ] && warn "ESO IRSA role: $IRSA_ARN (ensure shared/serviceaccount.yaml matches)" +fi + +kubectl apply -k k8s/external-secrets/shared >/dev/null # SA + ClusterSecretStore +kubectl apply -k "k8s/external-secrets/${ENVIRONMENT}" >/dev/null # the 4 ExternalSecrets +ok "applied ClusterSecretStore + ExternalSecrets" + +# Wait for ESO to materialise the 4 Secrets (READY=True on each ExternalSecret). +step " waiting for ExternalSecrets to sync (auth/gateway/converter/notification)" +for es in auth-secret gateway-secret converter-secret notification-secret; do + if kubectl wait --for=condition=Ready "externalsecret/$es" -n default --timeout=120s >/dev/null 2>&1; then + ok "$es synced" + else + warn "$es NOT ready — check the IRSA annotation on sa/vidcast-eso and the parameter paths." + fi +done + +# ============================================================================= +# STEP 6 — APP WORKLOADS (Kustomize overlay) +# ============================================================================= +step "6/13 Deploying app workloads (kubectl apply -k ${OVERLAY})" +kubectl apply -k "$OVERLAY" >/dev/null +for d in auth gateway converter notification frontend outbox-relay redis; do + wait_rollout "deployment/$d" default 180s +done +ok "app workloads applied" + +# ============================================================================= +# STEP 7 — KEDA + HPA + metrics-server +# ============================================================================= +# KEDA scales the converter on queue depth (to zero when idle). The gateway HPA +# scales on CPU, which needs metrics-server (EKS doesn't bundle it). +step "7/13 Installing KEDA + metrics-server + autoscalers" +helm repo add kedacore https://kedacore.github.io/charts >/dev/null 2>&1 || true +helm repo update kedacore >/dev/null 2>&1 || true +helm upgrade --install keda kedacore/keda -n keda --create-namespace -f k8s/keda/values.yaml >/dev/null +wait_rollout deployment/keda-operator keda 150s + +# metrics-server (idempotent apply of the upstream manifest) +kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml >/dev/null 2>&1 || true + +# KEDA's RabbitMQ scaler needs a connection-string Secret. It dials from the 'keda' +# namespace, so the host MUST be the FQDN (the short name 'rabbitmq' won't resolve +# cross-namespace). Build it from the RabbitMQ chart's credentials. +RMQ_USER="$(kubectl get secret rabbitmq-secret -n default -o jsonpath='{.data.RABBITMQ_DEFAULT_USER}' 2>/dev/null | base64 -d || true)" +RMQ_PASS="$(kubectl get secret rabbitmq-secret -n default -o jsonpath='{.data.RABBITMQ_DEFAULT_PASS}' 2>/dev/null | base64 -d || true)" +if [ -n "$RMQ_USER" ] && [ -n "$RMQ_PASS" ]; then + kubectl create secret generic keda-rabbitmq-secret -n default \ + --from-literal=host="amqp://${RMQ_USER}:${RMQ_PASS}@rabbitmq.default.svc.cluster.local:5672/" \ + --dry-run=client -o yaml | kubectl apply -f - >/dev/null + ok "keda-rabbitmq-secret created (FQDN host)" +else + warn "could not read rabbitmq-secret — apply k8s/keda/secret.yaml manually before the ScaledObject works." +fi +kubectl apply -k k8s/keda >/dev/null # ScaledObject + HPA + TriggerAuthentication +ok "KEDA ScaledObject + gateway HPA applied" + +# ============================================================================= +# STEP 8 — ARGO CD (GitOps) +# ============================================================================= +step "8/13 Installing Argo CD + Applications" +helm repo add argo https://argoproj.github.io/argo-helm >/dev/null 2>&1 || true +helm repo update argo >/dev/null 2>&1 || true +helm upgrade --install argocd argo/argo-cd -n argocd --create-namespace -f k8s/argocd/values.yaml >/dev/null +wait_rollout deployment/argocd-server argocd 180s +kubectl apply -k k8s/argocd >/dev/null # dev (auto-sync) + prod (manual gate) Applications +ok "Argo CD installed; dev auto-syncs, prod waits for manual Sync" + +# ============================================================================= +# STEP 9 — KYVERNO (policy-as-code, all Audit) +# ============================================================================= +step "9/13 Installing Kyverno + ClusterPolicies (Audit)" +helm repo add kyverno https://kyverno.github.io/kyverno >/dev/null 2>&1 || true +helm repo update kyverno >/dev/null 2>&1 || true +helm upgrade --install kyverno kyverno/kyverno -n kyverno --create-namespace -f k8s/kyverno/values.yaml >/dev/null +wait_rollout deployment/kyverno-admission-controller kyverno 180s +kubectl apply -k k8s/kyverno >/dev/null +ok "7 ClusterPolicies applied (all Audit)" + +# ============================================================================= +# STEP 10 — MONITORING (Prometheus / Grafana / Alertmanager + SLO stack) +# ============================================================================= +# Uses an emptyDir override because this cluster has no dynamic EBS provisioner. +step "10/13 Installing monitoring stack + dashboards" +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts >/dev/null 2>&1 || true +helm repo update prometheus-community >/dev/null 2>&1 || true +EMPTYDIR_OVERRIDE="" +[ -f monitoring/values-emptydir.yaml ] && EMPTYDIR_OVERRIDE="-f monitoring/values-emptydir.yaml" +helm upgrade --install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml $EMPTYDIR_OVERRIDE -n monitoring --create-namespace >/dev/null +wait_rollout deployment/monitoring-grafana monitoring 240s + +kubectl apply -f monitoring/scrape/ >/dev/null 2>&1 || true # ServiceMonitors + PodMonitors +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml >/dev/null 2>&1 || true +kubectl apply -f monitoring/alerts/vidcast-slo-rules.yaml >/dev/null 2>&1 || true +for dash in vidcast-operations vidcast-slo vidcast-finops; do + [ -f "monitoring/dashboards/$dash.json" ] || continue + kubectl create configmap "$dash" -n monitoring --from-file="monitoring/dashboards/$dash.json" \ + --dry-run=client -o yaml | kubectl label -f - --local -o yaml grafana_dashboard=1 | kubectl apply -f - >/dev/null +done +ok "Prometheus + Grafana + Alertmanager + SLO rules + 3 dashboards" + +# ============================================================================= +# STEP 11 — KUBECOST (FinOps) — installed LAST (heaviest add-on; watch node pressure) +# ============================================================================= +step "11/13 Installing Kubecost (FinOps)" +helm repo add kubecost https://kubecost.github.io/cost-analyzer/ >/dev/null 2>&1 || true +helm repo update kubecost >/dev/null 2>&1 || true +KC_LOCAL="" +[ -f k8s/kubecost/values-local.yaml ] && KC_LOCAL="-f k8s/kubecost/values-local.yaml" +helm upgrade --install kubecost kubecost/cost-analyzer --version "$KUBECOST_CHART_VERSION" \ + -n kubecost --create-namespace -f k8s/kubecost/values.yaml $KC_LOCAL >/dev/null +wait_rollout deployment/kubecost-cost-analyzer kubecost 240s +# If the node is under pressure (Pending pods), park Kubecost rather than fail the run. +if kubectl get pods -A --field-selector=status.phase=Pending --no-headers 2>/dev/null | grep -q .; then + warn "Pending pods detected — node may be full. Consider scaling Kubecost to 0:" + warn " kubectl scale deploy/kubecost-cost-analyzer -n kubecost --replicas=0" +fi +ok "Kubecost installed (chart $KUBECOST_CHART_VERSION)" + +# ============================================================================= +# STEP 12 — NETWORK POLICIES (allows FIRST, default-deny LAST) +# ============================================================================= +# Ordering matters: apply every 'allow' before the catch-all deny, so there's no +# window where traffic is dropped before its exception exists. +step "12/13 Applying NetworkPolicies (allows first, default-deny last)" +kubectl apply -f k8s/network-policies/allow-dns.yaml \ + -f k8s/network-policies/allow-monitoring.yaml \ + -f k8s/network-policies/app-policies.yaml \ + -f k8s/network-policies/datastore-policies.yaml >/dev/null +kubectl apply -f k8s/network-policies/allow-kyverno-sigstore-egress.yaml >/dev/null 2>&1 || true +kubectl apply -f k8s/network-policies/default-deny.yaml >/dev/null # LAST +ok "default-deny in force with allow-list exceptions" + +# ============================================================================= +# STEP 13 — SMOKE TEST + ACCESS URLS +# ============================================================================= +step "13/13 Smoke test" +PASS=0; TOTAL=0 +check() { TOTAL=$((TOTAL+1)); if eval "$2" >/dev/null 2>&1; then PASS=$((PASS+1)); ok "$1"; else warn "$1 — FAILED"; fi; } + +check "gateway /healthz returns ok" \ + "kubectl exec -n default deploy/gateway -- python -c \"import urllib.request as u,sys; sys.exit(0 if b'ok' in u.urlopen('http://localhost:8080/healthz').read() else 1)\"" +check "in-cluster DNS resolves (gateway → rabbitmq)" \ + "kubectl exec -n default deploy/gateway -- python -c \"import socket; socket.gethostbyname('rabbitmq')\"" +if [ -n "${APP_LOGIN_PASSWORD:-}" ] && [ -n "$NODE_IP" ]; then + LOGIN_EMAIL="${APP_LOGIN_EMAIL:-$GMAIL_ADDRESS}" + check "login returns a JWT (${LOGIN_EMAIL})" \ + "[ \$(curl -s -m 15 -o /dev/null -w '%{http_code}' -X POST http://$NODE_IP:30002/login -u \"${LOGIN_EMAIL}:${APP_LOGIN_PASSWORD}\") = 200 ]" +else + warn "skipping login check (set APP_LOGIN_PASSWORD + ensure NODE_IP to enable it)" +fi + +echo +echo "${c_bold}Deploy complete. ${PASS}/${TOTAL} smoke checks passed.${c_reset}" + +# ── Access URLs + port-forwards ────────────────────────────────────────────── +echo +echo "${c_bold}Access URLs${c_reset} (NodePorts — need the security group to allow your IP):" +if [ -n "$NODE_IP" ]; then + echo " Frontend (web UI): http://$NODE_IP:30006" + echo " Gateway (API): http://$NODE_IP:30002" + echo " Grafana (dashboards): http://$NODE_IP:30007 (admin / vidcast-demo)" +else + echo " (NODE_IP unknown — find it: kubectl get nodes -o wide)" +fi +echo +echo "${c_bold}Port-forwards${c_reset} (for tools not exposed publicly — open localhost in a browser):" +echo " Prometheus: kubectl -n monitoring port-forward svc/monitoring-kube-prometheus-prometheus 9090:9090 # http://localhost:9090" +echo " Alertmanager: kubectl -n monitoring port-forward svc/monitoring-kube-prometheus-alertmanager 9093:9093 # http://localhost:9093" +echo " Kubecost: kubectl -n kubecost port-forward deploy/kubecost-cost-analyzer 9091:9090 # http://localhost:9091" +echo " Argo CD: kubectl -n argocd port-forward svc/argocd-server 8080:443 # https://localhost:8080" +echo +echo "Tear it all down when finished: ./deploy.sh --teardown" diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml new file mode 100644 index 0000000..a18f759 --- /dev/null +++ b/docker-compose.swarm.yml @@ -0,0 +1,122 @@ +version: '3.8' + +services: + auth: + image: vidcast/auth:latest + ports: + - "5000:5000" + networks: + - vidcast-net + environment: + DATABASE_HOST: postgres + DATABASE_NAME: auth + DATABASE_USER: auth_user + DATABASE_PORT: "5432" + PSQL_PASSWORD: Auth123 + JWT_SECRET: staging-jwt-secret-change-in-production + AUTH_TABLE: auth_user + deploy: + replicas: 1 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + gateway: + image: vidcast/gateway:latest + ports: + - "8080:8080" + networks: + - vidcast-net + environment: + MONGODB_VIDEOS_URI: mongodb://mongo:27017/videos + MONGODB_MP3S_URI: mongodb://mongo:27017/mp3s + RABBITMQ_HOST: rabbitmq + AUTH_SVC_ADDRESS: auth:5000 + deploy: + replicas: 2 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + converter: + image: vidcast/converter:latest + networks: + - vidcast-net + environment: + MONGODB_URI: mongodb://mongo:27017 + RABBITMQ_HOST: rabbitmq + VIDEO_QUEUE: video + MP3_QUEUE: mp3 + deploy: + replicas: 4 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + notification: + image: vidcast/notification:latest + networks: + - vidcast-net + environment: + RABBITMQ_HOST: rabbitmq + MP3_QUEUE: mp3 + GMAIL_ADDRESS: "" + GMAIL_PASSWORD: "" + deploy: + replicas: 1 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + mongo: + image: mongo:4.0.8 + volumes: + - mongo-data:/data/db + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + + postgres: + image: postgres:14 + environment: + POSTGRES_DB: auth + POSTGRES_USER: auth_user + POSTGRES_PASSWORD: Auth123 + volumes: + - pg-data:/var/lib/postgresql/data + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + + rabbitmq: + image: rabbitmq:3-management + ports: + - "15672:15672" + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + +networks: + vidcast-net: + driver: overlay + +volumes: + mongo-data: + pg-data: diff --git a/docs/DECISIONS_MADE.md b/docs/DECISIONS_MADE.md new file mode 100644 index 0000000..8ebf283 --- /dev/null +++ b/docs/DECISIONS_MADE.md @@ -0,0 +1,165 @@ +# Architectural Decisions — RBAC / Notifications / Admin branch + +Trade-off documentation for the `feature/rbac-and-notifications` branch. Each +decision follows the same shape: **what we chose → the alternatives → the +trade-off we accepted → where it breaks → the real fix at scale.** + +--- + +## 1. bcrypt now, alongside RBAC (not deferred) + +We added bcrypt password hashing in the same change as the role model, rather than +shipping RBAC on the existing plaintext passwords and hashing "later." + +The alternative was to defer: keep the plaintext comparison, add only the `role` +column and JWT claim now. It's less code and avoids a coordinated DB+image +migration. + +The trade-off we accepted is a one-time migration cost: bcrypt seeds in `init.sql`, +a `checkpw` path in `/login`, and a merge-time reseed of live Postgres — all of +which must land together or logins break. + +This would be the wrong call if the password store were large and live (re-hashing +millions of users needs a dual-read "verify-then-upgrade-on-login" strategy, not a +reseed). Here the user set is two seeded admins plus dev sign-ups on a disposable +cluster, so a reseed is trivial. + +The deciding reason: "you added role-based access but didn't hash the passwords" is +the first thing an assessor asks. Doing RBAC on plaintext is a half-measure that +invites the question; doing both closes it, and the image rebuilds anyway. + +## 2. Polling, not SSE/WebSockets, for the download bubble + +The "your file is ready" badge polls `GET /notifications/unseen-count` every 5 +seconds rather than holding a server-push channel open. + +The alternatives were Server-Sent Events (one-way push, <1s latency) or WebSockets +(bidirectional). Both eliminate the poll and feel instant. + +The trade-off we accepted is up to ~5s of latency before the badge updates — which +is invisible when the conversion it's reporting on takes 5–30s anyway. + +This would be wrong at scale: thousands of concurrent browsers polling every 5s is +load the server feels, and at that point a push transport earns its complexity. + +For a single-user demo, polling is one endpoint, debuggable with `curl`, and +firewall-proof. The honest scaling note for the presentation is "we'd move to SSE +before WebSockets if push became necessary" — SSE is the right next rung, not WS. + +## 3. Skipping the admin stats panel (Grafana already covers it) + +Feature 4 ships the user table + role management but **not** the aggregate stats +panel (uploads today, bytes converted, queue depth) the spec sketched. + +The alternative was a `GET /admin/stats` endpoint aggregating Mongo + RabbitMQ and +a stats card on the page. + +The trade-off we accepted is that an admin reads operational metrics in Grafana +(already deployed on NodePort 30007), not inside the app. + +This would be wrong if the audience for the metrics were non-operators who never +open Grafana — then in-app stats earn their place. Our admin is also the cluster +operator, who already lives in Grafana. + +The deciding reason: building a second, thinner metrics surface duplicates what the +monitoring stack does properly (retention, alerting, dashboards). Don't rebuild +Grafana badly inside the app. + +## 4. Admin enforcement in the gateway only (in-cluster trust gap) + +Authorization for the admin endpoints is checked in the **gateway**; the +auth-service `/users` endpoints have no role check of their own and trust +in-cluster callers — the same trust model as the pre-existing `/login`/`/validate`. + +The alternative is defence in depth: every service validates the JWT and authorizes +independently, so no service is trusted purely by its network position. + +The trade-off we accepted is a real privilege boundary that sits at the **network** +layer (ClusterIP + "only the gateway should call auth") rather than the +**application** layer — an in-cluster pod could call `auth/users` directly. + +This is wrong the moment the cluster is multi-tenant or runs untrusted workloads: +network position is not identity, and "internal" is not "trusted." + +The real fix is one of: mTLS / a shared secret between gateway and auth; the auth +service validating the JWT itself; or a service mesh enforcing "only the gateway +may call auth" via NetworkPolicy + workload identity. Out of scope for a +single-tenant demo, but that's the next step. + +## 5. Audit trail to stdout (not an append-only store) + +Every role change prints `AUDIT admin_role_change admin= target= +new_role= result=` to the gateway's stdout, captured by `kubectl +logs` and the monitoring stack. + +The alternative is a dedicated `audit_log` table (or an external SIEM sink) written +transactionally with the change. + +The trade-off we accepted is that the record is **mutable and ephemeral**: logs +rotate, pods are replaced, and the line vanishes if the code path changes. It +answers who/whom/what, but it is not tamper-evident. + +This is wrong anywhere with compliance or forensic requirements: "the logs say so" +is not an audit trail if the logs can be edited or lost. + +The real fix is an append-only store written in the **same transaction** as the +role change — immutable timestamps, ideally hash-chained so tampering is +detectable — or shipping to a write-once external system. A whole subsystem; +deliberately out of scope. + +## 6. Admin guardrails: self-demote (403) and last-admin (409) + +The `PATCH /admin/users/` endpoint refuses to let an admin change their own +role (403) or demote the last remaining admin (409), in addition to 404 on an +unknown email and 400 on an invalid role. + +The alternative is to trust admins to not lock themselves out, or to handle lockout +reactively (a manual DB edit to restore an admin). + +The trade-off we accepted is a little extra server-side logic and one pre-check +query (counting admins) before a demotion — negligible cost. + +This is rarely wrong, but the guard is conservative: in a large org you might +legitimately want to demote yourself once another admin exists, which our blanket +self-demote block forbids. We chose the safe default over the flexible one. + +The deciding reason: admin lockout is a self-inflicted outage with no in-app +recovery path. Two cheap guards (plus disabling the self-row button in the UI) +remove the most common ways to cause it, and the 409 last-admin check catches the +case where demoting *someone else* would still empty the admin set. + +--- + +## Addenda — learnings from the post-merge integration test + +### A. The bcrypt migration is a forward-only constraint + +Once live Postgres is migrated to bcrypt hashes, you **cannot roll the auth image +back** to the pre-bcrypt version. The old image compares passwords with `==` +against the stored value; after migration that value is a bcrypt hash, so every +login fails. The clean rollback path (old plaintext image + old plaintext DB) +exists **only before the migration runs** — migration closes it. + +We hit exactly this live: the merge auto-deployed the bcrypt auth image *before* +the DB was migrated, so logins 500'd, and the only correct recovery was to roll +**forward** (run the migration), not back. The operational rule that falls out of +this: the bcrypt image and the schema/seed migration are a single atomic change — +deploy them together, and treat "rollback" post-migration as "fix forward," not +"revert the image." (A true revert would also require restoring a pre-bcrypt DB +snapshot, which a no-PV dev Postgres doesn't have.) + +### B. The 403 self-demote and 409 last-admin guards are complementary, not redundant + +At first glance the 409 looks unreachable: in normal operation the only admin +demoting the only admin is caught by the 403 self-demote check first, so 409 never +fires. That's true — for *non-stale* tokens. + +The 409 exists for the **stale-token** case. An admin whose role was revoked in the +DB but who still holds an unexpired admin JWT would pass the gateway's `admin` +claim check, and could then demote the last *real* admin — emptying the admin set +without ever demoting "themselves" (their token's identity is already a non-admin +in the DB). The 403 guards **identity** ("you can't change your own role"); the 409 +guards a **system invariant** ("never zero admins"). Different questions, different +failure modes — together they cover both "don't shoot yourself" and "don't empty +the admin set, even with a token that out-lived its privileges." This is why the +integration test could only trigger 409 by deliberately staling a token. diff --git a/docs/DISASTER_RECOVERY.md b/docs/DISASTER_RECOVERY.md new file mode 100644 index 0000000..0defa6f --- /dev/null +++ b/docs/DISASTER_RECOVERY.md @@ -0,0 +1,177 @@ +# VidCast — Disaster Recovery Runbook + +> Closes narrative gaps **I4** (no automated backup) and **P5** (no DR runbook). +> Companion to the durability work in `feature/improvement-sprint-1-durability-and-backup`. +> +> **Last restore test:** **2026-06-10** — Postgres restore drill performed during the +> Sprint 1 rollout: `pg_dump` of the live DB → `helm upgrade` onto the new EBS PVC +> (fresh/empty volume) → restored the dump → **login E2E passed** (admin JWT issued +> against the restored data). MongoDB backup verified producing valid archives in S3 +> (videos 38 MB, mp3s 7 MB); a full **mongorestore** drill is still outstanding (§5). + +--- + +## 1. What this protects against + +| Failure | Before | After this branch | +|---|---|---| +| Postgres pod restart | All registered users except the deploy.sh seed admin are lost (ephemeral pod fs) | Data persists on an EBS PVC (A11); also recoverable from nightly `pg_dump` | +| MongoDB PV loss / corruption | Every uploaded video + converted MP3 + outbox state gone permanently | Recoverable from the latest nightly `mongodump` (up to ~24h old) | +| Whole-cluster loss | App redeployable from Git via Argo CD, but **data gone** | App from Git + **data from S3 backups** = full recovery | + +The application/control plane is already recoverable from Git (Argo CD). This +runbook covers the **stateful tier**, which Git cannot rebuild. + +--- + +## 2. What is backed up, where, and how often + +| Datastore | Tool | Schedule (UTC) | Destination | Format | +|---|---|---|---|---| +| MongoDB `videos` + `mp3s` DBs (GridFS files + outbox) | `mongodump --uri` per DB, gzip archive | nightly **02:00** | `s3://vidcast-backups-501562869470/mongo/` | two gzip archives per run | +| PostgreSQL (`authdb`) | `pg_dump \| gzip` | nightly **02:15** | `s3://vidcast-backups-501562869470/postgres/` | gzipped SQL | + +> **MongoDB auth note (important for restore):** the backup authenticates with the +> **app's own credentials** from `gateway-secret` (`MONGODB_VIDEOS_URI` / +> `MONGODB_MP3S_URI`, user `mongouser`, `authSource=admin`) — **not** the +> `mongodb-secret` root user, whose password is out of sync with the running mongod +> and fails SCRAM-SHA-256. Each run produces two archives (`videos-` and +> `mp3s-`) because a URI pins to a single database. + +- **Bucket:** `vidcast-backups-501562869470` — private, versioned, AES256-encrypted, + created by `terraform/modules/storage`. +- **Retention:** 30 days (object + noncurrent-version lifecycle expiry). +- **Object keys** are timestamped: `mongo/videos-YYYYMMDDTHHMMSSZ.archive.gz`, + `mongo/mp3s-YYYYMMDDTHHMMSSZ.archive.gz`, `postgres/postgres-YYYYMMDDTHHMMSSZ.sql.gz`. +- **Auth:** the CronJobs run as the `vidcast-backup` ServiceAccount (IRSA role + `vidcast-cluster-backup-irsa`), which may only `s3:PutObject`/`ListBucket` on + this one bucket — no other AWS access. + +**Objectives** + +| | Target | Why | +|---|---|---| +| **RPO** (max data loss) | **≤ 24h** | Nightly cadence. Tighten by adding a midday run if needed. | +| **RTO** (time to restore) | **≤ 2h** | Re-apply infra (~20m) + restore dumps (minutes–tens of minutes) + E2E verify. | + +--- + +## 3. Prerequisites (provisioned by this branch) + +1. **EBS CSI driver addon** — `terraform/modules/eks` (`aws_eks_addon.ebs_csi` + + its IRSA role). Without it the Postgres PVC stays `Pending`. +2. **gp3 StorageClass `vidcast-ebs-gp3`** + **`postgres-pvc`** — `Helm_charts/Postgres` + (`persistence.enabled=true`). `reclaimPolicy: Retain` so deleting the PVC does + **not** delete the EBS volume. +3. **S3 backup bucket + backup IRSA role** — `terraform/modules/storage`. +4. **CronJobs + `vidcast-backup` SA** — `k8s/base/backup`, wired into both overlays. + +> ⚠️ If `terraform/modules/storage`'s `bucket_prefix` is changed, update +> `BACKUP_BUCKET` in `k8s/base/backup/*-cronjob.yaml` and the bucket name in §2. + +--- + +## 4. Restore procedures + +> Run from a workstation with `kubectl` pointed at the cluster and the relevant +> secrets present (ESO-synced in prod, or `deploy.sh` in dev). Replace +> `` with the chosen timestamped key from `aws s3 ls`. + +### 4.1 Pick the backup to restore + +```bash +aws s3 ls s3://vidcast-backups-501562869470/mongo/ --recursive | sort | tail +aws s3 ls s3://vidcast-backups-501562869470/postgres/ --recursive | sort | tail +``` + +### 4.2 Restore MongoDB + +> Each run produced TWO archives (`videos-` and `mp3s-`). Restore both. +> Authenticate with the **app** credentials (the same ones the backup used), not +> the mongodb-secret root user — pull the URI from `gateway-secret`. + +```bash +# 0. Get the app's mongo URIs (these carry the working mongouser credentials). +VIDEOS_URI=$(kubectl get secret gateway-secret -o jsonpath='{.data.MONGODB_VIDEOS_URI}' | base64 -d) +MP3S_URI=$(kubectl get secret gateway-secret -o jsonpath='{.data.MONGODB_MP3S_URI}' | base64 -d) + +# 1. Pull both archives for the chosen timestamp. +aws s3 cp s3://vidcast-backups-501562869470/mongo/videos-.archive.gz /tmp/videos.gz +aws s3 cp s3://vidcast-backups-501562869470/mongo/mp3s-.archive.gz /tmp/mp3s.gz + +# 2. Copy into the running mongod pod. +kubectl cp /tmp/videos.gz mongodb-0:/tmp/videos.gz +kubectl cp /tmp/mp3s.gz mongodb-0:/tmp/mp3s.gz + +# 3. Restore each. --drop replaces existing collections with the backup's contents; +# omit --drop to merge. --nsInclude scopes the restore to that database. +kubectl exec -it mongodb-0 -- mongorestore --uri="$VIDEOS_URI" \ + --gzip --archive=/tmp/videos.gz --drop --nsInclude='videos.*' +kubectl exec -it mongodb-0 -- mongorestore --uri="$MP3S_URI" \ + --gzip --archive=/tmp/mp3s.gz --drop --nsInclude='mp3s.*' +``` + +### 4.3 Restore PostgreSQL + +```bash +# 1. Pull + decompress. +aws s3 cp s3://vidcast-backups-501562869470/postgres/ /tmp/pg.sql.gz +gunzip -f /tmp/pg.sql.gz # -> /tmp/pg.sql + +# 2. Ensure the schema exists (a fresh PVC is empty). The chart's init.sql / +# deploy.sh seed runs on first boot; if restoring into a clean DB, the dump +# itself recreates auth_user. Pipe it in: +POD=$(kubectl get pod -l name=postgres-pod -o jsonpath='{.items[0].metadata.name}') +kubectl exec -i "$POD" -- sh -c 'PGPASSWORD="$POSTGRES_PASSWORD" psql -U pguser -d authdb' < /tmp/pg.sql +``` + +> If the restore target is a brand-new PVC, the bcrypt seed admin from +> `deploy.sh` must exist **or** be contained in the dump — otherwise log in with a +> user that the dump restored. + +### 4.4 Verify integrity (do not skip) + +```bash +# Postgres: row count + the seed admin is present and is a bcrypt hash. +kubectl exec -i "$POD" -- sh -c 'PGPASSWORD="$POSTGRES_PASSWORD" psql -U pguser -d authdb -c \ + "SELECT count(*) FROM auth_user; SELECT email, left(password,4) AS hash_prefix, role FROM auth_user LIMIT 5;"' +# expect hash_prefix like $2a$ / $2b$ (bcrypt), NOT plaintext. + +# Mongo: GridFS file counts are non-zero. +kubectl exec -it mongodb-0 -- mongo --quiet --eval \ + 'print("videos="+db.getSiblingDB("videos")["fs.files"].count()+" mp3s="+db.getSiblingDB("mp3s")["fs.files"].count())' +``` + +### 4.5 Full pipeline smoke test + +Log in (`baabalola@gmail.com / YourPassword123`) → upload a small video → +confirm conversion email → download the MP3. Restore is complete only when this +passes. + +--- + +## 5. The DR drill (perform, then record the date at the top) + +1. Trigger both backups on demand (don't wait for 02:00): + ```bash + kubectl create job --from=cronjob/mongo-backup mongo-backup-drill-$(date +%s) + kubectl create job --from=cronjob/postgres-backup pg-backup-drill-$(date +%s) + ``` + Confirm a fresh object appears under each S3 prefix. +2. In a **non-prod** namespace/cluster (or a disposable re-apply), perform §4.2–4.4. +3. Time it end to end → record actual RTO. Update the **Last restore test** date. +4. File any surprises as issues; a runbook that drifted from reality is worse than none. + +--- + +## 6. Follow-ups (out of scope for this branch) + +- **Backup freshness alert (P5 monitoring):** a `PrometheusRule` that fires if no + successful backup Job completed in the last 25h. The first time you learn + backups stopped should not be the day you need one. (Needs a kube-state-metrics + series on `kube_job_status_completion_time` filtered to the backup CronJobs.) +- **Metadata-only Mongo backups:** once P2 (S3 file storage) lands, files live in + S3 with its own durability and the Mongo dump shrinks to metadata — much smaller + and faster. +- **Cross-region copy** of the backup bucket for region-loss survivability + (deliberately omitted now per the single-region cost decision). diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md new file mode 100644 index 0000000..660bebb --- /dev/null +++ b/docs/GETTING_STARTED.md @@ -0,0 +1,288 @@ +# VidCast — Getting Started (Clone → Run → Teardown) + +This is the complete, end-to-end walkthrough: everything from cloning the repo to a +working deployment on AWS EKS, and finally tearing it down so it stops costing money. +It is the operational companion to the high-level [`README.md`](../README.md) and the +narrative [`PROJECT_GUIDE.md`](PROJECT_GUIDE.md). + +> **No secrets live in this repo.** Every credential (DB passwords, JWT secret, Gmail +> app password, AWS account ID) is supplied by *you* at deploy time through gitignored +> files and CI/CD secrets. Placeholders such as ``, `YOUR_STATE_BUCKET`, +> and `` mark every spot you must fill in. + +--- + +## 0. What you need first + +| Tool | Version | Notes | +|------|---------|-------| +| AWS CLI | v2 | `aws configure` with a user that can create EKS/VPC/IAM | +| kubectl | 1.31+ | | +| Helm | 3.x | | +| Terraform | 1.5+ | | +| Docker | 20+ | for building images locally | +| psql | any | PostgreSQL client, for seeding the auth DB | +| mongosh | 7.x | optional, for inspecting MongoDB | + +On WSL2/Ubuntu you can install kubectl, Helm, Python, psql, mongosh and Terraform with: + +```bash +./install_prerequisites.sh +``` + +AWS CLI and Docker are assumed already installed. Verify access before anything else: + +```bash +aws sts get-caller-identity +``` + +> **Account constraint:** this AWS account's SCPs reject T-type instances (EKS auto-adds +> `CreditSpecification: unlimited`, which is denied). Use `m7i-flex.large` or any +> M/C/R-series type. The Terraform EKS module enforces this with a validation block. + +--- + +## 1. Clone + +```bash +git clone https://github.com//vidcast.git +cd vidcast +``` + +--- + +## 2. Provide your configuration + +Nothing sensitive is committed, so you fill in values in **gitignored** files: + +```bash +# Terraform inputs +cp terraform/environments/dev/terraform.tfvars.example terraform/environments/dev/terraform.tfvars +# then edit: state_bucket, cluster_name, region, instance type +``` + +You will also choose application credentials as you go (Mongo/Postgres passwords, a +32+ char `JWT_SECRET`, an optional Gmail app password for notifications). Keep them in +a local note — `DEPLOYMENT_CONFIG.md` is gitignored for exactly this purpose. + +--- + +## 3. Provision infrastructure (Terraform) + +```bash +cd terraform/environments/dev + +terraform init \ + -backend-config="bucket=YOUR_STATE_BUCKET" \ + -backend-config="key=vidcast/dev/terraform.tfstate" \ + -backend-config="region=eu-west-2" \ + -backend-config="dynamodb_table=vidcast-terraform-locks" + +terraform plan +terraform apply # ~20 minutes for the EKS control plane + node group +cd ../../.. +``` + +This creates the VPC, IAM roles, EKS cluster + node group, security-group NodePort +rules (30002–30008), and the GitHub OIDC deploy role. Grab two outputs you'll reuse: + +```bash +cd terraform/environments/dev +terraform output github_actions_role_arn # → GitHub secret AWS_DEPLOY_ROLE_ARN +cd ../../.. + +aws eks update-kubeconfig --name vidcast-cluster --region eu-west-2 +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') +echo "Node external IP: $NODE_IP" +``` + +--- + +## 4. Deploy the data services (Helm) + +```bash +cd Helm_charts/MongoDB && helm install mongodb . && cd ../.. +kubectl wait --for=condition=ready pod/mongodb-0 --timeout=120s +cd Helm_charts/Postgres && helm install postgres . && cd ../.. +cd Helm_charts/RabbitMQ && helm install rabbitmq . && cd ../.. +kubectl get pods -w # wait until all are Running +``` + +> Mongo/Postgres/RabbitMQ credentials come from each chart's `values.yaml`. Set them +> there before `helm install`, and make them match the service config/secrets (see the +> "Customisation Checklist" in `CLAUDE.md`). + +--- + +## 5. Seed PostgreSQL + +`Helm_charts/Postgres/init.sql` ships with **placeholders only** — no real admin email +or password hash. Generate a bcrypt hash and edit the file before applying: + +```bash +python3 -c "import bcrypt; print(bcrypt.hashpw(b'YOUR_PASSWORD', bcrypt.gensalt(rounds=12)).decode())" +# paste the result into init.sql in place of , set your admin email + +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql -h "$NODE_IP" -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb -f Helm_charts/Postgres/init.sql +``` + +--- + +## 6. Create the RabbitMQ queues + +```bash +curl -u guest:guest -X PUT "http://$NODE_IP:30004/api/queues/%2F/video" \ + -H "Content-Type: application/json" -d '{"durable":true}' +curl -u guest:guest -X PUT "http://$NODE_IP:30004/api/queues/%2F/mp3" \ + -H "Content-Type: application/json" -d '{"durable":true}' +``` + +--- + +## 7. Get the images + +**Option A — let CI build them (recommended).** Push to `main` and GitHub Actions lints, +scans (Trivy), builds, and pushes all four backend services to Docker Hub, then deploys +to EKS. This needs the secrets in [section 10](#10-cicd-secrets). + +**Option B — build and push manually.** + +```bash +for svc in auth-service gateway-service converter-service notification-service; do + docker build -t YOUR_DOCKERHUB_USER/$svc:dev src/$svc + docker push YOUR_DOCKERHUB_USER/$svc:dev +done +``` + +The frontend is **not** built by CI; build it and push to your ECR (or Docker Hub), +then set the image in the Kustomize overlay you deploy +(`k8s/overlays//kustomization.yaml`, the `images:` entry named +`vidcast-frontend`). Backend image tags live in the same `images:` block. + +--- + +## 8. Deploy the microservices + +Manifests are managed with Kustomize (`k8s/base` + `k8s/overlays/{dev,prod}`). +Secrets are applied separately (they are not in the Kustomize tree): + +```bash +# Secrets first (gitignored; rabbitmq-secret comes from the RabbitMQ Helm chart): +kubectl apply -f src/auth-service/manifest/secret.yaml +kubectl apply -f src/gateway-service/manifest/secret.yaml +kubectl apply -f src/converter-service/manifest/secret.yaml +kubectl apply -f src/notification-service/manifest/secret.yaml + +# Then the overlay (use overlays/dev for the lighter single-replica dev env): +kubectl apply -k k8s/overlays/prod +kubectl get pods # all should reach Running +``` + +--- + +## 9. Test end-to-end + +```bash +# Login (use the admin email + password you seeded in step 5) +TOKEN=$(curl -s -X POST "http://$NODE_IP:30002/login" -u "admin@example.com:YOUR_PASSWORD") + +# Upload a video +curl -X POST "http://$NODE_IP:30002/upload" \ + -F "file=@assets/video.mp4" -H "Authorization: Bearer $TOKEN" + +# Watch the queue drain +curl -s -u guest:guest "http://$NODE_IP:30004/api/queues/%2F/video" | python3 -m json.tool | grep messages + +# Download the MP3 (file id comes from the notification email or the frontend) +curl -X GET "http://$NODE_IP:30002/download?fid=FILE_ID" \ + -H "Authorization: Bearer $TOKEN" -o output.mp3 +``` + +Or just open the web UI at `http://$NODE_IP:30006` and do it through the browser. + +--- + +## 10. CI/CD secrets + +The pipelines authenticate with secrets you configure in GitHub / Jenkins — none are +stored in the repo. + +### GitHub Actions — CI (`ci.yml`) + +Settings → Secrets and variables → Actions: + +| Secret | Description | Example | +|--------|-------------|---------| +| `DOCKERHUB_USERNAME` | Docker Hub username | your username | +| `DOCKERHUB_TOKEN` | Docker Hub **access token** (not your password) | `dckr_pat_...` | + +Create the token at hub.docker.com → Account Settings → Security → New Access Token. + +### GitHub Actions — CD (`cd.yml`), OIDC — no static AWS keys + +CD assumes an IAM role via GitHub OIDC (short-lived creds). There are **no** +`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` secrets. The role + OIDC provider are +created by Terraform (`terraform/modules/github-oidc`). + +| Secret | Source | +|--------|--------| +| `AWS_DEPLOY_ROLE_ARN` | `terraform output github_actions_role_arn` (step 3) | +| `AWS_REGION` | `eu-west-2` | +| `EKS_CLUSTER_NAME` | `vidcast-cluster` | +| `DOCKERHUB_USERNAME` | your Docker Hub username (sets the deployment image name) | + +`cd.yml` already sets `permissions: id-token: write` so it can request the OIDC token. + +### Jenkins (`Jenkinsfile`) + +Manage Jenkins → Credentials: + +| Credential ID | Type | Description | +|---------------|------|-------------| +| `dockerhub-credentials` | Username/Password | Docker Hub login | +| `aws-credentials` | AWS Credentials | IAM key for EKS access | +| `swarm-staging-ip` | Secret text | IP of the Swarm staging EC2 | + +--- + +## 11. Monitoring (optional) + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml -n monitoring --create-namespace +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml +``` + +Grafana → `http://$NODE_IP:30007` (admin / vidcast-demo). Alertmanager → `:30008`. + +--- + +## 12. Teardown (stop paying for it) + +```bash +kubectl delete -k k8s/overlays/prod # match the overlay you deployed + +helm uninstall mongodb postgres rabbitmq +helm uninstall monitoring -n monitoring + +cd terraform/environments/dev && terraform destroy && cd ../../.. +``` + +Because everything is infrastructure-as-code, `terraform apply` brings the whole stack +back in ~20 minutes whenever you need it again. + +--- + +## Troubleshooting + +- **Pod in `CrashLoopBackOff`** → `kubectl logs ` and `kubectl describe pod `. + Most often a credential mismatch between a chart `values.yaml` and a service config. +- **Every login fails after deploying a new auth image** → the bcrypt image and the DB + seed must land together; re-run `init.sql`. See [`MERGE_RUNBOOK_RBAC.md`](MERGE_RUNBOOK_RBAC.md). +- **`terraform apply` hangs then fails on the node group** → you used a T-type instance. + Switch to `m7i-flex.large`. +- **Can't reach a NodePort** → confirm the security-group rules for 30002–30008 exist + (Terraform creates them) and that you're hitting the node's *external* IP. diff --git a/docs/GITOPS.md b/docs/GITOPS.md new file mode 100644 index 0000000..b112773 --- /dev/null +++ b/docs/GITOPS.md @@ -0,0 +1,252 @@ +# GITOPS.md — Deployment model with Argo CD (B1) + +> How VidCast deploys after Sprint 3. Tracked (not gitignored): this is the +> contract for how changes reach the cluster. + +--- + +## 1. The model in one paragraph + +Argo CD runs in-cluster and continuously reconciles the `default` namespace to the +Kustomize manifests in this repo under `k8s/overlays/{dev,prod}`. **Git is the +source of truth.** Nobody runs `kubectl apply` or `kubectl set image` against the +app anymore — you change git, and Argo makes the cluster match. **dev auto-syncs; +prod syncs only on a human action (the approval gate).** + +--- + +## 2. Why in-repo manifests (Q3 decision) + +The Argo `Application`s point at `k8s/overlays/{dev,prod}` **in this same repo** — +there is no separate manifest repo and no reorganisation into an `apps/` tree. + +- **Separate manifest repo** is the textbook pattern for **multi-team orgs**: it + decouples "who can change app code" from "who can change what's deployed," and + lets many app repos feed one deployment repo. +- **Single-repo** is the right call for a **solo project**: one PR captures both + the code change *and* the manifest/image-tag change, with one review and one + audit trail. The indirection of a second repo would add ceremony with no + separation-of-duties benefit when one person owns everything. + +This is a deliberate, documented trade-off — not an oversight. + +--- + +## 3. Manifest layout (what Argo reads) + +``` +k8s/ + base// # A10 base manifests (one per workload) + overlays/ + dev/ → Application vidcast-dev (auto-sync) 1 replica each + prod/ → Application vidcast-prod (manual-sync) live footprint +``` + +Argo runs `kustomize build` on the overlay path itself — the same command we +validate locally. No Argo-specific manifest format; the overlays are plain +Kustomize. + +--- + +## 4. What Argo manages vs what stays manual + +| Layer | Owner | How it's applied | +|---|---|---| +| **App workloads** (Deployments, Services, ConfigMaps, ESO-created Secrets in `overlays/*`) | Argo CD | synced from git | +| Argo CD itself | platform (the operator) | `helm install argocd` | +| ESO (`ClusterSecretStore`, `ExternalSecret`s) | platform | `kubectl apply -f k8s/external-secrets` | +| KEDA (`ScaledObject`, `TriggerAuthentication`) | platform | `kubectl apply -k k8s/keda` | +| NetworkPolicies | platform | `kubectl apply -k k8s/network-policies` | +| Kyverno + ClusterPolicies | platform | `kubectl apply -k k8s/kyverno` | + +**Why the split:** Argo manages the *application*; the *platform* (the control +planes that make the cluster what it is, including Argo's own install) is owned by +the platform engineer. Argo shouldn't manage its own installation (chicken-and-egg), +and platform changes are infrequent, privileged, and not part of the app delivery +loop. (An "app-of-apps" pattern could later bring some platform pieces under Argo, +but that's deliberately out of scope here.) + +--- + +## 5. dev vs prod sync behaviour + +| | vidcast-dev | vidcast-prod | +|---|---|---| +| `syncPolicy.automated` | **present** (`prune: true`, `selfHeal: true`) | **absent** (manual only) | +| Trigger | every change to `overlays/dev` on main, auto | a human runs `argocd app sync vidcast-prod` | +| Drift (manual `kubectl edit`) | auto-reverted (selfHeal) | shown as OutOfSync until a human acts | +| Purpose | fast validation loop | the production approval gate | + +**dev workflow:** `merge to main → CI builds image → image-tag bump in +overlays/dev → Argo auto-syncs within the poll interval (~3 min)`. + +**prod workflow:** `merge image-tag-bump PR → vidcast-prod shows OutOfSync → human +syncs`. The **PR merge is the approval**; the manual Argo sync is the deploy action. + +> ⚠️ **Single-cluster caveat.** Both Applications target the `default` namespace on +> the one demo cluster, so they manage the same-named resources. **Sync only one at +> a time.** In a real deployment, dev and prod Applications point at different +> clusters (`destination.server`). Syncing both here would make them fight over the +> same Deployments. + +--- + +## 6. The approval-gate migration (the important part) + +**Before (push model):** `.github/workflows/cd.yml` runs `kubectl set image` +straight against EKS after CI. The "approval" was an ephemeral Jenkins button; the +record of what's deployed lives only in the cluster. + +**After (pull model):** CI builds+pushes the image, then **something updates the +image tag in the overlay**, and Argo syncs. The deploy becomes a **git change with +a diff, a reviewer, and a permanent audit trail** — you can see exactly which image +SHA went to prod, who approved it, and when, forever. Rollback is `git revert`. + +The "something that updates the tag" is a **CD change the operator writes** (workflows are +the operator's per the execution split). Two options: + +### Option A (recommended) — all-GitHub + +After CI pushes the image, a CD job bumps the tag with `kustomize edit set image` +and opens a PR (prod) / commits to main (dev). Merging the PR is the approval. + +- **dev:** commit the dev-overlay bump straight to main → Argo auto-syncs. +- **prod:** open a PR bumping the prod overlay → review+merge = approval → human + runs `argocd app sync vidcast-prod`. + +**Why recommended:** simplest, single system (GitHub), the PR diff *is* the +audit/approval, and it matches the in-repo Q3 decision. + +### Option B — preserve the Jenkins Swarm smoke-test + +Jenkins keeps building → deploys to Swarm staging → smoke-tests. **On success**, +Jenkins (instead of `kubectl set image`) bumps the overlay tag and opens the same +PR. Merge = approval. + +**Why you might want it:** keeps the real pre-prod verification (Swarm smoke test) +as a gate on *opening* the PR — defence in depth. Cost: two systems to maintain. + +**Recommendation: Option A.** The Swarm smoke-test is valuable but, for a solo +project, the marginal safety doesn't justify maintaining Jenkins + GitHub Actions. +If you keep Jenkins, do Option B and demote Jenkins to "smoke-test then open PR" +(its `kubectl`/rollback-undo stages go away — Argo owns deploy + rollback now). + +### Exact diff for the operator — `cd.yml` (Option A) + +Replace the `kubectl set image` deploy with a tag-bump-and-PR job. The OIDC/EKS +steps are no longer needed in CD (Argo deploys, not the workflow): + +```diff + name: VidCast CD — Deploy to EKS + on: + workflow_run: + workflows: ["VidCast CI — Lint, Scan, Build, Push"] + types: [completed] + branches: [main] + +-permissions: +- id-token: write # required to request the OIDC token +- contents: read ++permissions: ++ contents: write # commit the dev tag bump ++ pull-requests: write # open the prod tag-bump PR + + jobs: + deploy: + if: ${{ github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 ++ with: { ref: main, fetch-depth: 0 } + +- - name: Configure AWS credentials (OIDC) +- uses: aws-actions/configure-aws-credentials@v4 +- with: +- role-to-assume: ${{ secrets.AWS_DEPLOY_ROLE_ARN }} +- aws-region: ${{ secrets.AWS_REGION }} +- - name: Update kubeconfig for EKS +- run: aws eks update-kubeconfig --name ${{ secrets.EKS_CLUSTER_NAME }} --region ${{ secrets.AWS_REGION }} + + - name: Set short SHA + run: echo "SHORT_SHA=$(echo ${{ github.event.workflow_run.head_sha }} | cut -c1-7)" >> $GITHUB_ENV + +- - name: Deploy services to EKS +- run: | +- for svc in auth-service gateway-service converter-service notification-service; do +- deploy_name="${svc%-service}" +- kubectl set image deployment/${deploy_name} \ +- ${deploy_name}=${{ secrets.DOCKERHUB_USERNAME }}/${svc}:${{ env.SHORT_SHA }} || true +- kubectl rollout status deployment/${deploy_name} --timeout=120s || true +- done +- - name: Verify all pods running +- run: kubectl get pods -o wide ++ - name: Install kustomize ++ run: curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash && sudo mv kustomize /usr/local/bin/ ++ ++ # DEV: bump tags and push straight to main → Argo auto-syncs vidcast-dev. ++ - name: Bump dev overlay image tags ++ run: | ++ cd k8s/overlays/dev ++ for svc in auth gateway converter notification; do ++ kustomize edit set image /${svc}-service:${SHORT_SHA} ++ done ++ - name: Commit dev bump ++ run: | ++ git config user.name "vidcast-ci"; git config user.email "ci@vidcast" ++ git commit -am "ci(dev): bump images to ${SHORT_SHA}" && git push origin main || echo "no change" ++ ++ # PROD: open a PR bumping the prod overlay. Merge = approval; then a human ++ # runs `argocd app sync vidcast-prod` (prod Application is manual-sync). ++ - name: Bump prod overlay image tags on a branch ++ run: | ++ git checkout -b "deploy/prod-${SHORT_SHA}" ++ cd k8s/overlays/prod ++ for svc in auth gateway converter notification; do ++ kustomize edit set image /${svc}-service:${SHORT_SHA} ++ done ++ git commit -am "deploy(prod): bump images to ${SHORT_SHA}" ++ git push origin "deploy/prod-${SHORT_SHA}" ++ - name: Open prod deploy PR ++ run: gh pr create --base main --head "deploy/prod-${SHORT_SHA}" --title "Deploy ${SHORT_SHA} to prod" --body "Review = approval. After merge: argocd app sync vidcast-prod" ++ env: { GH_TOKEN: "${{ github.token }}" } +``` + +> Notes for the operator: the `outbox-relay` image (A1) should be added to this loop and to +> the overlays' `images:` lists once CI builds it. The `kustomize edit set image` +> lines assume the overlay `images:` entries A10 created. The CD job no longer needs +> AWS/EKS secrets — drop `AWS_DEPLOY_ROLE_ARN` etc. from CD (CI still uses them only +> if it pushed to ECR; Docker Hub images don't need AWS at all). + +--- + +## 7. Rollback + +```bash +git revert # the image-tag bump (or any manifest change) +# dev: Argo auto-syncs back. prod: argocd app sync vidcast-prod +``` + +Rollback is now a **git operation with history**, not an invisible +`kubectl rollout undo`. You can see in `git log` exactly what was rolled back and +when. + +--- + +## 8. The one rule: don't `kubectl edit` synced resources + +Once Argo owns a resource, **git is the only way to change it.** A manual +`kubectl edit`/`apply` on a synced workload will be **reverted** by dev's +`selfHeal`, or show as **OutOfSync drift** on prod. This includes the converter's +replica count — KEDA owns that at runtime (A7), so the overlay `replicas:` is just +the bootstrap value and Argo won't fight KEDA over it as long as we don't also set +it by hand. To change something, change git. + +--- + +## 9. Status / readiness + +- B1 ships the GitOps **machinery** (Argo install values + two Applications + this + doc). The CD tag-bump flow (§6) is the operator's to implement. +- Runtime verification (Argo UI showing the Application tree syncing) is deferred to + the next live cluster re-apply — the cluster is currently torn down. The + Application CRDs and Helm values are the reviewable artifacts now. diff --git a/docs/INGRESS_DEPLOY.md b/docs/INGRESS_DEPLOY.md new file mode 100644 index 0000000..fcc54c0 --- /dev/null +++ b/docs/INGRESS_DEPLOY.md @@ -0,0 +1,131 @@ +# VidCast — Ingress / TLS / Perimeter Deploy Guide (Sprint 2) + +> Closes **P1 / I7** (ALB Ingress + HTTPS on a hostname) and **I2** (datastores + +> app services NodePort → ClusterIP). Branch: +> `feature/improvement-sprint-2-ingress-tls`. **Nothing here has been applied** — +> this is the deploy runbook for after sign-off. + +--- + +## 1. What changes + +- The platform moves from `http://:30006` to **`https://`**, + served by an **AWS ALB** the Load Balancer Controller provisions from + `k8s/ingress/vidcast-ingress.yaml`. +- **All NodePorts are removed.** MongoDB (30005), PostgreSQL (30003), RabbitMQ + (30004), gateway (30002), frontend (30006) → **ClusterIP**. The ALB is the only + external entrypoint; datastores are admin-accessed via `kubectl port-forward`. + +## 2. Design decisions (deviations from the original prompt — read these) + +1. **Routing is `/` → `frontend`, not `/api` → `gateway`.** The frontend's nginx + already serves the SPA and proxies `/api/` → `gateway:8080` **stripping the + `/api` prefix** (`src/frontend/nginx.conf`). An ALB cannot strip path prefixes, + so a direct `/api` → gateway rule would deliver `/api/login` to a gateway that + only knows `/login` (404). Routing everything through the frontend preserves the + working path for browsers **and** API clients (`https:///api/login`) and + keeps the **gateway internal** (ClusterIP) — smaller attack surface. +2. **TLS is ACM, not cert-manager.** The ALB terminates TLS with an **ACM + certificate** (`alb.ingress.kubernetes.io/certificate-arn`). An ALB cannot read + cert-manager's in-cluster TLS secrets, so the `ClusterIssuer` + (`k8s/ingress/cert-manager/`) is shipped only as the **alternative** path (for an + in-cluster ingress controller, or DNS-01 issuance you import to ACM). For the + default ALB path you do **not** need cert-manager. +3. **No new `allow-alb-ingress` NetworkPolicy.** The existing `gateway` and + `frontend` policies (`app-policies.yaml`) already allow ingress on 8080 **from + any source**, so the ALB path is already permitted — a new VPC-CIDR policy would + be a redundant no-op (NetworkPolicy is an additive union). *Hardening + opportunity (separate change, since this sprint must not edit existing + policies):* tighten those two ingress rules from "anywhere" to the VPC CIDR now + that the ALB is the only entrypoint. +4. **LBC IRSA lives in `terraform/modules/lbc/`, not `modules/iam/`.** The iam + module creates the cluster role the eks module depends on, and eks creates the + OIDC provider the LBC trust policy needs — putting it in iam would form an + iam↔eks cycle. Mirrors the `external-secrets` / `storage` IRSA modules. +5. **Grafana subpath routing deferred.** Routing `/grafana` needs grafana's + `serve_from_sub_path`/`root_url` config (a monitoring change, out of this + sprint's scope) or a dedicated `grafana.` subdomain + cert SAN. Left as a + follow-up; the Ingress uses `group.name: vidcast` so a grafana Ingress can later + share the same ALB. + +## 3. Placeholders to fill at deploy time + +From `DEPLOYMENT_CONFIG.md` and `terraform output`: + +| Placeholder | Source | +|---|---| +| `${VIDCAST_HOSTNAME}` | DEPLOYMENT_CONFIG.md (the public DNS name) | +| `${ACM_CERTIFICATE_ARN}` | ACM cert for the hostname (step 2 below) | +| `${LBC_IRSA_ROLE_ARN}` | `terraform output lbc_irsa_role_arn` | +| `${VPC_ID}` | `terraform output vpc_id` | +| `${ALERT_EMAIL}` | DEPLOYMENT_CONFIG.md (cert-manager path only) | + +## 4. Deploy sequence + +```bash +# 1. Terraform: create the LBC IRSA role (idempotent; adds only IAM — no ALB yet). +cd terraform/environments/dev && terraform apply # review: should be additive only +LBC_IRSA_ROLE_ARN=$(terraform output -raw lbc_irsa_role_arn) +VPC_ID=$(terraform output -raw vpc_id) +cd - + +# 2. ACM: request a cert for $VIDCAST_HOSTNAME (DNS-validated) and note its ARN. +# aws acm request-certificate --domain-name "$VIDCAST_HOSTNAME" \ +# --validation-method DNS --region eu-west-2 +# (add the CNAME it returns to your DNS zone; wait for status ISSUED) + +# 3. Install the AWS Load Balancer Controller. +helm repo add eks https://aws.github.io/eks-charts && helm repo update +helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ + -n kube-system -f k8s/ingress/alb-controller-values.yaml \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="$LBC_IRSA_ROLE_ARN" \ + --set vpcId="$VPC_ID" +kubectl -n kube-system rollout status deploy/aws-load-balancer-controller + +# 4. (ONLY if using cert-manager instead of ACM) +# helm repo add jetstack https://charts.jetstack.io +# helm install cert-manager jetstack/cert-manager -n cert-manager --create-namespace --set installCRDs=true +# envsubst < k8s/ingress/cert-manager/cluster-issuer.yaml | kubectl apply -f - + +# 5. Apply the Ingress (placeholders substituted). The ALB takes a few minutes. +export VIDCAST_HOSTNAME ACM_CERTIFICATE_ARN +envsubst < k8s/ingress/vidcast-ingress.yaml | kubectl apply -f - +kubectl get ingress vidcast-ingress -w # wait for ADDRESS (the ALB DNS name) + +# 6. Point DNS at the ALB: Route 53 ALIAS/CNAME $VIDCAST_HOSTNAME -> . + +# 7. Flip services to ClusterIP. Datastores via Helm; gateway/frontend via Argo +# (it auto-syncs overlays/dev) or `kubectl apply -k k8s/overlays/dev`. +helm upgrade mongodb Helm_charts/MongoDB/ --reuse-values +helm upgrade postgres Helm_charts/Postgres/ -f <(helm get values postgres) # keep the password +helm upgrade rabbitmq Helm_charts/RabbitMQ/ --reuse-values +# NOTE: do this AFTER the ALB is serving — converting frontend/gateway to +# ClusterIP removes the old NodePort access path. +``` + +## 5. Verification + +```bash +# ALB provisioned + cert attached +kubectl get ingress vidcast-ingress -o wide +# HTTPS end-to-end (expect the SPA, then a working login through /api) +curl -sSI https://$VIDCAST_HOSTNAME/ | head -1 # 200 +curl -sS https://$VIDCAST_HOSTNAME/api/login -u 'baabalola@gmail.com:' | head -c 40 # JWT +# HTTP redirects to HTTPS +curl -sSI http://$VIDCAST_HOSTNAME/ | grep -i location # -> https +# NodePorts are gone (datastores + app) +kubectl get svc | grep -i nodeport || echo "no NodePort services — good" +# Datastores no longer externally reachable; admin via port-forward: +kubectl port-forward svc/rabbitmq 15672:15672 # then localhost:15672 +``` + +## 6. Cost & rollback + +- **Cost:** ALB ~£22/month + low LCU. Route 53 ~£1. Within the assessment's + approved envelope. The LBC IRSA role itself is free; the **ALB is created when + the Ingress is applied** (step 5) — that's the billing trigger. +- **Rollback:** `kubectl delete ingress vidcast-ingress` (ALB de-provisions), + `helm uninstall aws-load-balancer-controller`, and revert the Services to + NodePort (`git revert` the service commits, re-apply). The app keeps running + throughout; only the entrypoint changes. +``` diff --git a/docs/MANAGED_SERVICES.md b/docs/MANAGED_SERVICES.md new file mode 100644 index 0000000..4b48a4a --- /dev/null +++ b/docs/MANAGED_SERVICES.md @@ -0,0 +1,227 @@ +# MANAGED_SERVICES.md — A5 Datastore Trade-off Record + +> **What this document is.** Part A5 of `PHASE_UP_PLAN.md` proposed replacing +> every in-cluster stateful service (PostgreSQL, MongoDB/GridFS, RabbitMQ, Redis) +> with an AWS-managed equivalent, and Sprint 5 proposed *cutting over to them in +> prod*. After costing it honestly (§ below), that cutover was **cancelled**. This +> file is what replaces it: a decision record explaining, for each datastore, +> **what** the managed service would be, **what it replaces**, **when** you would +> actually adopt it, **why**, and **what it costs**. +> +> **Status:** in-cluster Helm charts remain the production datastore layer. A5 is +> documented-and-deferred, not built-and-running. No managed-datastore Terraform +> is applied; none is left running. Standing AWS cost of this decision: **$0**. + +--- + +## 0. TL;DR + +| Datastore | Today (kept) | Managed candidate | Adopt when | Standing cost if left on | +|---|---|---|---|---| +| PostgreSQL (auth) | `postgres` Deployment, **no PVC** | **RDS PostgreSQL** db.t3.micro | First real users / any data you can't lose | ~$15 (Single-AZ) / ~$31 (Multi-AZ) /mo | +| MongoDB + GridFS (video/mp3 blobs) | `mongo:4.0.8` StatefulSet | **MongoDB Atlas** (M0 dev / M10 prod) | When blob durability + backups matter | $0 (M0) / ~$57 (M10, ~$1–2 paused) /mo | +| RabbitMQ (pipeline) | `rabbitmq:3-management` StatefulSet | **Amazon MQ for RabbitMQ** mq.m5.large | When the broker must outlive the node | **~$183/mo** (no cheaper instance exists) | +| Redis (A2 idempotency) | in-cluster Redis pod | **ElastiCache** cache.t3.micro | When the lock store must be HA/managed | ~$12/mo | +| **A5 all-managed, left running** | — | RDS + Atlas M10 + Amazon MQ + ElastiCache | — | **~$262–273/mo** | + +**The decision:** keep all four in-cluster. They are durable enough for a +single-node portfolio cluster, they cost $0 when the cluster is off, and the +reliability *patterns* that managed services are usually adopted for (no lost +events, idempotent retries, dead-lettering) are delivered in code by A1/A2/A3 +against the in-cluster brokers instead. See §6. + +--- + +## 1. Why the cutover was cancelled (the cost reality) + +The EKS cluster was deliberately **torn down on 2026-06-03** to save money, +preserving everything for a ~20-minute re-apply. The whole point was to get the +standing bill toward zero. A5-as-specified pulls in the opposite direction: + +| Managed service | Cheapest realistic prod-ish | ~$/mo (eu-west-2, 24/7) | Stops billing when… | +|---|---|---|---| +| RDS PostgreSQL (db.t3.micro, Single-AZ) | smallest usable | ~$15 | `terraform destroy` | +| RDS PostgreSQL (db.t3.micro, **Multi-AZ**) | standby doubles it | ~$31 | `terraform destroy` | +| MongoDB Atlas **M10** (2 vCPU, 2 GB) | smallest *dedicated* | ~$57 (paused ~$1–2) | pause or delete cluster | +| **Amazon MQ for RabbitMQ (mq.m5.large)** | **smallest type that exists** | **~$183** | delete broker (no pause) | +| ElastiCache Redis (cache.t3.micro) | single node | ~$12 | delete (no pause) | +| **A5 total, all managed, left running** | — | **~$262–273** | — | + +> **The Amazon MQ correction.** An earlier version of the plan quoted Amazon MQ +> at "~$25–30/mo (mq.t3.micro)." **That instance type does not exist for +> RabbitMQ on Amazon MQ** — the smallest supported broker is **mq.m5.large**, at +> roughly $0.25/hr ≈ **$183/mo** in eu-west-2. There is no T-type and no pause. +> This single correction makes the managed broker the **largest standing cost in +> the entire plan** — bigger than the EKS control plane (~$150/mo) and ~3× the +> rest of A5 combined. It is the main reason the all-managed cutover was dropped. + +That is a **15–40× jump** over the ~$10/mo the cluster was torn down to save, on +a project where the explicit goal is $0-when-off. So A5 is documented here as the +*production migration path*, not adopted as the running architecture. + +--- + +## 2. PostgreSQL → Amazon RDS + +**Today.** `postgres` runs as a single Deployment with **no PersistentVolume** +(`TECHNICAL_ANALYSIS.md` M-3). If the pod is rescheduled, the auth database — and +every user account — is gone. It is re-seeded from `Helm_charts/Postgres/init.sql` +(with the bcrypt-hashed admin user) on each fresh deploy. Acceptable for a demo +that is re-seeded anyway; **unacceptable the moment a real user account matters.** + +**Managed candidate.** RDS PostgreSQL, `db.t3.micro`, Single-AZ for a demo +window. Multi-AZ (~$31/mo) is a one-flag change (`multi_az = true`) and is *pure +cost for zero observed benefit on a demo torn down nightly* — documented as +available, not enabled. + +**What changes in the app:** almost nothing. `DATABASE_HOST`/`PSQL_*` already come +from config + the (now ESO-managed) secret. Point the host at the RDS endpoint, +run `init.sql` once against RDS, done. **Order hazard (from memory):** the bcrypt +admin seed must land **before** the auth image starts, or login fails — see the +merge runbook in `RBAC_EXPLAINED.md`. + +**Adopt when:** you onboard any user whose account you can't cheerfully drop, or +you want point-in-time recovery / automated backups / a restart that doesn't wipe +auth. **Cost:** ~$15/mo Single-AZ, ~$31 Multi-AZ. Destroyable to $0. + +--- + +## 3. MongoDB + GridFS → MongoDB Atlas + +**Today.** `mongo:4.0.8` StatefulSet. GridFS is **load-bearing**: both the raw +videos (`fs_videos`) and the converted MP3s live in GridFS, chunked. Durability +is whatever the PVC gives you; backups are manual. + +**Managed candidate — Atlas, not DocumentDB.** This is the single most important +A5 choice and it is deliberate: + +| Option | GridFS | Cost | Verdict | +|---|---|---|---| +| **MongoDB Atlas** (M0 free dev / M10 prod) | **Real MongoDB → GridFS works unchanged** | $0 / ~$57 | ✅ chosen path | +| Amazon DocumentDB | **Emulates** the Mongo API; historic gaps around `fs.chunks`/GridFS ops — *must be functionally tested before trusting* | ~$200/mo (t3.medium floor) | ❌ rejected: GridFS risk + price | +| In-cluster StatefulSet | native, but PVC-only durability | $0 | ✅ kept today | + +DocumentDB is the **biggest sleeper risk** in A5: it is not MongoDB, it emulates +it, and GridFS is exactly the kind of feature that has had gaps. It is also the +priciest minimum. **Atlas is genuine MongoDB**, so it is zero application risk, +and the **M0 free tier** covers dev/demo at $0. M10 (dedicated) supports +pause/resume — paused is ~$1–2/mo storage-only. + +**Migration when adopted:** `mongodump` → `mongorestore` to Atlas, then a +**GridFS chunk verification test** (write a >255 KB file so it chunks, read it +back, byte-compare) before trusting it. PrivateLink from the VPC for prod. + +**Adopt when:** blob durability, automated backups, or off-cluster persistence +matter. **Cost:** $0 (M0) / ~$57 (M10). Atlas bills outside AWS, so it survives a +`terraform destroy` — pause or delete it explicitly at teardown. + +--- + +## 4. RabbitMQ → Amazon MQ for RabbitMQ + +**Today.** `rabbitmq:3-management` StatefulSet, single node. The A3 retry/DLQ +topology (retry queues with TTL, terminal `vidcast.dlx`, bounded `MAX_RETRIES`) +is built **against this in-cluster broker** and works there. + +**Managed candidate.** Amazon MQ for RabbitMQ, **mq.m5.large single-instance**. +It is a genuine drop-in: same AMQP, Pika unchanged, same management API, and the +A3 topology ports **verbatim**. Single-instance is **not HA** (cluster mode is a +one-flag change at ~3× cost) — documented honestly. + +**The blocker is cost, not compatibility.** As in §1: mq.m5.large ≈ **$183/mo**, +no T-type, no pause. For a project that exists to demonstrate the *patterns*, the +patterns already run for $0 in-cluster. Amazon MQ buys broker-survives-the-node +durability — which on a **single-node** cluster is moot, because the node *is* the +availability boundary for everything else too. + +**Why MSK (Kafka) is explicitly rejected:** it would require rewriting every +producer/consumer from Pika→Kafka (~$130+/mo minimum *and* a messaging-platform +migration). That is scope creep, not reliability work. Documented as the "if this +were event-sourced at scale" path, not adopted. + +**Adopt when:** the broker genuinely must outlive the node (i.e. you move off +single-node), and the $183/mo is justified by real traffic. **Cost:** ~$183/mo, +destroy to stop. **Recommendation: do not adopt for a portfolio cluster.** The +honest production posture for a single-node deployment is: "single-node RabbitMQ +without external HA is acceptable here because the EKS node itself is the HA +boundary, and broker *durability* is handled by A1 (outbox) + A3 (DLQ)." This is +what most small teams actually do before they hit scale. + +--- + +## 5. Redis (A2 idempotency) → ElastiCache + +**Today / chosen.** A2 (idempotency + distributed lock) runs against an +**in-cluster Redis pod** (~50m/128Mi). The lock TTL is short, so a Redis outage +degrades to "occasional duplicate" (which the idempotent consumers absorb), not +"stuck." Cost: $0, dies with the cluster. + +**Managed candidate.** ElastiCache `cache.t3.micro`, single node, ~$12/mo. No +pause; destroy to stop. It buys a managed, monitored, optionally-HA lock store. + +**Adopt when:** the lock store must be HA and survive node loss, in tandem with +the rest of the managed stack. On its own it is the least compelling A5 item — +the in-cluster Redis already gives correct idempotency semantics; ElastiCache +mainly adds operational polish. **Confirmed decision: keep Redis in-cluster.** + +--- + +## 6. The actual architecture decision + +**All four datastores stay in-cluster.** A5's value is captured two ways without +the bill: + +1. **As code-on-demand (optional).** The managed modules can be written behind + `var.use_managed_datastores` (default `false`) so the all-managed version can + be stood up for a *timed demo window* — `apply` → migrate → screenshot the + RDS/Atlas/MQ consoles → `destroy` — proving "I can run the managed version on + demand" at ~$0 standing cost. *(Not yet written; see §7.)* + +2. **As reliability patterns, already delivered in-cluster.** The reason teams + reach for managed datastores is usually durability and not-losing-data. A5 is + *not* the only way to get that, and on this topology it is the expensive way: + + | Concern managed services usually address | How VidCast addresses it without A5 | + |---|---| + | Lost events if the broker hiccups | **A1 transactional outbox** + single-replica relay — no upload event dropped | + | Duplicate processing on redelivery | **A2 idempotency** (claim-once + Redis lock) — duplicates are no-ops | + | Poison messages / infinite requeue | **A3 retry + DLQ** (bounded retries, terminal `vidcast.dlx`) | + | Broker config durability | persistent messages + durable queues on the in-cluster broker | + +The result: the **reliability story is real and demonstrable**, the **managed +migration path is documented and costed**, and the **standing bill stays $0** — +which is the entire reason the cluster was torn down in the first place. + +--- + +## 7. What is and isn't built + +| Item | State | +|---|---| +| This trade-off record (`MANAGED_SERVICES.md`) | ✅ this file | +| In-cluster Helm charts (Mongo/Postgres/RabbitMQ) | ✅ unchanged, remain the datastore layer | +| In-cluster Redis for A2 | planned in Sprint 2 (in-cluster, not ElastiCache) | +| A5 managed-datastore Terraform (RDS/Atlas/MQ/ElastiCache, behind `use_managed_datastores=false`) | ⏳ **not written** — optional, build only if a demo-window cutover is wanted | +| Sprint 5 permanent cutover | ❌ **cancelled** — replaced by this document | + +> If you want the on-demand managed version (§6.1) for a portfolio screenshot, +> say so and I'll write the Terraform behind the default-`false` toggle — +> `plan`-only, never applied without an explicit decision, with an AWS Budgets +> alarm in front of it. + +--- + +## 8. Standing-cost summary + +| Posture | Standing cost (cluster off) | +|---|---| +| **Chosen: all in-cluster** | **$0** | +| A9 ESO secrets (Parameter Store, standard tier) | **$0** (not Secrets Manager — see note) | +| A5 all-managed, left running | ~$262–273/mo | +| A5 demo-window (apply → demo → destroy) | ~$0 (delete Atlas M0 / everything at teardown) | + +> **A9 cost note.** A9 reads secrets from **SSM Parameter Store**, not Secrets +> Manager. Standard-tier parameters are free and SecureString uses the +> AWS-managed `alias/aws/ssm` key (also free), so A9's standing cost is **$0** — +> not the $0.40/secret/mo that Secrets Manager would charge. (Any cost table +> showing A9 at ~$3–5/mo predates the Parameter Store decision and is stale.) diff --git a/docs/MERGE_RUNBOOK_RBAC.md b/docs/MERGE_RUNBOOK_RBAC.md new file mode 100644 index 0000000..c4a3d87 --- /dev/null +++ b/docs/MERGE_RUNBOOK_RBAC.md @@ -0,0 +1,97 @@ +# Merge-time runbook — RBAC + bcrypt (Fix 1) + +**Run this WITH the operator, at the moment the `feature/rbac-and-notifications` branch is +merged to `main` and CI builds the new auth image.** It is the operational +counterpart to commit `6fd3b83`. + +> This is a *tracked* operational doc (unlike the `*_EXPLAINED.md` study aids, +> which are deliberately gitignored). It contains **no credentials** — the +> Postgres password is read from the environment. Export it first from the +> gitignored `DEPLOYMENT_CONFIG.md` (`POSTGRES_PASSWORD`), never paste it here. + +## Why this is needed + +The new auth image (bcrypt) and the new DB schema/seed **must land together**. If +the bcrypt image rolls while live Postgres still holds the old *plaintext* row, +`bcrypt.checkpw` fails to verify against a non-hash value and **every login +fails**. (As of the F1-F hardening, a malformed stored hash now returns 401 rather +than 500 — but it's still a failed login until the DB is migrated.) + +`init.sql` is **not** run by CD — it's a manual `psql`. Live Postgres has no +PersistentVolume, so re-seeding is safe and non-destructive to anything we care +about. + +## Pre-flight + +```bash +# Postgres password from the gitignored config — do NOT hardcode it. +export PGPASSWORD="$(grep -E '^POSTGRES_PASSWORD:' DEPLOYMENT_CONFIG.md | cut -d'"' -f2)" +# App-login plaintext (for the smoke test only), same source: +export APP_PW="$(grep -E '^APP_LOGIN_PASSWORD:' DEPLOYMENT_CONFIG.md | cut -d'"' -f2)" + +kubectl config current-context # expect arn:...:cluster/vidcast-cluster +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') +echo "node: $NODE_IP" +``` + +## 1. Migrate the schema (idempotent, additive) + +```bash +psql -h "$NODE_IP" -p 30003 -U pguser -d authdb <<'SQL' +ALTER TABLE auth_user ADD COLUMN IF NOT EXISTS role VARCHAR(32) NOT NULL DEFAULT 'user'; +ALTER TABLE auth_user ADD COLUMN IF NOT EXISTS created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP; +DO $$ BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_constraint WHERE conname = 'auth_user_email_key') THEN + ALTER TABLE auth_user ADD CONSTRAINT auth_user_email_key UNIQUE (email); + END IF; +END $$; +SQL +``` + +## 2. Re-seed admins with bcrypt hashes (idempotent via ON CONFLICT) + +```bash +psql -h "$NODE_IP" -p 30003 -U pguser -d authdb -f Helm_charts/Postgres/init.sql +``` + +> `init.sql` uses `CREATE TABLE IF NOT EXISTS` + `ON CONFLICT (email) DO UPDATE`, +> so running it against the now-migrated table only refreshes the two seeded +> admins' role + bcrypt hash. Any self-registered `user` rows are left untouched. + +## 3. Verify the seed + +```bash +psql -h "$NODE_IP" -p 30003 -U pguser -d authdb \ + -c "SELECT email, role, left(password,7) AS pw_prefix FROM auth_user;" +# expect your seeded admin email(s) as role=admin, pw_prefix = '$2b$12$' +``` + +## 4. Roll the auth image (CD normally does this on merge) + +```bash +kubectl rollout status deployment/auth --timeout=120s +``` + +## 5. Smoke test — admin login carries role=admin + +```bash +JWT=$(curl -s -X POST "http://$NODE_IP:30002/login" -u "admin@example.com:$APP_PW") +echo "$JWT" | cut -d. -f2 | base64 -d 2>/dev/null; echo +# expect: {"username":"admin@example.com",...,"admin":true,"role":"admin"} +``` + +## 6. Negative test — a new sign-up is role=user, never admin + +```bash +curl -s -X POST "http://$NODE_IP:30002/register" \ + -H 'Content-Type: application/json' \ + -d '{"email":"rbac-test@example.com","password":"testpass123"}' \ + | cut -d. -f2 | base64 -d 2>/dev/null; echo +# expect: ...,"admin":false,"role":"user" +``` + +## Rollback + +If login misbehaves: `kubectl rollout undo deployment/auth` returns the previous +(plaintext) auth image, which matches the pre-migration DB. Re-running `init.sql` +is always safe (`ON CONFLICT`). When done, `unset PGPASSWORD APP_PW`. diff --git a/docs/OBSERVABILITY.md b/docs/OBSERVABILITY.md new file mode 100644 index 0000000..8301774 --- /dev/null +++ b/docs/OBSERVABILITY.md @@ -0,0 +1,100 @@ +# VidCast — Observability & Abuse Protection (Sprint 3) + +> Closes **I8 / P3** (structured logging + correlation IDs), **A12** (download +> audit log), and **A10** (rate limiting). Application code only — no manifests, +> Terraform, or Helm. Branch: +> `feature/improvement-sprint-3-observability-and-abuse-protection`. + +--- + +## 1. Log format + +Every service logs **one JSON object per line** to stdout (via `jsonlog.py`, +inlined per service). Fields present on every line: + +| Field | Meaning | +|---|---| +| `timestamp` | ISO-8601 UTC | +| `level` | `INFO` / `WARNING` / `ERROR` | +| `service` | `gateway` / `auth` / `converter` / `notification` / `outbox-relay` | +| `correlation_id` | per-request trace id (`"none"` for process-level lines, `"legacy"` for pre-correlation messages) | +| `message` | human-readable text | +| *(extra)* | any call-site context, e.g. `fid`, `user`, `file_size_bytes`, `error` | + +Example: +```json +{"timestamp":"2026-06-11T03:11:19Z","level":"INFO","service":"gateway","correlation_id":"abc-123","message":"File downloaded","fid":"6a1a","user":"x@y.com","file_size_bytes":295749} +``` + +## 2. Tracing one request end to end + +The gateway mints a `correlation_id` (UUID4) per request and stamps it into the +RabbitMQ message body. The converter and notification services read it off the +message; the outbox relay republishes the stored payload verbatim, preserving it. +So a single id appears on every log line from upload to email: + +```bash +CID=abc-123 +# Across all services in the default namespace: +for app in gateway converter notification outbox-relay; do + kubectl logs -l app=$app --tail=-1 2>/dev/null \ + | jq -c "select(.correlation_id == \"$CID\")" +done +# or, if shipping to one sink later, a single: jq 'select(.correlation_id=="abc-123")' +``` + +**Flow:** `gateway` (mint id, log "Upload published/queued") → `video` queue → +`converter` ("Conversion complete") → `mp3` queue → `notification` ("Mail sent"). +With the outbox enabled, `outbox-relay` logs "Outbox event published" in between. + +## 3. Download audit (A12) + +Every successful `GET /download` emits one structured line from the gateway: +```json +{"level":"INFO","service":"gateway","message":"File downloaded","correlation_id":"…","fid":"…","user":"…","file_size_bytes":…} +``` +Find all downloads: `kubectl logs -l app=gateway | jq 'select(.message=="File downloaded")'`. +Failed downloads log `"Download failed"` with the `error`. (Admin role changes are +also audited via the existing `"Admin role change"` line.) + +## 4. Rate limiting (A10) + +`flask-limiter` on the gateway, backed by the **existing in-cluster Redis**: + +| Endpoint | Limit | Why | +|---|---|---| +| `POST /login` | **10 / minute** per client | brute-force protection | +| `POST /upload` | **20 / hour** per client | upload quota | + +To adjust, edit the `@limiter.limit(...)` decorators in +`src/gateway-service/server.py`. The E2E pipeline (1 login + 1 upload) is well +under both, so it is unaffected. + +**Three things to know for deployment:** + +1. **A `gateway → redis:6379` NetworkPolicy egress rule is required** for the + limit to be shared across gunicorn workers. The gateway's current egress policy + (`app-policies.yaml`) allows auth/mongodb/rabbitmq but **not** redis, and editing + existing NetworkPolicies is out of this code-only sprint's scope. Until that + one-line rule is added (a small follow-on infra PR, like Sprint 1's + `allow-backup-egress`), the limiter **degrades gracefully to a per-process + in-memory limiter** (`in_memory_fallback_enabled=True`) — still functional, but + each of the 2 gunicorn workers counts independently (≈2× the configured limit). +2. **Client IP comes from `X-Forwarded-For`.** The gateway sits behind nginx/ALB, + so it keys on the first XFF hop, not the socket peer (which would make `/login` + one global bucket — a lockout DoS). **Caveat:** XFF is client-spoofable because + nginx appends rather than replaces it; a determined attacker can rotate fake XFF + values to evade per-IP login limits. A robust fix (trust only the proxy hop, or + also limit per target username) is a follow-up. +3. **Redis port is fixed at 6379 in code**, not read from `REDIS_PORT` env — the + in-namespace `redis` Service injects `REDIS_PORT=tcp://:6379` via Docker + service links into the gateway pod (which, unlike the consumers, does not set + `enableServiceLinks:false`), and reading it would corrupt the storage URI. + +## 5. Not yet implemented — log shipping + +Logs are JSON on stdout; they are **not yet shipped to a central store**. The next +additive step (separate infra PR, per the code/infra split) is a **Fluent Bit +DaemonSet → CloudWatch Logs or Grafana Loki**, at which point the `jq` greps above +become a single indexed query (`correlation_id = "…"`) across all services. Until +then, query per-pod with `kubectl logs | jq` as shown above. diff --git a/docs/PROJECT_GUIDE.md b/docs/PROJECT_GUIDE.md new file mode 100644 index 0000000..6e105c6 --- /dev/null +++ b/docs/PROJECT_GUIDE.md @@ -0,0 +1,874 @@ +# VidCast — The Complete Project Guide + +**Last updated:** 2026-06-03 + +> **How to read this:** you do not need a technical background. Every piece of +> jargon is explained in plain English *in the same breath* as it's introduced, +> usually with a real-world comparison. A non-technical reader should never have to +> look anything up; an engineer should still find it substantive. + +--- + +## Table of contents + +1. [What VidCast does](#1-what-vidcast-does) +2. [The big picture — architecture overview](#2-the-big-picture--architecture-overview) +3. [The microservices in detail](#3-the-microservices-in-detail) +4. [The data layer](#4-the-data-layer) +5. [The upload-to-download journey](#5-the-upload-to-download-journey) +6. [Authentication and authorisation — the deep dive](#6-authentication-and-authorisation--the-deep-dive) +7. [Infrastructure — what we provisioned and why](#7-infrastructure--what-we-provisioned-and-why) +8. [The CI pipeline](#8-the-ci-pipeline-github-actions) +9. [The CD pipeline](#9-the-cd-pipeline-github-actions) +10. [How Docker Hub connects to Git](#10-how-docker-hub-connects-to-git) +11. [Dev vs Prod — two pipeline systems](#11-dev-vs-prod--two-pipeline-systems) +12. [Observability](#12-observability) +13. [The journey — problems faced and how we solved them](#13-the-journey--problems-faced-and-how-we-solved-them) +14. [Decisions and trade-offs](#14-decisions-and-trade-offs) +15. [Known limitations and the next iteration](#15-known-limitations-and-the-next-iteration) +16. [Glossary](#16-glossary) + +--- + +## 1. What VidCast does + +*VidCast turns a video into a downloadable audio file. You upload a recording, it +strips out the sound, and emails you a link to the MP3 — useful for turning a +recorded talk or Zoom call into a podcast.* + +The problem it solves is mundane but real: people record video but often only want +the **audio** — a lecturer turning a recorded class into a podcast, a journalist +pulling a clip for radio, a student who wants to listen to a webinar on the bus. +Doing that by hand means installing fiddly software. VidCast does it in a few clicks. + +The experience, end to end: you open the website, **sign up or log in**, **upload** +an MP4 video, and carry on with your day. Behind the scenes the system extracts the +audio, and within seconds a small red **badge** appears on the site (and an **email** +lands in your inbox) saying your file is ready. You click **Download** — or open +**My Conversions** to see your whole history — and get your MP3. If you're an +**administrator**, you also see a control panel to manage other users. That's the +whole product. The interesting part — and what this guide is really about — is the +engineering that makes it reliable, secure, and reproducible. + +--- + +## 2. The big picture — architecture overview + +*VidCast is built as **microservices**: instead of one big program, several small +programs each do exactly one job and talk to each other through well-defined +channels. They run on Kubernetes (an automated "shift manager" for software) on +Amazon's cloud.* + +> **The metaphor we'll use throughout:** imagine a company where every employee has +> exactly one job — a **receptionist** who greets every visitor, a **bouncer** who +> checks IDs, a **chef** who does the actual work, a **courier** who delivers the +> result, a **librarian** who files things away. Crucially, they never reach into +> each other's desks; they pass **formal memos** down a conveyor belt. That +> discipline is what makes the company easy to reason about, fix, and scale — and +> it's exactly how VidCast is built. + +Here's the cast and how a request flows: + +``` + You (browser) + │ + ▼ + ┌──────────────────┐ + │ Frontend │ React website served by nginx (the dining room + waiter) + └──────────────────┘ + │ /api/... + ▼ + ┌──────────────────┐ + │ Gateway │ the front desk — checks your wristband, routes everything + └──────────────────┘ + │ │ │ + login│ upload│ download│ + ▼ ▼ ▼ + ┌─────────┐ ┌─────────────┐ stream MP3 back + │ Auth │ │ MongoDB │◄────────────────── + │ service │ │ (files) │ + └─────────┘ └─────────────┘ + │ checks │ drop a "convert this" memo + ▼ ▼ + ┌─────────┐ ┌──────────────┐ "video" mailbox ┌────────────┐ + │Postgres │ │ RabbitMQ │───────────────────►│ Converter │ (the chef: MoviePy/ffmpeg) + │ (users) │ │ (conveyor) │◄───────────────────│ │ + └─────────┘ └──────────────┘ "mp3" mailbox └────────────┘ + │ "it's ready" memo + ▼ + ┌────────────────┐ + │ Notification │ the courier — emails YOU the link (Gmail SMTP) + └────────────────┘ +``` + +The **four backend microservices** are *auth* (identity), *gateway* (front door), +*converter* (the chef), and *notification* (the courier). The **frontend** is a +separate React app. Behind them sit **three data services**: *MongoDB* (stores the +big video/audio files), *PostgreSQL* (stores the list of users), and *RabbitMQ* (the +conveyor belt that lets the gateway hand a job to the converter without making you +wait). + +All of this runs inside **Kubernetes** (often "K8s") on **AWS EKS** (Amazon's +managed Kubernetes). Kubernetes is a **shift manager for software**: it keeps the +right number of each "employee" on duty, restarts anyone who collapses, and can +clone busy ones. Each running copy of a service is a **pod** — think of a *sealed +glass jar* with the program and everything it needs inside, so it behaves +identically wherever it runs. The outside world reaches specific services through +numbered **doors** punched in the cluster wall (called **NodePorts**): the website +is door `30006`, the gateway `30002`. + +--- + +## 3. The microservices in detail + +Each service is a small Python (or, for the frontend, JavaScript) program. The rule +they all obey: **do one job, trust nobody by accident, and talk through defined +channels.** + +### 3.1 auth-service — the bouncer + +- **Job:** prove *who you are*. It handles `login`, `signup` (`/register`), token + issuing, and — added in this project — telling the rest of the system your **role** + (admin or ordinary user). +- **Built with:** Python + Flask (a lightweight web framework), `PyJWT` (for the + wristband), `bcrypt` (for password scrambling), and `psycopg2` (to talk to + PostgreSQL). +- **Talks to:** PostgreSQL (the user list) downstream. Upstream, only the *gateway* + calls it — it sits on an internal-only address. +- **If it disappeared:** nobody could log in or sign up. Existing wristbands would + keep working until they expired (it's *stateless* — see Section 6), but no new + ones could be issued. +- **Interesting code:** `src/auth-service/server.py`. The heart is `CreateJWT`, which + stamps your details onto the wristband, and `/login`, which checks your password + against a scrambled fingerprint: + + ```python + # The wristband carries BOTH a simple admin flag (for older code that reads it) + # AND a richer role string (so we can add more roles later without breaking things). + "admin": role == "admin", + "role": role, + ``` + +### 3.2 gateway-service — the front desk + +- **Job:** the single front door. *Every* request from the website hits the gateway + first. It checks your wristband, then routes you: logins go to auth, uploads go to + storage + the conveyor belt, downloads stream files back, and admin requests are + gated to admins only. It also exposes `/my-files` (your history) and `/admin/users` + (the admin panel). +- **Built with:** Python + Flask, `PyMongo`/`gridfs` (file storage), `pika` (RabbitMQ), + `requests` (to call auth), and `flask-cors` (so the browser is allowed to call it). +- **Talks to:** auth (to validate wristbands), MongoDB (files), RabbitMQ (jobs). + Everything the browser does flows through here. +- **If it disappeared:** the whole app would go dark — it's the only public entrance. +- **Interesting code:** `src/gateway-service/server.py`. Note how upload was changed + from "admins only" to "any logged-in user" — a one-word change with big meaning + (Section 6): + + ```python + # Uploading is a core action for ANY authenticated user — not just admins. + if not access: + return "not authorized", 401 + ``` + +### 3.3 converter-service — the chef + +- **Job:** do the actual work. It waits at the **"video" mailbox**, and whenever a + job appears, it fetches the video, extracts the audio, saves the MP3, and drops a + **"it's ready" memo** in the "mp3" mailbox. +- **Built with:** Python, `pika` (RabbitMQ), `pymongo`/`gridfs`, and **MoviePy** — + a library that drives **ffmpeg** (the industry-standard audio/video tool) under + the hood. The actual conversion is essentially one line: + + ```python + audio = moviepy.editor.VideoFileClip(tf.name).audio # pull the audio track out + ``` +- **Talks to:** RabbitMQ (in and out) and MongoDB (read the video, write the MP3). +- **If it disappeared:** uploads would still succeed and pile up in the "video" + mailbox, but nothing would get converted — the queue would grow until a converter + came back to drain it. (This is a *feature* of the conveyor-belt design: a backlog + waits patiently instead of being lost.) +- **It runs 2 copies** so two videos can convert at once. + +### 3.4 notification-service — the courier + +- **Job:** wait at the **"mp3" mailbox**, and whenever a "ready" memo appears, email + the person who uploaded the video, using Gmail. +- **Built with:** Python, `pika`, and Python's built-in email/`smtplib` (the postal + system for *sending* mail). +- **Talks to:** RabbitMQ (in) and Gmail's outgoing mail server (out). +- **If it disappeared:** conversions would still complete and be downloadable — users + just wouldn't get the courtesy email. +- **The "never-raise" contract:** this service was rewritten so that a single bad + email *can never crash it* (the story is in Section 13). It now returns one of two + answers — "done, remove the memo" or "couldn't, try later" — and handles every odd + case gracefully: + + ```python + if not receiver_address: # an old memo with no recipient + return None # skip it, don't crash, carry on + ``` + +### 3.5 frontend — the dining room + +- **Job:** everything you see. Login, sign-up, upload, download, the **My + Conversions** history page, the **admin user-management** page, and a navbar that + shows different tabs depending on your role. +- **Built with:** React (a popular UI library) + Vite (a build tool) + Tailwind CSS + (styling), packaged behind **nginx** (a fast web server that also forwards `/api` + calls to the gateway). +- **Talks to:** only the gateway, via `/api/...`. +- **If it disappeared:** power users could still poke the gateway directly with + command-line tools, but normal people would have no way in. +- **Interesting detail:** the website reads your wristband to decide which tabs to + show. But hiding a tab is just tidiness — the *real* lock is on the gateway, so + even typing the admin URL directly bounces a non-admin away. + +--- + +## 4. The data layer + +Three different storage systems, each chosen because it's the right tool for a +different shape of data. + +### 4.1 MongoDB + GridFS — the file room + +MongoDB stores big files (the videos and MP3s). **GridFS** is the part of MongoDB +designed for large objects: it *tears each file into manageable chunks* and shelves +them, reassembling on demand. > **Analogy:** a librarian who tears a thick book into +chapters before shelving, so no single shelf has to hold the whole tome — and can +hand you back the reassembled book when you ask. We also attach a small label to +every file — `owner_email` — so the system can answer "which files are *yours*?" + +### 4.2 PostgreSQL — the staff roster + +PostgreSQL is a classic table-shaped database, perfect for the **user list**: one +row per user, with columns `email`, `password` (a scrambled fingerprint, never the +real password), `role` (admin/user), and `created_at`. > **Analogy:** the staff +roster binder with a role badge next to each name. It's the single source of truth +for *who exists and what they're allowed to do*. + +### 4.3 RabbitMQ — the post office + +RabbitMQ holds two durable **queues** (mailboxes): **`video`** (jobs going in) and +**`mp3`** (results coming out). Its whole purpose is **decoupling**: the gateway can +drop a job and immediately tell you "we're on it" without waiting for the slow +conversion, and the converter picks jobs up whenever it's free. > **Analogy:** a +post office with two mailboxes — *videos in, audio out*. "Durable" means the mail +survives even if the post office briefly closes (a pod restart) — letters aren't lost. + +--- + +## 5. The upload-to-download journey + +*Here's exactly what happens, step by step, the moment you upload a video. Follow +the numbers — no technical background needed.* + +1. **You click Upload.** The website (frontend) sends your video to the gateway at + `/api/upload`, attaching your **wristband** (the token proving who you are). +2. **The gateway checks your wristband** by asking the auth service "is this real and + not expired?" If yes, it learns your email. If no, you're turned away (`401`). +3. **The gateway stores the video** in MongoDB, stapling your email to it as the + `owner_email` label — like a coat-check ticket that stays on through the whole + process. +4. **The gateway drops a memo** in the RabbitMQ **"video" mailbox**: *"convert file + X; it belongs to you@example.com."* Then it immediately replies to the website + **"success!"** — you're free to go. *(This is the magic of the conveyor belt: you + never wait for the slow part.)* +5. **A converter picks up the memo** (whenever one is free), fetches the video from + MongoDB, and runs **MoviePy/ffmpeg** to extract the audio — a few seconds for a + short clip. +6. **The converter saves the MP3** back into MongoDB, copying the same `owner_email` + label onto it, and drops a new memo in the **"mp3" mailbox**: *"file X is ready + for you@example.com."* +7. **The notification service picks up that memo** and **emails you** a download + reference, using Gmail. The email goes to *the address you uploaded with* — never + a hard-coded one. +8. **Meanwhile, the website is quietly polling** the gateway every few seconds: + "any new files for me?" The moment your MP3 exists, the count comes back as 1 and + a **red badge** appears on the Download tab. +9. **You click Download** (or open **My Conversions**). The gateway confirms your + wristband, fetches the MP3 from MongoDB, and **streams it back** to your browser + as a file. Done. + +From your point of view it felt instant and you got an email. Underneath, five +independent services collaborated through two mailboxes and two databases — and any +one of them could have been restarted mid-flight without losing your job. + +--- + +## 6. Authentication and authorisation — the deep dive + +*This is the area assessors probe hardest, so we go deep. Two ideas that sound alike +but are completely different: **authentication** (proving who you are) and +**authorisation** (what you're allowed to do).* + +### 6.1 Authn vs authz — the core distinction + +- **Authentication ("authn") = "are you who you say you are?"** Showing ID at the + door. In VidCast that's `/login`: email + password → if correct, you get a + wristband. +- **Authorisation ("authz") = "are you allowed to do this?"** Which doors your + keycard opens *once you're inside*. + +> **The hotel analogy:** authentication is the photo ID proving you're a guest; +> authorisation is the keycard saying which doors open. Every guest can ride the lift +> and enter their own room (upload/download); only staff keycards open the back +> office (the admin panel). VidCast's original bug was handing **every** guest a +> *master keycard* — more on that below. + +This distinction drove a concrete fix: uploading a video only requires +**authentication** (any logged-in user). Seeing the admin panel requires +**authorisation** (the admin role specifically). The old code confused the two and +demanded "admin" just to upload — which only "worked" because everyone was secretly +admin. + +### 6.2 The JWT lifecycle — a wristband, not a logbook + +A **JWT** (JSON Web Token) is a **festival wristband**. When you log in, the auth +service issues one stamped with your details and sealed so it can't be forged. You +show it on every request; the gateway reads it. Crucially this is **stateless** — +the server keeps **no logbook** of who's logged in. Everything needed is *on the +wristband*, and a cryptographic seal proves it's genuine. (Why that matters: any +copy of the gateway can serve you without sharing a central session list — it scales +effortlessly.) + +The wristband carries four things: + +| Claim | Meaning | Plain English | +|---|---|---| +| `username` | your email | who you are | +| `admin` | true/false | the simple "are you staff?" flag | +| `role` | `"admin"` or `"user"` | the richer role (room to add more later) | +| `exp` | expiry timestamp | the wristband stops working after 1 day | + +Validation: when a request arrives, the gateway hands the wristband back to the auth +service, which re-checks the seal and the expiry. Tamper with it and the seal breaks; +wait too long and `exp` rejects it. + +### 6.3 bcrypt — the one-way blender + +Passwords are never stored as readable text. They're put through **bcrypt**, a +**one-way blender**: you can turn a strawberry into a smoothie, but you can't turn the +smoothie back into a strawberry. At login we blend what you typed and compare +*smoothies* (`bcrypt.checkpw`), never the original fruit. + +```python +if not bcrypt.checkpw(typed_password.encode(), stored_hash.encode()): + return "Could not verify", 401 # the smoothies don't match +``` + +Two properties make bcrypt the right choice: +- **One-way:** a thief who steals the database gets smoothies, not passwords. +- **Salted and slow:** a pinch of randomness (**salt**) means two people with the + same password get *different* smoothies, and the blender is deliberately slow so + an attacker can't try billions of guesses per second. + +### 6.4 RBAC and the three guardrails + +**Role-Based Access Control (RBAC)** is the formal name for "what you can do depends +on your role." Enforcement lives at the **gateway**: it reads the `admin` claim from +the (verified) wristband and rejects non-admins from admin endpoints with a `403` +("forbidden"). The admin panel can promote/demote users, protected by three rails: + +- **Self-demotion → `403`.** You cannot change *your own* role. Stops an admin + accidentally locking themselves out. +- **Last-admin demotion → `409`.** The system refuses a change that would leave + **zero** admins — nobody could ever get back in. +- **Unknown user → `404`.** Changing someone who doesn't exist fails cleanly. + +> **A subtle, clever point assessors love:** the `409` "last admin" rule looks +> redundant next to the `403` "not yourself" rule — if you're the only admin, +> demoting yourself is already blocked. But the `409` catches a sneakier case: +> someone whose admin rights were *just revoked* but who still holds a valid +> wristband from a minute ago could otherwise demote the last *real* admin. The two +> rules guard different things — your **identity** versus the **system's health** — +> so they're complementary, not duplicate. + +Every promote/demote also writes an **audit line** to the logs: *who* changed *whom*, +to *what*. (Making that line actually appear was its own small saga — Section 13.8.) + +### 6.5 The "everyone was an admin" story + +When we opened the original code, we found the wristband-stamping function had +`admin: True` **hard-coded** — *every* login, and worse, every *sign-up*, minted an +admin. RBAC was effectively switched off, and a stranger could create an account and +own the system (a **privilege-escalation hole**). We rebuilt it: real roles in the +database, the wristband carrying your *true* role, sign-ups locked to ordinary +"user," and the gateway enforcing the difference. That rebuild is the foundation +everything else in this project sits on. + +--- + +## 7. Infrastructure — what we provisioned and why + +*Everything VidCast runs on is defined as code and created on Amazon's cloud. Nothing +was clicked together by hand — which is why we can destroy it to save money and +rebuild it identically in 20 minutes.* + +- **AWS, one region (`eu-west-2`, London).** AWS is the cloud provider — rented + computers, networks, and storage. We use a **single region** deliberately: it's a + learning/dev project, and one region is cheaper and simpler. A bank would spread + across regions for disaster recovery; we don't need to. + +- **EKS — managed Kubernetes.** Running Kubernetes yourself means babysitting its + "brain" (the *control plane*). **EKS** is Amazon running that brain for you, so we + only manage the *workers*. > **Analogy:** EKS is hiring a managed building with the + security and plumbing already run; we just furnish the offices. + +- **Terraform — Infrastructure as Code.** Instead of clicking buttons in a console, + we *write down* the infrastructure we want in files, and Terraform makes reality + match. `terraform plan` shows the diff ("here's what I'll change"); `terraform + apply` does it; `terraform destroy` removes it. The state — Terraform's memory of + what exists — lives in an **S3 bucket** (Amazon's file store), locked by a + **DynamoDB** table so two people can't change it at once. > **Why local state is + forbidden:** if that memory lived on one laptop, a teammate (or the CI robot) + would have no idea what already exists and could create duplicates or clobber + things. A shared, locked memory keeps everyone honest. + +- **VPC, subnets, security groups, IAM roles — the walls and keys.** The **VPC** is + a private network — VidCast's own fenced compound. **Subnets** are rooms within it + (we use two, in two availability zones, for the cluster). **Security groups** are + doormen on each door, allowing only specific traffic (e.g. the website port from + the public, the admin ports only from the operator's home IP). **IAM roles** are + job-specific keyrings — the cluster's keyring, the worker nodes' keyring — each + holding only the permissions that job needs and no more. + +- **The node group — one `m7i-flex.large`.** The worker machine where the pods + actually run: 2 CPUs, 8 GB RAM, Kubernetes 1.31. We run **one** node for dev + (auto-scaling allowed between 1 and 2). > **Why this size and not a tiny one:** the + cluster runs ~12 pods at once; a smaller machine couldn't fit them. > **Why not a + cheaper "burstable" T-type machine:** this AWS account rejects a setting EKS forces + on T-type machines — we lost 40 minutes to that in May before switching. For + production you'd run several larger nodes across zones for resilience. + +- **OIDC — temporary visitor badges for the robot.** The CI/CD robot needs + permission to deploy to AWS. The naïve way is to hand it a permanent AWS key — a + master key that, if leaked, is a disaster. Instead we use **OIDC federation**: + GitHub vouches for the robot, and AWS issues a **short-lived visitor badge** valid + for one job. The trust policy says, in effect, *"only accept badges from GitHub + workflows in **this specific repo**"*: + + ``` + token.actions.githubusercontent.com:sub StringLike "repo:/vidcast:*" + ``` + No long-lived secret ever touches the robot. If GitHub were compromised the badge + still only works for our one repo, and only for the moment a job runs. + +--- + +## 8. The CI pipeline (GitHub Actions) + +*"CI" (Continuous Integration) is the **quality gate**: every time code changes, an +automated assembly line checks it and packs it into shippable containers. Ours runs +on GitHub's servers, defined in `.github/workflows/ci.yml`.* + +It triggers on pull requests and on pushes to `main`, but only when files under +`src/**` change (no point rebuilding for a docs-only edit). + +| Stage | What runs | When | Why it's there | +|---|---|---|---| +| **Checkout** | `actions/checkout` | every run | copies the code onto the robot's workbench | +| **Lint** | `ruff check src/ --exclude src/frontend` | PR + push | catches sloppy or broken Python *before* a human reviews it — like spell-check for code | +| **Build** | `docker build` per service (4 in parallel) | PR + push | proves each service's container actually builds; a typo in the recipe fails here | +| **Security scan** | Trivy (`severity CRITICAL,HIGH`, `exit-code 1`, `ignore-unfixed`) | PR + push | scans each container for known vulnerabilities and **blocks the build** if it finds a serious, fixable one | +| **Push** | `docker push` to Docker Hub | **`main` push only** | publishes the finished containers to the warehouse — but *only* once code is merged | + +A few things worth understanding: + +- **What "lint" actually catches.** `ruff` is a Python linter — it flags unused + imports, undefined names, risky patterns. It's fast and cheap and catches a whole + class of "oops" before review. When it fails, the fix is usually a one-liner. + +- **What Trivy actually does, and why it can fail.** **Trivy** is a security scanner. + It reads everything baked into a container — the operating-system packages, the + Python libraries — and cross-references a public database of known + vulnerabilities. If it finds a **CRITICAL** or **HIGH** issue that *has a fix + available* (`ignore-unfixed` skips ones nobody can fix yet), it stops the line + (`exit-code 1`). Earlier in the project this gate failed repeatedly, and fixing it + meant upgrading library versions until the scan came back clean — a real, instructive + battle (Section 13 references it). + +- **The deliberate choice: PR builds *don't* push images.** On a pull request, CI + builds and scans the containers but does **not** publish them — publishing only + happens on a push to `main`. *Why:* it keeps the warehouse free of half-baked + experiment images and enforces "nothing ships until it's merged." *The trade-off:* + it means you can't do a true pre-merge test on the real cluster (the images don't + exist yet) — which bit us once and is documented honestly in + `docs/DECISIONS_MADE.md`. + +> **Honest note:** there is **no automated unit-test stage** yet — CI is lint, build, +> scan, push. Adding tests is named as a gap in Section 15. We're not pretending it's +> there. + +--- + +## 9. The CD pipeline (GitHub Actions) + +*"CD" (Continuous Deployment) is the **delivery line**: once CI has approved and +published the containers, CD ships them to the live cluster — automatically, with no +human running commands. Defined in `.github/workflows/cd.yml`.* + +| Stage | What runs | Why | +|---|---|---| +| **Trigger** | when CI finishes successfully on `main` | only deploy code that *passed* the quality gate | +| **Get a visitor badge** | `aws-actions/configure-aws-credentials` via the **OIDC role** | short-lived AWS access, no stored keys (Section 7) | +| **Point kubectl at the cluster** | `aws eks update-kubeconfig` | so the robot can issue cluster commands | +| **Deploy** | `kubectl set image` on each of the 4 backend deployments | swaps in the new container version | +| **Verify** | `kubectl rollout status` | waits and confirms the new version came up healthy | + +The key concept here is the **rolling restart**. When `kubectl set image` runs, +Kubernetes doesn't yank the old version down and leave a gap — it **brings new pods +up first, waits for them to be healthy, then drains the old ones**. > **Analogy:** +swapping the engine on a moving train by attaching a new carriage, moving everyone +across, then detaching the old one — the passengers never stop moving. The app is +never offline during a deploy. + +CD also gives a free **audit trail**: GitHub records *who* triggered each run, *which +commit* it deployed, and the *outcome* — so there's always a record of what went +live and when. + +> **Note:** CD updates the **four backend** services. The **frontend** is deployed +> separately (Section 10) because building it needs Node.js, which this pipeline's +> setup doesn't include. + +--- + +## 10. How Docker Hub connects to Git + +*This is the "trust chain" from a developer's keyboard to a running container — how a +saved code change becomes a live service.* + +A **Docker image** is a **vacuum-sealed package** containing a program and everything +it needs; a **registry** is the warehouse that stores those packages. The chain: + +1. **A developer commits** code to Git and **pushes** to GitHub. +2. **GitHub Actions wakes up**, clones the repo onto a fresh robot, and **builds** the + Docker image for each changed service. +3. **On a `main` push, the robot logs in to Docker Hub** as ``, using a + **token** kept in GitHub's encrypted **Secrets** (`DOCKERHUB_USERNAME` + + `DOCKERHUB_TOKEN`), and **pushes** each image. +4. Images are tagged with the exact **commit ID** (e.g. `…/auth-service:c36b319`) — + *not* a moving `:latest` tag. > **Why the commit ID and not `:latest`:** "latest" + is ambiguous — it means something different every day. A commit ID is precise and + permanent, so you always know *exactly* which code is running and can reproduce or + roll back to it. (This is a deliberate choice; many projects use `:latest` for + convenience and regret it.) +5. **On deploy, the cluster pulls** that exact image from Docker Hub by its commit ID. + +> **Why a token, not the account password:** the token is **revocable and scoped** — +> like giving a contractor a key that only opens the supply closet and can be +> cancelled, rather than your house key. If it leaks, you revoke that one token; the +> Docker Hub account itself is never exposed. + +**The frontend exception.** The four backend services go to **Docker Hub**. The +**frontend** goes to **ECR** (Amazon's private registry) and is **built by hand**, +because compiling the React app needs Node.js, which the current backend-focused CI +doesn't set up. The cluster's worker machine has built-in permission to pull from the +account's own ECR, so no extra password is needed. (Folding the frontend into CI is a +named next step.) + +--- + +## 11. Dev vs Prod — two pipeline systems + +*VidCast carries two delivery systems on purpose, because the bootcamp curriculum +covers both and they're good at different things.* + +| | **GitHub Actions** (dev — in use today) | **Jenkins** (prod — pipeline written, server not yet running) | +|---|---|---| +| **Runs on** | GitHub's servers | infrastructure *you* control (your own VMs/pods) | +| **Best for** | fast setup, open-source, tight repo integration | heavy custom logic, internal corporate systems, multi-stage approvals | +| **Where it lives** | `.github/workflows/*.yml` | `Jenkinsfile` | + +To be precise about status: the **dev pipeline (GitHub Actions) is mature and in +daily use** — it's what built and deployed everything in this guide. The **Jenkins +pipeline is fully *written*** — `Jenkinsfile` is a complete 122-line, 8-stage +pipeline — but there is **no running Jenkins server executing it yet**. The +*pipeline-as-code* exists; the *machine to run it* is the next iteration. + +What that Jenkinsfile already describes is notably more production-shaped than the +GitHub flow: + +1. **Checkout** → 2. **Lint** → 3. **Build** all four images (in parallel) → +4. **Security scan** (Trivy) → 5. **Push** to the registry → 6. **Deploy to +staging** (a cheap **Docker Swarm** environment via `docker stack deploy`) → +7. **Smoke-test staging** (`curl -f .../healthz` — fail the build if the health +check fails) → 8. **Manual approval gate** (*"Staging passed. Deploy to +Production?"* — a human must click) → 9. **Deploy to production** (EKS). + +> **Why Docker Swarm for staging:** a second full EKS cluster for testing would cost +> roughly as much as production. A tiny Docker Swarm setup on a small machine costs a +> fraction and is functionally close enough to catch problems before they reach the +> real cluster. The bootcamp deliberately connects its "Docker Swarm" module to its +> "Kubernetes" module this way. + +The production-grade extras a finished Jenkins setup would add: explicit +staging→production promotion with the **manual approval gate** (already in the file), +automated **rollback** if a health check fails after deploy, **blue-green or canary** +releases (ship to a slice of users first), and hooks into on-call alerting. Those are +the road map, not today's reality — and we say so plainly. + +--- + +## 12. Observability + +*"Observability" answers the question: when something goes wrong at 2 a.m., can you +tell **what** and **why**? VidCast has three complementary layers, because they +answer different questions.* + +- **Logs — the diary.** Every service prints what it's doing to its output, captured + by `kubectl logs`. Logs answer *"what happened, in order?"* This is where the + **admin audit trail** lives — every promote/demote prints `AUDIT admin_role_change + admin=… target=… new_role=…`. (Getting those lines to actually appear took a + one-line fix — Section 13.8.) + +- **Metrics — the dashboard gauges.** We install **kube-prometheus-stack**, a bundle + of **Prometheus** (which collects numbers over time — CPU, memory, pod restarts, + node health) and **Grafana** (which draws them as live dashboards, on door `30007`, + with a custom "VidCast Operations" dashboard and alert rules for crash-loops and + high CPU/memory). Metrics answer *"is the system healthy right now, and what's the + trend?"* + + > **Honest scope:** Prometheus here scrapes **cluster- and node-level** metrics — + > it does *not* yet collect custom per-service business metrics (e.g. "conversions + > per minute"). The app code doesn't expose them (a `prometheus-client` library was + > declared early but left unused and dropped). Per-service metrics are a named gap + > in Section 15. + +- **Traces — the journey map.** *(Not implemented.)* Tracing follows a single request + across every service to find where time was spent. We don't have it; for a system + this size, logs + metrics suffice, and we note tracing as a "if this grew" item. + +> **Why three layers matter:** a metric tells you *the kitchen is on fire* (CPU is +> pegged); a log tells you *which dish caused it* (the error message); a trace would +> tell you *which step in that dish's recipe was slow*. Different questions, different +> tools. + +--- + +## 13. The journey — problems faced and how we solved them + +*Every real project is a sequence of problems. Here are the eight that mattered most, +told as stories, roughly in order. The recurring lesson: discipline — small honest +checks, written-down recovery plans — pays off exactly when things break.* + +### 13.1 The May crash loop — workers stuck in a reboot spiral + +The first deployment looked alive but wasn't working. Two services — the converter +and the courier — were in a **crash loop**: starting, falling over, restarting, +forever. The root cause was mundane and two-fold: the RabbitMQ **mailboxes hadn't +been created**, so the workers panicked trying to listen at a mailbox that didn't +exist; and the Gmail login was misconfigured. We created the queues up front and +fixed the mail settings, and the workers settled. **Lesson:** a service that depends +on something must fail *loudly and early* if that something is missing — which led +directly to the health-check and startup fixes that followed. + +### 13.2 "Everyone is an admin" — the hidden master key + +While planning the roles feature, we read the token code and found `admin: True` +*hard-coded* into every wristband. The system had been handing out master keys to +everyone, and nobody had noticed because nothing visibly broke — the door was +unlocked, so every push opened it. This single discovery reframed the whole piece of +work: it wasn't "add roles," it was "the access control has never actually been on." +**Lesson:** "it works" is not the same as "it's correct" — a security control that's +silently disabled looks identical to one that's working, until someone checks. + +### 13.3 The sign-up that made strangers into admins + +Worse than 13.2: the brand-new self-service sign-up handed each new account an +**admin** wristband. Anyone on the internet could create an account and own the +system — a textbook **privilege-escalation hole**. The fix was a few lines (new +accounts are always ordinary "user"), but the *finding* mattered: it was caught by +reading the code adversarially before shipping, not by a user exploiting it. +**Lesson:** review your own work as if you were trying to break it. + +### 13.4 The login that cried "fire" — the psycopg2 `None` bug + +A subtle one. The database library's `execute()` command always returns *nothing* +(`None`), but the login code was written as if that nothing meant "no user found." +The result: when an **unknown** person tried to log in, instead of a clean "you're +not on the list" (`401`), the system threw a confusing internal error (`500`) — the +equivalent of setting off the fire alarm when a stranger knocks. We rewrote it to +decide based on the *actual database result*. **Lesson:** if your front door can't +reliably say "no," every lock you build on top of it is theatre. + +### 13.5 The runbook hiding in a private notebook + +During our own pre-ship review, we caught something easy to miss: the **recovery +recipe** for the risky database upgrade was written inside a file that was +*deliberately excluded from the shared repository* (it was personal study material). +Had a teammate cloned the project fresh, the single most important operational +document would have been missing. We moved it into the official, shared docs — +carefully stripping out a password first. **Lesson:** the value of a runbook is zero +if it isn't where the next person will look. + +### 13.6 The pipeline that wouldn't pre-test + +Planning the integration test, we hit a wall: we wanted to test the new code on the +real cluster *before* merging — but the CI pipeline only publishes containers on a +push to `main`, so the pre-merge containers simply didn't exist to deploy. This is a +genuine consequence of a sensible policy (don't pollute the registry with experiments). +We documented the constraint, chose a "merge then verify with a fast rollback ready" +approach, and wrote down the trade-off. **Lesson:** sometimes the right move is to +name a limitation honestly rather than bolt on a hack to route around it. + +### 13.7 The deployment that broke every login — and the runbook that saved it + +This is the one worth telling in full. Our new login uses scrambled (bcrypt) +passwords, which requires the **database** to be upgraded in lockstep — a +bcrypt-expecting login against an old plain-text database is *a new lock fitted to a +door whose keys everyone still holds in the old shape*: nothing opens. When the work +was merged, the automated pipeline did its job and **instantly deployed the new login +code** — but the database upgrade is a deliberate manual step that hadn't run yet. For +a few minutes, **every login on the live site returned an error.** No panic, though: +we'd *written the recovery recipe in advance* (the very runbook from 13.5). We ran the +database upgrade, and logins came back to life immediately; then we shipped the new +frontend and ran a full top-to-bottom test. **Lesson — the whole project in +miniature:** the failure was real, but because the recovery was documented and +rehearsed, it was a five-minute fix, not an outage. We also learned a permanent rule: +once the database is upgraded, you can't roll *back* the login code (the old code +can't read the new scrambled passwords) — recovery is always *forward*. That's now +written into the decision log. + +### 13.8 The audit log that wrote to nowhere + +The final test passed but for one oddity: the admin **audit lines** (who promoted +whom) weren't showing up in the logs — even though the code was clearly writing them. +The cause was a classic gotcha: programs **buffer** their output, jotting notes on a +pad and only handing the pad over when it's full, to save effort. For a long-running +service, that pad might not be handed over for ages — so the audit notes sat in +memory, invisible. (Confusingly, the routine request logs *did* show, because they're +written a different, immediate way.) The fix was a single standard setting — +`PYTHONUNBUFFERED=1` — telling the program "hand over every note immediately." We +applied it, watched an audit line appear the instant a role changed, and confirmed it +survived the next automated deploy. **Lesson:** "the code is correct" and "the output +is visible" are two different claims — verify the second, not just the first. + +--- + +## 14. Decisions and trade-offs + +*Good engineering is making deliberate choices and being able to defend them. Each of +these follows the same shape: **what we chose, the alternatives, why we rejected them, +and the trade-off we accepted.** (The full versions live in `docs/DECISIONS_MADE.md`.)* + +- **Scramble passwords now, not "later."** *Alternatives:* add roles now and hash + passwords in a future pass. *Rejected because* doing access-control on unprotected + passwords is a half-measure an assessor would immediately question, and the login + image had to be rebuilt anyway. *Trade-off accepted:* a one-time, carefully + sequenced database upgrade (the one that briefly broke logins in 13.7). + +- **Polling, not live-push, for the "ready" badge.** *Alternatives:* Server-Sent + Events or WebSockets (instant push). *Rejected because* for a single-user demo a + few seconds' lag is invisible (the conversion itself takes longer), and push adds + real complexity. *Trade-off accepted:* a few seconds of latency, and a known + upgrade path if usage ever grew to thousands of concurrent users. + +- **No in-app admin stats panel.** *Alternatives:* build a stats screen of uploads, + bytes, queue depth. *Rejected because* the Grafana dashboard already shows system + metrics properly; a second, weaker copy inside the app duplicates it. *Trade-off:* + admins read operational numbers in Grafana, not the app. + +- **Admin checks at the gateway only.** *Alternatives:* have every service + independently verify the wristband ("defence in depth"). *Rejected (for now) + because* the back-end services are sealed inside the cluster and only the gateway + is exposed. *Trade-off accepted:* a real but contained gap — a malicious pod + *inside* the cluster could call the auth service directly. Documented, with the + proper fix named (service-to-service identity / a mesh). + +- **Audit trail to stdout, not a tamper-proof ledger.** *Alternatives:* a dedicated, + append-only audit table. *Rejected because* it's a whole subsystem for a demo. + *Trade-off:* the audit answers who/whom/what but isn't tamper-evident — fine for a + dev system, named as a gap for a real one. + +- **Conservative admin guardrails.** *Alternatives:* let admins demote themselves + once another admin exists. *Rejected because* admin lockout is a self-inflicted + outage with no in-app recovery. *Trade-off:* slightly less flexibility for far more + safety. + +- **PR builds don't push images.** *Alternatives:* publish every PR's images. + *Rejected because* it clutters the registry and weakens merge discipline. + *Trade-off:* genuine pre-merge cluster testing needs manual image building — which + bit us once (13.6) and is documented. + +--- + +## 15. Known limitations and the next iteration + +*Honest about what isn't built. For each, the "real fix."* + +- **PostgreSQL has no persistent disk.** If its pod restarts, the user table is lost + and must be re-seeded. *Fine for dev; the real fix* is attaching a persistent + volume or using a managed database (AWS RDS). +- **Services trust each other inside the cluster.** The auth service's user-management + endpoints trust any in-cluster caller; only the gateway's outer wall enforces + "admins only." *Real fix:* cryptographic service-to-service identity (mutual TLS or + a service mesh) so every hop re-checks. +- **Audit log is plain stdout.** Visible but not tamper-evident. *Real fix:* an + append-only audit store written in the same transaction as the change. +- **No automated tests.** CI lints, builds, and scans, but runs no unit/integration + tests. *Real fix:* a `pytest` stage gating every PR. +- **No per-service business metrics.** Monitoring is cluster/node level only. *Real + fix:* expose Prometheus metrics from each service (conversions, queue depth, error + rates). +- **Single region, one worker node.** No failover. *Real fix:* multiple nodes across + availability zones, and multi-region for true disaster recovery. +- **No automated rollback on a bad deploy.** *Real fix:* a health-gated deploy that + auto-reverts (the Jenkins pipeline is designed for this; the server isn't running). +- **The production Jenkins server isn't provisioned** (the pipeline-as-code is fully + written). *Real fix:* stand up a Jenkins instance and connect it. +- **Two features deferred:** in-browser **audio preview** (play before downloading) + and **email verification** on sign-up. Both are scoped and waiting, neither needed + for the core demo. + +--- + +## 16. Glossary + +- **API / endpoint** — a specific "service window" a program offers, like the + different windows at a post office. `/login` and `/upload` are endpoints. +- **JWT (token)** — a tamper-proof festival wristband proving who you are; shown on + every request so you don't re-log-in each time. +- **bcrypt** — a one-way blender for passwords; you can check a match but can't + reverse it. +- **Microservice** — one small program doing exactly one job, talking to others + through defined channels (vs. one giant do-everything program). +- **Queue (RabbitMQ)** — a mailbox/conveyor belt that lets one service hand work to + another without waiting. +- **Container** — a running program sealed with everything it needs, so it behaves + the same everywhere. +- **Image** — the vacuum-sealed package a container is started from; the recipe. +- **Registry (Docker Hub / ECR)** — the warehouse storing images. +- **Pod** — a sealed glass jar holding a running container, the unit Kubernetes + manages. +- **Kubernetes (K8s)** — the automated "shift manager" that keeps the right software + running and restarts what fails. +- **EKS** — Amazon running Kubernetes' "brain" for you, so you only manage the + workers. +- **Helm** — a package manager for Kubernetes; installs ready-made bundles (we use it + for MongoDB, PostgreSQL, RabbitMQ, and the monitoring stack). Think "app store for + cluster components." +- **Terraform** — Infrastructure as Code: you write down the cloud you want, it makes + reality match. +- **OIDC** — a way to issue short-lived "visitor badges" so the CI robot never holds a + permanent cloud key. +- **IAM** — Amazon's permission system; job-specific keyrings that grant only what's + needed. +- **GridFS** — MongoDB's way of storing big files by tearing them into chunks. +- **CI/CD** — the automated assembly line (CI checks + packs code) and delivery line + (CD ships it). +- **Trivy** — a scanner that blocks containers carrying known serious vulnerabilities. +- **ffmpeg** — the industry-standard audio/video tool; MoviePy drives it to extract + the audio. +- **Rolling restart** — deploying a new version by bringing it up before taking the + old one down, so the app is never offline. + +--- + +*This guide is self-contained: a group member can read it cover to cover and have +full context, a guest can follow the upload-to-download story without prior +knowledge, and an assessor can see the reasoning behind every decision. For the +line-by-line code companions, see the `*_EXPLAINED.md` files alongside each service; +for the formal trade-off log, `docs/DECISIONS_MADE.md`; for bringing the cluster back, +`DEPLOYMENT_GUIDE.md`.* diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000..8efea3a --- /dev/null +++ b/docs/README.md @@ -0,0 +1,58 @@ +# VidCast Documentation + +This folder holds the project's documentation. Pick the document that matches what +you're trying to do. + +## Where to start + +| If you want to… | Read this | +|------------------|-----------| +| **Run the project yourself**, from cloning to teardown | [`GETTING_STARTED.md`](GETTING_STARTED.md) | +| **Understand the whole project** — for assessors, teammates, or non-technical guests | [`PROJECT_GUIDE.md`](PROJECT_GUIDE.md) | +| **Look up a specific component**, port, or data flow | [`architecture.md`](architecture.md) | +| **Operate or destroy** an existing deployment in detail | [`deployment-guide.md`](deployment-guide.md) | +| **Present or demo** the project | [`presentation-notes.md`](presentation-notes.md) | +| Know **why** a design choice was made (RBAC, bcrypt, notifications) | [`DECISIONS_MADE.md`](DECISIONS_MADE.md) | +| **Merge the RBAC/bcrypt branch** without breaking logins | [`MERGE_RUNBOOK_RBAC.md`](MERGE_RUNBOOK_RBAC.md) | + +A typical first read: **`PROJECT_GUIDE.md`** to understand it, then **`GETTING_STARTED.md`** +to stand it up. + +## Each document + +- **`GETTING_STARTED.md`** — The complete end-to-end walkthrough: prerequisites, clone, + configure, Terraform infra, Helm data services, seeding, deploying the microservices, + the end-to-end test, CI/CD secrets, monitoring, and teardown. Start here to run it. + +- **`PROJECT_GUIDE.md`** — The single comprehensive guide to VidCast, written so a + non-technical reader and an engineer both get value from it. Covers what the product + does, the architecture, every microservice, the data layer, the platform engineering + (Terraform, CI/CD, monitoring), and the decisions behind it all. + +- **`architecture.md`** — Architecture reference. Service inventory (technology, image, + ports, replicas, security posture per service), the event-driven data flow, and the + port map. Use it as a lookup, not a tutorial. + +- **`deployment-guide.md`** — Phase-by-phase operations reference: one-time state-bucket + bootstrap, Terraform, Helm, deploy, operate, and destroy. More granular than + `GETTING_STARTED.md` and aimed at someone already comfortable with the stack. + +- **`presentation-notes.md`** — A timed (12–15 min) script for demoing the project: + what to show, in what order, and how to frame it for an audience. + +- **`DECISIONS_MADE.md`** — Architectural decision records for the RBAC / notifications / + admin work. Each entry: what we chose, the alternatives, the trade-off accepted, where + it breaks, and the real fix at scale. + +- **`MERGE_RUNBOOK_RBAC.md`** — Operational runbook for the moment the RBAC + bcrypt + branch merges to `main`: the new auth image and the DB seed must land together or every + login fails. Contains no credentials. + +## Conventions + +Documentation contains **no real secrets**. Anything account-specific appears as a +placeholder you fill in — ``, `YOUR_STATE_BUCKET`, `admin@example.com`, +``, `YOUR_POSTGRES_PASSWORD`, and so on. + +Project-level instructions for AI assistants live in [`../CLAUDE.md`](../CLAUDE.md); the +public overview is the root [`../README.md`](../README.md). diff --git a/docs/SLO.md b/docs/SLO.md new file mode 100644 index 0000000..422d3e2 --- /dev/null +++ b/docs/SLO.md @@ -0,0 +1,122 @@ +# SLO.md — VidCast Service Level Objectives (B4) + +> ## ⚠️ These targets are **demonstrative**, not a production guarantee +> +> VidCast runs on a **single-node EKS cluster that is deliberately torn down +> between sessions** to save cost (see the project memory / `MANAGED_SERVICES.md`). +> Every teardown is, by definition, 100% unavailability — so the **availability +> budget is exhausted the moment the cluster goes down**. Do **not** read these +> numbers as a claim that VidCast delivers 99.9% uptime. +> +> **The portfolio artifact is the machinery** — the multi-window multi-burn-rate +> PrometheusRules, the normalised burn-rate recording rules, and the error-budget +> Grafana dashboard — *not* the headline percentages. The same machinery, pointed +> at a real always-on deployment, would enforce real SLOs unchanged. + +--- + +## The three SLOs + +| # | SLO | Target | Window | SLI (how it's measured) | +|---|-----|--------|--------|--------------------------| +| 1 | **Availability** | 99.9% of gateway requests are non-5xx | 30 days | `vidcast_gateway_requests_total` — 1 − (5xx ÷ total) | +| 2 | **Conversion latency** | 95% of conversions finish ≤ 5 min | 30 days | `vidcast_conversion_duration_seconds` — fraction in the `le="300"` bucket | +| 3 | **End-to-end success** | 99% of uploads produce a notification email | 30 days | `vidcast_notifications_total{status="success"}` ÷ `vidcast_uploads_total` | + +All three SLIs come from the **M-2 metrics foundation** built in this sprint +(gateway `/metrics`, converter & notification `start_http_server`, RabbitMQ's +`rabbitmq_prometheus` plugin). Scrape wiring: `monitoring/scrape/`. + +--- + +## Error budgets and burn-rate thresholds + +"Burn rate" = how fast you're spending the budget, **normalised** so **1× is the +exact rate that just exhausts the budget over the SLO window** and **14× is 14× +too fast**. The recording rules in `monitoring/alerts/vidcast-slo-rules.yaml` +store burn rates already normalised, so the alert thresholds are literally `> 14` +and `> 1`. + +### 1. Availability — 99.9% / 30 days +- **Budget factor (1 − SLO):** 0.001 +- **Error budget (time):** 0.1% × 30 d = **43.2 minutes** of allowed 5xx per 30 days +- **Fast-burn (page / critical):** 1h **and** 5m burn rate **> 14×** → at 14× the + 43.2-min budget is gone in ~3 h. `for: 2m`. +- **Slow-burn (ticket / warning):** 6h **and** 30m burn rate **> 1×**. `for: 15m`. + +### 2. Conversion latency — 95% ≤ 5 min / 30 days +- **Budget factor:** 0.05 (5% of conversions may exceed 5 min) +- **Error budget:** 5% of all conversions in the 30-day window may be slow +- **Fast-burn (critical):** 1h **and** 5m burn rate **> 14×** (i.e. >70% of recent + conversions slower than 5 min). `for: 2m`. +- **Slow-burn (warning):** 6h **and** 30m burn rate **> 1×** (>5% slow). `for: 15m`. + +### 3. End-to-end success — 99% / 30 days +- **Budget factor:** 0.01 +- **Error budget (time-equivalent):** 1% × 30 d = **432 minutes (7.2 h)** of total + pipeline failure per 30 days; equivalently 1% of uploads may go un-notified +- **Fast-burn (critical):** 1h **and** 5m burn rate **> 14×**. `for: 5m`. +- **Slow-burn (warning):** 6h **and** 30m burn rate **> 1×**. `for: 30m`. + +Why **multi-window** (long **and** short): the long window (1h/6h) decides +severity; the short window (5m/30m) must *also* be burning, which makes the alert +**clear quickly** once the incident is over instead of latching on for an hour. +(Google SRE workbook, "Alerting on SLOs".) + +--- + +## Runbooks (alert → first action) + +### §Availability +`VidcastAvailabilityFastBurn` / `…SlowBurn` — gateway 5xx rate over budget. +1. `kubectl logs deploy/gateway` — look for tracebacks / dependency errors. +2. Check `/healthz`: is MongoDB or RabbitMQ the failing dependency? +3. Check the `PodCrashLoopBackOff` alert and gateway pod restarts. + +### §Conversion-latency +`VidcastConversionLatency…` — conversions taking too long. +1. Is the `video` queue backed up? (`RabbitMQQueueBacklog` alert / RabbitMQ UI :30004.) +2. Is KEDA scaling the converter? `kubectl get scaledobject,deploy converter`. + Remember the single-node cap: **`maxReplicaCount: 2`** — at saturation the 2nd + replica may be `Pending` (see the node-budget story), which *is* a latency cause. +3. Converter CPU throttling / OOM? `kubectl top pod -l app=converter`. + +### §End-to-end-success +`VidcastE2ESuccess…` — uploads not turning into emails. +1. Inspect the dead-letter queues (`video.dlq`, `mp3.dlq`) — see `DLQ_TOPOLOGY_EXPLAINED.md`. +2. `kubectl logs deploy/notification` — SMTP/Gmail failures? (If `GMAIL_APP_PASSWORD` + is `SKIP`, sends fail by design and this SLO is not meaningful — disable the alert.) +3. Is the outbox-relay publishing? `kubectl logs deploy/outbox-relay`. + +--- + +## Honest measurement caveats + +1. **30-day budgets vs 7-day retention.** Prometheus retention is **7 days** + (`monitoring/values.yaml`). The *alerts* only use ≤6h windows, so they are + unaffected. But the dashboard's **"budget remaining"** and **"time to + exhaustion"** panels are computed over the **7-day** window and labelled as + such — a true 30-day accounting needs longer retention (Thanos / remote-write), + which is out of scope. +2. **End-to-end SLI is time-shifted.** Uploads and their emails are minutes apart, + so over short windows `sends ÷ uploads` is noisy and can momentarily exceed 1. + It is only meaningful over **long windows (≥6h)** where the shift washes out — + which is exactly why only the 6h/30m slow-burn pair is trustworthy for this SLO. +3. **Conversion latency only counts completed jobs.** Jobs that dead-letter never + enter the histogram — they are an *end-to-end-success* failure (SLO 3), not a + latency failure. This is intentional and standard. +4. **No-traffic = no signal.** When idle, the ratios divide by a zero rate → NaN → + alerts stay quiet. Correct for a demo cluster that is often idle. + +--- + +## Where everything lives + +| Artifact | Path | +|----------|------| +| Recording rules + burn-rate alerts | `monitoring/alerts/vidcast-slo-rules.yaml` | +| Error-budget Grafana dashboard | `monitoring/dashboards/vidcast-slo.json` | +| Scrape config (ServiceMonitor/PodMonitor) | `monitoring/scrape/` | +| Gateway metrics | `src/gateway-service/metrics.py`, `server.py` | +| Converter / notification metrics | `src/{converter,notification}-service/consumer.py` | +| Concept companion (gitignored) | `SLO_EXPLAINED.md` | diff --git a/docs/SUPPLY_CHAIN.md b/docs/SUPPLY_CHAIN.md new file mode 100644 index 0000000..d3455d9 --- /dev/null +++ b/docs/SUPPLY_CHAIN.md @@ -0,0 +1,270 @@ +# SUPPLY_CHAIN.md — A8 Supply-Chain Hardening + +How VidCast makes its container images **verifiable**: from a git commit, through +CI, to a signed image whose signature is logged in a public transparency log and +checked at admission by Kyverno (B5). + +``` + git commit ──► CI build ──► image pushed ──► cosign keyless sign ──► Rekor log + (source) (SBOM + (Docker Hub / (Fulcio cert binds (public, + SARIF + ECR, by digest) the GitHub OIDC tamper-evident + Trivy gate) identity to the image) transparency) + │ + ▼ + Kyverno verifyImages at admission (B5) + checks the signature + identity before + a pod is allowed to run. +``` + +Each link adds a property: **SBOM** = know what's inside; **SARIF** = vulnerabilities +visible in GitHub Security; **Trivy gate** = CRITICAL/HIGH block the build; **cosign +sign** = provenance + integrity; **Rekor** = public, append-only proof; **Kyverno +verify** = only signed-by-us images run. + +--- + +## Trust anchors + +| Anchor | Value | Role | +|--------|-------|------| +| OIDC issuer | `https://token.actions.githubusercontent.com` | GitHub vouches for the workflow's identity | +| Fulcio (CA) | `https://fulcio.sigstore.dev` | issues a short-lived (10-min) cert binding that identity to the signature | +| Rekor (log) | `https://rekor.sigstore.dev` | public transparency log — every signature is recorded immutably | +| TUF root | `https://tuf-repo-cdn.sigstore.dev` | bootstraps trust in Fulcio/Rekor keys | + +**Keyless** signing means there is **no private key to store or leak**. The signer's +identity *is* the GitHub Actions OIDC token; Fulcio issues a throwaway certificate +for the ~10 minutes it takes to sign, and the binding is recorded in Rekor forever. + +--- + +## ⭐ Cosign signing identity (B5 needs this EXACTLY) + +The Kyverno `verify-images` policy (B5) must match the certificate identity below +**character-for-character**. It is the GitHub Actions OIDC subject for the signing +workflow on `main`: + +``` +certificate-identity: https://github.com//vidcast/.github/workflows/ci.yml@refs/heads/main +certificate-oidc-issuer: https://token.actions.githubusercontent.com +``` + +- If signing is moved to a different workflow file, the `.github/workflows/` + segment changes — update B5 to match. +- If you lock the OIDC trust to a tag/release instead of a branch, the + `@refs/heads/main` suffix changes to `@refs/tags/`. + +Repos signed: `/{auth,gateway,converter,notification}-service` (Docker +Hub) and `.dkr.ecr.eu-west-2.amazonaws.com/vidcast-frontend` (ECR). + +--- + +## Manually verify a signature + +```bash +# Any signed image (by tag or, better, by digest): +cosign verify \ + --certificate-identity 'https://github.com//vidcast/.github/workflows/ci.yml@refs/heads/main' \ + --certificate-oidc-issuer 'https://token.actions.githubusercontent.com' \ + /gateway-service: + +# Inspect the attached SBOM attestation: +cosign verify-attestation --type cyclonedx \ + --certificate-identity 'https://github.com//vidcast/.github/workflows/ci.yml@refs/heads/main' \ + --certificate-oidc-issuer 'https://token.actions.githubusercontent.com' \ + /gateway-service: +``` + +A passing `cosign verify` proves: this exact image digest was signed by *our* CI +workflow on `main`, and the signature is in Rekor (so it can't have been forged or +back-dated). + +--- + +## Admission verification (B5 — Kyverno `verify-images`) + +The last link: `k8s/kyverno/verify-images.yaml` checks the signature **at admission** +— before a pod is allowed to run. It is now pointed at the real repos and the exact +keyless identity above: + +- **imageReferences:** `docker.io//*` (backends) **and** + `.dkr.ecr.eu-west-2.amazonaws.com/vidcast-frontend*` (frontend) — **both + registries verified**. +- **attestor:** keyless, `subject` = the A8 identity, `issuer` = GitHub OIDC, + `rekor.url` = `https://rekor.sigstore.dev`. +- **mode:** `Audit`, `mutateDigest: false` — observe only. It **stays Audit** until + CI is producing signatures and a signed image verifies PASS on a live cluster; + only then does it go Enforce (+ `mutateDigest: true` to pin admitted pods to the + verified digest). Until then the Audit report shows our images as FAIL ("no + signature") — the expected, honest "not yet signed" state. + +**Network prerequisite:** Kyverno must reach Fulcio/Rekor/TUF + the registries. +`k8s/network-policies/allow-kyverno-sigstore-egress.yaml` (kyverno namespace) is the +egress carve-out. Honest caveat: vanilla NetworkPolicy can't pin to the Sigstore +*hostnames* (IP/CIDR only), so it's a TCP-443-to-internet allow — FQDN pinning needs +Cilium/an egress proxy (documented in `k8s/network-policies/README.md`). + +Live PASS/FAIL test commands: `k8s/kyverno/README.md` §B5. + +## ECR hardening (mine — Terraform, implemented) + +`terraform/modules/ecr/` (wired into `environments/dev/main.tf` as `module.ecr`): + +| Control | Setting | Why | +|---------|---------|-----| +| Tag immutability | `IMMUTABLE` | a verified digest can't be swapped under the same tag | +| Scan on push | `scan_on_push = true` | basic CVE scan on every push (defence in depth behind the CI Trivy gate) | +| Lifecycle | untagged expire after **7d**; keep last **10** images | bounded storage / cost | +| Encryption | `AES256` (AWS-managed) | **CMK deliberately skipped** — ~$1/mo standing for marginal benefit | + +`terraform validate` passes. **One-time import** (the repo already exists): + +```bash +cd terraform/environments/dev +terraform import 'module.ecr.aws_ecr_repository.this["vidcast-frontend"]' vidcast-frontend +terraform plan # should then show only the immutability/scan/lifecycle deltas +``` + +--- + +## CI diff for the operator (you write these — `.github/workflows/ci.yml`) + +Four steps added to the `build-and-scan` job. Keyless signing + SARIF upload need +extra job permissions. Apply as one coherent change: + +```diff + build-and-scan: + needs: lint + runs-on: ubuntu-latest ++ # id-token: keyless cosign signing + provenance via GitHub OIDC. ++ # security-events: upload the Trivy SARIF report to the Security tab. ++ permissions: ++ contents: read ++ id-token: write ++ security-events: write + strategy: + fail-fast: false + matrix: + service: [auth-service, gateway-service, converter-service, notification-service] + + steps: + - uses: actions/checkout@v4 + + - name: Set short SHA + run: echo "SHORT_SHA=${GITHUB_SHA::7}" >> $GITHUB_ENV + + - name: Build Docker image + run: | + docker build \ + -t ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} \ + src/${{ matrix.service }}/ + ++ # ── A8 step 1: SBOM (CycloneDX JSON) ─────────────────────────────────── ++ # syft generates a component inventory; uploaded as a build artifact and ++ # (after push) attached to the image as a cosign attestation below. ++ - name: Generate SBOM (CycloneDX) ++ uses: anchore/sbom-action@v0 ++ with: ++ image: ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} ++ format: cyclonedx-json ++ output-file: sbom-${{ matrix.service }}.cdx.json ++ - name: Upload SBOM artifact ++ uses: actions/upload-artifact@v4 ++ with: ++ name: sbom-${{ matrix.service }} ++ path: sbom-${{ matrix.service }}.cdx.json + + # ── existing gating scan (unchanged): CRITICAL/HIGH fail the build ────── + - name: Trivy vulnerability scan + uses: aquasecurity/trivy-action@master + with: + image-ref: ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} + severity: CRITICAL,HIGH + exit-code: '1' + ignore-unfixed: true + format: table + ++ # ── A8 step 2: SARIF → GitHub Security tab ───────────────────────────── ++ # A SECOND, non-gating Trivy run that emits SARIF (exit-code 0 so it never ++ # fails the build — the gate above already did that) and uploads it. ++ - name: Trivy scan (SARIF, report-only) ++ uses: aquasecurity/trivy-action@master ++ with: ++ image-ref: ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} ++ severity: CRITICAL,HIGH ++ exit-code: '0' ++ ignore-unfixed: true ++ format: sarif ++ output: trivy-${{ matrix.service }}.sarif ++ - name: Upload SARIF to code-scanning ++ uses: github/codeql-action/upload-sarif@v3 ++ with: ++ sarif_file: trivy-${{ matrix.service }}.sarif ++ category: trivy-${{ matrix.service }} + + - name: Login to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Push image to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} + ++ # ── A8 step 3: cosign keyless sign (main pushes only) ────────────────── ++ - name: Install cosign ++ if: github.ref == 'refs/heads/main' && github.event_name == 'push' ++ uses: sigstore/cosign-installer@v3 ++ - name: Resolve pushed digest ++ if: github.ref == 'refs/heads/main' && github.event_name == 'push' ++ run: | ++ # Sign by DIGEST, never by mutable tag. ++ echo "IMAGE_DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' \ ++ ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }})" >> $GITHUB_ENV ++ - name: Sign image (keyless, OIDC) ++ if: github.ref == 'refs/heads/main' && github.event_name == 'push' ++ env: ++ COSIGN_YES: "true" # non-interactive; uses the ambient GitHub OIDC token ++ run: cosign sign "${IMAGE_DIGEST}" + ++ # ── A8 step 4: SLSA provenance + SBOM attestation ────────────────────── ++ # Attach the CycloneDX SBOM to the image as a signed attestation: ++ - name: Attest SBOM ++ if: github.ref == 'refs/heads/main' && github.event_name == 'push' ++ env: ++ COSIGN_YES: "true" ++ run: cosign attest --type cyclonedx --predicate sbom-${{ matrix.service }}.cdx.json "${IMAGE_DIGEST}" ++ # For full SLSA build-provenance (L3), call the reusable generator as a ++ # SEPARATE job that takes the pushed digest as input — it produces a signed ++ # provenance attestation proving which commit + workflow built the image: ++ # uses: slsa-framework/slsa-github-generator/.github/workflows/generator_container_slsa3.yml@v2.0.0 ++ # with: { image: , digest: ${{ env.IMAGE_DIGEST-as-output }} } ++ # secrets: { registry-username: ..., registry-password: ... } +``` + +**Why these belong to the operator:** they live under `.github/workflows/`, which is the +CI/CD boundary you own. The Kyverno side (B5) is mine and only goes to Enforce once +these steps are merged and have produced at least one verifiable signature. + +--- + +## Cost decisions (A8) + +- **No CMK** — AES256 AWS-managed encryption is free; a CMK is ~$1/mo standing. +- ECR scan-on-push, immutability, lifecycle, SBOM, SARIF, cosign keyless, Rekor: + **all $0** within free limits. A8 adds no standing AWS charge. + +--- + +## Status (honest) + +| Item | State | +|------|-------| +| ECR Terraform (immutability/scan/lifecycle) | ✅ written, `terraform validate` passes; `import` + `apply` owed at re-apply | +| Cosign signing identity documented | ✅ (above — B5 consumes it) | +| CI diffs (SBOM/SARIF/cosign/provenance) | ✅ provided for the operator; not applied (his boundary) | +| Kyverno `verify-images` (B5) | ✅ activated, both registries, real identity, **Audit** (parses; `kustomize build` → 7 policies, 0 Enforce) | +| Sigstore egress NetworkPolicy (B5) | ✅ written (kyverno ns, Egress-only); apply + runtime-verify owed | +| Signatures actually in Rekor + a live PASS | ⏳ deferred — needs the operator's CI merged + a real run | diff --git a/docs/TECHNICAL_ANALYSIS.md b/docs/TECHNICAL_ANALYSIS.md new file mode 100644 index 0000000..315135f --- /dev/null +++ b/docs/TECHNICAL_ANALYSIS.md @@ -0,0 +1,389 @@ +# VidCast — Technical Project Analysis + +> A senior-DevOps review of the VidCast video-to-audio microservices platform. +> Covers what the project does, how it is built, what it does well by +> industry standards, and where it falls short — with concrete, prioritised +> recommendations. Every source file (application code, Terraform, CI/CD, +> Helm, manifests, Dockerfiles, monitoring) was read line by line for this +> assessment. + +--- + +## Part 1 — What This Project Is and Does + +### One-line summary +VidCast is a **video-to-audio conversion platform** — "turn video recordings +into podcast-ready audio." A user logs in, uploads an MP4, the system +asynchronously extracts the audio track, stores it as an MP3, and emails them a +download link. + +### The core flow (the actual logic) +The system is an **event-driven, asynchronous pipeline** built around two +RabbitMQ queues (`video` and `mp3`). Following a single upload through the code: + +1. **Login** — The frontend (`src/frontend/src/api.js`) POSTs HTTP Basic + credentials to the **gateway** `/login`, which proxies to the + **auth-service** (`auth_svc/access.py` → `auth-service/server.py`). Auth looks + the user up in PostgreSQL, verifies the password with `bcrypt.checkpw` + (constant-time), and mints a **JWT** (`CreateJWT`) carrying `username`, + `role`, a backward-compatible `admin` boolean, `iat`, and a 1-day `exp`. + +2. **Upload** — The frontend POSTs the file plus `Authorization: Bearer ` + to gateway `/upload`. The gateway validates the token by calling auth + `/validate` (`auth/validate.py`), then `storage/util.py`: + - Stores the raw video in **MongoDB GridFS** (`fs_videos`), tagging it with + `metadata.owner_email` = the uploader's JWT email. + - Publishes a persistent message `{video_fid, mp3_fid:null, username}` to the + RabbitMQ **`video` queue**. If the publish fails, it rolls back the GridFS + write (`fs.delete`) — a genuine write-consistency guard. + +3. **Convert** — The **converter-service** (`consumer.py` → + `convert/to_mp3.py`) consumes the `video` queue. It pulls the video out of + GridFS into a temp file, uses **MoviePy/ffmpeg** to extract the audio, writes + the MP3 into a *separate* GridFS DB (`fs_mp3s`) — copying the `owner_email` + tag forward — then publishes `{..., mp3_fid}` to the **`mp3` queue**. + ACK/NACK semantics drive retry. + +4. **Notify** — The **notification-service** (`consumer.py` → `send/email.py`) + consumes the `mp3` queue and emails the uploader (recipient = the `username` + carried through the message, never hardcoded) via Gmail SMTP. It is written + defensively: it never raises (a raise would crash-loop the pod), it ACK-drops + unparseable or recipient-less messages, and it NACKs only on *retryable* + failures. + +5. **Download** — The user hits gateway `/download?fid=...`, which streams the + MP3 back out of GridFS via Flask `send_file`. + +### Extensions beyond the original fork +The repo is a hardened, extended descendant of `N4si/K8s-video-converter`: + +- **Real RBAC** — a `role` column in PostgreSQL (`user` vs `admin`). + Self-registration always creates a `user` (the comments note the original code + minted admin JWTs — a fixed privilege-escalation hole). Admin-only gateway + endpoints (`/admin/users` GET, PATCH) are guarded by `_require_admin`, with two + real-world safety guardrails: an admin **cannot change their own role**, and the + system **refuses to demote the last remaining admin** (lockout prevention). + Role changes emit an audit log line. +- **Per-user ownership** — `/my-files` lists only the caller's conversions + (GridFS `owner_email` query); `/notifications/unseen-count` powers a "new + conversions" badge using a `since` timestamp. +- **Health endpoints** — auth `/healthz` pings PostgreSQL; gateway `/healthz` + checks MongoDB + RabbitMQ; the queue consumers `touch /tmp/healthy` for + exec-based liveness probes (with a startup touch so idle consumers don't + crash-loop). +- **Frontend** (React + Vite + Tailwind) — Login, Upload, Download, My + Conversions, plus admin-only Dashboard (Grafana iframe), Architecture diagram, + and Users pages. It decodes the JWT client-side **for UX only** and explicitly + documents that the backend is the real authority. + +### Technology stack + +| Layer | Technology | +|---|---| +| **Backend services** | Python 3.10, Flask (auth + gateway), Pika (RabbitMQ), psycopg2, bcrypt, PyJWT, PyMongo/GridFS, MoviePy + ffmpeg, smtplib | +| **Frontend** | React, Vite, Tailwind CSS, React Router, axios; nginx (non-root) | +| **Messaging** | RabbitMQ (`video` & `mp3` durable queues) | +| **Datastores** | MongoDB GridFS (video + mp3 binaries), PostgreSQL (users/auth) | +| **Orchestration** | Kubernetes on AWS EKS (prod, eu-west-2, m7i-flex.large); raw manifests per service + Helm charts for Mongo/Postgres/RabbitMQ | +| **Staging** | Docker Swarm on a t2.micro (`docker-compose.swarm.yml`) | +| **IaC** | Terraform (modules: vpc, iam, eks, security-groups, github-oidc) with S3/DynamoDB state backend | +| **CI/CD** | GitHub Actions (`ci.yml`, `cd.yml`) **and** a `Jenkinsfile` with a Swarm→approval→EKS promotion flow | +| **Observability** | Prometheus + Grafana + Alertmanager (kube-prometheus-stack), custom dashboard + alert rules | + +--- + +## Part 2 — Technical Assessment: What Was Done Well + +This is a strong portfolio/learning project that demonstrably reaches for +production patterns. The following are genuine, industry-standard strengths. + +### 2.1 Architecture & application design +- **Clean event-driven decomposition.** Upload, convert, and notify are + decoupled through durable queues with `PERSISTENT_DELIVERY_MODE`. This is the + correct shape for CPU-heavy media work — the gateway returns immediately and + conversion scales horizontally. +- **Correct messaging semantics.** Consumers ACK on success and NACK on + retryable failure; the gateway compensates a failed publish by deleting the + orphaned GridFS object. The notification service distinguishes *permanent* + failures (ACK-drop) from *transient* ones (NACK-requeue) — a distinction many + juniors miss. +- **Separation of concerns inside services.** The gateway splits `auth` + (validate), `auth_svc` (login/register), and `storage` (GridFS + publish) into + focused modules rather than one monolithic `server.py`. +- **Stateless services with externalised state.** All persistence lives in + Mongo/Postgres/RabbitMQ, so the Flask/consumer pods scale and restart freely. + +### 2.2 Security engineering (application layer) +- **bcrypt password hashing** with `gensalt(rounds=12)` and constant-time + `checkpw`; legacy/non-bcrypt rows are treated as auth failures, never 500s. +- **Privilege-escalation fix** — self-registration is hard-pinned to `role=user`; + it cannot mint an admin. +- **Thoughtful RBAC guardrails** — no self-demotion, no last-admin demotion + (returns `409`), plus an audit log line on every role change. These are + operational-maturity touches, not just feature code. +- **Secrets kept out of git.** `.gitignore` excludes `**/secret.yaml`, + `terraform.tfvars`, `*.tfstate`, `customise.sh`, and session docs; tracked + `*.example` templates document the shape without leaking values. Mongo URIs and + the JWT secret were correctly **moved out of ConfigMaps into Secrets** (with a + comment explaining why). +- **Defensive error handling** — endpoints avoid leaking stack traces; + `silent=True` JSON parsing; explicit status codes (`400/401/403/404/409/502`). + +### 2.3 Container & Kubernetes hardening +- **Non-root everywhere.** Every Dockerfile sets `USER 1000/1001`, and every + Deployment sets `runAsNonRoot`, `runAsUser`, `allowPrivilegeEscalation: false`, + and `capabilities: drop: ["ALL"]`. +- **`readOnlyRootFilesystem: true`** on all four backend services, with a + correctly scoped writable `emptyDir` at `/tmp` exactly where it's needed + (Werkzeug multipart buffering, ffmpeg temp files, the `/tmp/healthy` + heartbeat). The comment trail shows this was reasoned, not cargo-culted. +- **Liveness/readiness probes** appropriate to each workload type — HTTP + `/healthz` for the web services, exec `test -f /tmp/healthy` for the queue + consumers (which have no HTTP surface). +- **Resource requests and limits** set per service and tuned to the real node + (the converter was deliberately dropped from 4→2 replicas after hitting + "Insufficient cpu" on a 2-vCPU node — a real capacity-planning decision, + documented inline). +- **Frontend multi-stage build** (`node:18-alpine` builder → `nginx:1.25-alpine` + runtime) running as a dedicated non-root uid with pre-chowned nginx dirs and + PID file. Security headers (`X-Frame-Options`, `X-Content-Type-Options`, + `X-XSS-Protection`) and a sane `client_max_body_size 256m` for uploads. + +### 2.4 Supply-chain & dependency hygiene +- **Trivy scanning** wired into *both* pipelines at `CRITICAL,HIGH` with + `exit-code 1` and `ignore-unfixed` — a real, blocking gate. +- **Deliberately curated requirements.** The `requirements.txt` files are + remarkable: each pin carries a comment citing the specific CVE it clears + (Werkzeug CVE-2024-34069, urllib3 2.x line, Pillow ≥10.3.0, numpy <2.0 for + MoviePy compat), and dev-only tooling (pylint/astroid/jedi) and unused + packages (prometheus-client) were stripped from the runtime image. +- **Dockerfiles patch the OS layer** (`apt-get upgrade`) and the Python + toolchain (`pip install --upgrade pip setuptools wheel`) to clear base-image + CVEs, with comments naming them. + +### 2.5 Infrastructure as Code (Terraform) +- **Properly modularised** (`vpc`, `iam`, `eks`, `security-groups`, + `github-oidc`) with a clean root composition in `environments/dev/main.tf`. +- **Remote state done right** — S3 backend with DynamoDB locking, + `required_version >= 1.5`, providers pinned with `~>`. +- **Least-privilege-minded CI auth** — GitHub Actions authenticates via **OIDC** + (`aws_iam_openid_connect_provider`) with a trust policy scoped to the repo, and + the deploy role's *only* AWS permission is `eks:DescribeCluster` on one cluster + ARN; Kubernetes-level rights are granted separately via an **EKS access entry** + with `AmazonEKSEditPolicy`. No long-lived AWS keys in GitHub secrets. This is + exactly the modern pattern. +- **A real `validation` block** rejecting T-type instances (encoding a known + account SCP constraint into the type system so it fails fast at plan time), and + IRSA enabled via the cluster OIDC provider. + +### 2.6 CI/CD design +- **Matrix-parallel CI** across all four services (lint → build → scan → + push-on-main-only) with `fail-fast: false` so one service's failure doesn't + mask the others. +- **A genuine promotion pipeline in Jenkins** — lint → parallel build → Trivy → + push → deploy to Swarm staging → smoke test → **manual approval gate** → deploy + to EKS, with an automatic `kubectl rollout undo` on failure. The staging-on- + Swarm choice is a legitimate ~97% cost optimisation over a second EKS cluster. +- **CD via `workflow_run`** gated on CI success, using short-SHA image tags and + `kubectl rollout status` for verification. + +### 2.7 Observability +- kube-prometheus-stack with sensible EKS-specific tuning (etcd/scheduler/ + controller-manager scraping disabled — EKS manages them), 7-day retention, + persistent storage, NodePort-exposed Grafana/Alertmanager. +- **Meaningful alert rules** with runbook-style annotations + (`kubectl logs --previous`, `kubectl describe pod rabbitmq-0`): + CrashLoopBackOff, high node CPU/mem, queue backlog, RabbitMQ down. + +### 2.8 Documentation & operational discipline +- Exceptional inline commenting — the "why," the CVE, the trade-off, and the + backward-compatibility note are captured at the point of change. +- A handover/report/problems doc system for crash-safe, resumable multi-session + work, and per-issue `*_EXPLAINED.md` study material. + +**Overall verdict on merits:** the engineering *judgment* on display is well +above typical bootcamp output. The dependency hygiene, OIDC-based CI auth, pod +security contexts, and RBAC guardrails are all things real production teams ship. + +--- + +## Part 3 — Areas for Improvement (Demerits & Risks) + +Ordered roughly by severity. Severity reflects *production* readiness; several +are explicitly acknowledged as acceptable for a learning/demo context. + +### 3.1 Critical / High + +**[H-1] Databases exposed to the public internet via NodePort + `0.0.0.0/0`.** +The security-group module opens ports `30002–30008` to `0.0.0.0/0`, and Postgres +(`30003`), RabbitMQ (`30004`), and MongoDB (`30005`) are all NodePort services. +That publishes the datastores' admin ports to the entire internet. +→ *Fix:* remove DB NodePorts entirely (they're for admin convenience only — +use `kubectl port-forward`); restrict the remaining app NodePorts (or front them +with an ALB/Ingress + security group scoped to the LB). Never expose +stateful-service ports to `0.0.0.0/0`. + +**[H-2] PostgreSQL runs with `POSTGRES_HOST_AUTH_METHOD: trust`.** +In `Helm_charts/Postgres/templates/postgres-deploy.yaml` Postgres accepts **any +connection with no password**. Combined with [H-1], anyone who can reach +`NODE_IP:30003` gets unauthenticated DB access — including the full `auth_user` +table. +→ *Fix:* drop `trust`, rely on `scram-sha-256`, and keep the DB ClusterIP-only. + +**[H-3] A live-looking Gmail app password sits in the working tree.** +`customise.sh` (gitignored, so not committed — good) nonetheless contains a real +16-char `GMAIL_APP_PASSWORD`, the JWT secret, and DB passwords in plaintext on +disk. Gitignore prevents a commit but not local exfiltration, and the credential +is real. +→ *Fix:* **rotate that Gmail app password now**, then source these values from +the environment / a secret manager rather than baking them into a script. + +**[H-4] No external secret management.** Secrets live in `stringData` in +gitignored `secret.yaml` files (committed comments even say "back this with AWS +Secrets Manager + External Secrets Operator"). Manual secret files don't rotate, +aren't audited, and drift between environments. +→ *Fix:* adopt the External Secrets Operator backed by the IRSA infra that +already exists. +→ *Status (Phase Up A9 — strong-partial):* **resolved for application secrets.** +ESO now syncs `auth/gateway/converter/notification` secrets from **AWS SSM +Parameter Store** (not Secrets Manager) via a least-privilege IRSA role +(`terraform/modules/external-secrets`, `k8s/external-secrets/`). Parameter Store +was chosen over Secrets Manager precisely to avoid the $0.40/secret/month charge: +standard-tier parameters and the AWS-managed `alias/aws/ssm` SecureString key are +both free, so the **standing cost is $0**. *Pending:* the `rabbitmq-secret` is +still Helm-provisioned (the broker is created from it) and migrates to Parameter +Store only if/when a managed broker is adopted — deferred with reason, see +`MANAGED_SERVICES.md` §4. + +### 3.2 Medium + +**[M-1] Flask development server in production.** Both `auth-service` and +`gateway-service` run `server.run(host=…)` — the single-threaded Werkzeug dev +server, which prints "do not use in a production deployment." Under concurrency +it will serialise requests and degrade badly. +→ *Fix:* run behind `gunicorn`/`uvicorn` workers (e.g. +`gunicorn -w 4 -b 0.0.0.0:8080 server:server`). + +**[M-2] Monitoring scrape/alert mismatch — alerts that can never fire.** +- `monitoring/values.yaml` adds a scrape job for `gateway:8080/metrics`, but the + gateway has **no `/metrics` endpoint** (prometheus-client was intentionally + removed). That target will be permanently `down`. +- `vidcast-alerts.yaml` references `rabbitmq_queue_messages{queue="video"}` and + `up{job="rabbitmq"}`, but **no RabbitMQ exporter / scrape job is configured**. + The two most pipeline-relevant alerts (queue backlog, RabbitMQ down) will never + evaluate. +→ *Fix:* either expose real app metrics (re-add a `/metrics` endpoint with +request/queue gauges) and deploy the RabbitMQ Prometheus plugin/exporter, or +remove the dangling scrape job and alerts so the monitoring stack reflects +reality. + +**[M-3] No persistent storage for PostgreSQL.** The Postgres Helm chart is a +`Deployment` with no PVC — a pod reschedule wipes every user account. +(Acknowledged in CLAUDE.md as "use RDS in production.") +→ *Fix:* RDS, or at minimum a StatefulSet + PVC like MongoDB/RabbitMQ already +have. + +**[M-4] Unpinned images.** The Postgres Helm value is `image: postgres` (→ +`latest`), and the staging compose uses `:latest` tags throughout. This breaks +reproducibility and makes rollbacks nondeterministic. +→ *Fix:* pin every image to a digest or explicit version. + +**[M-5] In-cluster service-to-service calls are unauthenticated.** The +auth-service's `/users` and `/validate` endpoints carry no auth of their own and +trust any in-cluster caller; the gateway is the sole enforcer (the code honestly +documents this trust gap). There is **no NetworkPolicy**, so any compromised pod +can call auth directly and enumerate/modify users. +→ *Fix:* default-deny NetworkPolicies scoping who can reach auth:5000, and/or a +shared internal token / service mesh mTLS. + +**[M-6] Frontend image isn't built by CI and uses a placeholder.** +`frontend/manifest/deployment.yaml` points at +`.dkr.ecr…/vidcast-frontend:latest` — a literal placeholder that +won't deploy unedited, built out-of-band (CI only builds the four Python +services). This is a manual, error-prone step and a `:latest` tag. +→ *Fix:* add the frontend to the CI matrix (or a dedicated job) pushing to ECR +with a SHA tag, and template the account ID via kustomize/Helm. + +### 3.3 Low / Polish + +- **[L-1] No CPU/memory `HPA`** despite the whole point being scalable + conversion; scaling is manual (edit replicas / node desired_size). A queue-depth + or CPU HPA on the converter would close the loop. +- **[L-2] No PodDisruptionBudgets** — voluntary disruptions (node drains) can + take all replicas of a 2-replica service at once. +- **[L-3] Odd `maxSurge` values** — `notification` has `maxSurge: 8` for 2 + replicas, `gateway`/`auth` use `maxSurge: 3`. Harmless but sloppy; pick values + proportional to replica count and set `maxUnavailable` explicitly. +- **[L-4] No connection resilience on broker/DB.** `pika.BlockingConnection` is + established once at import time with `heartbeat=0`; a RabbitMQ blip won't + auto-reconnect (the gateway would need a pod restart). Postgres connections are + opened per-request with no pooling (`psycopg2` raw) — fine at low volume, + costly at scale. +- **[L-5] Single AZ-ish footprint / single node.** `desired_size=1` on one + instance type means the node is a SPOF; the two subnets span AZs but the node + group runs one node. +- **[L-6] Dockerfiles aren't multi-stage for the Python services.** + `build-essential`, `python3-dev`, `libpq-dev` remain in the runtime image, + enlarging it and the attack surface. A builder stage + `psycopg2-binary` (or + copying only the built wheels) would slim them. +- **[L-7] No automated tests.** There are no unit/integration tests in the repo; + CI lints and scans but never asserts behaviour. A few pytest cases around + auth/RBAC and the publish-rollback path would catch regressions the linter + can't. +- **[L-8] CD uses `|| true` on rollout steps**, so a failed `kubectl rollout + status` won't fail the GitHub Actions job — a broken deploy can report green. + (Jenkins handles this better with explicit rollback.) +- **[L-9] No Content-Security-Policy** header on the frontend (only the three + legacy headers); `X-XSS-Protection` is deprecated. +- **[L-10] Terraform `dev`-only.** There's one environment dir; staging/prod + parity is via Swarm compose rather than a `prod` Terraform workspace. Fine for + the project's scope, but not a multi-env IaC layout. + +--- + +## Part 4 — Prioritised Recommendations + +**Do now (security):** +1. Rotate the Gmail app password exposed in `customise.sh` [H-3]. +2. Remove DB NodePorts and stop opening `0.0.0.0/0` to stateful ports [H-1]. +3. Remove `POSTGRES_HOST_AUTH_METHOD: trust`; require auth [H-2]. + +**Next (production-readiness):** +4. Put auth/gateway behind gunicorn [M-1]. +5. Adopt External Secrets Operator on the existing IRSA foundation [H-4]. ✅ *Done (A9, Parameter Store, $0 standing; broker creds pending).* +6. Give Postgres durable storage (RDS or StatefulSet+PVC) [M-3]. +7. Reconcile monitoring: real `/metrics` + RabbitMQ exporter, or remove the + dead scrape/alerts [M-2]. +8. Pin all images to digests/versions [M-4]. + +**Then (hardening & scale):** +9. NetworkPolicies (default-deny) + scope auth's internal endpoints [M-5]. +10. Add the frontend to CI/ECR with SHA tags [M-6]. +11. HPA on the converter, PDBs on all services, broker auto-reconnect + [L-1, L-2, L-4]. +12. Multi-stage Python Dockerfiles, a pytest suite, and make CD fail on rollout + errors [L-6, L-7, L-8]. + +--- + +## Part 5 — Bottom Line + +VidCast is, at its core, a **DevOps/cloud-engineering showcase** wrapped around a +deliberately simple media-conversion app. Judged as that, it is **well above +average**: the event-driven architecture is sound, and the surrounding platform +work — OIDC-based CI auth, curated CVE-clearing dependencies, pod security +contexts, RBAC with real lockout guardrails, a Swarm→approval→EKS promotion +pipeline, and unusually honest inline documentation — reflects mature +engineering judgment. + +Its gaps are the predictable ones for a project optimised for a single-node demo +on a budget: **internet-exposed datastores with weak/no DB auth, dev-grade app +servers, no external secret management, no durable Postgres, and a monitoring +layer whose most important alerts can't fire.** None are hard to fix, and most +are already self-identified in the code comments and CLAUDE.md. Close the four +High items and the handful of Medium ones and this moves from "excellent +portfolio project" to "defensible small-scale production deployment." + +*Per project records, the live EKS cluster was torn down on 2026-06-03 for cost +savings, with Terraform state, tfvars, and ECR images preserved for a +one-command re-apply.* diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..f79c119 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,160 @@ +# VidCast — Architecture Reference + +## System Overview + +VidCast is an event-driven microservices platform. When a user uploads a video, it is stored immediately and a message is published to a queue. Worker pods pick up the message asynchronously, convert the video to MP3, and trigger an email notification. The user never waits for conversion — they get a notification when it's ready. + +This pattern (store-and-queue instead of store-and-block) is the same one used by YouTube, TikTok, Spotify, and every media processing platform at scale. + +--- + +## Service Inventory + +### Frontend Service + +- **Technology:** React 18 + Vite + Tailwind CSS, served by nginx +- **Image:** `/frontend` +- **Port:** NodePort 30006 +- **Replicas:** 1 +- **Purpose:** Web interface — login, upload, download, monitoring dashboard, architecture diagram +- **Build:** Multi-stage Dockerfile (Node.js build → nginx serve) +- **Security:** Runs as non-root uid 1001, HTTP liveness/readiness probes + +### Gateway Service + +- **Technology:** Flask 2.2, PyMongo, Pika +- **Image:** `nasi101/gateway` +- **Port:** NodePort 30002 (8080 in-cluster) +- **Replicas:** 2 +- **Purpose:** The single external entry point. Handles authentication delegation, file storage, and queue publishing. +- **Routes:** + - `POST /login` → delegates to Auth Service → returns JWT + - `POST /upload` → validates JWT → stores video in MongoDB GridFS → publishes file ID to RabbitMQ video queue + - `GET /download?fid=` → validates JWT → retrieves MP3 from MongoDB GridFS → streams to client + - `GET /healthz` → checks MongoDB + RabbitMQ → 200 ok / 503 degraded +- **Security:** CORS enabled, readOnlyRootFilesystem, resource limits 100m-300m CPU / 128Mi-256Mi RAM + +### Auth Service + +- **Technology:** Flask 2.2, PyJWT, psycopg2 +- **Image:** `nasi101/auth` +- **Port:** ClusterIP 5000 (internal only — not accessible outside the cluster) +- **Replicas:** 2 +- **Purpose:** Issues and validates JWT tokens. Reads user credentials from PostgreSQL. +- **Routes:** + - `POST /login` → queries PostgreSQL for email/password → returns JWT (1-day expiry) + - `POST /validate` → decodes and verifies JWT → returns claims + - `GET /healthz` → checks PostgreSQL connectivity → 200 ok / 503 error +- **Security:** ClusterIP only, readOnlyRootFilesystem, resource limits 50m-200m CPU / 64Mi-128Mi RAM + +### Converter Service + +- **Technology:** Python, Pika, PyMongo, MoviePy, ffmpeg +- **Image:** `nasi101/converter` +- **Port:** None (queue consumer only — no HTTP interface) +- **Replicas:** 4 +- **Purpose:** Processes the video queue. For each message, fetches the video from MongoDB, runs ffmpeg to extract audio, stores the MP3 back in MongoDB, acknowledges the message, publishes the MP3 file ID to the mp3 queue, and touches `/tmp/healthy`. +- **Security:** emptyDir volume at /tmp (needed for temp files during conversion), readOnlyRootFilesystem, resource limits 250m-500m CPU / 256Mi-512Mi RAM + +### Notification Service + +- **Technology:** Python, Pika, smtplib +- **Image:** `nasi101/notification` +- **Port:** None (queue consumer only — no HTTP interface) +- **Replicas:** 2 +- **Purpose:** Processes the mp3 queue. For each message, sends an email via Gmail SMTP containing the file ID for download. +- **Security:** emptyDir volume at /tmp, readOnlyRootFilesystem, resource limits 50m-100m CPU / 64Mi-128Mi RAM + +--- + +## Infrastructure Services + +### MongoDB (StatefulSet) + +- **Image:** mongo:4.0.8 +- **Port:** NodePort 30005 (27017 in-cluster) +- **Storage:** GridFS — stores binary files (video and MP3) chunked into 255KB pieces +- **Databases:** `videos` (uploaded MP4s), `mp3s` (converted MP3s) +- **Note:** No PersistentVolume — data is lost if the pod is deleted. Acceptable for demo; use Atlas or DocumentDB in production. + +### PostgreSQL (Deployment) + +- **Port:** NodePort 30003 (5432 in-cluster) +- **Database:** `authdb` +- **Table:** `auth_user` (email, password) +- **Note:** No PersistentVolume. Use RDS for production. + +### RabbitMQ (StatefulSet) + +- **Image:** rabbitmq:3-management +- **Ports:** NodePort 30004 (management UI), 5672 (AMQP in-cluster) +- **Queues:** `video` (durable), `mp3` (durable) +- **Durability:** Messages survive RabbitMQ restarts + +--- + +## Data Flow — Upload + +``` +1. User POSTs MP4 to Gateway :30002/upload with JWT +2. Gateway validates JWT with Auth Service +3. Gateway stores MP4 binary in MongoDB GridFS → receives file_id +4. Gateway publishes file_id to RabbitMQ "video" queue +5. Gateway returns "success!" to user immediately +6. (Asynchronously) Converter pod picks up file_id from "video" queue +7. Converter fetches MP4 bytes from MongoDB by file_id +8. Converter runs ffmpeg to extract audio as MP3 +9. Converter stores MP3 binary in MongoDB GridFS → receives mp3_id +10. Converter publishes mp3_id to RabbitMQ "mp3" queue +11. (Asynchronously) Notification pod picks up mp3_id from "mp3" queue +12. Notification sends email with mp3_id to user +13. User GETs /download?fid=mp3_id → Gateway streams MP3 from MongoDB +``` + +--- + +## Port Map + +| Port | Service | Access | +|------|---------|--------| +| 30002 | Gateway API | Public — client entry point | +| 30003 | PostgreSQL | Admin only | +| 30004 | RabbitMQ Management | Admin only | +| 30005 | MongoDB | Admin only | +| 30006 | Frontend | Public — web interface | +| 30007 | Grafana | Admin only | +| 30008 | Alertmanager | Admin only | + +--- + +## Security Architecture + +### What's implemented + +- **Non-root containers:** All pods run as uid 1000 (or 1001 for frontend nginx) +- **Read-only root filesystem:** Containers cannot modify their own binaries or config files at runtime. Converter and notification mount an `emptyDir` at `/tmp` for temporary files. +- **Capability dropping:** All Linux capabilities dropped (`capabilities.drop: ["ALL"]`) +- **No privilege escalation:** `allowPrivilegeEscalation: false` on all containers +- **Resource limits:** Prevents one service from starving others on the shared node +- **Health probes:** Kubernetes detects and restarts unhealthy pods automatically +- **Secrets not in Git:** `**/secret.yaml` is gitignored; secrets are applied via `kubectl apply` outside of version control +- **Image scanning:** Trivy scans every image build for CRITICAL and HIGH CVEs before push + +### What's discussed but not implemented + +- **mTLS between services:** Requires a service mesh (Istio, Linkerd). Docker Swarm provides mTLS built-in; Kubernetes requires explicit setup. +- **Network Policies:** Currently all pods can talk to all other pods. Network Policies would restrict Auth to only accept traffic from Gateway, etc. +- **External Secrets Operator:** Secrets currently stored in Kubernetes Secret objects (base64, not encrypted). External Secrets + AWS Secrets Manager would fetch secrets at runtime via IRSA. +- **Image signing:** Trivy scans for known CVEs; Cosign/Sigstore would add cryptographic signing so only verified images can run. + +--- + +## Environments + +| Environment | Platform | Purpose | Cost | +|-------------|----------|---------|------| +| Production | AWS EKS eu-west-2 (m7i-flex.large) | Live traffic | ~$150/month | +| Staging | Docker Swarm (t2.micro EC2) | Pre-production via Jenkins | ~$10/month | +| Local | Docker Compose | Developer testing | Free | + +Staging uses Docker Swarm rather than a second EKS cluster — a 97% cost reduction with equivalent functionality for integration testing. diff --git a/docs/deployment-guide.md b/docs/deployment-guide.md new file mode 100644 index 0000000..b8ac0b1 --- /dev/null +++ b/docs/deployment-guide.md @@ -0,0 +1,295 @@ +# VidCast — Deployment Guide + +Complete step-by-step instructions for deploying, operating, and destroying the VidCast platform. + +--- + +## Prerequisites + +```bash +# Check all tools are installed +aws --version # AWS CLI v2+ +kubectl version # 1.31+ +helm version # 3.x +terraform version # 1.5+ +psql --version # PostgreSQL client +docker --version # Docker 20+ +``` + +Configure AWS credentials: +```bash +aws configure +# Or export AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_DEFAULT_REGION +aws sts get-caller-identity # Verify +``` + +--- + +## Phase 1 — Infrastructure (Terraform) + +Create the S3 bucket and DynamoDB table for Terraform remote state first (one-time): + +```bash +# State bucket +aws s3 mb s3://YOUR-STATE-BUCKET --region eu-west-2 +aws s3api put-bucket-versioning --bucket YOUR-STATE-BUCKET \ + --versioning-configuration Status=Enabled + +# State lock table +aws dynamodb create-table \ + --table-name vidcast-terraform-locks \ + --attribute-definitions AttributeName=LockID,AttributeType=S \ + --key-schema AttributeName=LockID,KeyType=HASH \ + --billing-mode PAY_PER_REQUEST \ + --region eu-west-2 +``` + +Then apply Terraform: + +```bash +cd terraform/environments/dev +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars — set state_bucket to YOUR-STATE-BUCKET + +terraform init \ + -backend-config="bucket=YOUR-STATE-BUCKET" \ + -backend-config="key=vidcast/dev/terraform.tfstate" \ + -backend-config="region=eu-west-2" \ + -backend-config="dynamodb_table=vidcast-terraform-locks" + +terraform validate +terraform plan +terraform apply # Takes ~20 minutes for EKS cluster creation +``` + +Get the kubeconfig update command from outputs: +```bash +terraform output kubeconfig_command +# Run the command it prints +kubectl get nodes -o wide # Capture EXTERNAL-IP as NODE_IP +``` + +--- + +## Phase 2 — Infrastructure Services (Helm) + +```bash +cd Helm_charts/MongoDB +helm install mongodb . +kubectl wait --for=condition=ready pod/mongodb-0 --timeout=180s + +cd ../Postgres +helm install postgres . +kubectl wait --for=condition=ready pod -l app=postgres --timeout=120s + +cd ../RabbitMQ +helm install rabbitmq . +kubectl wait --for=condition=ready pod/rabbitmq-0 --timeout=120s +cd ../.. +``` + +--- + +## Phase 3 — Initialise PostgreSQL + +```bash +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') + +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql \ + -h $NODE_IP -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb \ + -f Helm_charts/Postgres/init.sql + +# Verify +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql \ + -h $NODE_IP -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb \ + -c "SELECT email FROM auth_user;" +``` + +--- + +## Phase 4 — Create RabbitMQ Queues + +```bash +curl -u guest:guest -X PUT http://$NODE_IP:30004/api/queues/%2F/video \ + -H "Content-Type: application/json" -d '{"durable":true}' + +curl -u guest:guest -X PUT http://$NODE_IP:30004/api/queues/%2F/mp3 \ + -H "Content-Type: application/json" -d '{"durable":true}' + +# Verify +curl -s -u guest:guest http://$NODE_IP:30004/api/queues | \ + python3 -c "import json,sys; [print(q['name']) for q in json.load(sys.stdin)]" +``` + +--- + +## Phase 5 — Create Kubernetes Secrets + +Secrets are gitignored (`**/secret.yaml`) and are **not** part of the Kustomize +tree — they are applied separately, before the overlay. A `secret.yaml.example` +template sits in each service's `src//manifest/` dir — copy it to +`secret.yaml`, fill in real values, and `kubectl apply -f` it. Or create them +imperatively: + +```bash +# Auth service +kubectl create secret generic auth-secret \ + --from-literal=PSQL_PASSWORD=YOUR_POSTGRES_PASSWORD \ + --from-literal=JWT_SECRET=YOUR_JWT_SECRET + +# Gateway service — MongoDB URIs now live in the Secret, not the ConfigMap +kubectl create secret generic gateway-secret \ + --from-literal=MONGODB_VIDEOS_URI="mongodb://USER:PASS@mongodb:27017/videos?authSource=admin" \ + --from-literal=MONGODB_MP3S_URI="mongodb://USER:PASS@mongodb:27017/mp3s?authSource=admin" + +# Converter service — MongoDB URI now lives in the Secret, not the ConfigMap +kubectl create secret generic converter-secret \ + --from-literal=MONGODB_URI="mongodb://USER:PASS@mongodb:27017/mp3s?authSource=admin" + +# Notification service +kubectl create secret generic notification-secret \ + --from-literal=GMAIL_ADDRESS=YOUR_GMAIL \ + --from-literal=GMAIL_PASSWORD=YOUR_GMAIL_APP_PASSWORD +``` + +--- + +## Phase 6 — Deploy Microservices + +All services deploy in one Kustomize apply (use `overlays/dev` for the lighter +single-replica dev environment): + +```bash +kubectl apply -k k8s/overlays/prod + +for d in auth gateway converter notification frontend; do + kubectl rollout status deployment/$d --timeout=120s +done + +kubectl get pods # All should be Running +``` + +--- + +## Phase 7 — End-to-End Test + +```bash +NODE_IP=$(kubectl get nodes -o jsonpath='{.items[0].status.addresses[?(@.type=="ExternalIP")].address}') + +# Login +TOKEN=$(curl -s -X POST http://$NODE_IP:30002/login -u "EMAIL:PASSWORD") +echo "Token: ${TOKEN:0:30}..." + +# Upload +curl -X POST http://$NODE_IP:30002/upload \ + -F "file=@assets/video.mp4" \ + -H "Authorization: Bearer $TOKEN" +# Expected: "success!" + +# Monitor conversion +sleep 10 +curl -s -u guest:guest http://$NODE_IP:30004/api/queues/%2F/video | \ + python3 -c "import json,sys; q=json.load(sys.stdin); print('video queue:', q.get('messages', 0), 'messages')" + +# Download (file_id from notification email) +curl -X GET "http://$NODE_IP:30002/download?fid=FILE_ID" \ + -H "Authorization: Bearer $TOKEN" \ + -o output.mp3 +ls -lh output.mp3 +``` + +--- + +## Phase 8 — Monitoring (Optional) + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml -n monitoring --create-namespace + +kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=grafana -n monitoring --timeout=180s + +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml + +echo "Grafana: http://$NODE_IP:30007 (admin / vidcast-demo)" +echo "Alertmanager: http://$NODE_IP:30008" +``` + +--- + +## Operational Commands + +```bash +# Pod status +kubectl get pods -o wide + +# Logs +kubectl logs -l app=gateway --tail=50 +kubectl logs -l app=converter --tail=50 -c converter + +# Restart a deployment +kubectl rollout restart deployment/gateway + +# Scale converters for heavy load +kubectl scale deployment/converter --replicas=8 + +# Watch RabbitMQ queue depths +watch -n5 "curl -s -u guest:guest http://$NODE_IP:30004/api/queues/%2F | \ + python3 -c \"import json,sys; [print(q['name'], q.get('messages',0)) for q in json.load(sys.stdin)]\"" + +# Check health endpoints +curl http://$NODE_IP:30002/healthz # Gateway +``` + +--- + +## Cost Management + +Stop/start the node group to pause costs (saves ~$70/month when not in use): + +```bash +# Stop (scale to 0 nodes) +aws eks update-nodegroup-config \ + --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes \ + --scaling-config minSize=0,maxSize=2,desiredSize=0 \ + --region eu-west-2 + +# Start (scale back up) +aws eks update-nodegroup-config \ + --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes \ + --scaling-config minSize=1,maxSize=2,desiredSize=1 \ + --region eu-west-2 +``` + +Note: The EKS control plane still costs ~$73/month even with 0 nodes. For extended breaks, run `terraform destroy`. + +--- + +## Teardown (Full Destroy) + +```bash +# 1. Microservices (Kustomize — match the overlay you deployed) +kubectl delete -k k8s/overlays/prod + +# 2. Monitoring +helm uninstall monitoring -n monitoring +kubectl delete namespace monitoring + +# 3. Infrastructure services +helm uninstall mongodb +helm uninstall postgres +helm uninstall rabbitmq + +# 4. EKS + VPC + IAM via Terraform +cd terraform/environments/dev +terraform destroy # Takes ~15 minutes + +# 5. Delete Terraform state bucket (optional) +aws s3 rb s3://YOUR-STATE-BUCKET --force +aws dynamodb delete-table --table-name vidcast-terraform-locks --region eu-west-2 +``` diff --git a/docs/presentation-notes.md b/docs/presentation-notes.md new file mode 100644 index 0000000..f6cc21c --- /dev/null +++ b/docs/presentation-notes.md @@ -0,0 +1,115 @@ +# VidCast — Presentation Notes + +## Timing Guide (12–15 minutes total) + +| Section | Time | What to show | +|---------|------|--------------| +| Open with the product | 2 min | Live demo via web interface | +| Architecture walkthrough | 3 min | Architecture page in the frontend | +| Platform engineering | 5 min | Terraform, CI/CD pipeline, Grafana | +| What I'd do next | 2 min | Whiteboard / verbal | +| Real-world connection | 1 min | Verbal close | + +--- + +## Opening (2 minutes) + +**Don't start with "I built a Kubernetes cluster." Start with the problem.** + +"Content creators record videos — Zoom calls, webinars, conference talks. They need the audio as a standalone podcast. Right now they have to download the video, find a converter tool, wait, re-upload. VidCast does it in one step: upload the video, we email you when the MP3 is ready." + +Then open the web interface and do the upload live. + +--- + +## Architecture Walkthrough (3 minutes) + +Switch to the Architecture page in the frontend. + +**Microservices → Restaurant analogy:** +"In a traditional monolith, one chef does everything — takes the order, cooks, plates, brings it to you. That chef gets overwhelmed at rush hour. VidCast uses specialised roles: the gateway is the host taking orders, the converter is the kitchen, the notification service is the runner bringing the food. Each role can be scaled independently — we run 4 converter workers because conversion is the slow part." + +**Message queue → Post office analogy:** +"When you drop a letter at the post office, you don't wait at the counter for it to be delivered. You hand it over and walk away. RabbitMQ is our post office sorting room. You upload a video, it goes into the queue, and you get on with your day. The converter workers process it on their own schedule." + +**JWT authentication → Security badge analogy:** +"You show your ID at reception once — that's the login. You get a badge — that's the JWT token. You swipe the badge at each door — that's the authorization header on every request. The auth service is reception; the gateway is the building with all the doors." + +--- + +## Platform Engineering Walkthrough (5 minutes) + +### Terraform (~1 minute) +Show the `terraform/` directory structure. + +"Before this project, if someone deleted the cluster, I'd spend an hour clicking through the AWS console trying to remember every setting. Now: `terraform apply` recreates the entire platform in 20 minutes from version-controlled code. VPC, subnets, IAM roles, EKS cluster, security groups — all defined as code, reviewable, reproducible. This is the difference between an experiment and a production system." + +**One important detail:** On this AWS account, T-type instances fail during EKS node group creation because EKS auto-generates a `CreditSpecification: unlimited` parameter that the account's SCP rejects. The Terraform EKS module includes a validation block that catches this immediately rather than failing after 15 minutes. That's a lesson in defensive infrastructure — encoding known constraints in the code rather than the documentation. + +### CI/CD Pipeline (~2 minutes) +Show the GitHub Actions UI (or the `.github/workflows/ci.yml` file). + +"Every push to main runs this pipeline automatically. Ruff lints all four Python services. Docker builds all four images in parallel. Trivy scans each image for critical vulnerabilities before any image reaches the registry. If Trivy fails, the pipeline stops — nothing gets pushed to Docker Hub, nothing gets deployed to the cluster. + +This is called shift-left security — catching problems early in development rather than discovering them in production. + +After CI passes, the CD pipeline runs automatically: configures kubectl for EKS, and deploys the new images with `kubectl set image`. Rolling deployment, zero downtime. + +I also wrote a Jenkinsfile for teams using Jenkins — same stages, different syntax. It adds a Docker Swarm staging environment and a manual approval gate before production. A CI/CD pipeline is tool-agnostic; the concepts are the same whether you're using GitHub Actions, Jenkins, or GitLab CI." + +### Grafana Dashboard (~2 minutes) +Open Grafana, navigate to VidCast Operations. + +"This is what the on-call engineer sees. Pod status — are all 4 converters running? Restart count — has anything crashed in the last hour? Node CPU and memory — is the node being saturated? And this is the one I find most interesting for a demo: RabbitMQ queue depth. Watch what happens when I upload a video..." + +[Upload a video and watch the video queue tick up, then back down as the converters process it.] + +"That spike is real. You can see the video enter the queue, the converters pick it up, and the queue drain. This is what observability looks like — not just 'is it running,' but 'is it doing what it's supposed to do.'" + +--- + +## Security Hardening (if time permits) + +"Every pod runs as a non-root user — even nginx runs as uid 1001. The root filesystem is read-only, so even if an attacker compromises the converter, they can't modify the application binaries. We mount a writable `/tmp` directory as a separate volume so the ffmpeg conversion has somewhere to write temporary files without compromising the rest of the filesystem. + +Every capability is dropped — no raw sockets, no sys_admin, no process injection. This is the principle of least privilege applied at the kernel level." + +--- + +## What I'd Do Next (2 minutes) + +"Three things I'd add with more time: + +**KEDA — queue-based autoscaling.** Right now I have 4 converter replicas. With KEDA, the converter would watch the RabbitMQ queue depth and scale automatically — 4 replicas for 4 videos waiting, 20 replicas for 20 videos. You pay for compute only when there's work to do. + +**Service mesh for mTLS.** Docker Swarm gives you mutual TLS between services built-in — every connection is encrypted and authenticated. In Kubernetes, you need a service mesh like Istio or Linkerd to get the same thing. For a demo, it's not worth the operational overhead. For production handling sensitive content, it's non-negotiable. + +**External Secrets Operator.** Right now credentials are in Kubernetes Secrets — which are base64-encoded, not encrypted. The right approach is to store them in AWS Secrets Manager and fetch them at runtime via IRSA. The secrets never exist in the cluster YAML files at all." + +--- + +## Closing (1 minute) + +"Every media processing platform uses this pattern. YouTube when you upload a video. Spotify when they transcode your podcast. Companies processing mortgage documents, medical images, satellite data. The scale is different, but the architecture is the same: upload, queue, process, store, notify. VidCast is a production-quality implementation of that pattern on real AWS infrastructure." + +--- + +## Common Interview Questions — With Answers + +**"Why microservices instead of a monolith?"** +"For this use case, the converter is the bottleneck — ffmpeg is CPU-intensive and variable in duration. By separating it into its own service, we can scale it independently (4 replicas) without scaling the gateway or auth service. A monolith would require scaling everything together." + +**"Why RabbitMQ instead of SQS or Kafka?"** +"RabbitMQ fits our scale — durable queues, simple consumer model, built-in management UI. SQS would be equally valid and easier to operate in AWS (no StatefulSet needed). Kafka would be overkill for this throughput; it shines at millions of messages per second with multiple consumer groups. For a production system I'd use SQS to reduce operational overhead." + +**"What happens if a converter pod crashes mid-conversion?"** +"The RabbitMQ `basic_ack` is sent only after successful conversion. If the converter crashes before acknowledging, RabbitMQ redelivers the message to another converter. The video gets processed exactly once (at-least-once delivery). The MP3 might be stored twice if the pod crashes after storing but before acking — in production I'd add idempotency via a unique conversion ID." + +**"Why Docker Swarm for staging instead of a second EKS cluster?"** +"A second EKS cluster costs ~$290/month. A Swarm EC2 instance costs ~$8/month. 97% cost reduction for functionally equivalent pre-production testing. The Jenkins pipeline deploys to Swarm first, runs a smoke test against the /healthz endpoint, waits for human approval, then deploys to EKS." + +**"How would you handle secrets in production?"** +"Currently they're in Kubernetes Secrets — base64, not encrypted. In production: AWS Secrets Manager + External Secrets Operator + IRSA. Secrets are stored in Secrets Manager, fetched at runtime by the pod's service account, never in any YAML file. If EKS envelope encryption is enabled, the Secret objects in etcd are also encrypted at rest." + +**"What is Trivy and why is it in the pipeline?"** +"Trivy is an open-source vulnerability scanner by Aqua Security. It scans container images for known CVEs in OS packages and application dependencies. In our pipeline, it runs after Docker build but before Docker push. If Trivy finds a CRITICAL or HIGH vulnerability that has a fix available, the pipeline fails — the image never reaches the registry. This is shift-left security: catching problems in CI rather than discovering them in production." diff --git a/install_prerequisites.sh b/install_prerequisites.sh new file mode 100644 index 0000000..1e2f0c6 --- /dev/null +++ b/install_prerequisites.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# DevOps Project Prerequisites Installation Guide for WSL2 +# This script installs: kubectl, Helm, Python 3, psql, mongosh, Terraform +# Already installed: AWS CLI, Docker + +set -e # Exit on any error + +echo "==========================================" +echo "DevOps Project Prerequisites Installation" +echo "WSL2 Ubuntu Setup" +echo "==========================================" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 1. UPDATE PACKAGE MANAGER +# ═══════════════════════════════════════════════════════════════ +echo "[1/7] Updating package manager..." +sudo apt-get update +echo "✓ Package manager updated" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 2. INSTALL KUBECTL +# ═══════════════════════════════════════════════════════════════ +echo "[2/7] Installing kubectl..." +echo " → Downloading kubectl binary" +curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" +echo " → Making executable" +chmod +x kubectl +echo " → Installing to /usr/local/bin" +sudo mv kubectl /usr/local/bin/kubectl +echo " → Verifying installation" +kubectl version --client +echo "✓ kubectl installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 3. INSTALL HELM +# ═══════════════════════════════════════════════════════════════ +echo "[3/7] Installing Helm..." +echo " → Downloading Helm installation script" +curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash +echo " → Verifying installation" +helm version +echo "✓ Helm installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 4. INSTALL PYTHON 3 +# ═══════════════════════════════════════════════════════════════ +echo "[4/7] Installing Python 3..." +echo " → Installing python3 and pip" +sudo apt-get install -y python3 python3-pip python3-venv +echo " → Verifying Python installation" +python3 --version +echo " → Verifying pip installation" +pip3 --version +echo "✓ Python 3 installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 5. INSTALL POSTGRESQL CLIENT (psql) +# ═══════════════════════════════════════════════════════════════ +echo "[5/7] Installing PostgreSQL client (psql)..." +echo " → Installing postgresql-client" +sudo apt-get install -y postgresql-client +echo " → Verifying installation" +psql --version +echo "✓ PostgreSQL client installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 6. INSTALL MONGODB CLIENT (mongosh) +# ═══════════════════════════════════════════════════════════════ +echo "[6/7] Installing MongoDB client (mongosh)..." +echo " → Adding MongoDB repository" +curl https://www.mongodb.org/static/pgp/server-7.0.asc | sudo apt-key add - +echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/7.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-7.0.list +echo " → Updating package manager" +sudo apt-get update +echo " → Installing mongosh" +sudo apt-get install -y mongosh +echo " → Verifying installation" +mongosh --version +echo "✓ MongoDB client installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# 7. INSTALL TERRAFORM +# ═══════════════════════════════════════════════════════════════ +echo "[7/7] Installing Terraform..." +echo " → Adding HashiCorp GPG key" +wget -O- https://apt.releases.hashicorp.com/gpg | \ + sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg +echo " → Adding HashiCorp apt repository" +echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ + sudo tee /etc/apt/sources.list.d/hashicorp.list +echo " → Updating package manager" +sudo apt-get update +echo " → Installing terraform" +sudo apt-get install -y terraform +echo " → Verifying installation" +terraform version +echo "✓ Terraform installed successfully" +echo "" + +# ═══════════════════════════════════════════════════════════════ +# FINAL VERIFICATION +# ═══════════════════════════════════════════════════════════════ +echo "==========================================" +echo "Installation Complete!" +echo "==========================================" +echo "" +echo "Verification of all tools:" +echo "" +echo "kubectl:" +kubectl version --client --short +echo "" +echo "Helm:" +helm version --short +echo "" +echo "Python:" +python3 --version +echo "" +echo "pip:" +pip3 --version +echo "" +echo "psql (PostgreSQL client):" +psql --version +echo "" +echo "mongosh (MongoDB client):" +mongosh --version +echo "" +echo "Terraform:" +terraform version +echo "" +echo "✓ All prerequisites installed successfully!" +echo "" +echo "Next steps:" +echo "1. Clone the repository:" +echo " git clone https://github.com//vidcast.git" +echo " cd vidcast" +echo "" +echo "2. Verify AWS CLI:" +echo " aws --version" +echo "" +echo "3. Verify Docker:" +echo " docker --version" +echo "" +echo "4. Configure AWS credentials (if not already done):" +echo " aws configure" +echo "" +echo "5. Follow the full walkthrough:" +echo " docs/GETTING_STARTED.md" +echo "" diff --git a/k8s/README.md b/k8s/README.md new file mode 100644 index 0000000..6c80780 --- /dev/null +++ b/k8s/README.md @@ -0,0 +1,101 @@ +# k8s/ — Application manifests (Kustomize) + +VidCast's five application workloads (auth, gateway, converter, notification, +frontend) are managed with **Kustomize**: a shared `base/` plus per-environment +`overlays/`. This replaces the old raw per-service manifests under +`src//manifest/` and is the structure Argo CD (Phase Up B1) syncs from. + +> **Scope.** This tree covers the *application* services only. The stateful +> backends (MongoDB, PostgreSQL, RabbitMQ) remain Helm charts under +> `Helm_charts/` (they are `dev-only` infra; see `MANAGED_SERVICES.md` for the +> managed-service alternatives and why they are documented-but-not-applied). +> Kubernetes **Secrets are not in this tree** — see "Secrets" below. + +## Layout + +``` +k8s/ +├── base/ +│ ├── auth/ deployment + service (ClusterIP :5000) + configmap +│ ├── gateway/ deployment + service (NodePort :30002) + configmap +│ ├── converter/ deployment + configmap (queue consumer, no Service) +│ ├── notification/ deployment + configmap (queue consumer, no Service) +│ └── frontend/ deployment + service (NodePort :30006) + configmap +└── overlays/ + ├── dev/ 1 replica per backend; lighter footprint (Argo auto-sync ON) + └── prod/ mirrors current live footprint (2/2/2/2/1) (Argo auto-sync OFF) +``` + +Each `base//` has its own `kustomization.yaml` so a service can become an +independent Argo CD `Application` later. Each overlay references all five bases +and applies environment-specific transforms (image tags, replica counts, and the +governance labels). + +## Deploy + +```bash +# 1. Secrets first — NOT in the Kustomize tree (see below): +kubectl apply -f ../src/auth-service/manifest/secret.yaml +kubectl apply -f ../src/gateway-service/manifest/secret.yaml +kubectl apply -f ../src/converter-service/manifest/secret.yaml +kubectl apply -f ../src/notification-service/manifest/secret.yaml +# (rabbitmq-secret is created by the RabbitMQ Helm chart) + +# 2. Render to check what you're about to apply: +kubectl kustomize overlays/prod # or overlays/dev + +# 3. Apply: +kubectl apply -k overlays/prod # or overlays/dev +``` + +Teardown: `kubectl delete -k overlays/` (match what you deployed). + +## What the overlays change + +| Transform | dev | prod | +|---|---|---| +| Replicas (auth/gateway/converter/notification) | 1 each | 2 each (base) | +| Frontend replicas | 1 | 1 | +| `environment` label | `dev` | `prod` | +| Governance labels (`cost-centre`, `owner`, `app.kubernetes.io/managed-by`) | yes | yes | +| Backend image tags | `images:` block | `images:` block | +| Frontend image | resolved to account ECR via `images:` `newName`/`newTag` | same | + +The governance labels (`environment`, `cost-centre`, `owner`, +`app.kubernetes.io/managed-by`) are what the Kyverno `require-labels` policy +(B2) enforces. `app.kubernetes.io/managed-by` is `kustomize` today and flips to +`argocd` when B1 lands. + +## Image tags = the GitOps source of truth + +Image versions are set in each overlay's `images:` block, **not** by +`kubectl set image`. Today the CD pipeline still patches the live Deployment +directly; under B1 the pipeline will instead open a PR bumping `newTag` here, and +the merge of that PR is the deploy. Backends are on Docker Hub +(`/-service`); the frontend is in this account's ECR (CI does +not build the frontend). + +## Secrets + +Secrets are intentionally **excluded** from Kustomize: +- `**/secret.yaml` is gitignored, so they must never be rendered from tracked + files; and +- Phase Up **A9** replaces the manual `secret.yaml` files with **External + Secrets Operator** (`ExternalSecret` → AWS Parameter Store). At that point a + `secretstore`/`externalsecrets` component is added to this tree and the + manual apply in step 1 goes away. + +`secret.yaml.example` templates still live under `src//manifest/` and +document the required keys. + +## Validation + +```bash +kubectl kustomize overlays/dev >/dev/null && echo "dev OK" +kubectl kustomize overlays/prod >/dev/null && echo "prod OK" +``` + +`prod` is intended to render equivalent to the pre-Kustomize raw manifests apart +from three deliberate additions: the governance labels, `namespace: default`, and +the resolved frontend image. `kubectl apply -k` is also run with +`--dry-run=server` in CI before a real apply. diff --git a/k8s/argocd/README.md b/k8s/argocd/README.md new file mode 100644 index 0000000..5fef9e2 --- /dev/null +++ b/k8s/argocd/README.md @@ -0,0 +1,55 @@ +# k8s/argocd — GitOps with Argo CD (B1) + +Argo CD continuously reconciles the cluster to the manifests in +`k8s/overlays/{dev,prod}`. **dev auto-syncs; prod is manual-sync (the approval +gate).** Full design + the CD gate migration are in `GITOPS.md` (repo root). + +## Install (applied separately, like ESO/KEDA — CRDs first) + +```bash +helm repo add argo https://argoproj.github.io/argo-helm && helm repo update +helm install argocd argo/argo-cd -n argocd --create-namespace -f k8s/argocd/values.yaml + +# register the two Applications (these are argoproj.io CRDs → need Argo installed first) +kubectl apply -k k8s/argocd +``` + +## Access the UI (port-forward — not NodePort) + +The Argo UI is an admin control plane, so it is **not** world-exposed via NodePort +(same posture as the RabbitMQ/DB admin ports under A6). Reach it with a port-forward: + +```bash +kubectl -n argocd port-forward svc/argocd-server 8080:443 +# browse https://localhost:8080 (self-signed cert → accept the warning) +# initial admin password: +kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d; echo +``` + +## Sync + +```bash +kubectl get applications -n argocd # dev/prod status (Synced/OutOfSync/Health) +argocd app sync vidcast-prod # MANUAL prod sync = the deploy/approval action +# dev auto-syncs; to force: argocd app sync vidcast-dev +``` + +## ⚠️ Single-cluster caveat + +`vidcast-dev` and `vidcast-prod` both target the `default` namespace on this one +cluster, so they manage the same-named resources. **Sync only ONE at a time** (dev +for validation, prod for the live footprint). Syncing both would make them fight +over the same Deployments. Multi-cluster would point each Application at a different +`destination.server`. Explained in `GITOPS.md`. + +## What Argo manages vs what's manual + +- **Argo manages:** the app workloads in `k8s/overlays/{dev,prod}` (Deployments, + Services, ConfigMaps, and the ESO-created Secrets). +- **Manual / platform-owned (the operator):** Argo CD itself, KEDA, ESO, NetworkPolicies, + Kyverno. Platform layer ≠ application layer. See `GITOPS.md`. + +## Rollback + +`git revert` the offending commit → Argo re-syncs to the previous state (dev +automatically; prod on the next manual sync). No `kubectl` needed. diff --git a/k8s/argocd/application-dev.yaml b/k8s/argocd/application-dev.yaml new file mode 100644 index 0000000..77e790a --- /dev/null +++ b/k8s/argocd/application-dev.yaml @@ -0,0 +1,45 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vidcast-dev + namespace: argocd + # The finalizer makes `kubectl delete application` also prune the synced + # resources (cascade), rather than orphaning them. + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + # In-repo manifests (Q3 decision: no separate manifest repo). Argo points at + # the overlay A10 already built — no reorganisation. + repoURL: https://github.com/johnnybabs/vidcast.git + targetRevision: main + path: k8s/overlays/dev + destination: + server: https://kubernetes.default.svc + namespace: default + # Replica counts are owned by autoscalers, not git: KEDA scales `converter` + # (0..2, scale-to-zero) and the HPA scales `gateway`. Without this, Argo sees + # live replicas != overlay replicas:1 and reports OutOfSync forever — and with + # selfHeal it would fight the autoscaler. Ignore /spec/replicas on both so the + # autoscaler owns that field and Argo owns everything else. + ignoreDifferences: + - group: apps + kind: Deployment + name: converter + jsonPointers: + - /spec/replicas + - group: apps + kind: Deployment + name: gateway + jsonPointers: + - /spec/replicas + syncPolicy: + # DEV = automated. Every commit to main that changes k8s/overlays/dev (or an + # image-tag bump) is auto-synced within the controller's poll interval. + automated: + prune: true # delete resources removed from git + selfHeal: true # revert manual kubectl drift back to git + syncOptions: + - CreateNamespace=false # the `default` namespace already exists + - ApplyOutOfSyncOnly=true diff --git a/k8s/argocd/application-prod.yaml b/k8s/argocd/application-prod.yaml new file mode 100644 index 0000000..953dee7 --- /dev/null +++ b/k8s/argocd/application-prod.yaml @@ -0,0 +1,29 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: vidcast-prod + namespace: argocd + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + project: default + source: + repoURL: https://github.com/johnnybabs/vidcast.git + targetRevision: main + path: k8s/overlays/prod + destination: + server: https://kubernetes.default.svc + namespace: default + syncPolicy: + # PROD = MANUAL. There is deliberately NO `automated:` block. This IS the + # approval gate: a prod deploy happens only when a human merges the image-tag + # bump PR (which Argo then shows as OutOfSync) and clicks Sync / runs + # `argocd app sync vidcast-prod`. Do not add `automated:` here. + syncOptions: + - CreateNamespace=false + - ApplyOutOfSyncOnly=true +# NOTE (single-cluster caveat): vidcast-dev and vidcast-prod both target the +# `default` namespace on the same cluster, so they manage the same-named resources +# and must NOT be synced simultaneously. On this demo cluster you run ONE at a time +# (dev for validation, prod for the live footprint). In a real setup the two +# Applications target different clusters via destination.server. See GITOPS.md. diff --git a/k8s/argocd/kustomization.yaml b/k8s/argocd/kustomization.yaml new file mode 100644 index 0000000..ffdab50 --- /dev/null +++ b/k8s/argocd/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# B1 Argo CD Application CRDs. Applied SEPARATELY (after `helm install argocd`) +# because they are argoproj.io CRDs that must exist first — same applied-separately +# pattern as ESO (A9) and KEDA (A7). Argo CD itself, KEDA, ESO, NetworkPolicies and +# Kyverno are PLATFORM resources owned by the platform engineer (John), NOT managed +# by these Applications — the Applications only manage the app workloads under +# k8s/overlays/{dev,prod}. See GITOPS.md for the manage-vs-manual boundary. +resources: + - application-dev.yaml + - application-prod.yaml diff --git a/k8s/argocd/values.yaml b/k8s/argocd/values.yaml new file mode 100644 index 0000000..13183c0 --- /dev/null +++ b/k8s/argocd/values.yaml @@ -0,0 +1,56 @@ +# Helm values for Argo CD (B1) — tuned for a single 2-vCPU demo node, NOT the +# chart defaults (which assume a dedicated control-plane cluster). +# helm repo add argo https://argoproj.github.io/argo-helm +# helm install argocd argo/argo-cd -n argocd --create-namespace -f k8s/argocd/values.yaml +# +# Lean: SSO (dex), ApplicationSet, and notifications are disabled — none are needed +# for two hand-written Applications on one cluster. That leaves the core four: +# application-controller, server, repo-server, and the bundled redis. Total +# requests ≈ 250m / 576Mi. + +dex: + enabled: false +notifications: + enabled: false +applicationSet: + enabled: false + +# The reconcile engine — the heaviest component; give it the most headroom. +controller: + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 250m + memory: 512Mi + +# API + UI. Kept ClusterIP (default) — the Argo UI is an admin control plane and is +# NOT world-exposed via NodePort (same posture as not exposing the RabbitMQ/DB admin +# ports in A6). Access is via `kubectl -n argocd port-forward svc/argocd-server 8080:443`. +server: + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 100m + memory: 256Mi + +repoServer: + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 100m + memory: 256Mi + +redis: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi diff --git a/src/auth-service/manifest/configmap.yaml b/k8s/base/auth/configmap.yaml similarity index 85% rename from src/auth-service/manifest/configmap.yaml rename to k8s/base/auth/configmap.yaml index c34dacc..980594d 100644 --- a/src/auth-service/manifest/configmap.yaml +++ b/k8s/base/auth/configmap.yaml @@ -5,5 +5,5 @@ metadata: data: DATABASE_HOST: db DATABASE_NAME: authdb - DATABASE_USER: nasi + DATABASE_USER: pguser AUTH_TABLE: auth_user diff --git a/k8s/base/auth/deployment.yaml b/k8s/base/auth/deployment.yaml new file mode 100644 index 0000000..573ccc1 --- /dev/null +++ b/k8s/base/auth/deployment.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: auth + labels: + app: auth +spec: + replicas: 2 + selector: + matchLabels: + app: auth + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 3 + template: + metadata: + labels: + app: auth + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + # Writable scratch dir. readOnlyRootFilesystem is true, but gunicorn's + # sync workers write a heartbeat temp file (tempfile.mkstemp) and need a + # writable temp dir — without this the workers fail to boot (A4). + - name: tmp-volume + emptyDir: {} + containers: + - name: auth + image: johnbaabalola/auth-service:16f49a0 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5000 + envFrom: + - configMapRef: + name: auth-configmap + - secretRef: + name: auth-secret + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/k8s/base/auth/kustomization.yaml b/k8s/base/auth/kustomization.yaml new file mode 100644 index 0000000..08a39aa --- /dev/null +++ b/k8s/base/auth/kustomization.yaml @@ -0,0 +1,24 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the auth-service (Flask + PostgreSQL, ClusterIP :5000). +# Environment-agnostic: overlays set the image tag, replica count, and the +# org labels (environment / cost-centre / owner). The auth-secret and the +# auth-configmap are referenced via envFrom; the Secret is provided out of band +# (gitignored secret.yaml today, ExternalSecret after A9). +resources: + - deployment.yaml + - service.yaml + - configmap.yaml + +# Identity labels. includeSelectors:false is critical — a Deployment's +# spec.selector is immutable, so we must never let a label transformer touch it. +# includeTemplates:true puts the labels on the pods too (handy for Kyverno +# require-labels in B2 and for `kubectl get pods -l app.kubernetes.io/part-of`). +labels: + - pairs: + app.kubernetes.io/name: auth + app.kubernetes.io/component: auth-service + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/src/auth-service/manifest/service.yaml b/k8s/base/auth/service.yaml similarity index 100% rename from src/auth-service/manifest/service.yaml rename to k8s/base/auth/service.yaml diff --git a/k8s/base/backup/kustomization.yaml b/k8s/base/backup/kustomization.yaml new file mode 100644 index 0000000..7a9e6b1 --- /dev/null +++ b/k8s/base/backup/kustomization.yaml @@ -0,0 +1,23 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Backup base (I4 / P5). Nightly mongodump + pg_dump CronJobs that ship compressed +# dumps to the S3 backup bucket, plus the IRSA ServiceAccount they run as. Wired +# into both overlays (dev + prod) so every environment is recoverable. The restore +# procedure these feed is in docs/DISASTER_RECOVERY.md. +# +# Prerequisites (provisioned by terraform/modules/storage + /eks): +# - S3 bucket vidcast-backups- +# - IAM role vidcast-cluster-backup-irsa (referenced by serviceaccount.yaml) +# - the Postgres PVC (A11) — what makes a pg_dump worth keeping +resources: + - serviceaccount.yaml + - mongo-backup-cronjob.yaml + - postgres-backup-cronjob.yaml + +labels: + - pairs: + app.kubernetes.io/component: backup + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/backup/mongo-backup-cronjob.yaml b/k8s/base/backup/mongo-backup-cronjob.yaml new file mode 100644 index 0000000..ce44468 --- /dev/null +++ b/k8s/base/backup/mongo-backup-cronjob.yaml @@ -0,0 +1,119 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: mongo-backup + labels: + app: mongo-backup + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/component: backup +spec: + # Nightly at 02:00 UTC (off-peak). RPO is therefore up to ~24h — documented in + # docs/DISASTER_RECOVERY.md. + schedule: "0 2 * * *" + concurrencyPolicy: Forbid + startingDeadlineSeconds: 600 + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 2 + # Don't let a stuck dump run forever and overlap the next night. + activeDeadlineSeconds: 3600 + template: + metadata: + labels: + app: mongo-backup + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/component: backup + spec: + serviceAccountName: vidcast-backup + restartPolicy: OnFailure + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # fsGroup makes the shared emptyDir writable by the non-root uid the + # dump + upload containers run as. + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + volumes: + # Scratch space the dump is written to and the uploader reads from. + - name: backup + emptyDir: {} + # Writable HOME for the AWS CLI's cache (it runs read-only-root otherwise). + - name: home + emptyDir: {} + # initContainer dumps Mongo to /backup/dump.gz; it must finish before the + # uploader runs. A dump failure fails the init step → the Job retries. + initContainers: + - name: mongodump + # Same image family as the running mongod (mongo:4.2) so mongodump's + # archive format matches what mongorestore expects on restore. + image: mongo:4.2 + command: + - /bin/sh + - -c + # Dump the two application databases (videos + mp3s) — together they + # hold every GridFS file and the outbox collection. Each URI pins to + # its db, so we run mongodump twice into separate archives. + - > + mongodump --uri="$MONGODB_VIDEOS_URI" --gzip --archive=/backup/videos.gz && + mongodump --uri="$MONGODB_MP3S_URI" --gzip --archive=/backup/mp3s.gz + envFrom: + # Use the app's OWN mongo credentials (gateway-secret, ESO-synced) — + # the exact URIs the gateway/converter authenticate with. The + # mongodb-secret root creds are NOT usable here: that secret's password + # is out of sync with the running mongod (root auth fails SCRAM-SHA-256; + # only the app URIs authenticate). Provides MONGODB_VIDEOS_URI / _MP3S_URI. + - secretRef: + name: gateway-secret + volumeMounts: + - name: backup + mountPath: /backup + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "512Mi" + containers: + - name: upload + image: amazon/aws-cli:2.15.30 + # Date-stamp the S3 keys so each night is a distinct pair of objects + # (bucket versioning + the 30-day lifecycle handle retention). One + # shared timestamp keeps the videos/mp3s archives of a run together. + command: + - /bin/sh + - -c + - > + TS=$(date -u +%Y%m%dT%H%M%SZ); + aws s3 cp /backup/videos.gz "s3://${BACKUP_BUCKET}/mongo/videos-${TS}.archive.gz" && + aws s3 cp /backup/mp3s.gz "s3://${BACKUP_BUCKET}/mongo/mp3s-${TS}.archive.gz" + env: + # Deterministic bucket name (vidcast-backups-). If you + # change terraform/modules/storage bucket_prefix, update this. + - name: BACKUP_BUCKET + value: vidcast-backups-501562869470 + - name: HOME + value: /home + volumeMounts: + - name: backup + mountPath: /backup + - name: home + mountPath: /home + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "250m" + memory: "256Mi" diff --git a/k8s/base/backup/postgres-backup-cronjob.yaml b/k8s/base/backup/postgres-backup-cronjob.yaml new file mode 100644 index 0000000..1cabefb --- /dev/null +++ b/k8s/base/backup/postgres-backup-cronjob.yaml @@ -0,0 +1,104 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: postgres-backup + labels: + app: postgres-backup + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/component: backup +spec: + # Nightly at 02:15 UTC — staggered 15m after the Mongo dump so they don't both + # contend for node CPU/network at once. + schedule: "15 2 * * *" + concurrencyPolicy: Forbid + startingDeadlineSeconds: 600 + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 2 + activeDeadlineSeconds: 1800 + template: + metadata: + labels: + app: postgres-backup + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/component: backup + spec: + serviceAccountName: vidcast-backup + restartPolicy: OnFailure + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + volumes: + - name: backup + emptyDir: {} + - name: home + emptyDir: {} + # pg_dump → /backup/dump.sql.gz, then the uploader ships it to S3. + initContainers: + - name: pgdump + # Match the running Postgres major (16) so pg_dump's output restores + # cleanly; 16.4-alpine is the same tag the Helm chart pins. + image: postgres:16.4-alpine + command: + - /bin/sh + - -c + - > + pg_dump -h db -p 5432 -U pguser -d authdb | gzip > /backup/dump.sql.gz + env: + # pg_dump reads the password from PGPASSWORD. The value lives in + # auth-secret (DATABASE_PASSWORD) — created by ESO from Parameter + # Store (prod) or deploy.sh (dev). + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: auth-secret + key: DATABASE_PASSWORD + volumeMounts: + - name: backup + mountPath: /backup + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + containers: + - name: upload + image: amazon/aws-cli:2.15.30 + command: + - /bin/sh + - -c + - > + aws s3 cp /backup/dump.sql.gz + "s3://${BACKUP_BUCKET}/postgres/postgres-$(date -u +%Y%m%dT%H%M%SZ).sql.gz" + env: + - name: BACKUP_BUCKET + value: vidcast-backups-501562869470 + - name: HOME + value: /home + volumeMounts: + - name: backup + mountPath: /backup + - name: home + mountPath: /home + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "250m" + memory: "256Mi" diff --git a/k8s/base/backup/serviceaccount.yaml b/k8s/base/backup/serviceaccount.yaml new file mode 100644 index 0000000..428d29b --- /dev/null +++ b/k8s/base/backup/serviceaccount.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vidcast-backup + namespace: default + annotations: + # IRSA: binds this SA to the IAM role created by terraform/modules/storage + # (output: backup_irsa_role_arn). Role name is deterministic: + # "-backup-irsa". Account 501562869470, cluster vidcast-cluster. + # The role allows s3:PutObject/ListBucket on the backup bucket ONLY. + eks.amazonaws.com/role-arn: arn:aws:iam::501562869470:role/vidcast-cluster-backup-irsa + labels: + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/component: backup + app.kubernetes.io/managed-by: kustomize diff --git a/k8s/base/converter/configmap.yaml b/k8s/base/converter/configmap.yaml new file mode 100644 index 0000000..52bf2f2 --- /dev/null +++ b/k8s/base/converter/configmap.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: converter-configmap +data: + MP3_QUEUE: "mp3" + VIDEO_QUEUE: "video" + # A3 retry/DLQ tuning. After MAX_RETRIES failed attempts a message goes to the + # terminal .dlq; RETRY_TTL_MS is the delay (ms) a failed message waits in + # .retry before it is re-injected into the main queue. + MAX_RETRIES: "3" + RETRY_TTL_MS: "30000" + # A2 idempotency. "false" (default) = consumers behave exactly as before. "true" + # = claim-once on video_fid via Redis so a redelivery isn't converted twice. + # IDEMPOTENCY_TTL_SECONDS bounds the dedup window; REDIS_HOST is the in-cluster + # Redis Service. + IDEMPOTENCY_ENABLED: "false" + IDEMPOTENCY_TTL_SECONDS: "300" + REDIS_HOST: "redis" + # MONGODB_URI moved to the converter-secret Secret — it embeds the MongoDB + # username/password and must not live in a ConfigMap. The env var name is + # unchanged; envFrom pulls it from the Secret instead. diff --git a/k8s/base/converter/deployment.yaml b/k8s/base/converter/deployment.yaml new file mode 100644 index 0000000..216f33e --- /dev/null +++ b/k8s/base/converter/deployment.yaml @@ -0,0 +1,83 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: converter + labels: + app: converter +spec: + # 2 replicas, not 4: the single m7i-flex.large node (2 vCPU) cannot schedule + # 4 converters @ 250m CPU request alongside the other services — they sat + # Pending with "Insufficient cpu". 2 replicas is enough for demo throughput; + # scale up by adding nodes (raise the node group desired_size) if needed. + # NOTE (A7): KEDA will drive this Deployment's replica count from the RabbitMQ + # `video` queue depth (scale-to-zero). The static count here is the floor used + # before KEDA is installed / when KEDA is disabled. + replicas: 2 + selector: + matchLabels: + app: converter + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + template: + metadata: + labels: + app: converter + spec: + # Disable legacy Docker-link service env vars. The `redis` Service otherwise + # injects REDIS_PORT=tcp://:6379, clobbering the plain "6379" idempotency.py + # expects → ValueError at import. This consumer needs no service-link vars. + enableServiceLinks: false + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + - name: tmp-volume + emptyDir: {} + containers: + - name: converter + image: johnbaabalola/converter-service:16f49a0 + imagePullPolicy: IfNotPresent + ports: + # B4: prometheus metrics (start_http_server) — scraped by a PodMonitor. + - name: metrics + containerPort: 9000 + envFrom: + - configMapRef: + name: converter-configmap + - secretRef: + name: converter-secret + - secretRef: + name: rabbitmq-secret + env: + # Unbuffered stdout so print() diagnostics reach kubectl logs + # immediately, not on a block-buffer flush. + - name: PYTHONUNBUFFERED + value: "1" + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "250m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + exec: + command: ["test", "-f", "/tmp/healthy"] + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 diff --git a/k8s/base/converter/kustomization.yaml b/k8s/base/converter/kustomization.yaml new file mode 100644 index 0000000..187d076 --- /dev/null +++ b/k8s/base/converter/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the converter-service (Pika + MoviePy/ffmpeg, queue +# consumer — no Service). Reads the `video` queue, writes mp3 to GridFS, +# publishes to the `mp3` queue. Liveness is exec-based (test -f /tmp/healthy). +resources: + - deployment.yaml + - configmap.yaml + +labels: + - pairs: + app.kubernetes.io/name: converter + app.kubernetes.io/component: converter-service + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/frontend/configmap.yaml b/k8s/base/frontend/configmap.yaml new file mode 100644 index 0000000..a6e9fb2 --- /dev/null +++ b/k8s/base/frontend/configmap.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: frontend-configmap +data: + VITE_API_URL: "/api" + VITE_GRAFANA_URL: "" diff --git a/k8s/base/frontend/deployment.yaml b/k8s/base/frontend/deployment.yaml new file mode 100644 index 0000000..0af1b6f --- /dev/null +++ b/k8s/base/frontend/deployment.yaml @@ -0,0 +1,63 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: frontend + labels: + app: frontend +spec: + replicas: 1 + selector: + matchLabels: + app: frontend + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + template: + metadata: + labels: + app: frontend + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1001 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + containers: + - name: frontend + # Image name is resolved by the overlay `images:` transformer to the + # real ECR path + tag (CI does not build the frontend, so it is not on + # Docker Hub like the backends). Base uses a bare, transformable name + # instead of the old ".dkr.ecr…" literal placeholder. + image: vidcast-frontend:latest + ports: + - containerPort: 8080 + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: false + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/k8s/base/frontend/kustomization.yaml b/k8s/base/frontend/kustomization.yaml new file mode 100644 index 0000000..a668ff4 --- /dev/null +++ b/k8s/base/frontend/kustomization.yaml @@ -0,0 +1,19 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the frontend (React + nginx, NodePort :30006). The image +# lives in this account's ECR and is set by the overlay images transformer. +# readOnlyRootFilesystem is intentionally false here (nginx writes its PID and +# temp paths); this is the one backend/frontend exception to the RO-rootfs rule. +resources: + - deployment.yaml + - service.yaml + - configmap.yaml + +labels: + - pairs: + app.kubernetes.io/name: frontend + app.kubernetes.io/component: frontend + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/frontend/service.yaml b/k8s/base/frontend/service.yaml new file mode 100644 index 0000000..eb8fde1 --- /dev/null +++ b/k8s/base/frontend/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: frontend + labels: + app: frontend +spec: + # P1/I2: ClusterIP only. The ALB Ingress (target-type: ip) registers the frontend + # pod IPs directly, so no NodePort is needed; the platform is reached at + # https:// via the ALB instead of http://:30006. + type: ClusterIP + selector: + app: frontend + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP diff --git a/k8s/base/gateway/configmap.yaml b/k8s/base/gateway/configmap.yaml new file mode 100644 index 0000000..e2c8aeb --- /dev/null +++ b/k8s/base/gateway/configmap.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gateway-configmap +data: + AUTH_SVC_ADDRESS: "auth:5000" + # A1 transactional outbox feature flag. "false" (default) = the gateway + # publishes uploads directly to RabbitMQ exactly as before. Flip to "true" to + # route uploads through the MongoDB outbox collection (the outbox-relay then + # publishes them). Flip only after the relay image is deployed and verified. + OUTBOX_ENABLED: "false" + # MONGODB_VIDEOS_URI and MONGODB_MP3S_URI moved to the gateway-secret Secret — + # they embed the MongoDB username/password and must not live in a ConfigMap + # (ConfigMaps are not treated as sensitive and are easy to dump). The env var + # names are unchanged; envFrom pulls them from the Secret instead. See + # k8s/base/gateway/ (Secret provided out of band / via ESO after A9). diff --git a/k8s/base/gateway/deployment.yaml b/k8s/base/gateway/deployment.yaml new file mode 100644 index 0000000..de64b39 --- /dev/null +++ b/k8s/base/gateway/deployment.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gateway + labels: + app: gateway +spec: + replicas: 2 + selector: + matchLabels: + app: gateway + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 3 + template: + metadata: + labels: + app: gateway + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + # Writable scratch dir. readOnlyRootFilesystem is true, but Werkzeug + # buffers multipart file uploads to a temp directory; without this the + # /upload handler fails with "No usable temporary directory found". + - name: tmp-volume + emptyDir: {} + containers: + - name: gateway + image: johnbaabalola/gateway-service:16f49a0 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 8080 + envFrom: + - configMapRef: + name: gateway-configmap + - secretRef: + name: gateway-secret + - secretRef: + name: rabbitmq-secret + env: + # Unbuffered stdout so print() (e.g. the admin role-change audit log) + # reaches kubectl logs immediately, not on a block-buffer flush. + - name: PYTHONUNBUFFERED + value: "1" + # B4: prometheus-client multiprocess sample dir. Lives on the writable + # /tmp emptyDir (readOnlyRootFilesystem is true); the 2 gunicorn workers + # write here and /metrics aggregates across them. + - name: PROMETHEUS_MULTIPROC_DIR + value: /tmp/prometheus + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "300m" + memory: "256Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/k8s/base/gateway/kustomization.yaml b/k8s/base/gateway/kustomization.yaml new file mode 100644 index 0000000..368e790 --- /dev/null +++ b/k8s/base/gateway/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the gateway-service (Flask + PyMongo + Pika, NodePort +# :30002). Fronts login/upload/download/admin. References gateway-secret + +# rabbitmq-secret (the latter is created by the RabbitMQ Helm chart). +resources: + - deployment.yaml + - service.yaml + - configmap.yaml + +labels: + - pairs: + app.kubernetes.io/name: gateway + app.kubernetes.io/component: gateway-service + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/gateway/service.yaml b/k8s/base/gateway/service.yaml new file mode 100644 index 0000000..643839b --- /dev/null +++ b/k8s/base/gateway/service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: gateway + labels: + # B4: the ServiceMonitor selects the Service by this label. + app: gateway +spec: + selector: + app: gateway + # I2/P1: ClusterIP only. The gateway is no longer publicly exposed on a NodePort; + # browsers and API clients reach it through the frontend (nginx proxies /api/ → + # gateway), which the ALB Ingress fronts. Smaller attack surface. + type: ClusterIP + ports: + # named so the B4 ServiceMonitor can reference it by name for /metrics scraping. + - name: http + port: 8080 + targetPort: 8080 + protocol: TCP diff --git a/k8s/base/notification/configmap.yaml b/k8s/base/notification/configmap.yaml new file mode 100644 index 0000000..5688dc4 --- /dev/null +++ b/k8s/base/notification/configmap.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: notification-configmap +data: + MP3_QUEUE: "mp3" + # A3 retry/DLQ tuning. After MAX_RETRIES failed attempts a message goes to the + # terminal .dlq; RETRY_TTL_MS is the delay (ms) a failed message waits in + # .retry before it is re-injected into the main queue. + MAX_RETRIES: "3" + RETRY_TTL_MS: "30000" + # A2 idempotency. "false" (default) = consumers behave exactly as before. "true" + # = claim-once on mp3_fid via Redis so a redelivery isn't emailed twice. + # IDEMPOTENCY_TTL_SECONDS bounds the dedup window; REDIS_HOST is the in-cluster + # Redis Service. + IDEMPOTENCY_ENABLED: "false" + IDEMPOTENCY_TTL_SECONDS: "300" + REDIS_HOST: "redis" + # Batch summary email: the notification service reads job_status to know when + # a multi-file batch is complete. This default URI has NO credentials, so on an + # auth-required mongod the connection fails and the service safely falls back to + # one email per file. To ENABLE batch summaries in a live cluster, two infra + # additions are needed (out of this code sprint, documented in the assessment): + # 1) override MONGODB_URI with the credentialed value in notification-secret + # (mongouser, authSource=admin), mirroring converter-secret — secretRef wins + # over this configmap value, so a real password never lives in this file; + # 2) a notification→mongodb:27017 NetworkPolicy egress rule (default-deny blocks + # it today), like Sprint 1's allow-backup-egress. + MONGODB_URI: "mongodb://mongodb:27017/videos" + # VIDCAST_URL: public web app URL used in notification emails (UX2/B3). Defaults + # in code to a dev placeholder; set the real ALB hostname in the prod overlay. + VIDCAST_URL: "http://localhost:30006" + # VIDEO_QUEUE removed: the notification consumer only reads MP3_QUEUE + # (consumer.py consumes os.environ.get("MP3_QUEUE")). The video queue is + # consumed exclusively by the converter service, so this value was never read. diff --git a/k8s/base/notification/deployment.yaml b/k8s/base/notification/deployment.yaml new file mode 100644 index 0000000..a380bc5 --- /dev/null +++ b/k8s/base/notification/deployment.yaml @@ -0,0 +1,76 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: notification + labels: + app: notification +spec: + replicas: 2 + selector: + matchLabels: + app: notification + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 8 + template: + metadata: + labels: + app: notification + spec: + # Disable legacy Docker-link service env vars. The `redis` Service otherwise + # injects REDIS_PORT=tcp://:6379, clobbering the plain "6379" idempotency.py + # expects → ValueError at import. This consumer needs no service-link vars. + enableServiceLinks: false + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + - name: tmp-volume + emptyDir: {} + containers: + - name: notification + image: johnbaabalola/notification-service:16f49a0 + imagePullPolicy: IfNotPresent + ports: + # B4: prometheus metrics (start_http_server) — scraped by a PodMonitor. + - name: metrics + containerPort: 9000 + envFrom: + - configMapRef: + name: notification-configmap + - secretRef: + name: notification-secret + - secretRef: + name: rabbitmq-secret + env: + # Unbuffered stdout so print() diagnostics reach kubectl logs + # immediately, not on a block-buffer flush. + - name: PYTHONUNBUFFERED + value: "1" + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "100m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + exec: + command: ["test", "-f", "/tmp/healthy"] + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 diff --git a/k8s/base/notification/kustomization.yaml b/k8s/base/notification/kustomization.yaml new file mode 100644 index 0000000..75e0d86 --- /dev/null +++ b/k8s/base/notification/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the notification-service (Pika + smtplib, queue consumer — +# no Service). Reads the `mp3` queue, emails the uploader. Liveness is +# exec-based (test -f /tmp/healthy). +resources: + - deployment.yaml + - configmap.yaml + +labels: + - pairs: + app.kubernetes.io/name: notification + app.kubernetes.io/component: notification-service + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/outbox-relay/deployment.yaml b/k8s/base/outbox-relay/deployment.yaml new file mode 100644 index 0000000..ec2be7e --- /dev/null +++ b/k8s/base/outbox-relay/deployment.yaml @@ -0,0 +1,74 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: outbox-relay + labels: + app: outbox-relay +spec: + # CRITICAL: exactly ONE replica. The relay is the sole publisher of outbox + # events; a second replica would double-publish. Do not scale this up, and do + # not let an overlay/HPA touch it. Single replica = single publisher (A1, §3.3). + replicas: 1 + selector: + matchLabels: + app: outbox-relay + strategy: + # Recreate (not RollingUpdate): never run two relay pods at once, even briefly + # during a rollout, so the single-publisher invariant holds across deploys. + type: Recreate + template: + metadata: + labels: + app: outbox-relay + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + # Writable scratch for the /tmp/healthy liveness heartbeat under + # readOnlyRootFilesystem (same pattern as converter/notification). + - name: tmp-volume + emptyDir: {} + containers: + - name: outbox-relay + image: johnbaabalola/outbox-relay:latest + imagePullPolicy: IfNotPresent + envFrom: + # Reuse the gateway's existing credential sources — no new paths. + # gateway-secret provides MONGODB_VIDEOS_URI (the outbox lives in the + # same `videos` db); rabbitmq-secret provides the broker credentials. + - secretRef: + name: gateway-secret + - secretRef: + name: rabbitmq-secret + env: + - name: PYTHONUNBUFFERED + value: "1" + - name: OUTBOX_POLL_INTERVAL + value: "30" + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "100m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + exec: + command: ["test", "-f", "/tmp/healthy"] + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 diff --git a/k8s/base/outbox-relay/kustomization.yaml b/k8s/base/outbox-relay/kustomization.yaml new file mode 100644 index 0000000..c889822 --- /dev/null +++ b/k8s/base/outbox-relay/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifest for the outbox-relay (A1). A single-replica publisher of the +# MongoDB `outbox` collection to RabbitMQ — no Service (it listens on no port; +# it is a background poller, like the converter/notification consumers). Liveness +# is exec-based (test -f /tmp/healthy). No configmap: it reads MONGODB_VIDEOS_URI +# from gateway-secret and the broker creds from rabbitmq-secret (see deployment). +resources: + - deployment.yaml + +labels: + - pairs: + app.kubernetes.io/name: outbox-relay + app.kubernetes.io/component: outbox-relay + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/redis/deployment.yaml b/k8s/base/redis/deployment.yaml new file mode 100644 index 0000000..e760f21 --- /dev/null +++ b/k8s/base/redis/deployment.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis + labels: + app: redis +spec: + # Single replica. The idempotency claim store is intentionally non-HA: claims + # are short-lived (TTL) and claim_once fails OPEN, so a Redis restart degrades + # to "possible occasional duplicate", never a stuck pipeline (risk 2.7). The + # managed/HA alternative (ElastiCache) is documented-but-skipped in + # MANAGED_SERVICES.md §5 per the cost boundary. + replicas: 1 + selector: + matchLabels: + app: redis + strategy: + type: Recreate + template: + metadata: + labels: + app: redis + spec: + securityContext: + runAsNonRoot: true + runAsUser: 999 + runAsGroup: 999 + fsGroup: 999 + # B2 gap-fix: pod-level so it also covers any future init/sidecar + # containers. RuntimeDefault blocks ~44 dangerous syscalls (PSS Restricted) + # and satisfies the Kyverno require-seccomp-runtime-default policy. + seccompProfile: + type: RuntimeDefault + volumes: + # In-memory only — no persistence. /data is mounted writable because + # redis chdirs there; with --save "" --appendonly no nothing is written. + - name: data + emptyDir: {} + containers: + - name: redis + image: redis:7.4-alpine + imagePullPolicy: IfNotPresent + # No persistence (RDB snapshots off, AOF off) — this is an ephemeral + # dedup cache, not a database. maxmemory-policy evicts oldest claims if + # memory is ever pressured rather than OOM-killing the pod. + args: + - "redis-server" + - "--save" + - "" + - "--appendonly" + - "no" + - "--maxmemory" + - "100mb" + - "--maxmemory-policy" + - "allkeys-lru" + ports: + - containerPort: 6379 + volumeMounts: + - name: data + mountPath: /data + resources: + requests: + cpu: "50m" + memory: "128Mi" + limits: + cpu: "100m" + memory: "256Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + exec: + command: ["redis-cli", "ping"] + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/k8s/base/redis/kustomization.yaml b/k8s/base/redis/kustomization.yaml new file mode 100644 index 0000000..fb29c5e --- /dev/null +++ b/k8s/base/redis/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Base manifests for the in-cluster Redis (A2 idempotency claim store). Single +# replica, no persistence, ClusterIP only. The converter/notification consumers +# reach it at redis:6379 when IDEMPOTENCY_ENABLED=true. +resources: + - deployment.yaml + - service.yaml + +labels: + - pairs: + app.kubernetes.io/name: redis + app.kubernetes.io/component: idempotency-store + app.kubernetes.io/part-of: vidcast + includeSelectors: false + includeTemplates: true diff --git a/k8s/base/redis/service.yaml b/k8s/base/redis/service.yaml new file mode 100644 index 0000000..9df8995 --- /dev/null +++ b/k8s/base/redis/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: redis + labels: + app: redis +spec: + selector: + app: redis + ports: + - port: 6379 + targetPort: 6379 + # ClusterIP — internal only. The consumers reach it at REDIS_HOST=redis:6379. + type: ClusterIP diff --git a/k8s/external-secrets/README.md b/k8s/external-secrets/README.md new file mode 100644 index 0000000..ca4c158 --- /dev/null +++ b/k8s/external-secrets/README.md @@ -0,0 +1,98 @@ +# k8s/external-secrets/ — External Secrets Operator (A9) + +Replaces the manual, gitignored `secret.yaml` files as the source of truth for +VidCast's application secrets. Secrets live in **AWS SSM Parameter Store** and are +pulled into the cluster by the **External Secrets Operator (ESO)** via IRSA — no +long-lived AWS keys, no secrets in git. + +**Why Parameter Store, not Secrets Manager:** Secrets Manager bills +$0.40/secret/month (≈$3/mo for our 7 values, and it persists even when the +cluster is destroyed). Standard-tier SSM parameters are **free**, and +`SecureString` uses the **free** AWS-managed `alias/aws/ssm` key. This keeps the +project's "~$0 when the cluster is off" target. ESO supports both backends; the +only difference is `service: ParameterStore` in the ClusterSecretStore. + +## Components + +| File | Purpose | +|---|---| +| `shared/serviceaccount.yaml` | `vidcast-eso` SA, annotated with the IRSA role ARN (Terraform `external_secrets_irsa_role_arn`) | +| `shared/cluster-secret-store.yaml` | `ClusterSecretStore` → Parameter Store, eu-west-2, auth via the SA | +| `dev/`, `prod/` | One `ExternalSecret` per service; each writes the Secret the Deployment consumes (`auth-secret`, `gateway-secret`, `converter-secret`, `notification-secret`) | + +## Prerequisites (one-time per cluster) + +1. **Apply the IRSA role** (part of the Terraform stack): + ```bash + cd terraform/environments/dev && terraform apply # creates *-external-secrets-irsa + ``` + Confirm the SA annotation matches the output: + ```bash + terraform output external_secrets_irsa_role_arn + ``` + +2. **Install ESO** (pin a chart version whose CRDs serve `external-secrets.io/v1` + — that is **>= 0.14**; check with `helm search repo … --versions`): + ```bash + helm repo add external-secrets https://charts.external-secrets.io + helm repo update + helm install external-secrets external-secrets/external-secrets \ + -n external-secrets --create-namespace \ + --version 0.14.0 # or later; CRDs install by default on recent charts + ``` + +## Seed the parameters + +Values are read from environment variables so **no secret is ever written to a +tracked file**. Source them from the gitignored `DEPLOYMENT_CONFIG.md` first. +`prod` shown; for `dev` swap the path prefix to `/vidcast/dev/`. + +```bash +REGION=eu-west-2 +put() { aws ssm put-parameter --region "$REGION" --type SecureString --overwrite --name "$1" --value "$2"; } + +# auth +put /vidcast/prod/auth/psql-password "$POSTGRES_PASSWORD" +put /vidcast/prod/auth/jwt-secret "$JWT_SECRET" +# gateway (full Mongo URIs, user+pass embedded) +put /vidcast/prod/gateway/mongodb-videos-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/videos?authSource=admin" +put /vidcast/prod/gateway/mongodb-mp3s-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +# converter +put /vidcast/prod/converter/mongodb-uri "mongodb://$MONGODB_USERNAME:$MONGODB_PASSWORD@mongodb:27017/mp3s?authSource=admin" +# notification +put /vidcast/prod/notification/gmail-address "$GMAIL_ADDRESS" +put /vidcast/prod/notification/gmail-password "$GMAIL_APP_PASSWORD" # strip spaces from the app password +``` + +## Deploy + +```bash +# After ESO is installed and parameters are seeded: +kubectl apply -k k8s/external-secrets/prod # or .../dev + +# ESO reconciles each ExternalSecret into the named Secret. Verify: +kubectl get externalsecret -n default +# NAME STORE READY +# auth-secret vidcast-parameter-store True +# ... +kubectl get secret auth-secret gateway-secret converter-secret notification-secret -n default +``` + +Then deploy the app (`kubectl apply -k k8s/overlays/prod`). The Deployments +reference these Secret names via `envFrom.secretRef`, unchanged — they neither +know nor care that ESO populated them. + +## Rotation + +Update the parameter (`put …` again) — ESO re-syncs within `refreshInterval` +(1h), or force it: `kubectl annotate externalsecret auth-secret force-sync=$(date +%s) --overwrite`. +Pods pick up the new value on their next restart (envFrom is read at start). + +## What is NOT migrated here (honest scope) + +`rabbitmq-secret` (broker credentials) is still created by the RabbitMQ **Helm +chart**, because that same secret provisions the in-cluster broker itself — +having ESO own it would make the dev broker depend on ESO being up first. Broker +credentials move to Parameter Store when the broker moves to **Amazon MQ** +(managed), which is documented-but-not-applied in `MANAGED_SERVICES.md`. The +parameter convention is reserved: `/vidcast//rabbitmq/{username,password}`. diff --git a/k8s/external-secrets/dev/externalsecret-auth.yaml b/k8s/external-secrets/dev/externalsecret-auth.yaml new file mode 100644 index 0000000..3441f2d --- /dev/null +++ b/k8s/external-secrets/dev/externalsecret-auth.yaml @@ -0,0 +1,30 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: auth-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + # Produces the Secret named "auth-secret" — exactly what the auth Deployment + # references via envFrom.secretRef. ESO owns it (creates/updates/deletes). + name: auth-secret + creationPolicy: Owner + template: + type: Opaque + data: + # The auth service reads the password from the env var DATABASE_PASSWORD + # (server.py: psycopg2.connect(password=os.getenv('DATABASE_PASSWORD'))), and the + # Deployment injects this Secret via envFrom — so the key name MUST be + # DATABASE_PASSWORD. (It was PSQL_PASSWORD, which never matched what the app reads; + # the mismatch was masked while Postgres used trust auth, and surfaced when it was + # switched to scram-sha-256.) + - secretKey: DATABASE_PASSWORD + remoteRef: + key: /vidcast/dev/auth/psql-password + - secretKey: JWT_SECRET + remoteRef: + key: /vidcast/dev/auth/jwt-secret diff --git a/k8s/external-secrets/dev/externalsecret-converter.yaml b/k8s/external-secrets/dev/externalsecret-converter.yaml new file mode 100644 index 0000000..6a13c97 --- /dev/null +++ b/k8s/external-secrets/dev/externalsecret-converter.yaml @@ -0,0 +1,19 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: converter-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: converter-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: MONGODB_URI + remoteRef: + key: /vidcast/dev/converter/mongodb-uri diff --git a/k8s/external-secrets/dev/externalsecret-gateway.yaml b/k8s/external-secrets/dev/externalsecret-gateway.yaml new file mode 100644 index 0000000..2d62ddb --- /dev/null +++ b/k8s/external-secrets/dev/externalsecret-gateway.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: gateway-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: gateway-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: MONGODB_VIDEOS_URI + remoteRef: + key: /vidcast/dev/gateway/mongodb-videos-uri + - secretKey: MONGODB_MP3S_URI + remoteRef: + key: /vidcast/dev/gateway/mongodb-mp3s-uri diff --git a/k8s/external-secrets/dev/externalsecret-notification.yaml b/k8s/external-secrets/dev/externalsecret-notification.yaml new file mode 100644 index 0000000..26de033 --- /dev/null +++ b/k8s/external-secrets/dev/externalsecret-notification.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: notification-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: notification-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: GMAIL_ADDRESS + remoteRef: + key: /vidcast/dev/notification/gmail-address + - secretKey: GMAIL_PASSWORD + remoteRef: + key: /vidcast/dev/notification/gmail-password diff --git a/k8s/external-secrets/dev/kustomization.yaml b/k8s/external-secrets/dev/kustomization.yaml new file mode 100644 index 0000000..2dca61c --- /dev/null +++ b/k8s/external-secrets/dev/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# DEV ESO overlay. Reads /vidcast/dev/* from Parameter Store and materialises +# the four app Secrets. Apply AFTER the ESO Helm chart is installed (the +# ExternalSecret/ClusterSecretStore CRDs must exist first): +# kubectl apply -k k8s/external-secrets/dev +namespace: default + +resources: + - ../shared + - externalsecret-auth.yaml + - externalsecret-gateway.yaml + - externalsecret-converter.yaml + - externalsecret-notification.yaml diff --git a/k8s/external-secrets/prod/externalsecret-auth.yaml b/k8s/external-secrets/prod/externalsecret-auth.yaml new file mode 100644 index 0000000..9eaca56 --- /dev/null +++ b/k8s/external-secrets/prod/externalsecret-auth.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: auth-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: auth-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: DATABASE_PASSWORD + remoteRef: + key: /vidcast/prod/auth/psql-password + - secretKey: JWT_SECRET + remoteRef: + key: /vidcast/prod/auth/jwt-secret diff --git a/k8s/external-secrets/prod/externalsecret-converter.yaml b/k8s/external-secrets/prod/externalsecret-converter.yaml new file mode 100644 index 0000000..ee7dab5 --- /dev/null +++ b/k8s/external-secrets/prod/externalsecret-converter.yaml @@ -0,0 +1,19 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: converter-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: converter-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: MONGODB_URI + remoteRef: + key: /vidcast/prod/converter/mongodb-uri diff --git a/k8s/external-secrets/prod/externalsecret-gateway.yaml b/k8s/external-secrets/prod/externalsecret-gateway.yaml new file mode 100644 index 0000000..56c756c --- /dev/null +++ b/k8s/external-secrets/prod/externalsecret-gateway.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: gateway-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: gateway-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: MONGODB_VIDEOS_URI + remoteRef: + key: /vidcast/prod/gateway/mongodb-videos-uri + - secretKey: MONGODB_MP3S_URI + remoteRef: + key: /vidcast/prod/gateway/mongodb-mp3s-uri diff --git a/k8s/external-secrets/prod/externalsecret-notification.yaml b/k8s/external-secrets/prod/externalsecret-notification.yaml new file mode 100644 index 0000000..d96e694 --- /dev/null +++ b/k8s/external-secrets/prod/externalsecret-notification.yaml @@ -0,0 +1,22 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: notification-secret + namespace: default +spec: + refreshInterval: 1h + secretStoreRef: + name: vidcast-parameter-store + kind: ClusterSecretStore + target: + name: notification-secret + creationPolicy: Owner + template: + type: Opaque + data: + - secretKey: GMAIL_ADDRESS + remoteRef: + key: /vidcast/prod/notification/gmail-address + - secretKey: GMAIL_PASSWORD + remoteRef: + key: /vidcast/prod/notification/gmail-password diff --git a/k8s/external-secrets/prod/kustomization.yaml b/k8s/external-secrets/prod/kustomization.yaml new file mode 100644 index 0000000..149943e --- /dev/null +++ b/k8s/external-secrets/prod/kustomization.yaml @@ -0,0 +1,14 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# PROD ESO overlay. Reads /vidcast/prod/* from Parameter Store and materialises +# the four app Secrets. Apply AFTER the ESO Helm chart is installed: +# kubectl apply -k k8s/external-secrets/prod +namespace: default + +resources: + - ../shared + - externalsecret-auth.yaml + - externalsecret-gateway.yaml + - externalsecret-converter.yaml + - externalsecret-notification.yaml diff --git a/k8s/external-secrets/shared/cluster-secret-store.yaml b/k8s/external-secrets/shared/cluster-secret-store.yaml new file mode 100644 index 0000000..313f747 --- /dev/null +++ b/k8s/external-secrets/shared/cluster-secret-store.yaml @@ -0,0 +1,23 @@ +apiVersion: external-secrets.io/v1 +kind: ClusterSecretStore +metadata: + name: vidcast-parameter-store + labels: + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/managed-by: kustomize +spec: + provider: + aws: + # ParameterStore (NOT SecretsManager) — standard-tier params are free and + # SecureString uses the free AWS-managed alias/aws/ssm key. See + # MANAGED_SECRETS_EXPLAINED.md for the cost rationale. + service: ParameterStore + region: eu-west-2 + auth: + # IRSA via the vidcast-eso ServiceAccount. ESO mints a token for this SA + # (TokenRequest) and exchanges it for the IAM role's temporary creds — + # no static AWS keys anywhere. + jwt: + serviceAccountRef: + name: vidcast-eso + namespace: default diff --git a/k8s/external-secrets/shared/kustomization.yaml b/k8s/external-secrets/shared/kustomization.yaml new file mode 100644 index 0000000..4081d89 --- /dev/null +++ b/k8s/external-secrets/shared/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Env-agnostic ESO plumbing: the ServiceAccount (IRSA) and the cluster-scoped +# ClusterSecretStore. Referenced by both the dev/ and prod/ overlays. Applying +# it twice is idempotent (only one env's ExternalSecrets are applied at a time +# on the single cluster). +resources: + - serviceaccount.yaml + - cluster-secret-store.yaml diff --git a/k8s/external-secrets/shared/serviceaccount.yaml b/k8s/external-secrets/shared/serviceaccount.yaml new file mode 100644 index 0000000..ef43650 --- /dev/null +++ b/k8s/external-secrets/shared/serviceaccount.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vidcast-eso + namespace: default + annotations: + # IRSA: binds this SA to the IAM role created by + # terraform/modules/external-secrets (output: external_secrets_irsa_role_arn). + # Role name is deterministic: "-external-secrets-irsa". + # Account 501562869470, cluster vidcast-cluster, region eu-west-2. + eks.amazonaws.com/role-arn: arn:aws:iam::501562869470:role/vidcast-cluster-external-secrets-irsa + labels: + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/managed-by: kustomize diff --git a/k8s/ingress/alb-controller-values.yaml b/k8s/ingress/alb-controller-values.yaml new file mode 100644 index 0000000..5361a20 --- /dev/null +++ b/k8s/ingress/alb-controller-values.yaml @@ -0,0 +1,37 @@ +# Helm values for the AWS Load Balancer Controller (eks/aws-load-balancer-controller). +# Install into kube-system. The controller watches Ingress resources with +# ingressClassName: alb and provisions/manages the ALB. +# +# PLACEHOLDERS filled at deploy time (do NOT commit real values): +# ${LBC_IRSA_ROLE_ARN} — terraform output lbc_irsa_role_arn (module.lbc) +# ${VPC_ID} — terraform output vpc_id +# +# Install (see docs/INGRESS_DEPLOY.md): +# helm repo add eks https://aws.github.io/eks-charts +# helm install aws-load-balancer-controller eks/aws-load-balancer-controller \ +# -n kube-system -f k8s/ingress/alb-controller-values.yaml \ +# --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=$LBC_IRSA_ROLE_ARN \ +# --set vpcId=$VPC_ID +clusterName: vidcast-cluster +region: eu-west-2 +vpcId: "${VPC_ID}" + +serviceAccount: + # Create the SA named exactly as the IRSA trust policy expects + # (system:serviceaccount:kube-system:aws-load-balancer-controller). + create: true + name: aws-load-balancer-controller + annotations: + eks.amazonaws.com/role-arn: "${LBC_IRSA_ROLE_ARN}" + +# Single replica is fine at this scale (the controller is control-plane only; an +# ALB it already created keeps serving traffic during a brief controller restart). +replicaCount: 1 + +resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi diff --git a/k8s/ingress/cert-manager/cluster-issuer.yaml b/k8s/ingress/cert-manager/cluster-issuer.yaml new file mode 100644 index 0000000..40aba3a --- /dev/null +++ b/k8s/ingress/cert-manager/cluster-issuer.yaml @@ -0,0 +1,28 @@ +# cert-manager ClusterIssuer (Let's Encrypt production). +# +# NOTE ON HOW THIS FITS THE ALB PATH: the ALB terminates TLS with an ACM +# certificate (see k8s/ingress/vidcast-ingress.yaml certificate-arn), NOT with a +# cert-manager-issued Kubernetes secret — an ALB cannot read in-cluster TLS +# secrets. This issuer is therefore provided as the ALTERNATIVE path: use it if you +# switch to an in-cluster ingress controller (e.g. ingress-nginx) that consumes k8s +# TLS secrets, or to issue/renew a cert via DNS-01 that you then import into ACM. +# For the default ALB+ACM path you do not need to install cert-manager at all. +# +# PLACEHOLDER filled at deploy time: ${ALERT_EMAIL} (from DEPLOYMENT_CONFIG.md) — +# Let's Encrypt sends expiry/issuance notices here. +# +# Prereq if used: helm install cert-manager jetstack/cert-manager --set installCRDs=true +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-prod +spec: + acme: + server: https://acme-v02.api.letsencrypt.org/directory + email: "${ALERT_EMAIL}" + privateKeySecretRef: + name: letsencrypt-prod-key + solvers: + - http01: + ingress: + ingressClassName: alb diff --git a/k8s/ingress/vidcast-ingress.yaml b/k8s/ingress/vidcast-ingress.yaml new file mode 100644 index 0000000..09726ee --- /dev/null +++ b/k8s/ingress/vidcast-ingress.yaml @@ -0,0 +1,59 @@ +# Public entrypoint for VidCast via an AWS ALB (provisioned by the AWS +# Load Balancer Controller from this Ingress). +# +# ROUTING DECISION (important): a single rule sends ALL paths to the `frontend` +# service. The frontend's nginx already serves the React SPA AND proxies `/api/` → +# gateway:8080 *stripping the /api prefix* (src/frontend/nginx.conf). An ALB cannot +# rewrite/strip path prefixes, so routing `/api` straight to the gateway would +# deliver `/api/login` to a gateway that only knows `/login` → 404. Going through +# the frontend preserves the working request path for browsers AND API clients +# (https:///api/login) and keeps the gateway internal (ClusterIP, reachable +# only via the frontend) — a smaller attack surface. +# +# TLS DECISION: the ALB terminates TLS using an ACM certificate (certificate-arn +# below). The ALB does NOT consume cert-manager's in-cluster TLS secrets, so the +# cert-manager ClusterIssuer (k8s/ingress/cert-manager/) is included only as the +# alternative path (in-cluster ingress, or DNS-01 issuance you then import to ACM). +# See docs/INGRESS_DEPLOY.md. +# +# PLACEHOLDERS filled at deploy time from DEPLOYMENT_CONFIG.md / terraform outputs: +# ${VIDCAST_HOSTNAME} — the public DNS name (Route 53 → ALB) +# ${ACM_CERTIFICATE_ARN} — ACM cert ARN covering ${VIDCAST_HOSTNAME} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: vidcast-ingress + namespace: default + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + # target-type: ip registers pod IPs directly (works with ClusterIP services). + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/listen-ports: '[{"HTTP":80,"HTTPS":443}]' + # Redirect all HTTP to HTTPS at the ALB. + alb.ingress.kubernetes.io/ssl-redirect: "443" + alb.ingress.kubernetes.io/certificate-arn: "${ACM_CERTIFICATE_ARN}" + # Health check the SPA root. + alb.ingress.kubernetes.io/healthcheck-path: / + # Named ALB group so additional Ingresses (e.g. a future grafana host) can share + # this one ALB instead of each provisioning their own. + alb.ingress.kubernetes.io/group.name: vidcast + # The LBC chart creates the `alb` IngressClass (createIngressClassResource=true). +spec: + ingressClassName: alb + tls: + - hosts: + - "${VIDCAST_HOSTNAME}" + # Not used by the ALB (it terminates with ACM) but documents the host→cert + # intent and is harmless. + secretName: vidcast-tls + rules: + - host: "${VIDCAST_HOSTNAME}" + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: frontend + port: + number: 8080 diff --git a/k8s/keda/README.md b/k8s/keda/README.md new file mode 100644 index 0000000..f88eba2 --- /dev/null +++ b/k8s/keda/README.md @@ -0,0 +1,58 @@ +# k8s/keda — Autoscaling (A7) + +KEDA-driven scale-to-zero for the **converter** + a CPU HPA for the **gateway**. + +## What's here + +| File | Purpose | +|---|---| +| `values.yaml` | KEDA Helm install values (conservative resources for the 2-vCPU node) | +| `triggerauthentication.yaml` | `TriggerAuthentication` → reads the broker connection string from `keda-rabbitmq-secret` | +| `scaledobject-converter.yaml` | `ScaledObject` → scales **converter** 0→3 on `video` queue depth | +| `hpa-gateway.yaml` | `HorizontalPodAutoscaler` → scales **gateway** 1→3 on CPU 70% | +| `secret.yaml.example` | template for the gitignored `secret.yaml` (the `host` amqp URI) | + +## Why two different autoscalers + +- **Converter → KEDA (queue depth, scale-to-zero).** The converter is an async, + CPU-heavy, bursty queue consumer that is idle most of the time. KEDA scales it on + `video` queue length and to **zero** when there's no work — no idle CPU burn. +- **Gateway → HPA (CPU).** The gateway is the synchronous, user-facing request + tier; it must always have ≥1 replica and scales on CPU load. + +**They target different deployments**, so the two controllers never fight over the +same replica count (a classic KEDA+HPA footgun). + +## Install order (CRDs first) + +```bash +helm repo add kedacore https://kedacore.github.io/charts && helm repo update +helm install keda kedacore/keda -n keda --create-namespace -f k8s/keda/values.yaml + +# broker connection string for KEDA (gitignored; from secret.yaml.example or ESO) +cp k8s/keda/secret.yaml.example k8s/keda/secret.yaml # then edit, OR use ESO +kubectl apply -f k8s/keda/secret.yaml + +kubectl apply -k k8s/keda +``` + +## Prerequisites + +- **metrics-server** must be installed for the gateway CPU HPA (EKS doesn't bundle + it). Without it the HPA reports `` CPU and won't scale. +- The gateway has a CPU **request** (100m) — required for utilisation-% targeting. + +## Verify + +```bash +kubectl get scaledobject,hpa -n default +kubectl describe scaledobject converter-scaler # READY/ACTIVE conditions +# scale-to-zero: with an empty video queue, converter replicas -> 0 after cooldown +kubectl get deploy converter -w +# scale-up: publish a burst to the video queue, watch replicas climb toward 3 +``` + +> Note: once KEDA owns the converter, its replica count is managed by KEDA, not the +> overlay. The `replicas:` in the converter base manifest is only the pre-KEDA +> bootstrap value; re-applying the overlay may briefly reset it until KEDA +> reconciles. See `AUTOSCALING_EXPLAINED.md`. diff --git a/k8s/keda/hpa-gateway.yaml b/k8s/keda/hpa-gateway.yaml new file mode 100644 index 0000000..d3ea245 --- /dev/null +++ b/k8s/keda/hpa-gateway.yaml @@ -0,0 +1,25 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: gateway-hpa + namespace: default +spec: + # Targets the GATEWAY deployment ONLY (distinct from the KEDA ScaledObject, + # which targets the converter). A plain built-in HPA — no KEDA dependency. + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: gateway + minReplicas: 1 + maxReplicas: 3 + metrics: + # CPU utilisation as a % of the gateway's CPU *request* (100m). Requires + # metrics-server in the cluster. The gateway is the synchronous, user-facing + # request tier, so CPU is the right scale signal (vs queue depth for the async + # converter). + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 70 diff --git a/k8s/keda/kustomization.yaml b/k8s/keda/kustomization.yaml new file mode 100644 index 0000000..5599bf4 --- /dev/null +++ b/k8s/keda/kustomization.yaml @@ -0,0 +1,25 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# A7 autoscaling resources. Applied SEPARATELY from the app overlay (like the ESO +# resources) because the ScaledObject/TriggerAuthentication are KEDA CRDs that +# must exist first — install KEDA before applying this: +# helm install keda kedacore/keda -n keda --create-namespace -f k8s/keda/values.yaml +# kubectl apply -f k8s/keda/secret.yaml # (gitignored; from secret.yaml.example or ESO) +# kubectl apply -k k8s/keda +# +# The HPA is a built-in type (no CRD) but lives here for cohesion. The secret is +# NOT in this kustomization — it is gitignored and applied out of band. +namespace: default + +resources: + - triggerauthentication.yaml + - scaledobject-converter.yaml + - hpa-gateway.yaml + +labels: + - pairs: + app.kubernetes.io/part-of: vidcast + app.kubernetes.io/managed-by: kustomize + includeSelectors: false + includeTemplates: false diff --git a/k8s/keda/scaledobject-converter.yaml b/k8s/keda/scaledobject-converter.yaml new file mode 100644 index 0000000..9615765 --- /dev/null +++ b/k8s/keda/scaledobject-converter.yaml @@ -0,0 +1,30 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: converter-scaler + namespace: default +spec: + # Targets the CONVERTER deployment ONLY. The gateway is scaled by a separate + # HPA (hpa-gateway.yaml) on a DIFFERENT deployment — the two controllers never + # share a scaleTargetRef, so they cannot fight over the same replica count. + scaleTargetRef: + name: converter + pollingInterval: 15 # check the queue every 15s + cooldownPeriod: 60 # wait 60s after the queue drains before scaling down + minReplicaCount: 0 # scale to ZERO when there is no work (the point of KEDA + # here — converters are bursty, CPU-heavy, and idle most + # of the time) + maxReplicaCount: 2 # single-node constraint: the 2-vCPU node cannot schedule + # 3 converter replicas at any resource level (datastore + # requests are now scheduler-visible, Sprint 4 gap-fix). + # Horizontal node scaling in production would raise this cap. + triggers: + - type: rabbitmq + metadata: + protocol: amqp + queueName: video # scale on the MAIN video queue depth (not retry/dlq) + mode: QueueLength + value: "5" # ~5 queued messages per converter replica is the + # target; KEDA adds replicas (up to 3) as it grows + authenticationRef: + name: keda-rabbitmq-auth diff --git a/k8s/keda/secret.yaml.example b/k8s/keda/secret.yaml.example new file mode 100644 index 0000000..a4de3f5 --- /dev/null +++ b/k8s/keda/secret.yaml.example @@ -0,0 +1,21 @@ +# EXAMPLE — copy to k8s/keda/secret.yaml (gitignored by **/secret.yaml) and fill +# in the real broker credentials, or provision it via ESO (A9). KEDA's +# TriggerAuthentication reads the `host` key as the rabbitmq connection string. +# +# The value is a full amqp URI: amqp://:@rabbitmq:5672/ +# (user/pass match the rabbitmq-secret / RabbitMQ Helm values; host "rabbitmq" is +# the in-cluster Service; vhost "/" is URL-encoded as the trailing slash). +# +# ESO alternative (preferred, no plaintext in a file): +# aws ssm put-parameter --name /vidcast//keda/rabbitmq-host --type SecureString \ +# --value "amqp://$RMQ_USER:$RMQ_PASS@rabbitmq:5672/" +# ...and add an ExternalSecret writing keda-rabbitmq-secret.host from it +# (see k8s/external-secrets/ for the pattern). +apiVersion: v1 +kind: Secret +metadata: + name: keda-rabbitmq-secret + namespace: default +type: Opaque +stringData: + host: "amqp://guest:guest@rabbitmq:5672/" diff --git a/k8s/keda/triggerauthentication.yaml b/k8s/keda/triggerauthentication.yaml new file mode 100644 index 0000000..08f011f --- /dev/null +++ b/k8s/keda/triggerauthentication.yaml @@ -0,0 +1,17 @@ +apiVersion: keda.sh/v1alpha1 +kind: TriggerAuthentication +metadata: + name: keda-rabbitmq-auth + namespace: default +spec: + # KEDA's rabbitmq scaler needs a full connection string (it embeds credentials), + # so it must come from a Secret, not ScaledObject metadata. The `host` parameter + # maps to the `host` key of keda-rabbitmq-secret (see secret.yaml.example). + # + # The stock rabbitmq-secret only holds RABBITMQ_DEFAULT_USER/PASS, not a combined + # amqp URI, which is why this dedicated secret exists. With ESO (A9) this becomes + # an ExternalSecret pulling /vidcast//keda/rabbitmq-host from Parameter Store. + secretTargetRef: + - parameter: host + name: keda-rabbitmq-secret + key: host diff --git a/k8s/keda/values.yaml b/k8s/keda/values.yaml new file mode 100644 index 0000000..db61571 --- /dev/null +++ b/k8s/keda/values.yaml @@ -0,0 +1,29 @@ +# Helm values for the KEDA install (A7). +# helm repo add kedacore https://kedacore.github.io/charts +# helm install keda kedacore/keda -n keda --create-namespace -f k8s/keda/values.yaml +# +# Conservative resources for the 2-vCPU node: KEDA runs three pods (operator, +# metrics API server, admission webhook). Total requests ≈ 125m / 160Mi — kept +# small because the node already carries the app + datastores + Redis + relay. +resources: + operator: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + metricServer: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + webhooks: + requests: + cpu: 25m + memory: 32Mi + limits: + cpu: 50m + memory: 64Mi diff --git a/k8s/kubecost/README.md b/k8s/kubecost/README.md new file mode 100644 index 0000000..4fa99d5 --- /dev/null +++ b/k8s/kubecost/README.md @@ -0,0 +1,52 @@ +# k8s/kubecost — FinOps cost visibility (B3) + +Kubecost (OSS / OpenCost core, **no license key**) for per-namespace / per-service / +per-conversion cost. Installed **last** in the upgrade plan because it is the +heaviest add-on and the most likely to pressure the single 2-vCPU node. + +## The one tuning that matters + +By default Kubecost deploys its **own** Prometheus + node-exporter + +kube-state-metrics (~1 CPU) — a duplicate of the kube-prometheus-stack from B4. +`values.yaml` **disables all of that** and points Kubecost at the existing +Prometheus, reducing it to a single ~175m cost-analyzer pod. Without this it does +not fit the node. + +## Install (applied separately, like KEDA/ESO/Kyverno/Argo) + +```bash +helm repo add kubecost https://kubecost.github.io/cost-analyzer/ && helm repo update +helm install kubecost kubecost/cost-analyzer -n kubecost --create-namespace \ + -f k8s/kubecost/values.yaml +kubectl apply -f monitoring/scrape/kubecost-servicemonitor.yaml # Prometheus scrapes cost metrics +``` + +## ⚠️ Node-budget gate (do NOT skip) + +Even tuned to ~175m, Kubecost pushes the **prod** footprint over the 90% idle gate +(see the B3 review note). Run it **against the dev (1-replica) footprint** (~81% +idle), **or** scale it to zero between cost-analysis sessions: + +```bash +kubectl scale deploy/kubecost-cost-analyzer -n kubecost --replicas=0 # park it +kubectl scale deploy/kubecost-cost-analyzer -n kubecost --replicas=1 # bring it back; Prometheus 7d backfills +``` + +## Verify (live cluster) + +```bash +kubectl get pods -n kubecost # cost-analyzer Running +# cost metrics present in Prometheus (Status ▸ Targets shows vidcast-kubecost UP): +# node_total_hourly_cost, container_cpu_allocation, ... +kubectl port-forward -n kubecost deploy/kubecost-cost-analyzer 9090:9090 # optional Kubecost UI +``` + +Then load `monitoring/dashboards/vidcast-finops.json` in Grafana. + +## Accuracy + +Kubecost **estimates** from instance list pricing; **AWS Cost Explorer is ground +truth**. m7i-flex.large ≈ **$0.106/hr** (eu-west-2 on-demand — verify current +pricing). Reconcile the dashboard's monthly projection against the real bill; they +will differ (Kubecost doesn't see RIs/Savings Plans, data-transfer, or control-plane +charges unless configured). See `FINOPS_EXPLAINED.md`. diff --git a/k8s/kubecost/values-local.yaml b/k8s/kubecost/values-local.yaml new file mode 100644 index 0000000..e0d8856 --- /dev/null +++ b/k8s/kubecost/values-local.yaml @@ -0,0 +1,19 @@ +# Local override for this dev cluster, applied after values.yaml: +# helm install kubecost kubecost/cost-analyzer -n kubecost \ +# -f k8s/kubecost/values.yaml -f k8s/kubecost/values-local.yaml +# +# 1) clusterId: the current cost-analyzer chart bundles a finopsagent subchart that +# hard-requires global.clusterId. We don't use the cloud agent, so disable it and +# set a clusterId for completeness. +# 2) persistentVolume disabled: this cluster has no dynamic EBS provisioner +# (no aws-ebs-csi-driver; in-tree provisioner dead on EKS 1.31). A PVC would hang +# Pending. Kubecost falls back to an emptyDir ETL cache; cost history beyond the +# pod lifetime is still backfilled from the 7d Prometheus retention. +global: + clusterId: vidcast-cluster + +finopsagent: + enabled: false + +persistentVolume: + enabled: false diff --git a/k8s/kubecost/values.yaml b/k8s/kubecost/values.yaml new file mode 100644 index 0000000..362ece2 --- /dev/null +++ b/k8s/kubecost/values.yaml @@ -0,0 +1,63 @@ +# Kubecost (B3 — FinOps), OSS / OpenCost core. NO license key. +# helm repo add kubecost https://kubecost.github.io/cost-analyzer/ +# helm install kubecost kubecost/cost-analyzer -n kubecost --create-namespace \ +# -f k8s/kubecost/values.yaml +# +# ┌─ THE node-budget tuning (the whole reason this fits a 2-vCPU node) ─────────┐ +# │ By DEFAULT Kubecost stands up its OWN Prometheus + node-exporter + │ +# │ kube-state-metrics — heavy, and a duplicate of the kube-prometheus-stack we │ +# │ already run (B4). We DISABLE all of that and point Kubecost at the existing │ +# │ Prometheus in the `monitoring` namespace. That turns Kubecost from a ~1 CPU │ +# │ add-on into a single ~175m cost-analyzer pod. │ +# └─────────────────────────────────────────────────────────────────────────────┘ +global: + prometheus: + enabled: false # do NOT deploy a second Prometheus + fqdn: http://monitoring-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 + grafana: + enabled: false # we already have Grafana (B4); the FinOps dashboard loads there + proxy: false + +# Belt-and-braces: ensure none of the bundled exporters deploy. +prometheus: + nodeExporter: + enabled: false + kube-state-metrics: + disabled: true + server: + # unused (global.prometheus.enabled=false), but pinned small if ever toggled + resources: + requests: {cpu: 10m, memory: 32Mi} + +# OSS / OpenCost core — no productKey, no enterprise features. +kubecostProductConfigs: + clusterName: vidcast-cluster + +# Disable the heavyweight optional subsystems we don't need on one node. +networkCosts: + enabled: false # eBPF per-pod network cost agent (a DaemonSet) — off +clusterController: + enabled: false +forecasting: + enabled: false +kubecostAggregator: + enabled: false + +# ~175m / 224Mi total (model + frontend). Defaults are far higher (model alone +# defaults ~500m/512Mi). Tuned down for the single 2-vCPU node — see the B3 review +# note: even at this size Kubecost pushes the PROD footprint over the 90% idle gate, +# so it is intended to run against the dev (1-replica) footprint or be scaled to 0 +# between cost-analysis sessions (Prometheus 7d retention backfills history). +kubecostModel: + resources: + requests: {cpu: "150m", memory: "192Mi"} + limits: {cpu: "300m", memory: "384Mi"} +kubecostFrontend: + resources: + requests: {cpu: "25m", memory: "32Mi"} + limits: {cpu: "50m", memory: "64Mi"} + +# Small PV for Kubecost's local ETL cache (cost history beyond Prometheus retention). +persistentVolume: + enabled: true + size: 2Gi diff --git a/k8s/kyverno/README.md b/k8s/kyverno/README.md new file mode 100644 index 0000000..376e525 --- /dev/null +++ b/k8s/kyverno/README.md @@ -0,0 +1,96 @@ +# k8s/kyverno — Policy-as-Code (B2) + +Seven Kyverno `ClusterPolicy` resources that enforce security/best-practice rules +at admission. **Every policy is in `Audit` mode in Sprint 3** — violations are +reported, nothing is blocked. + +## Policies + +| Policy | Rejects | Mode | +|---|---|---| +| `disallow-latest-tag` | untagged / `:latest` images | Audit | +| `require-requests-limits` | containers without cpu+mem requests AND limits | Audit | +| `require-non-root` | pods not running as non-root | Audit | +| `require-seccomp-runtime-default` | pods without seccomp RuntimeDefault | Audit | +| `require-labels` | pods missing app / environment / app.kubernetes.io/managed-by | Audit | +| `disallow-privileged` | privileged containers + SYS_ADMIN/NET_ADMIN/ALL caps | Audit | +| `verify-images` | **ACTIVATED (B5)** — unsigned `docker.io//*` + ECR `vidcast-frontend` images (cosign keyless) | Audit | + +System and platform namespaces (`kube-system`, `kyverno`, `argocd`, `keda`, +`external-secrets`, `monitoring`, …) are **excluded** so the Audit report stays +focused on the VidCast app in `default`. + +## Install (applied separately, like ESO/KEDA/Argo) + +```bash +helm repo add kyverno https://kyverno.github.io/kyverno && helm repo update +helm install kyverno kyverno/kyverno -n kyverno --create-namespace -f k8s/kyverno/values.yaml +kubectl apply -k k8s/kyverno # the ClusterPolicies (CRDs → need Kyverno first) +``` + +## Verify + +```bash +kubectl get clusterpolicy # all 7 should be READY=true +kubectl get policyreport -A # per-namespace pass/fail (Audit results) +kubectl get clusterpolicyreport + +# manual Audit test: a pod that violates several policies is ADMITTED (Audit), then +# shows up as failures in the report. +kubectl run audit-test --image=nginx:latest --restart=Never -n default +kubectl get policyreport -n default -o wide # see audit-test fail disallow-latest-tag, require-* ... +kubectl delete pod audit-test -n default +``` + +On a torn-down cluster this is **runtime-verify on re-apply** — the artifacts now +are the 7 policy files (validated with `kustomize build` + YAML parse). + +## Audit → Enforce promotion (NOT in Sprint 3 — deliberate follow-up) + +Do this only after the known violations (see the B2 review note / gap analysis) are +fixed, one policy at a time: + +```bash +kubectl get policyreport -A # 1. review every violation +# 2. fix the offending manifests (datastore resources/securityContext/labels, seccomp +# on app pods, outbox-relay + postgres image tags) — a separate clean commit +# 3. per policy, flip Audit -> Enforce once its violations are zero: +kubectl patch clusterpolicy require-non-root --type merge \ + -p '{"spec":{"validationFailureAction":"Enforce"}}' +# 4. verify-images stays Audit until B5 signing exists; promote it LAST. +``` + +Never bulk-flip all policies to Enforce — promote each only when its report is clean, +or you'll block legitimate deploys. + +## B5 — verify-images cosign test (live cluster) + +`verify-images` is now pointed at the real repos + the real keyless identity but +stays **Audit**. Until the operator's CI signs images, the Audit report will show our +images as **FAIL ("no signature")** — that is the expected "not yet signed" state. + +Prereq: the Sigstore egress carve-out so Kyverno can reach Fulcio/Rekor/TUF + +the registries: + +```bash +kubectl apply -f k8s/network-policies/allow-kyverno-sigstore-egress.yaml # kyverno ns +``` + +Once CI is signing, prove PASS vs FAIL on a live cluster: + +```bash +# PASS: a signed VidCast image verifies (after the cosign-sign CI job has run) +kubectl run sig-pass --image=docker.io//gateway-service: \ + --restart=Never -n default +kubectl describe clusterpolicyreport | grep -A3 verify-images # result: pass + +# FAIL: an unsigned image is reported (Audit → still admitted, but flagged) +kubectl run sig-fail --image=docker.io//gateway-service: \ + --restart=Never -n default +kubectl describe clusterpolicyreport | grep -A3 verify-images # result: fail + +kubectl delete pod sig-pass sig-fail -n default +``` + +Promote `verify-images` to **Enforce LAST** (and set `mutateDigest: true`) only +after a real signed image shows PASS here. Identity + chain: `SUPPLY_CHAIN.md`. diff --git a/k8s/kyverno/disallow-latest-tag.yaml b/k8s/kyverno/disallow-latest-tag.yaml new file mode 100644 index 0000000..f5a65df --- /dev/null +++ b/k8s/kyverno/disallow-latest-tag.yaml @@ -0,0 +1,53 @@ +# WHAT: rejects containers whose image is untagged or uses the `:latest` tag. +# WHY: `:latest` (and untagged, which means latest) is a moving target — you can't +# tell what version is actually running, rollbacks aren't reproducible, and two +# pods created minutes apart can run different code. Pin an immutable tag/digest. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: disallow-latest-tag + annotations: + policies.kyverno.io/title: Disallow Latest Tag + policies.kyverno.io/category: Best Practices +spec: + validationFailureAction: Audit # Sprint 3 = Audit only; promote to Enforce later + background: true + rules: + - name: require-image-tag + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: &platformNs + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "An explicit image tag is required (untagged images default to :latest)." + pattern: + spec: + containers: + - image: "*:*" + - name: disallow-latest-tag + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: *platformNs + validate: + message: "Using the mutable ':latest' tag is not allowed; pin a specific version." + pattern: + spec: + containers: + - image: "!*:latest" diff --git a/k8s/kyverno/disallow-privileged.yaml b/k8s/kyverno/disallow-privileged.yaml new file mode 100644 index 0000000..78c48af --- /dev/null +++ b/k8s/kyverno/disallow-privileged.yaml @@ -0,0 +1,66 @@ +# WHAT: rejects privileged containers and containers that add dangerous Linux +# capabilities (SYS_ADMIN, NET_ADMIN, ALL). +# WHY: a privileged container effectively disables all container isolation — it can +# access host devices and the kernel directly, making "container escape" trivial. +# SYS_ADMIN/NET_ADMIN/ALL grant near-root kernel powers for the same reason. No +# VidCast workload needs any of these. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: disallow-privileged + annotations: + policies.kyverno.io/title: Disallow Privileged and Dangerous Capabilities + policies.kyverno.io/category: Pod Security Standards (Baseline) +spec: + validationFailureAction: Audit + background: true + rules: + - name: disallow-privileged-mode + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: &platformNs + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "Privileged mode is not allowed." + pattern: + spec: + =(initContainers): + - =(securityContext): + =(privileged): "false" + containers: + - =(securityContext): + =(privileged): "false" + - name: disallow-dangerous-capabilities + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: *platformNs + validate: + message: "Adding SYS_ADMIN, NET_ADMIN, or ALL capabilities is not allowed." + foreach: + - list: "request.object.spec.containers" + deny: + conditions: + any: + - key: "{{ element.securityContext.capabilities.add[] || `[]` }}" + operator: AnyIn + value: + - SYS_ADMIN + - NET_ADMIN + - ALL diff --git a/k8s/kyverno/kustomization.yaml b/k8s/kyverno/kustomization.yaml new file mode 100644 index 0000000..7f2f106 --- /dev/null +++ b/k8s/kyverno/kustomization.yaml @@ -0,0 +1,16 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# B2 Kyverno ClusterPolicies. Applied SEPARATELY (after `helm install kyverno`) +# because these are kyverno.io CRDs — same applied-separately, platform-owned +# pattern as ESO/KEDA/Argo. ALL policies are validationFailureAction: Audit in +# Sprint 3 — nothing is enforced/blocked yet. See README.md for the Audit→Enforce +# promotion procedure (a deliberate, separate follow-up after the gap is fixed). +resources: + - disallow-latest-tag.yaml + - require-requests-limits.yaml + - require-non-root.yaml + - require-seccomp.yaml + - require-labels.yaml + - disallow-privileged.yaml + - verify-images.yaml diff --git a/k8s/kyverno/require-labels.yaml b/k8s/kyverno/require-labels.yaml new file mode 100644 index 0000000..bb39866 --- /dev/null +++ b/k8s/kyverno/require-labels.yaml @@ -0,0 +1,44 @@ +# WHAT: rejects any pod missing the labels `app`, `environment`, and +# `app.kubernetes.io/managed-by`. +# WHY: unlabelled resources can't be attributed (who owns this?), can't be +# cost-allocated (B3 Kubecost groups by label), and can't be targeted by other +# policies/selectors. Three labels is the minimum useful set — deliberately NOT +# requiring cost-centre/owner yet (the datastore charts don't have them, and we +# don't want to drown the Audit report). "managed-by" = the standard +# app.kubernetes.io/managed-by, which the A10 overlays already stamp. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-labels + annotations: + policies.kyverno.io/title: Require Standard Labels + policies.kyverno.io/category: Best Practices +spec: + validationFailureAction: Audit + background: true + rules: + - name: require-app-environment-managedby + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "Labels 'app', 'environment', and 'app.kubernetes.io/managed-by' are required." + pattern: + metadata: + labels: + app: "?*" + environment: "?*" + app.kubernetes.io/managed-by: "?*" diff --git a/k8s/kyverno/require-non-root.yaml b/k8s/kyverno/require-non-root.yaml new file mode 100644 index 0000000..eae74cd --- /dev/null +++ b/k8s/kyverno/require-non-root.yaml @@ -0,0 +1,42 @@ +# WHAT: rejects any pod that doesn't set runAsNonRoot: true (at pod OR container level). +# WHY: a container running as root that escapes the runtime (via a kernel/runtime CVE) +# owns the host node. Running as a non-root UID is the single highest-leverage +# container hardening step — it turns many "root on the node" escapes into nothing. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-non-root + annotations: + policies.kyverno.io/title: Require runAsNonRoot + policies.kyverno.io/category: Pod Security Standards (Restricted) +spec: + validationFailureAction: Audit + background: true + rules: + - name: require-run-as-non-root + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "runAsNonRoot must be true (set it on the pod or every container securityContext)." + anyPattern: + - spec: + securityContext: + runAsNonRoot: true + - spec: + containers: + - securityContext: + runAsNonRoot: true diff --git a/k8s/kyverno/require-requests-limits.yaml b/k8s/kyverno/require-requests-limits.yaml new file mode 100644 index 0000000..e9f7e30 --- /dev/null +++ b/k8s/kyverno/require-requests-limits.yaml @@ -0,0 +1,46 @@ +# WHAT: rejects any pod whose containers don't set BOTH cpu+memory requests AND +# cpu+memory limits. +# WHY: a container with no requests is invisible to the scheduler (it can land on an +# already-full node); a container with no limits can consume the whole node and +# starve everything else. On our single 2-vCPU node that's fatal — this is the +# policy backing all the node-budget tracking done across Sprint 2. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-requests-limits + annotations: + policies.kyverno.io/title: Require Requests and Limits + policies.kyverno.io/category: Best Practices +spec: + validationFailureAction: Audit + background: true + rules: + - name: require-requests-limits + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "CPU and memory requests AND limits are required on every container." + pattern: + spec: + containers: + - resources: + requests: + cpu: "?*" + memory: "?*" + limits: + cpu: "?*" + memory: "?*" diff --git a/k8s/kyverno/require-seccomp.yaml b/k8s/kyverno/require-seccomp.yaml new file mode 100644 index 0000000..4addddc --- /dev/null +++ b/k8s/kyverno/require-seccomp.yaml @@ -0,0 +1,46 @@ +# WHAT: rejects any pod that doesn't set a seccompProfile of type RuntimeDefault +# (at pod OR container level). +# WHY: seccomp filters which Linux syscalls a container may make. RuntimeDefault +# blocks ~44 dangerous/obscure syscalls the app never needs, shrinking the kernel +# attack surface available to a compromised container. It is a Pod Security +# Standards "Restricted" requirement. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: require-seccomp-runtime-default + annotations: + policies.kyverno.io/title: Require seccomp RuntimeDefault + policies.kyverno.io/category: Pod Security Standards (Restricted) +spec: + validationFailureAction: Audit + background: true + rules: + - name: require-seccomp-runtime-default + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + namespaces: + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + validate: + message: "seccompProfile.type must be RuntimeDefault (set on the pod or every container)." + anyPattern: + - spec: + securityContext: + seccompProfile: + type: RuntimeDefault + - spec: + containers: + - securityContext: + seccompProfile: + type: RuntimeDefault diff --git a/k8s/kyverno/values.yaml b/k8s/kyverno/values.yaml new file mode 100644 index 0000000..b3c4831 --- /dev/null +++ b/k8s/kyverno/values.yaml @@ -0,0 +1,44 @@ +# Helm values for Kyverno (B2) — tuned for the 2-vCPU demo node, not chart +# defaults (which request far more for HA). +# helm repo add kyverno https://kyverno.github.io/kyverno +# helm install kyverno kyverno/kyverno -n kyverno --create-namespace -f k8s/kyverno/values.yaml +# +# Four single-replica controllers. Total requests ≈ 125m / 320Mi. Policies +# themselves (ClusterPolicies) are applied separately via `kubectl apply -k`. + +admissionController: + replicas: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 150m + memory: 256Mi + +backgroundController: + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + +reportsController: + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi + +cleanupController: + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi diff --git a/k8s/kyverno/verify-images.yaml b/k8s/kyverno/verify-images.yaml new file mode 100644 index 0000000..ceb5894 --- /dev/null +++ b/k8s/kyverno/verify-images.yaml @@ -0,0 +1,63 @@ +# WHAT: verifies cosign signatures on VidCast images at admission. ACTIVATED in B5 +# (was an inert placeholder in B2) — now pointed at the real repos and the real +# keyless signing identity, but STILL Audit (reports, never blocks). +# WHY: once CI signs images (cosign keyless via GitHub OIDC, A8), this proves a +# running container was built by OUR pipeline and not tampered with. Kyverno +# checks the signature, the signing identity, and the Rekor log entry. +# +# ⚠️ STAYS Audit — do NOT set Enforce until: (1) John's CI cosign-sign job is merged +# and producing signatures, AND (2) at least one signed image has verified PASS on a +# live cluster. Until CI signs, the Audit report will show these images as FAIL +# ("no signature") — that is the EXPECTED, honest "not yet signed" state, not a bug. +# Promotion checklist + the live PASS/FAIL test are in k8s/kyverno/README.md. +apiVersion: kyverno.io/v1 +kind: ClusterPolicy +metadata: + name: verify-images + annotations: + policies.kyverno.io/title: Verify Image Signatures (B5 — Audit) + policies.kyverno.io/category: Supply Chain Security +spec: + validationFailureAction: Audit + background: false # image verification cannot run as a background scan + rules: + - name: verify-cosign-keyless + match: + any: + - resources: + kinds: [Pod] + exclude: + any: + - resources: + # Only verify OUR workloads — platform/system images (kyverno, argo, + # keda, ESO, monitoring, kube-system) are signed by other identities. + namespaces: + - kube-system + - kube-public + - kube-node-lease + - kyverno + - argocd + - keda + - external-secrets + - monitoring + verifyImages: + # Both registries, same signer (the repo's CI workflow). Backends live on + # Docker Hub; the frontend on ECR. NOTE: if the frontend is signed by a + # DIFFERENT workflow file than ci.yml, give it its own attestor entry — the + # keyless `subject` is the exact workflow path (see A8 / SUPPLY_CHAIN.md). + - imageReferences: + - "docker.io/johnbaabalola/*" + - "501562869470.dkr.ecr.eu-west-2.amazonaws.com/vidcast-frontend*" + # Audit-phase: observe only. Don't rewrite tag→digest yet (flip to true + # at Enforce, so admitted pods are pinned to the verified digest). + mutateDigest: false + attestors: + - entries: + # Cosign keyless: the identity IS the GitHub Actions OIDC token; the + # signature is logged in Rekor. No private key to store. The subject + # MUST match A8's documented identity character-for-character. + - keyless: + subject: "https://github.com/johnnybabs/vidcast/.github/workflows/ci.yml@refs/heads/main" + issuer: "https://token.actions.githubusercontent.com" + rekor: + url: "https://rekor.sigstore.dev" diff --git a/k8s/network-policies/README.md b/k8s/network-policies/README.md new file mode 100644 index 0000000..b8a3ffd --- /dev/null +++ b/k8s/network-policies/README.md @@ -0,0 +1,97 @@ +# k8s/network-policies — Default-deny NetworkPolicies (A6) + +A zero-trust network posture for the `default` namespace: every pod denies all +ingress and egress except the flows explicitly allowed here. + +## ⚠️ Hard prerequisite + +The **VPC CNI network-policy agent must be enabled**, or these policies are +accepted by the API server and **never enforced** (decorative YAML). It's enabled +in Terraform: `terraform/modules/eks/main.tf` → `aws_eks_addon.vpc_cni` with +`enableNetworkPolicy = "true"`. Confirm after apply: + +```bash +kubectl get ds aws-node -n kube-system -o yaml | grep -i network-policy # agent flag +``` + +## Files (applied with default-deny LAST) + +| File | What it allows | +|---|---| +| `allow-dns.yaml` | every pod → CoreDNS (UDP/TCP 53) — **must** exist before deny | +| `allow-monitoring.yaml` | Prometheus (`monitoring` ns) → gateway:8080, auth:5000 | +| `app-policies.yaml` | per-app ingress/egress (gateway, auth, frontend, converter, notification, outbox-relay) | +| `datastore-policies.yaml` | mongodb / postgres / rabbitmq / redis ingress from their clients (+ KEDA→rabbitmq) | +| `default-deny.yaml` | deny all ingress + egress (the catch-all) — **apply last** | + +## The traffic matrix + +``` + (browser, NodePort 30002/30006) + │ + ▼ + frontend :8080 ──/api/──► gateway :8080 ──► auth :5000 ──► postgres :5432 + │ + ├──► mongodb :27017 (GridFS + outbox) + └──► rabbitmq :5672 (publish / outbox path) + + outbox-relay ──► mongodb :27017, rabbitmq :5672 + converter ──► rabbitmq :5672, mongodb :27017, redis :6379 + notification ──► rabbitmq :5672, redis :6379, SMTP 0.0.0.0/0:587 (Gmail) + KEDA (keda ns) ──► rabbitmq :5672 (queue-depth poll) + Prometheus (monitoring ns) ──► gateway :8080, auth :5000 + all pods ──► CoreDNS :53 +``` + +Anything not in this matrix is denied. Notably the DB/broker admin NodePorts +(30003/30004/30005) are **no longer reachable from outside the cluster** — that +also closes finding **H-1**. Use `kubectl port-forward` for admin access. + +## Apply + +```bash +# (after the CNI agent is enabled and the app is deployed) +kubectl apply -k k8s/network-policies # allows + deny as one coherent set +``` + +## Verify (REQUIRED on a live cluster before declaring A6 done) + +```bash +# positive: an ALLOWED path works +kubectl exec deploy/gateway -- python -c "import socket; socket.create_connection(('auth',5000),3); print('gateway->auth OK')" + +# negative: a DENIED path hangs/times out (e.g. gateway must NOT reach redis) +kubectl exec deploy/gateway -- python -c "import socket; socket.create_connection(('redis',6379),3)" # expect timeout + +# DNS still resolves +kubectl exec deploy/gateway -- python -c "import socket; print(socket.gethostbyname('rabbitmq'))" + +# Prometheus targets still UP for the scraped pods +``` + +## Rollback (fastest in the plan) + +```bash +kubectl delete networkpolicy default-deny-all -n default # instantly reopens networking +# or: kubectl delete -k k8s/network-policies +``` + +## B5 — Sigstore egress for Kyverno (kyverno namespace) + +`allow-kyverno-sigstore-egress.yaml` lets the Kyverno image-verifier reach the OCI +registries + Fulcio/Rekor/TUF. It targets the **kyverno** namespace, so it is +**NOT** part of the `default`-ns kustomization above (that would force it into +`default`). Apply it standalone: + +```bash +kubectl apply -f k8s/network-policies/allow-kyverno-sigstore-egress.yaml +``` + +⚠️ **Honest limitation — no hostname pinning.** Vanilla Kubernetes NetworkPolicy +matches egress by **IP/CIDR, not hostname**, so it cannot pin to `*.sigstore.dev`. +Sigstore + the registries sit on rotating CDN IPs, so the only expressible rule is +**TCP 443 to the public internet** (which also permits the registries Kyverno +needs anyway). True FQDN-scoped egress (fulcio/rekor/tuf only) requires a +DNS-aware CNI (Cilium) or an egress proxy — out of scope. The kyverno namespace +ships **no default-deny** today, so this policy is a safe, deliberate hardening to +apply when locking that namespace down. diff --git a/k8s/network-policies/allow-backup-egress.yaml b/k8s/network-policies/allow-backup-egress.yaml new file mode 100644 index 0000000..6c740ea --- /dev/null +++ b/k8s/network-policies/allow-backup-egress.yaml @@ -0,0 +1,92 @@ +# Network exceptions for the backup CronJobs (mongo-backup, postgres-backup). +# +# Under default-deny (ingress AND egress), a backup pod can do nothing until both +# ends of each connection are allowed: +# - the backup pod's EGRESS to the datastore / AWS, and +# - the datastore's INGRESS from the backup pod. +# The existing datastore-policies.yaml only allows the app clients (gateway, +# converter, auth, ...) into Mongo/Postgres — not the backup pods — so without this +# file mongodump/pg_dump hang and the aws-cli upload CrashLoops (no path to STS/S3). +# +# Backup pods are selected by app.kubernetes.io/component: backup (set on both +# CronJob pod templates). +--- +# Egress: backup pods → Mongo (27017), Postgres (5432), and AWS over HTTPS (443). +# 0.0.0.0/0:443 is for AWS STS (IRSA AssumeRoleWithWebIdentity) + S3 (object PUT); +# AWS IP ranges are large/dynamic, so this mirrors the existing 0.0.0.0/0 egress +# pattern used for Gmail SMTP (app-policies) and sigstore (allow-kyverno-sigstore). +# DNS (53) is already granted to every pod by allow-dns.yaml. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: backup-egress + namespace: default +spec: + podSelector: + matchLabels: + app.kubernetes.io/component: backup + policyTypes: + - Egress + egress: + - to: + - podSelector: + matchLabels: + app: database # MongoDB + ports: + - protocol: TCP + port: 27017 + - to: + - podSelector: + matchLabels: + app: auth-app # PostgreSQL + ports: + - protocol: TCP + port: 5432 + - to: + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - protocol: TCP + port: 443 # AWS STS + S3 (HTTPS) +--- +# Ingress: let the backup pods into MongoDB (mongodump). Additive to mongodb-ingress. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mongodb-ingress-backup + namespace: default +spec: + podSelector: + matchLabels: + app: database + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/component: backup + ports: + - protocol: TCP + port: 27017 +--- +# Ingress: let the backup pods into PostgreSQL (pg_dump). Additive to postgres-ingress. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: postgres-ingress-backup + namespace: default +spec: + podSelector: + matchLabels: + app: auth-app + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app.kubernetes.io/component: backup + ports: + - protocol: TCP + port: 5432 diff --git a/k8s/network-policies/allow-dns.yaml b/k8s/network-policies/allow-dns.yaml new file mode 100644 index 0000000..01e849d --- /dev/null +++ b/k8s/network-policies/allow-dns.yaml @@ -0,0 +1,25 @@ +# A6 — DNS egress for EVERY pod. This must exist before default-deny, or every +# pod loses name resolution (services are addressed by DNS name: rabbitmq, mongodb, +# auth, redis, ...) and the whole app breaks. Allows UDP+TCP 53 to CoreDNS only. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-dns-egress + namespace: default +spec: + podSelector: {} # all pods + policyTypes: + - Egress + egress: + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + podSelector: + matchLabels: + k8s-app: kube-dns # CoreDNS on EKS carries this label + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 diff --git a/k8s/network-policies/allow-kyverno-sigstore-egress.yaml b/k8s/network-policies/allow-kyverno-sigstore-egress.yaml new file mode 100644 index 0000000..7c6e3a6 --- /dev/null +++ b/k8s/network-policies/allow-kyverno-sigstore-egress.yaml @@ -0,0 +1,68 @@ +# B5 — Sigstore egress for Kyverno's image-verification (verify-images policy). +# +# When verify-images evaluates a pod, the Kyverno admission controller must reach: +# • the OCI registry — fetch the image manifest + the cosign `.sig` object +# (registry-1.docker.io for johnbaabalola/*, and the +# .dkr.ecr.eu-west-2.amazonaws.com ECR frontend repo) +# • fulcio.sigstore.dev — verify the short-lived signing certificate +# • rekor.sigstore.dev — verify the signature's transparency-log entry +# • tuf-repo-cdn.sigstore.dev — bootstrap trust in the Fulcio/Rekor roots +# Without egress to these, verification fails with network errors (NOT "unsigned"). +# +# ⚠️ HONEST LIMITATION: vanilla Kubernetes NetworkPolicy matches egress by IP/CIDR, +# NOT by hostname — so it CANNOT pin specifically to *.sigstore.dev. The Sigstore +# services and the public registries live on rotating CDN IPs, so the only +# expressible rule is "TCP 443 to the public internet", which necessarily also +# permits the registries Kyverno legitimately needs. True FQDN-level egress pinning +# (fulcio/rekor/tuf only) requires a CNI with DNS-aware policies (Cilium) or an +# egress proxy — out of scope; documented in k8s/network-policies/README.md. +# +# This is the ALLOW half. The kyverno namespace ships with NO default-deny today +# (so egress is already open and this is a safe no-op-to-add). Applying THIS policy +# selects the kyverno pods and restricts their egress to exactly DNS + 443, which is +# the intended hardening; pair it with a kyverno default-deny when locking the ns +# down. policyTypes is Egress ONLY — the admission webhook ingress is untouched. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-kyverno-sigstore-egress + namespace: kyverno +spec: + podSelector: {} # all Kyverno pods (admission/background/reports controllers) + policyTypes: + - Egress + egress: + # DNS — resolve the registry + sigstore hostnames (CoreDNS on EKS). + - to: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: kube-system + podSelector: + matchLabels: + k8s-app: kube-dns + ports: + - protocol: UDP + port: 53 + - protocol: TCP + port: 53 + # HTTPS to the public internet (Sigstore: fulcio/rekor/tuf-repo-cdn) AND the OCI + # registries (Docker Hub, ECR) AND the EKS API endpoint. CIDR-scoped because NP + # can't match the hostnames above (see the limitation note). + - to: + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - protocol: TCP + port: 443 + # EC2 Instance Metadata Service (IMDS) on :80 — verify-images (B5) pulls from the + # PRIVATE ECR repo (vidcast-frontend), and the AWS SDK fetches the node-role + # credentials from IMDS to authenticate. Without this the ECR call hangs past the + # webhook deadline → "context canceled" → failurePolicy:Fail rejects the admission + # of any ECR-image workload. (Public docker.io images need no auth and are + # unaffected.) Scoped to the link-local IMDS address only. + - to: + - ipBlock: + cidr: 169.254.169.254/32 + ports: + - protocol: TCP + port: 80 diff --git a/k8s/network-policies/allow-monitoring.yaml b/k8s/network-policies/allow-monitoring.yaml new file mode 100644 index 0000000..85eadce --- /dev/null +++ b/k8s/network-policies/allow-monitoring.yaml @@ -0,0 +1,55 @@ +# A6 — allow Prometheus (kube-prometheus-stack, `monitoring` namespace) to scrape +# the app pods' metrics ports, so default-deny doesn't silently break monitoring. +# Gateway already accepts 8080 from any source, but it's listed here explicitly so +# the scrape intent is documented and survives a tightening of the gateway rule. +# B4 added /metrics to gateway (:8080) AND a dedicated :9000 metrics port on the +# converter and notification consumers — both are scraped by PodMonitors, so the +# monitoring namespace also needs ingress to :9000 on those pods (below). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-monitoring-scrape + namespace: default +spec: + podSelector: + matchExpressions: + - key: app + operator: In + values: ["gateway", "auth"] + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - protocol: TCP + port: 8080 + - protocol: TCP + port: 5000 +--- +# Consumers (converter, notification) expose a Prometheus endpoint on :9000 (B4), +# scraped by their PodMonitors. They have no other ingress, so without this the +# default-deny silently drops the scrape and their targets go DOWN. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-monitoring-scrape-consumers + namespace: default +spec: + podSelector: + matchExpressions: + - key: app + operator: In + values: ["converter", "notification"] + policyTypes: + - Ingress + ingress: + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - protocol: TCP + port: 9000 diff --git a/k8s/network-policies/app-policies.yaml b/k8s/network-policies/app-policies.yaml new file mode 100644 index 0000000..8a8cae9 --- /dev/null +++ b/k8s/network-policies/app-policies.yaml @@ -0,0 +1,204 @@ +# A6 — per-application allow rules (the exceptions to default-deny). Each policy +# names the minimum ingress a pod accepts and the egress it needs. DNS egress for +# all pods is in allow-dns.yaml; datastore *ingress* is in datastore-policies.yaml. +# Every A→B flow needs BOTH an egress rule on A (here) and an ingress rule on B. +--- +# GATEWAY — public API. Ingress on 8080 from anywhere (it is exposed via NodePort +# 30002 to browsers AND proxied by the frontend). Egress to auth, mongodb (GridFS + +# outbox), and rabbitmq (direct publish, or outbox path). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: gateway + namespace: default +spec: + podSelector: + matchLabels: + app: gateway + policyTypes: + - Ingress + - Egress + ingress: + - ports: # no `from` => any source (public NodePort + frontend proxy) + - protocol: TCP + port: 8080 + egress: + - to: + - podSelector: + matchLabels: + app: auth + ports: + - protocol: TCP + port: 5000 + - to: + - podSelector: + matchLabels: + app: database # MongoDB + ports: + - protocol: TCP + port: 27017 + - to: + - podSelector: + matchLabels: + app: rabbitmq + ports: + - protocol: TCP + port: 5672 +--- +# AUTH — ingress from gateway on 5000; egress to PostgreSQL. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: auth + namespace: default +spec: + podSelector: + matchLabels: + app: auth + policyTypes: + - Ingress + - Egress + ingress: + - from: + - podSelector: + matchLabels: + app: gateway + ports: + - protocol: TCP + port: 5000 + egress: + - to: + - podSelector: + matchLabels: + app: auth-app # PostgreSQL + ports: + - protocol: TCP + port: 5432 +--- +# FRONTEND — public web UI. Ingress on 8080 from anywhere (NodePort 30006). Egress +# to gateway (nginx proxies /api/ -> gateway:8080). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: frontend + namespace: default +spec: + podSelector: + matchLabels: + app: frontend + policyTypes: + - Ingress + - Egress + ingress: + - ports: + - protocol: TCP + port: 8080 + egress: + - to: + - podSelector: + matchLabels: + app: gateway + ports: + - protocol: TCP + port: 8080 +--- +# CONVERTER — queue consumer (no ingress). Egress to rabbitmq (consume video + +# publish mp3 + retry/dlx), mongodb (read video / write mp3), redis (A2 idempotency). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: converter + namespace: default +spec: + podSelector: + matchLabels: + app: converter + policyTypes: + - Egress + egress: + - to: + - podSelector: + matchLabels: + app: rabbitmq + ports: + - protocol: TCP + port: 5672 + - to: + - podSelector: + matchLabels: + app: database + ports: + - protocol: TCP + port: 27017 + - to: + - podSelector: + matchLabels: + app: redis + ports: + - protocol: TCP + port: 6379 +--- +# NOTIFICATION — queue consumer (no ingress). Egress to rabbitmq, redis (A2), and +# external SMTP (Gmail :587). SMTP is to 0.0.0.0/0:587 because Gmail's IPs are +# dynamic; nothing internal listens on 587 so this opens no in-cluster path. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: notification + namespace: default +spec: + podSelector: + matchLabels: + app: notification + policyTypes: + - Egress + egress: + - to: + - podSelector: + matchLabels: + app: rabbitmq + ports: + - protocol: TCP + port: 5672 + - to: + - podSelector: + matchLabels: + app: redis + ports: + - protocol: TCP + port: 6379 + - to: + - ipBlock: + cidr: 0.0.0.0/0 + ports: + - protocol: TCP + port: 587 # SMTP submission (Gmail) +--- +# OUTBOX-RELAY (A1) — poller, no ingress. Egress to mongodb (read outbox) and +# rabbitmq (publish). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: outbox-relay + namespace: default +spec: + podSelector: + matchLabels: + app: outbox-relay + policyTypes: + - Egress + egress: + - to: + - podSelector: + matchLabels: + app: database + ports: + - protocol: TCP + port: 27017 + - to: + - podSelector: + matchLabels: + app: rabbitmq + ports: + - protocol: TCP + port: 5672 diff --git a/k8s/network-policies/datastore-policies.yaml b/k8s/network-policies/datastore-policies.yaml new file mode 100644 index 0000000..af85885 --- /dev/null +++ b/k8s/network-policies/datastore-policies.yaml @@ -0,0 +1,131 @@ +# A6 — datastore INGRESS allow rules. Each datastore accepts connections only from +# the specific in-cluster clients that need it. Note what is deliberately ABSENT: +# no allow-from-anywhere on the DB/broker ports, so once default-deny is in place +# the stateful NodePorts (30003/30004/30005) are no longer reachable from outside +# the cluster — which also closes finding H-1 (0.0.0.0/0 open to stateful ports). +# Admin access is via `kubectl port-forward` instead. +# +# Datastores need no egress policy (they don't initiate connections); DNS for them +# is covered by allow-dns.yaml. +--- +# MongoDB (app: database) — from gateway (GridFS + outbox), converter (read video / +# write mp3), outbox-relay (read outbox). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: mongodb-ingress + namespace: default +spec: + podSelector: + matchLabels: + app: database + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: gateway + - podSelector: + matchLabels: + app: converter + - podSelector: + matchLabels: + app: outbox-relay + ports: + - protocol: TCP + port: 27017 +--- +# PostgreSQL (app: auth-app) — from auth only. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: postgres-ingress + namespace: default +spec: + podSelector: + matchLabels: + app: auth-app + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: auth + ports: + - protocol: TCP + port: 5432 +--- +# RabbitMQ (app: rabbitmq) — AMQP 5672 from the four in-cluster clients, plus the +# KEDA scaler in the `keda` namespace (it polls queue depth over AMQP). Management +# 15672 is intentionally NOT exposed externally (port-forward for admin). +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: rabbitmq-ingress + namespace: default +spec: + podSelector: + matchLabels: + app: rabbitmq + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: gateway + - podSelector: + matchLabels: + app: outbox-relay + - podSelector: + matchLabels: + app: converter + - podSelector: + matchLabels: + app: notification + ports: + - protocol: TCP + port: 5672 + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: keda + ports: + - protocol: TCP + port: 5672 + # Prometheus (monitoring ns) scrapes the rabbitmq_prometheus plugin on :15692 + # (/metrics/per-object). Without this, default-deny drops the scrape and the + # rabbitmq target goes DOWN (the two RabbitMQ alerts depend on it). + - from: + - namespaceSelector: + matchLabels: + kubernetes.io/metadata.name: monitoring + ports: + - protocol: TCP + port: 15692 +--- +# Redis (app: redis, A2) — from the two consumers that claim_once. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: redis-ingress + namespace: default +spec: + podSelector: + matchLabels: + app: redis + policyTypes: + - Ingress + ingress: + - from: + - podSelector: + matchLabels: + app: converter + - podSelector: + matchLabels: + app: notification + ports: + - protocol: TCP + port: 6379 diff --git a/k8s/network-policies/default-deny.yaml b/k8s/network-policies/default-deny.yaml new file mode 100644 index 0000000..d8eddae --- /dev/null +++ b/k8s/network-policies/default-deny.yaml @@ -0,0 +1,19 @@ +# A6 — default-deny for the `default` namespace. APPLY THIS LAST (after all the +# allow-* policies below exist), so nothing is cut off before its exceptions are in +# place. NetworkPolicies are additive (a packet is permitted if ANY policy allows +# it), so once this is in effect every pod denies all ingress AND egress except +# what the allow-* policies explicitly permit. +# +# Rollback (fastest in the whole plan): `kubectl delete networkpolicy default-deny-all -n default` +# instantly restores open networking. +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny-all + namespace: default +spec: + podSelector: {} # every pod in the namespace + policyTypes: + - Ingress + - Egress + # No ingress/egress rules => deny everything not explicitly allowed elsewhere. diff --git a/k8s/network-policies/kustomization.yaml b/k8s/network-policies/kustomization.yaml new file mode 100644 index 0000000..cccd61e --- /dev/null +++ b/k8s/network-policies/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# A6 NetworkPolicies. Applied SEPARATELY from the app overlay (these are a security +# layer over whatever is deployed) and ONLY after the VPC CNI network-policy agent +# is enabled in Terraform (terraform/modules/eks aws_eks_addon.vpc_cni) — without +# that agent these policies are accepted but NOT enforced. +# +# Ordering: the allow-* policies and default-deny are listed with default-deny LAST. +# NetworkPolicy is additive (union of allows), so applying the whole set at once is +# safe; the ordering is belt-and-braces and matches "apply allows first, deny last". +namespace: default + +resources: + - allow-dns.yaml + - allow-monitoring.yaml + - app-policies.yaml + - datastore-policies.yaml + # I4 backup CronJobs: egress to Mongo/Postgres/AWS + datastore ingress from backup. + - allow-backup-egress.yaml + - default-deny.yaml diff --git a/k8s/overlays/dev/kustomization.yaml b/k8s/overlays/dev/kustomization.yaml new file mode 100644 index 0000000..ddd7129 --- /dev/null +++ b/k8s/overlays/dev/kustomization.yaml @@ -0,0 +1,68 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# DEV overlay — lighter footprint for a smaller / cost-saving cluster and faster +# Argo syncs. All backends drop to 1 replica (the converter/notification HA of +# prod is unnecessary for dev validation, and it leaves CPU headroom on a +# 2-vCPU node for the in-cluster add-ons added later: KEDA, Kyverno, Kubecost). +# +# Argo CD (B1) points its auto-sync-ON "vidcast-dev" Application here. +namespace: default + +resources: + - ../../base/auth + - ../../base/gateway + - ../../base/converter + - ../../base/notification + - ../../base/frontend + # A1 outbox relay. Single replica — deliberately NOT in the replicas: list + # below, it must stay at 1 (single publisher). The johnbaabalola/outbox-relay + # image is built by CI (John's matrix change); once pushed, add it to the + # images: list with its tag like the other services. + - ../../base/outbox-relay + # A2 idempotency claim store (in-cluster Redis, single replica). Not in the + # replicas: list — Redis stays at 1. + - ../../base/redis + # I4/P5 nightly mongodump + pg_dump CronJobs → S3 backup bucket. CronJob pods + # are short-lived and carry no replica count. + - ../../base/backup + +labels: + - pairs: + environment: dev + cost-centre: vidcast-portfolio + owner: john-baabalola + app.kubernetes.io/managed-by: kustomize + includeSelectors: false + includeTemplates: true + +images: + - name: johnbaabalola/auth-service + newTag: 65f2f57 + - name: johnbaabalola/gateway-service + newTag: 65f2f57 + - name: johnbaabalola/converter-service + newTag: 65f2f57 + - name: johnbaabalola/notification-service + newTag: 65f2f57 + # B2 gap-fix (disallow-latest-tag): pin the relay off :latest. 65f2f57 is the + # tag actually built and pushed to the registry (the prior placeholder e4d2669 + # was never built → ImagePullBackOff when Argo synced it). The REAL tag will be + # bumped here by GitOps (B1) once John adds outbox-relay to the CI build matrix + # (A1 CI diff) like the other services. + - name: johnbaabalola/outbox-relay + newTag: 65f2f57 + - name: vidcast-frontend + newName: 501562869470.dkr.ecr.eu-west-2.amazonaws.com/vidcast-frontend + newTag: 6eb217c + +# Dev runs one replica of each backend (frontend is already 1 in base). +replicas: + - name: auth + count: 1 + - name: gateway + count: 1 + - name: converter + count: 1 + - name: notification + count: 1 diff --git a/k8s/overlays/prod/kustomization.yaml b/k8s/overlays/prod/kustomization.yaml new file mode 100644 index 0000000..5c87d6f --- /dev/null +++ b/k8s/overlays/prod/kustomization.yaml @@ -0,0 +1,62 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# PROD overlay — mirrors the current live production footprint (the single-node +# EKS cluster). Replicas are inherited from base unchanged (auth 2, gateway 2, +# converter 2, notification 2, frontend 1) so `kubectl kustomize overlays/prod` +# renders ≈ the pre-Kustomize raw manifests (the only intended deltas are the +# org labels and the resolved frontend image — see k8s/README.md verification). +# +# Argo CD (B1) points its auto-sync-OFF "vidcast-prod" Application here: +# production deploys are gated on a human merging the image-tag-bump PR. +namespace: default + +resources: + - ../../base/auth + - ../../base/gateway + - ../../base/converter + - ../../base/notification + - ../../base/frontend + # A1 outbox relay (single replica). The johnbaabalola/outbox-relay image is + # built by CI (John's matrix change); once pushed, add it to the images: list + # below with its tag so GitOps (B1) bumps it like the other services. + - ../../base/outbox-relay + # A2 idempotency claim store (in-cluster Redis, single replica). ElastiCache is + # the documented-but-skipped managed alternative (MANAGED_SERVICES.md §5): to + # use it, point the consumers' REDIS_HOST at the ElastiCache endpoint and drop + # this resource. We keep Redis in-cluster per the cost boundary. + - ../../base/redis + # I4/P5 nightly mongodump + pg_dump CronJobs → S3 backup bucket. CronJob pods + # are short-lived and carry no replica count. + - ../../base/backup + +# Org/governance labels. These are what Kyverno require-labels (B2) enforces. +# environment distinguishes prod from dev; managed-by flips to "argocd" in B1. +labels: + - pairs: + environment: prod + cost-centre: vidcast-portfolio + owner: john-baabalola + app.kubernetes.io/managed-by: kustomize + includeSelectors: false + includeTemplates: true + +# Image tags are the source of truth for GitOps: the CD pipeline (B1) bumps +# newTag here via a PR rather than running `kubectl set image`. Backends are +# Docker Hub; the frontend resolves to this account's ECR (CI does not build it). +images: + - name: johnbaabalola/auth-service + newTag: 65f2f57 + - name: johnbaabalola/gateway-service + newTag: 65f2f57 + - name: johnbaabalola/converter-service + newTag: 65f2f57 + - name: johnbaabalola/notification-service + newTag: 65f2f57 + # outbox-relay is now built by CI (added to the build matrix) and pinned to the + # same CI SHA as the other backends — GitOps (B1) bumps newTag here on each build. + - name: johnbaabalola/outbox-relay + newTag: 65f2f57 + - name: vidcast-frontend + newName: 501562869470.dkr.ecr.eu-west-2.amazonaws.com/vidcast-frontend + newTag: d9e4282 diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 0000000..af93098 --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,69 @@ +# VidCast Monitoring Stack + +Prometheus + Grafana + Alertmanager deployed via kube-prometheus-stack. + +## Install + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml \ + -n monitoring \ + --create-namespace +``` + +Wait for all pods to start: +```bash +kubectl get pods -n monitoring -w +``` + +## Access + +| Service | URL | Credentials | +|---------|-----|-------------| +| Grafana | http://NODE_IP:30007 | admin / vidcast-demo | +| Alertmanager | http://NODE_IP:30008 | none | + +Replace `NODE_IP` with the output of `kubectl get nodes -o wide`. + +## Apply Custom Dashboard + +The `dashboards/vidcast-operations.json` file is loaded automatically via the Grafana sidecar when the release is installed with the values in `values.yaml`. To load manually: + +1. Open Grafana → Dashboards → Import +2. Upload `monitoring/dashboards/vidcast-operations.json` + +## Apply Custom Alert Rules + +```bash +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml +``` + +## B4 — SLO scrape targets, burn-rate rules & error-budget dashboard + +App metrics are scraped via operator-native ServiceMonitor/PodMonitor resources +(the old static `additionalScrapeConfigs` gateway job was retired): + +```bash +kubectl apply -f monitoring/scrape/ # gateway + rabbitmq SM, converter + notification PM +kubectl apply -f monitoring/alerts/vidcast-slo-rules.yaml # recording rules + multi-burn-rate alerts +``` + +These depend on the **M-2 metrics foundation**: the gateway `/metrics` endpoint, +the converter/notification metrics servers (`:9000/metrics`), and RabbitMQ's +`rabbitmq_prometheus` plugin (`:15692`, enabled in `Helm_charts/RabbitMQ`). All +need a fresh image build (gateway/converter/notification) and a RabbitMQ re-deploy. + +- **SLO definitions, budgets, runbooks:** `SLO.md` (repo root) +- **Error-budget dashboard:** `dashboards/vidcast-slo.json` (load like the ops dashboard) + +Verify scrape targets after applying: Prometheus UI → Status → Targets should show +`vidcast-gateway`, `vidcast-rabbitmq`, `vidcast-converter`, `vidcast-notification` **UP**. + +## Uninstall + +```bash +helm uninstall monitoring -n monitoring +kubectl delete namespace monitoring +``` diff --git a/monitoring/alerts/vidcast-alerts.yaml b/monitoring/alerts/vidcast-alerts.yaml new file mode 100644 index 0000000..9776cc1 --- /dev/null +++ b/monitoring/alerts/vidcast-alerts.yaml @@ -0,0 +1,67 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: vidcast-alerts + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: vidcast.pods + interval: 1m + rules: + - alert: PodCrashLoopBackOff + expr: | + rate(kube_pod_container_status_restarts_total{namespace="default"}[10m]) * 60 > 0.5 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.pod }} is crash-looping" + description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has restarted more than 3 times in 10 minutes. Investigate with: kubectl logs {{ $labels.pod }} --previous" + + - name: vidcast.resources + interval: 1m + rules: + - alert: HighNodeMemoryUsage + expr: | + 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node memory usage above 85%" + description: "Node memory is {{ $value | humanize }}% used. Risk of OOMKill for converter pods. Consider scaling down or upgrading the node." + + - alert: HighNodeCPUUsage + expr: | + 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node CPU usage above 85%" + description: "Node CPU is {{ $value | humanize }}% used. Converter replicas may be saturating the node." + + - name: vidcast.queues + interval: 1m + rules: + - alert: RabbitMQQueueBacklog + expr: | + rabbitmq_queue_messages{queue="video"} > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Video queue backlog: {{ $value }} messages" + description: "More than 10 videos are waiting for conversion. Converter workers may be overwhelmed or crashed." + + - alert: RabbitMQUnavailable + expr: | + up{job="rabbitmq"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "RabbitMQ is unreachable" + description: "RabbitMQ has been down for 2 minutes. The entire upload/convert pipeline is blocked. Check: kubectl describe pod rabbitmq-0" diff --git a/monitoring/alerts/vidcast-slo-rules.yaml b/monitoring/alerts/vidcast-slo-rules.yaml new file mode 100644 index 0000000..2a86dce --- /dev/null +++ b/monitoring/alerts/vidcast-slo-rules.yaml @@ -0,0 +1,155 @@ +# B4 — SLO burn-rate rules (multi-window, multi-burn-rate; Google SRE workbook). +# +# Burn rates are RECORDED already NORMALISED (error-ratio ÷ (1−SLO)), so a value of +# 1 means "consuming budget exactly at the sustainable rate" and 14 means "14× too +# fast". That makes the alert thresholds literally `> 14` (fast) and `> 1` (slow), +# and lets the Grafana error-budget dashboard reuse the same series. +# +# Each alert is MULTI-WINDOW: a long window (1h fast / 6h slow) sets the severity, +# a short window (5m fast / 30m slow) must ALSO be burning — this is what stops a +# long-window alert from latching on after the incident is over (the short window +# recovers fast and clears the alert). +# +# SLO budget factors (1 − target): availability 0.001 (99.9%) · conversion_latency +# 0.05 (95%) · e2e_success 0.01 (99%). See SLO.md for budgets + the single-node +# "demonstrative target" caveat. +# +# NOTE: division by a zero scrape-rate (no traffic) yields NaN; NaN > N is false, so +# the alerts simply stay quiet when idle — correct for a demo cluster. +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: vidcast-slo-rules + namespace: monitoring + labels: + release: monitoring +spec: + groups: + # ───────────────────────── recording rules ───────────────────────── + - name: vidcast.slo.availability.recording + interval: 30s + rules: + - record: slo:availability:burnrate5m + expr: | + (sum(rate(vidcast_gateway_requests_total{status=~"5.."}[5m])) + / sum(rate(vidcast_gateway_requests_total[5m]))) / 0.001 + - record: slo:availability:burnrate30m + expr: | + (sum(rate(vidcast_gateway_requests_total{status=~"5.."}[30m])) + / sum(rate(vidcast_gateway_requests_total[30m]))) / 0.001 + - record: slo:availability:burnrate1h + expr: | + (sum(rate(vidcast_gateway_requests_total{status=~"5.."}[1h])) + / sum(rate(vidcast_gateway_requests_total[1h]))) / 0.001 + - record: slo:availability:burnrate6h + expr: | + (sum(rate(vidcast_gateway_requests_total{status=~"5.."}[6h])) + / sum(rate(vidcast_gateway_requests_total[6h]))) / 0.001 + + - name: vidcast.slo.conversion_latency.recording + interval: 30s + rules: + # "bad" = fraction of conversions slower than the 5-minute (300s) bucket. + # le=~"300(\.0)?" tolerates int vs float bucket-label rendering by the client. + - record: slo:conversion_latency:burnrate5m + expr: | + (1 - (sum(rate(vidcast_conversion_duration_seconds_bucket{le=~"300(\\.0)?"}[5m])) + / sum(rate(vidcast_conversion_duration_seconds_count[5m])))) / 0.05 + - record: slo:conversion_latency:burnrate30m + expr: | + (1 - (sum(rate(vidcast_conversion_duration_seconds_bucket{le=~"300(\\.0)?"}[30m])) + / sum(rate(vidcast_conversion_duration_seconds_count[30m])))) / 0.05 + - record: slo:conversion_latency:burnrate1h + expr: | + (1 - (sum(rate(vidcast_conversion_duration_seconds_bucket{le=~"300(\\.0)?"}[1h])) + / sum(rate(vidcast_conversion_duration_seconds_count[1h])))) / 0.05 + - record: slo:conversion_latency:burnrate6h + expr: | + (1 - (sum(rate(vidcast_conversion_duration_seconds_bucket{le=~"300(\\.0)?"}[6h])) + / sum(rate(vidcast_conversion_duration_seconds_count[6h])))) / 0.05 + + - name: vidcast.slo.e2e_success.recording + interval: 30s + rules: + # "bad" = fraction of accepted uploads that did NOT result in a sent email. + # Best evaluated over long windows: uploads and sends are minutes apart, so + # short windows are noisy (can briefly exceed 1). See SLO.md caveat. + - record: slo:e2e_success:burnrate5m + expr: | + (1 - (sum(rate(vidcast_notifications_total{status="success"}[5m])) + / sum(rate(vidcast_uploads_total[5m])))) / 0.01 + - record: slo:e2e_success:burnrate30m + expr: | + (1 - (sum(rate(vidcast_notifications_total{status="success"}[30m])) + / sum(rate(vidcast_uploads_total[30m])))) / 0.01 + - record: slo:e2e_success:burnrate1h + expr: | + (1 - (sum(rate(vidcast_notifications_total{status="success"}[1h])) + / sum(rate(vidcast_uploads_total[1h])))) / 0.01 + - record: slo:e2e_success:burnrate6h + expr: | + (1 - (sum(rate(vidcast_notifications_total{status="success"}[6h])) + / sum(rate(vidcast_uploads_total[6h])))) / 0.01 + + # ───────────────────────── burn-rate alerts ───────────────────────── + - name: vidcast.slo.alerts + rules: + # Availability (99.9%) + - alert: VidcastAvailabilityFastBurn + expr: slo:availability:burnrate1h > 14 and slo:availability:burnrate5m > 14 + for: 2m + labels: + severity: critical + slo: availability + annotations: + summary: "Availability error budget burning 14× (fast)" + description: "Gateway 5xx rate is consuming the 30-day availability budget 14× too fast (1h & 5m windows). At this rate the 43.2-min budget is gone in ~3h. Runbook: SLO.md §Availability." + - alert: VidcastAvailabilitySlowBurn + expr: slo:availability:burnrate6h > 1 and slo:availability:burnrate30m > 1 + for: 15m + labels: + severity: warning + slo: availability + annotations: + summary: "Availability error budget burning ≥1× (slow)" + description: "Gateway 5xx rate is over the sustainable burn rate (6h & 30m windows). The budget will be exhausted before the 30-day window resets if this continues. Runbook: SLO.md §Availability." + + # Conversion latency (95% < 5 min) + - alert: VidcastConversionLatencyFastBurn + expr: slo:conversion_latency:burnrate1h > 14 and slo:conversion_latency:burnrate5m > 14 + for: 2m + labels: + severity: critical + slo: conversion_latency + annotations: + summary: "Conversion-latency budget burning 14× (fast)" + description: "Far more than 5% of conversions are exceeding 5 minutes (1h & 5m windows). Check converter saturation / KEDA scaling / queue backlog. Runbook: SLO.md §Conversion-latency." + - alert: VidcastConversionLatencySlowBurn + expr: slo:conversion_latency:burnrate6h > 1 and slo:conversion_latency:burnrate30m > 1 + for: 15m + labels: + severity: warning + slo: conversion_latency + annotations: + summary: "Conversion-latency budget burning ≥1× (slow)" + description: "The fraction of conversions slower than 5 minutes is over budget (6h & 30m windows). Runbook: SLO.md §Conversion-latency." + + # End-to-end success (99% upload → email) + - alert: VidcastE2ESuccessFastBurn + expr: slo:e2e_success:burnrate1h > 14 and slo:e2e_success:burnrate5m > 14 + for: 5m + labels: + severity: critical + slo: e2e_success + annotations: + summary: "End-to-end success budget burning 14× (fast)" + description: "Uploads are not turning into notification emails at >14× the budget rate (1h & 5m windows). Check the converter→mp3→notification pipeline + DLQs. Runbook: SLO.md §End-to-end-success." + - alert: VidcastE2ESuccessSlowBurn + expr: slo:e2e_success:burnrate6h > 1 and slo:e2e_success:burnrate30m > 1 + for: 30m + labels: + severity: warning + slo: e2e_success + annotations: + summary: "End-to-end success budget burning ≥1× (slow)" + description: "A sustained fraction of uploads are not producing emails (6h & 30m windows). Runbook: SLO.md §End-to-end-success." diff --git a/monitoring/dashboards/vidcast-finops.json b/monitoring/dashboards/vidcast-finops.json new file mode 100644 index 0000000..a29c781 --- /dev/null +++ b/monitoring/dashboards/vidcast-finops.json @@ -0,0 +1,81 @@ +{ + "title": "VidCast FinOps / Cost", + "uid": "vidcast-finops", + "tags": ["vidcast", "finops", "cost"], + "timezone": "browser", + "refresh": "1m", + "schemaVersion": 36, + "panels": [ + { + "id": 100, + "type": "text", + "title": "", + "gridPos": {"h": 4, "w": 24, "x": 0, "y": 0}, + "options": { + "mode": "markdown", + "content": "## VidCast cost (Kubecost OSS estimates)\n**Kubecost estimates; the AWS Cost Explorer bill is ground truth.** Node-cost model is based on instance list pricing — m7i-flex.large @ **~$0.106/hr** (eu-west-2 on-demand; verify current pricing). Cost-per-conversion = cluster $/hr ÷ conversions/hr (uses the B4 `vidcast_conversions_total` counter). Trend/30-day panels are bounded by Prometheus **7d retention**. Namespace cost = CPU-share approximation; the Kubecost UI has precise allocation." + } + }, + { + "id": 1, + "title": "Cluster cost ($/hr)", + "type": "stat", + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 4}, + "fieldConfig": {"defaults": {"unit": "currencyUSD", "decimals": 3}}, + "options": {"colorMode": "value", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "sum(node_total_hourly_cost)", "legendFormat": "$/hr"}] + }, + { + "id": 2, + "title": "Projected monthly cost", + "type": "stat", + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 4}, + "fieldConfig": {"defaults": {"unit": "currencyUSD", "decimals": 0}}, + "options": {"colorMode": "value", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "sum(node_total_hourly_cost) * 730", "legendFormat": "$/mo"}] + }, + { + "id": 3, + "title": "⭐ Cost per conversion", + "type": "stat", + "gridPos": {"h": 5, "w": 6, "x": 12, "y": 4}, + "fieldConfig": {"defaults": {"unit": "currencyUSD", "decimals": 4}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "sum(node_total_hourly_cost) / clamp_min(sum(rate(vidcast_conversions_total{status=\"success\"}[1h])) * 3600, 1)", "legendFormat": "$/conversion"}] + }, + { + "id": 4, + "title": "Conversions / hour", + "type": "stat", + "gridPos": {"h": 5, "w": 6, "x": 18, "y": 4}, + "fieldConfig": {"defaults": {"unit": "none", "decimals": 1}}, + "options": {"colorMode": "value", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "sum(rate(vidcast_conversions_total{status=\"success\"}[1h])) * 3600", "legendFormat": "conv/hr"}] + }, + { + "id": 5, + "title": "Cluster cost trend ($/hr, ≤7d)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 9}, + "fieldConfig": {"defaults": {"unit": "currencyUSD", "custom": {"drawStyle": "line", "fillOpacity": 10}}}, + "targets": [{"expr": "sum(node_total_hourly_cost)", "legendFormat": "cluster $/hr"}] + }, + { + "id": 6, + "title": "CPU usage by workload (cost proxy)", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 9}, + "fieldConfig": {"defaults": {"unit": "none", "custom": {"drawStyle": "line", "fillOpacity": 10, "stacking": {"mode": "normal"}}}}, + "targets": [{"expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{namespace=\"default\", container!=\"\"}[5m]))", "legendFormat": "{{pod}}"}] + }, + { + "id": 7, + "title": "Estimated monthly cost by namespace (CPU-share approx.)", + "type": "bargauge", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 17}, + "fieldConfig": {"defaults": {"unit": "currencyUSD", "decimals": 0}}, + "options": {"orientation": "horizontal", "displayMode": "gradient", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "sum by (namespace) (rate(container_cpu_usage_seconds_total{container!=\"\"}[1h])) / scalar(sum(rate(container_cpu_usage_seconds_total{container!=\"\"}[1h]))) * scalar(sum(node_total_hourly_cost)) * 730", "legendFormat": "{{namespace}}"}] + } + ] +} diff --git a/monitoring/dashboards/vidcast-operations.json b/monitoring/dashboards/vidcast-operations.json new file mode 100644 index 0000000..5b5619b --- /dev/null +++ b/monitoring/dashboards/vidcast-operations.json @@ -0,0 +1,139 @@ +{ + "title": "VidCast Operations", + "uid": "vidcast-ops", + "tags": ["vidcast"], + "timezone": "browser", + "refresh": "30s", + "schemaVersion": 36, + "panels": [ + { + "id": 1, + "title": "Pod Status — All Services", + "type": "stat", + "gridPos": {"h": 4, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "expr": "sum by (pod) (kube_pod_status_phase{namespace='default', phase='Running'})", + "legendFormat": "{{pod}}" + } + ], + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": {"calcs": ["last"]} + } + }, + { + "id": 2, + "title": "Pod Restarts (last 1h)", + "type": "stat", + "gridPos": {"h": 4, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "expr": "sum by (pod) (increase(kube_pod_container_status_restarts_total{namespace='default'}[1h]))", + "legendFormat": "{{pod}}" + } + ], + "options": { + "colorMode": "background", + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 3} + ] + } + } + }, + { + "id": 3, + "title": "Node CPU Usage %", + "type": "gauge", + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 4}, + "targets": [ + { + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)", + "legendFormat": "CPU %" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + }, + { + "id": 4, + "title": "Node Memory Usage %", + "type": "gauge", + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 4}, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "legendFormat": "Memory %" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + }, + { + "id": 5, + "title": "RabbitMQ Queue Depth", + "type": "timeseries", + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 4}, + "description": "Messages waiting in video and mp3 queues. Rising video queue = converter backlog.", + "targets": [ + { + "expr": "rabbitmq_queue_messages{queue='video'}", + "legendFormat": "video queue" + }, + { + "expr": "rabbitmq_queue_messages{queue='mp3'}", + "legendFormat": "mp3 queue" + } + ] + }, + { + "id": 6, + "title": "CPU Usage per Pod", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 10}, + "targets": [ + { + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{namespace='default', pod!=''}[5m]))", + "legendFormat": "{{pod}}" + } + ] + }, + { + "id": 7, + "title": "Memory Usage per Pod", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 10}, + "targets": [ + { + "expr": "sum by (pod) (container_memory_working_set_bytes{namespace='default', pod!=''})", + "legendFormat": "{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } + } + ] +} diff --git a/monitoring/dashboards/vidcast-slo.json b/monitoring/dashboards/vidcast-slo.json new file mode 100644 index 0000000..a7b7070 --- /dev/null +++ b/monitoring/dashboards/vidcast-slo.json @@ -0,0 +1,146 @@ +{ + "title": "VidCast SLO / Error Budget", + "uid": "vidcast-slo", + "tags": ["vidcast", "slo"], + "timezone": "browser", + "refresh": "30s", + "schemaVersion": 36, + "panels": [ + { + "id": 100, + "type": "text", + "title": "", + "gridPos": {"h": 3, "w": 24, "x": 0, "y": 0}, + "options": { + "mode": "markdown", + "content": "## VidCast SLOs — error budget & burn rate\n**Demonstrative targets on a single-node demo cluster** (every teardown exhausts the availability budget — see SLO.md). Burn rate is normalised: **1× = sustainable**, **14× = fast-burn page**. Budget-remaining is computed over the **7-day Prometheus retention**, not the full 30-day SLO window." + } + }, + + { + "id": 1, + "title": "Availability — budget remaining (7d)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 0, "y": 3}, + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 25}, {"color": "green", "value": 50}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "100 * (1 - clamp_max((sum(increase(vidcast_gateway_requests_total{status=~\"5..\"}[7d])) / sum(increase(vidcast_gateway_requests_total[7d]))) / 0.001, 1))", "legendFormat": "remaining"}] + }, + { + "id": 2, + "title": "Availability — burn rate (1h)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 3}, + "fieldConfig": {"defaults": {"unit": "none", "decimals": 1, + "thresholds": {"steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 14}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "slo:availability:burnrate1h", "legendFormat": "1h burn"}] + }, + { + "id": 3, + "title": "Availability — hrs to exhaustion (proj.)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 3}, + "fieldConfig": {"defaults": {"unit": "h", "decimals": 1, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 72}, {"color": "green", "value": 720}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "(1 - clamp_max((sum(increase(vidcast_gateway_requests_total{status=~\"5..\"}[7d])) / sum(increase(vidcast_gateway_requests_total[7d]))) / 0.001, 1)) * 720 / clamp_min(slo:availability:burnrate1h, 0.001)", "legendFormat": "hours"}] + }, + { + "id": 4, + "title": "Availability — burn rate trend", + "type": "timeseries", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 3}, + "fieldConfig": {"defaults": {"unit": "none", "custom": {"drawStyle": "line", "fillOpacity": 10}}}, + "targets": [ + {"expr": "slo:availability:burnrate1h", "legendFormat": "1h"}, + {"expr": "slo:availability:burnrate6h", "legendFormat": "6h"} + ] + }, + + { + "id": 5, + "title": "Conversion latency — budget remaining (7d)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 0, "y": 9}, + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 25}, {"color": "green", "value": 50}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "100 * (1 - clamp_max((1 - (sum(increase(vidcast_conversion_duration_seconds_bucket{le=~\"300(\\.0)?\"}[7d])) / sum(increase(vidcast_conversion_duration_seconds_count[7d])))) / 0.05, 1))", "legendFormat": "remaining"}] + }, + { + "id": 6, + "title": "Conversion latency — burn rate (1h)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 9}, + "fieldConfig": {"defaults": {"unit": "none", "decimals": 1, + "thresholds": {"steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 14}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "slo:conversion_latency:burnrate1h", "legendFormat": "1h burn"}] + }, + { + "id": 7, + "title": "Conversion latency — hrs to exhaustion (proj.)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 9}, + "fieldConfig": {"defaults": {"unit": "h", "decimals": 1, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 72}, {"color": "green", "value": 720}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "(1 - clamp_max((1 - (sum(increase(vidcast_conversion_duration_seconds_bucket{le=~\"300(\\.0)?\"}[7d])) / sum(increase(vidcast_conversion_duration_seconds_count[7d])))) / 0.05, 1)) * 720 / clamp_min(slo:conversion_latency:burnrate1h, 0.001)", "legendFormat": "hours"}] + }, + { + "id": 8, + "title": "Conversion latency — burn rate trend", + "type": "timeseries", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 9}, + "fieldConfig": {"defaults": {"unit": "none", "custom": {"drawStyle": "line", "fillOpacity": 10}}}, + "targets": [ + {"expr": "slo:conversion_latency:burnrate1h", "legendFormat": "1h"}, + {"expr": "slo:conversion_latency:burnrate6h", "legendFormat": "6h"} + ] + }, + + { + "id": 9, + "title": "End-to-end success — budget remaining (7d)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 0, "y": 15}, + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 25}, {"color": "green", "value": 50}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "100 * (1 - clamp_max((1 - (sum(increase(vidcast_notifications_total{status=\"success\"}[7d])) / sum(increase(vidcast_uploads_total[7d])))) / 0.01, 1))", "legendFormat": "remaining"}] + }, + { + "id": 10, + "title": "End-to-end success — burn rate (1h)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 6, "y": 15}, + "fieldConfig": {"defaults": {"unit": "none", "decimals": 1, + "thresholds": {"steps": [{"color": "green", "value": null}, {"color": "yellow", "value": 1}, {"color": "red", "value": 14}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "slo:e2e_success:burnrate1h", "legendFormat": "1h burn"}] + }, + { + "id": 11, + "title": "End-to-end success — hrs to exhaustion (proj.)", + "type": "stat", + "gridPos": {"h": 6, "w": 6, "x": 12, "y": 15}, + "fieldConfig": {"defaults": {"unit": "h", "decimals": 1, + "thresholds": {"steps": [{"color": "red", "value": null}, {"color": "yellow", "value": 72}, {"color": "green", "value": 720}]}}}, + "options": {"colorMode": "background", "reduceOptions": {"calcs": ["last"]}}, + "targets": [{"expr": "(1 - clamp_max((1 - (sum(increase(vidcast_notifications_total{status=\"success\"}[7d])) / sum(increase(vidcast_uploads_total[7d])))) / 0.01, 1)) * 720 / clamp_min(slo:e2e_success:burnrate1h, 0.001)", "legendFormat": "hours"}] + }, + { + "id": 12, + "title": "End-to-end success — burn rate trend", + "type": "timeseries", + "gridPos": {"h": 6, "w": 6, "x": 18, "y": 15}, + "fieldConfig": {"defaults": {"unit": "none", "custom": {"drawStyle": "line", "fillOpacity": 10}}}, + "targets": [ + {"expr": "slo:e2e_success:burnrate1h", "legendFormat": "1h"}, + {"expr": "slo:e2e_success:burnrate6h", "legendFormat": "6h"} + ] + } + ] +} diff --git a/monitoring/scrape/converter-podmonitor.yaml b/monitoring/scrape/converter-podmonitor.yaml new file mode 100644 index 0000000..6ff891b --- /dev/null +++ b/monitoring/scrape/converter-podmonitor.yaml @@ -0,0 +1,20 @@ +# B4 (M-2): scrape the converter consumer's metrics (conversion-latency SLO source). +# A PodMonitor (not ServiceMonitor) because the converter has no Service — it is a +# queue consumer; we scrape its pods directly on the named "metrics" container port. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: vidcast-converter + namespace: monitoring + labels: + release: monitoring +spec: + namespaceSelector: + matchNames: ["default"] + selector: + matchLabels: + app: converter + podMetricsEndpoints: + - port: metrics # the named containerPort (9000) + path: /metrics + interval: 30s diff --git a/monitoring/scrape/gateway-servicemonitor.yaml b/monitoring/scrape/gateway-servicemonitor.yaml new file mode 100644 index 0000000..e63cf97 --- /dev/null +++ b/monitoring/scrape/gateway-servicemonitor.yaml @@ -0,0 +1,21 @@ +# B4 (M-2): scrape the gateway's /metrics (availability + uploads SLO sources). +# Replaces the old static additionalScrapeConfigs job in monitoring/values.yaml — +# operator-native, auto-discovers all gateway pod endpoints behind the Service. +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vidcast-gateway + namespace: monitoring + labels: + # kube-prometheus-stack only adopts ServiceMonitors carrying the release label. + release: monitoring +spec: + namespaceSelector: + matchNames: ["default"] + selector: + matchLabels: + app: gateway + endpoints: + - port: http # the named Service port (8080) + path: /metrics + interval: 30s diff --git a/monitoring/scrape/kubecost-servicemonitor.yaml b/monitoring/scrape/kubecost-servicemonitor.yaml new file mode 100644 index 0000000..b27f856 --- /dev/null +++ b/monitoring/scrape/kubecost-servicemonitor.yaml @@ -0,0 +1,24 @@ +# B3: let the existing Prometheus scrape Kubecost's cost-model /metrics, which is +# where the cost series (node_total_hourly_cost, *_allocation_*) the FinOps +# dashboard queries come from. Kubecost READS raw metrics from Prometheus and +# EXPOSES computed cost metrics back — this closes that loop. +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vidcast-kubecost + namespace: monitoring + labels: + release: monitoring +spec: + namespaceSelector: + matchNames: ["kubecost"] + selector: + matchLabels: + app: cost-analyzer # the kubecost cost-analyzer Service/pods + endpoints: + # cost-model exposes its metrics on the 9003 container port. Verify the exact + # named port on the live Service (chart-version dependent) — fall back to the + # numeric targetPort if the name differs. + - port: tcp-model + path: /metrics + interval: 60s diff --git a/monitoring/scrape/notification-podmonitor.yaml b/monitoring/scrape/notification-podmonitor.yaml new file mode 100644 index 0000000..58dd836 --- /dev/null +++ b/monitoring/scrape/notification-podmonitor.yaml @@ -0,0 +1,19 @@ +# B4 (M-2): scrape the notification consumer's metrics (end-to-end SLO numerator). +# PodMonitor for the same reason as the converter — no Service, scrape pods directly. +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: vidcast-notification + namespace: monitoring + labels: + release: monitoring +spec: + namespaceSelector: + matchNames: ["default"] + selector: + matchLabels: + app: notification + podMetricsEndpoints: + - port: metrics # the named containerPort (9000) + path: /metrics + interval: 30s diff --git a/monitoring/scrape/rabbitmq-servicemonitor.yaml b/monitoring/scrape/rabbitmq-servicemonitor.yaml new file mode 100644 index 0000000..be2cdbb --- /dev/null +++ b/monitoring/scrape/rabbitmq-servicemonitor.yaml @@ -0,0 +1,29 @@ +# B4 (M-2): scrape rabbitmq_prometheus (:15692). Un-dangles the two RabbitMQ alerts +# in vidcast-alerts.yaml (rabbitmq_queue_messages, up{job="rabbitmq"}). +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vidcast-rabbitmq + namespace: monitoring + labels: + release: monitoring +spec: + namespaceSelector: + matchNames: ["default"] + selector: + matchLabels: + app: rabbitmq + endpoints: + # /metrics/per-object (NOT the default /metrics): the default endpoint returns + # cluster-AGGREGATED metrics only. The existing RabbitMQQueueBacklog alert needs + # PER-QUEUE rabbitmq_queue_messages{queue="video"}, which is exposed per-object. + # Cardinality is trivial here (2 queues + DLQ topology). RabbitMQ ≥3.8. + - port: prometheus # the named Service port (15692) + path: /metrics/per-object + interval: 30s + # Force job="rabbitmq" so the existing alerts' up{job="rabbitmq"} selector + # resolves deterministically (the operator's default job label is otherwise + # derived from the Service/port and is version-dependent). + relabelings: + - targetLabel: job + replacement: rabbitmq diff --git a/monitoring/values-emptydir.yaml b/monitoring/values-emptydir.yaml new file mode 100644 index 0000000..36e74f0 --- /dev/null +++ b/monitoring/values-emptydir.yaml @@ -0,0 +1,20 @@ +# Local override: ephemeral (emptyDir) storage for the monitoring stack. +# Used because this dev cluster has NO dynamic EBS provisioner (no aws-ebs-csi-driver +# addon; the in-tree kubernetes.io/aws-ebs provisioner is non-functional on EKS 1.31), +# and the datastores run on manually-provisioned local PVs only. emptyDir avoids +# billable EBS volumes and orphaned-volume cost on teardown (project "~$0 when off" +# posture). Trade-off: metrics/dashboards do not survive a pod restart — acceptable +# on a transient dev/demo cluster. Apply alongside values.yaml: +# helm install monitoring prometheus-community/kube-prometheus-stack \ +# -f monitoring/values.yaml -f monitoring/values-emptydir.yaml -n monitoring +grafana: + persistence: + enabled: false + +alertmanager: + alertmanagerSpec: + storage: null + +prometheus: + prometheusSpec: + storageSpec: null diff --git a/monitoring/values.yaml b/monitoring/values.yaml new file mode 100644 index 0000000..fc008d2 --- /dev/null +++ b/monitoring/values.yaml @@ -0,0 +1,79 @@ +# kube-prometheus-stack Helm values for VidCast +# Install: helm install monitoring prometheus-community/kube-prometheus-stack \ +# -f monitoring/values.yaml -n monitoring --create-namespace + +grafana: + adminPassword: vidcast-demo + service: + type: NodePort + nodePort: 30007 + persistence: + enabled: true + size: 2Gi + sidecar: + dashboards: + enabled: true + searchNamespace: monitoring + grafana.ini: + server: + root_url: "%(protocol)s://%(domain)s:30007" + # Allow the frontend Dashboard page to embed Grafana panels in an