From bcfbd2192d6f37f1a63a407e7ffb1ea9c300526c Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:11:32 +0100 Subject: [PATCH 01/90] chore: add .gitignore, untrack secrets, create directory structure - Added comprehensive .gitignore covering Terraform state, k8s secrets, build artifacts, Python cache, Node modules, and IDE files - Untracked 6 secret.yaml files that should never be in git history - Created directory structure for terraform/, monitoring/, docs/, src/frontend/, .github/workflows/ - Added terraform.tfvars.example template - Added CLAUDE.md and VIDCAST_UPGRADE_PLAN.md project context files Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 56 ++ CLAUDE.md | 645 ++++++++++++++++++ Helm_charts/MongoDB/templates/secret.yaml | 11 - Helm_charts/RabbitMQ/templates/secret.yaml | 7 - VIDCAST_UPGRADE_PLAN.md | 634 +++++++++++++++++ src/auth-service/manifest/secret.yaml | 9 - src/converter-service/manifest/secret.yaml | 7 - src/gateway-service/manifest/secret.yaml | 7 - src/notification-service/manifest/secret.yaml | 10 - .../environments/dev/terraform.tfvars.example | 19 + 10 files changed, 1354 insertions(+), 51 deletions(-) create mode 100644 .gitignore create mode 100644 CLAUDE.md delete mode 100644 Helm_charts/MongoDB/templates/secret.yaml delete mode 100644 Helm_charts/RabbitMQ/templates/secret.yaml create mode 100644 VIDCAST_UPGRADE_PLAN.md delete mode 100644 src/auth-service/manifest/secret.yaml delete mode 100644 src/converter-service/manifest/secret.yaml delete mode 100644 src/gateway-service/manifest/secret.yaml delete mode 100644 src/notification-service/manifest/secret.yaml create mode 100644 terraform/environments/dev/terraform.tfvars.example diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fd88d1f --- /dev/null +++ b/.gitignore @@ -0,0 +1,56 @@ +# Terraform +terraform.tfvars +terraform.tfvars.json +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +crash.log + +# Kubernetes secrets +**/secret.yaml + +# Deployment-specific files +DEPLOYMENT_CONFIG.md +DEPLOYMENT_HANDOVER.md +DEPLOYMENT_REPORT.md +SESSION_SUMMARY.md +DEPLOYMENT_PROBLEMS.md +deployment-ids.txt +customise.sh + +# Build artifacts +*.mp3 +!assets/video.mp4 +output.* + +# Python +__pycache__/ +*.pyc +*.pyo +.env +venv/ +*.egg-info/ + +# Node +node_modules/ +dist/ +build/ +.cache/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log + +# Explanation files (study material, not production) +*_EXPLAINED.md diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..324d013 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,645 @@ +# CLAUDE.md — VidCast Platform (Video-to-Audio Microservices on AWS EKS) + +--- + +## ⚠️ READ THIS FIRST — BEFORE ANYTHING ELSE + +### Step 1 — Identify which prompt type is being used + +This file supports two execution modes. The mode determines who builds the CI/CD pipeline, health endpoints, and security hardening. + +``` +FULL PROMPT (CLAUDE_CODE_FULL_PROMPT_V2.md) + → Claude builds everything — all phases, all files + → Sections marked [FULL ONLY] apply + → Sections marked [HYBRID ONLY] do NOT apply — skip them + +HYBRID PROMPT (CLAUDE_CODE_HYBRID_PROMPT_V2.md) + → Claude builds Terraform, monitoring, frontend, Swarm compose, docs + → Developer manually builds CI/CD, health endpoints, security hardening + → Sections marked [HYBRID ONLY] apply + → Sections marked [FULL ONLY] do NOT apply — skip them +``` + +Read the active prompt file to determine mode. If uncertain, ask. + +### Step 2 — Read all companion files + +```bash +ls -la *.md +cat VIDCAST_UPGRADE_PLAN.md +ls DEPLOYMENT_CONFIG.md 2>/dev/null && cat DEPLOYMENT_CONFIG.md +ls DEPLOYMENT_HANDOVER.md 2>/dev/null && cat DEPLOYMENT_HANDOVER.md +``` + +If `DEPLOYMENT_CONFIG.md` has unfilled bracket placeholders (`[VALUE]`), list them and ask the user to fill them before proceeding. Do NOT continue with placeholder values. + +### Step 3 — Check for a previous session + +If `DEPLOYMENT_HANDOVER.md` exists, read it, identify which phases are complete, and resume from the next incomplete phase. Never recreate resources that already exist. + +### Step 4 — Validate AWS access + +```bash +aws sts get-caller-identity +``` + +--- + +## Concurrent File Management (Non-Negotiable) + +Maintain two tracking files throughout ALL work. These are your crash recovery system. + +**DEPLOYMENT_HANDOVER.md** — Session state. Update this: +- BEFORE any destructive operation (terraform destroy, kubectl delete, helm uninstall) +- AFTER every completed phase +- AFTER every successful infrastructure change (terraform apply, helm install, kubectl apply) +- IMMEDIATELY if usage limits are approaching — save state before stopping + +**DEPLOYMENT_REPORT.md** — Full record of everything done. Update after every significant action. + +If Claude Code stops for any reason, the next session reads DEPLOYMENT_HANDOVER.md and resumes exactly from where it left off. Every phase completion and every resource ID must be recorded here. + +DEPLOYMENT_HANDOVER.md structure: +```markdown +# VidCast Deployment Handover +## Last Updated: [timestamp] + +### Base Deployment Phases (0-12) +- [x] Phase 0: Prerequisites +- [ ] Phase 1: IAM Roles +... + +### Upgrade Phases +- [ ] Phase U0: Repo Cleanup +- [ ] Phase U1: Terraform IaC +... + +### AWS Resources +- VPC ID: [value] +- EKS Cluster: [value] +- Node Group: [value] +- Node IP: [value] +- Security Group: [value] + +### Staging Environment +- Swarm EC2 IP: [value] +- Swarm status: [running/stopped/not created] + +### Resume Instructions +[Exact commands to pick up from current state] +``` + +--- + +## Project Overview + +**Product:** VidCast — "Turn video recordings into podcast-ready audio" + +This is a Python microservices platform that converts uploaded MP4 video files to MP3 audio files. It runs on AWS EKS with an event-driven, asynchronous architecture. A user uploads a video, it's processed via a RabbitMQ pipeline, and they receive an email with the download link. + +**Repository base:** https://github.com/N4si/K8s-video-converter.git (forked to student's account) + +--- + +## System Architecture + +``` +Client (Browser / curl / React Frontend) + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ Frontend — React + nginx (NodePort :30006) [NEW] │ +│ Login → Upload → Download → Dashboard → Arch Diagram│ +└──────────────────────────────────────────────────────┘ + │ + ▼ +┌──────────────────────────────────────────────────────┐ +│ Gateway Service — Flask :8080 (NodePort :30002) │ +│ POST /login → Auth Service (:5000) → PostgreSQL │ +│ POST /upload → MongoDB GridFS + RabbitMQ "video" │ +│ GET /download → MongoDB GridFS → stream MP3 │ +│ GET /healthz → health check endpoint [NEW] │ +└──────────────────────────────────────────────────────┘ + │ + ▼ RabbitMQ "video" queue +┌──────────────────────────────────────────────────────┐ +│ Converter Service — 4 replicas (Pika + ffmpeg) │ +│ Reads video → extracts audio → stores MP3 │ +│ → publishes to RabbitMQ "mp3" queue │ +└──────────────────────────────────────────────────────┘ + │ + ▼ RabbitMQ "mp3" queue +┌──────────────────────────────────────────────────────┐ +│ Notification Service — 2 replicas (Pika + smtplib) │ +│ Sends email with file ID for download │ +└──────────────────────────────────────────────────────┘ +``` + +### Services + +| Service | Technology | Replicas | Access | Health Check | +|---------|-----------|----------|--------|-------------| +| Frontend | React + nginx | 1 | NodePort :30006 | HTTP GET / | +| Auth Service | Flask + PyJWT + psycopg2 | 2 | ClusterIP :5000 | HTTP GET /healthz | +| Gateway Service | Flask + PyMongo + Pika | 2 | NodePort :30002 | HTTP GET /healthz | +| Converter Service | Pika + MoviePy + ffmpeg | 4 | None (queue consumer) | Exec: test -f /tmp/healthy | +| Notification Service | Pika + smtplib | 2 | None (queue consumer) | Exec: test -f /tmp/healthy | +| MongoDB | mongo:4.0.8 | 1 (StatefulSet) | NodePort :30005 | TCP :27017 | +| PostgreSQL | postgres | 1 (Deployment) | NodePort :30003 | TCP :5432 | +| RabbitMQ | rabbitmq:3-management | 1 (StatefulSet) | NodePort :30004 | TCP :5672 | + +### Environments + +| Environment | Platform | Purpose | Cost | +|-------------|----------|---------|------| +| Production | AWS EKS eu-west-2 (m7i-flex.large) | Live traffic | ~$150/month | +| Staging | Docker Swarm (t2.micro EC2) | Pre-production via Jenkins | ~$10/month | +| Local | Docker Compose | Developer testing | Free | + +**Why Docker Swarm for staging:** A second EKS staging environment costs ~$0.40/hour (~$290/month). A Swarm staging environment on a single t2.micro costs ~$0.01/hour (~$7.50/month, free tier eligible). 97% cost reduction for a functionally equivalent testing environment. The Jenkins pipeline deploys to Swarm first, runs a smoke test, waits for human approval, then deploys to EKS. This directly connects the Docker Swarm bootcamp module to the Kubernetes production deployment. + +### Port Map + +| Port | Service | Type | Purpose | +|------|---------|------|---------| +| 30002 | Gateway | NodePort | Client API | +| 30003 | PostgreSQL | NodePort | Admin access | +| 30004 | RabbitMQ UI | NodePort | Queue management | +| 30005 | MongoDB | NodePort | Admin access | +| 30006 | Frontend | NodePort | Web interface | +| 30007 | Grafana | NodePort | Monitoring dashboard | +| 30008 | Alertmanager | NodePort | Alert management | + +--- + +## Repository Structure + +``` +vidcast/ +├── CLAUDE.md # THIS FILE +├── VIDCAST_UPGRADE_PLAN.md # Detailed improvement plan +├── MEDIAFLOW_COMPARISON.md # MediaFlow comparison analysis +├── README.md # Public-facing documentation +├── .gitignore # Comprehensive — secrets, state, artifacts +├── Jenkinsfile # Staging → Approval → Production pipeline +├── docker-compose.swarm.yml # Docker Swarm staging environment +├── DEPLOYMENT_CONFIG.md # GITIGNORED — your AWS + app configuration +├── DEPLOYMENT_HANDOVER.md # GITIGNORED — session state +├── DEPLOYMENT_REPORT.md # GITIGNORED — deployment timeline +│ +├── .github/ +│ └── workflows/ +│ ├── ci.yml # Lint + Trivy + build + push +│ └── cd.yml # Deploy to EKS +│ +├── terraform/ +│ ├── environments/ +│ │ └── dev/ +│ │ ├── main.tf # Root module +│ │ ├── variables.tf # Inputs +│ │ ├── outputs.tf # Cluster endpoint, node IP, kubeconfig cmd +│ │ ├── backend.tf # S3 + DynamoDB state +│ │ └── terraform.tfvars # GITIGNORED — actual values +│ └── modules/ +│ ├── vpc/ # VPC, 2 subnets, IGW, routes +│ ├── eks/ # Cluster + node group + OIDC +│ ├── iam/ # Cluster role, node role +│ └── security-groups/ # NodePort rules 30002-30008 +│ +├── Helm_charts/ +│ ├── MongoDB/ +│ ├── Postgres/ +│ └── RabbitMQ/ +│ +├── src/ +│ ├── auth-service/ +│ ├── gateway-service/ +│ ├── converter-service/ +│ ├── notification-service/ +│ └── frontend/ # React web app +│ ├── Dockerfile +│ ├── nginx.conf +│ ├── package.json +│ ├── src/ +│ └── manifest/ +│ +├── monitoring/ +│ ├── values.yaml +│ ├── dashboards/ +│ │ └── vidcast-operations.json +│ └── alerts/ +│ └── vidcast-alerts.yaml +│ +├── docs/ +│ ├── architecture.md +│ ├── deployment-guide.md +│ └── presentation-notes.md +│ +└── assets/ + └── video.mp4 +``` + +--- + +## Configuration Values (from DEPLOYMENT_CONFIG.md) + +Parse DEPLOYMENT_CONFIG.md before proceeding. Validate no bracket placeholders remain: +```bash +grep -n '\[.*\]' DEPLOYMENT_CONFIG.md +``` + +| Variable | Description | +|----------|-------------| +| YOUR_NAME | For deployment report | +| AWS_ACCOUNT_ID | Auto-detect: `aws sts get-caller-identity` | +| AWS_REGION | eu-west-2 (London) | +| CLUSTER_NAME | e.g., vidcast-cluster | +| NODE_INSTANCE_TYPE | m7i-flex.large (NEVER T-type — see constraints) | +| NODE_COUNT | 1 | +| VPC_ID | Leave blank to create new | +| DOCKER_HUB_USERNAME | Your Docker Hub username | +| APP_LOGIN_EMAIL | Login email for the app | +| APP_LOGIN_PASSWORD | App login password | +| GMAIL_ADDRESS | Gmail for sending notifications | +| GMAIL_APP_PASSWORD | 16-char app password (or SKIP) | +| MONGODB_USERNAME | MongoDB app user | +| MONGODB_PASSWORD | MongoDB password | +| POSTGRES_USERNAME | PostgreSQL username | +| POSTGRES_PASSWORD | PostgreSQL password | +| JWT_SECRET | Random 32+ char string | + +--- + +## Customisation Checklist + +After setting config values, update these files consistently: + +### MongoDB Credentials (3 files must match) +- `Helm_charts/MongoDB/values.yaml` → username, password +- `src/gateway-service/manifest/configmap.yaml` → MONGODB_VIDEOS_URI, MONGODB_MP3S_URI +- `src/converter-service/manifest/configmap.yaml` → MONGODB_URI + +### PostgreSQL Credentials (4 files must match) +- `Helm_charts/Postgres/values.yaml` → user, password, db +- `Helm_charts/Postgres/init.sql` → INSERT INTO auth_user +- `src/auth-service/manifest/secret.yaml` → PSQL_PASSWORD (base64) +- `src/auth-service/manifest/configmap.yaml` → DATABASE_USER + +### JWT Secret, Gmail, Docker Images +- `src/auth-service/manifest/secret.yaml` → JWT_SECRET (base64) +- `src/notification-service/manifest/secret.yaml` → GMAIL_ADDRESS, GMAIL_PASSWORD (base64) +- All 4 deployment YAML files → image name + +Generate and run `customise.sh` using sed to apply all substitutions atomically. +Validate: `grep -r "nasi\|sarcasm\|iambatmanthegoat" . --include="*.yaml" --include="*.sql"` + +--- + +## Part 1 — Base Deployment Phases (Original Project) + +These phases deploy the base application. If already complete, check DEPLOYMENT_HANDOVER.md and skip to Part 2. + +``` +Phase 0: Prerequisites (tools + AWS credentials + repo) +Phase 1: IAM roles (eks-cluster-role, eks-node-role) +Phase 2: VPC and networking (CLI only — no console) +Phase 3: EKS cluster + node group (~20 minutes) +Phase 4: Security group rules (30002-30005) +Phase 5: Customise files + apply bug fixes +Phase 6: Helm deployments (MongoDB → PostgreSQL → RabbitMQ) +Phase 7: PostgreSQL init (run init.sql) +Phase 8: RabbitMQ queues (via HTTP Management API) +Phase 9: Docker images (prebuilt or build+push) +Phase 10: Deploy microservices +Phase 11: End-to-end test +Phase 12: Deployment report +``` + +### Phase 1: IAM Roles +```bash +# Check before creating — skip if already exists +aws iam get-role --role-name eks-cluster-role 2>/dev/null || \ + aws iam create-role --role-name eks-cluster-role \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"eks.amazonaws.com"},"Action":"sts:AssumeRole"}]}' +aws iam attach-role-policy --role-name eks-cluster-role \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKSClusterPolicy + +aws iam get-role --role-name eks-node-role 2>/dev/null || \ + aws iam create-role --role-name eks-node-role \ + --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"ec2.amazonaws.com"},"Action":"sts:AssumeRole"}]}' +aws iam attach-role-policy --role-name eks-node-role \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy +aws iam attach-role-policy --role-name eks-node-role \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy +aws iam attach-role-policy --role-name eks-node-role \ + --policy-arn arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly +``` +Save role ARNs to DEPLOYMENT_HANDOVER.md. + +### Phase 2: VPC and Networking (only if VPC_ID blank) +```bash +VPC_ID=$(aws ec2 create-vpc --cidr-block 10.0.0.0/16 \ + --tag-specifications 'ResourceType=vpc,Tags=[{Key=Name,Value=vidcast-vpc}]' \ + --query Vpc.VpcId --output text) +IGW_ID=$(aws ec2 create-internet-gateway --query InternetGateway.InternetGatewayId --output text) +aws ec2 attach-internet-gateway --internet-gateway-id $IGW_ID --vpc-id $VPC_ID +SUBNET_1=$(aws ec2 create-subnet --vpc-id $VPC_ID --cidr-block 10.0.1.0/24 \ + --availability-zone eu-west-2a --query Subnet.SubnetId --output text) +SUBNET_2=$(aws ec2 create-subnet --vpc-id $VPC_ID --cidr-block 10.0.2.0/24 \ + --availability-zone eu-west-2b --query Subnet.SubnetId --output text) +aws ec2 create-tags --resources $SUBNET_1 $SUBNET_2 \ + --tags Key=kubernetes.io/role/elb,Value=1 +aws ec2 modify-subnet-attribute --subnet-id $SUBNET_1 --map-public-ip-on-launch +aws ec2 modify-subnet-attribute --subnet-id $SUBNET_2 --map-public-ip-on-launch +RTB=$(aws ec2 create-route-table --vpc-id $VPC_ID --query RouteTable.RouteTableId --output text) +aws ec2 create-route --route-table-id $RTB --destination-cidr-block 0.0.0.0/0 \ + --gateway-id $IGW_ID +aws ec2 associate-route-table --route-table-id $RTB --subnet-id $SUBNET_1 +aws ec2 associate-route-table --route-table-id $RTB --subnet-id $SUBNET_2 +``` +Save all IDs to DEPLOYMENT_HANDOVER.md. + +### Phase 3: EKS Cluster + +⚠️ NEVER use T-type instances. Use m7i-flex.large or M/C/R-series only. + +```bash +aws eks create-cluster --name vidcast-cluster --region eu-west-2 \ + --kubernetes-version 1.31 \ + --role-arn arn:aws:iam::ACCOUNT_ID:role/eks-cluster-role \ + --resources-vpc-config subnetIds=SUBNET_1,SUBNET_2,endpointPublicAccess=true + +aws eks wait cluster-active --name vidcast-cluster --region eu-west-2 +aws eks update-kubeconfig --name vidcast-cluster --region eu-west-2 + +aws eks create-nodegroup --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes \ + --node-role arn:aws:iam::ACCOUNT_ID:role/eks-node-role \ + --subnets SUBNET_1 SUBNET_2 \ + --instance-types m7i-flex.large \ + --scaling-config minSize=1,maxSize=2,desiredSize=1 \ + --ami-type AL2_x86_64 --region eu-west-2 + +aws eks wait nodegroup-active --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes --region eu-west-2 + +kubectl get nodes -o wide # capture EXTERNAL-IP as NODE_IP +``` + +### Phase 4: Security Group Rules +```bash +NODE_SG=$(aws ec2 describe-security-groups \ + --filters "Name=tag:kubernetes.io/cluster/vidcast-cluster,Values=owned" \ + --query "SecurityGroups[0].GroupId" --output text) +for PORT in 30002 30003 30004 30005 30006 30007 30008; do + aws ec2 authorize-security-group-ingress \ + --group-id $NODE_SG --protocol tcp --port $PORT --cidr 0.0.0.0/0 +done +``` + +### Phase 6: Helm Deployments +```bash +cd Helm_charts/MongoDB && helm install mongodb . && cd ../.. +kubectl get pods -w # wait for mongodb-0 Running +cd Helm_charts/Postgres && helm install postgres . && cd ../.. +kubectl get pods -w # wait for postgres Running +cd Helm_charts/RabbitMQ && helm install rabbitmq . && cd ../.. +kubectl get pods -w # wait for rabbitmq-0 Running +``` + +### Phase 7: PostgreSQL Init +```bash +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql -h NODE_IP -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb -f Helm_charts/Postgres/init.sql +PGPASSWORD=YOUR_POSTGRES_PASSWORD psql -h NODE_IP -p 30003 \ + -U YOUR_POSTGRES_USERNAME -d authdb -c "SELECT * FROM auth_user;" +``` + +### Phase 8: RabbitMQ Queues (HTTP API — not browser) +```bash +curl -u guest:guest -X PUT http://NODE_IP:30004/api/queues/%2F/video \ + -H "Content-Type: application/json" -d '{"durable":true}' +curl -u guest:guest -X PUT http://NODE_IP:30004/api/queues/%2F/mp3 \ + -H "Content-Type: application/json" -d '{"durable":true}' +curl -s -u guest:guest http://NODE_IP:30004/api/queues | python3 -m json.tool | grep name +``` + +### Phase 10: Deploy Microservices +```bash +kubectl apply -f src/auth-service/manifest/ +kubectl rollout status deployment/auth +kubectl apply -f src/gateway-service/manifest/ +kubectl rollout status deployment/gateway +kubectl apply -f src/converter-service/manifest/ +kubectl rollout status deployment/converter +kubectl apply -f src/notification-service/manifest/ +kubectl rollout status deployment/notification +kubectl get pods # all should be Running +``` + +### Phase 11: End-to-End Test +```bash +# Login +JWT=$(curl -s -X POST http://NODE_IP:30002/login -u "EMAIL:PASSWORD") +echo "JWT: $JWT" + +# Upload +curl -X POST http://NODE_IP:30002/upload \ + -F "file=@assets/video.mp4" -H "Authorization: Bearer $JWT" + +# Monitor queues +sleep 5 +curl -s -u guest:guest http://NODE_IP:30004/api/queues/%2F/video \ + | python3 -m json.tool | grep messages + +# Download (use FILE_ID from email) +curl -X GET "http://NODE_IP:30002/download?fid=FILE_ID" \ + -H "Authorization: Bearer $JWT" -o output.mp3 +``` + +--- + +## Part 2 — Upgrade Phases + +These phases transform the base project into a production-grade platform. + +``` +Phase U0: Repo cleanup + .gitignore +Phase U1: Terraform IaC (VPC, IAM, EKS, SGs) +Phase U2: CI/CD Pipeline + [FULL ONLY]: Claude generates ci.yml, cd.yml, Jenkinsfile + [HYBRID ONLY]: Claude generates docker-compose.swarm.yml only + Developer manually writes ci.yml, cd.yml, Jenkinsfile +Phase U3: Security Hardening + [FULL ONLY]: Claude adds probes, limits, security contexts, health endpoints + [HYBRID ONLY]: Developer writes all security hardening manually +Phase U4: Monitoring Stack (Prometheus + Grafana + Alertmanager) +Phase U5: Frontend Application (React) +Phase U6: Documentation +``` + +### Phase U2: CI/CD Pipeline + +**GitHub Actions ci.yml — all modes:** + +Matrix strategy running lint + Trivy scan + build + push for all four services in parallel: +- Matrix: `service: [auth-service, gateway-service, converter-service, notification-service]` +- Lint: ruff check +- Build: docker build tagged with SHORT_SHA (`${GITHUB_SHA::7}`) +- Scan: aquasecurity/trivy-action with CRITICAL,HIGH severity, exit-code 1, ignore-unfixed +- Push: docker/login-action + docker push (main branch only) + +**GitHub Actions cd.yml — all modes:** + +Trigger: `workflow_run` on CI completion (main branch). Uses `aws-actions/configure-aws-credentials@v4`, then `aws eks update-kubeconfig`, then `kubectl set image` + `kubectl rollout status` for each service. + +**Jenkinsfile — key stages (all modes):** + +``` +Stage 1: Lint (ruff) +Stage 2: Build Images (parallel — all 4 services) +Stage 3: Security Scan (Trivy — all 4 images) +Stage 4: Push Images (Docker Hub) +Stage 5: Deploy Staging → docker stack deploy to Swarm EC2 +Stage 6: Smoke Test → curl -f http://${STAGING_IP}:8080/healthz || exit 1 +Stage 7: Approve Production → input message: 'Deploy to Production?' +Stage 8: Deploy Production → kubectl set image + kubectl rollout status +post { failure { kubectl rollout undo all services } } +``` + +**docker-compose.swarm.yml:** All 7 services with overlay networking, named volumes for MongoDB and PostgreSQL, failure_action: rollback on all services, restart_policy: on-failure max 3. + +**[HYBRID ONLY]:** Developer builds ci.yml, cd.yml, and Jenkinsfile manually. See HYBRID_IMPLEMENTATION_GUIDE_V2.md for step-by-step instructions. + +### Phase U3: Security Hardening + +**Health endpoints:** +- `src/auth-service/server.py`: add Flask `/healthz` route testing PostgreSQL connectivity +- `src/gateway-service/server.py`: add `/healthz` testing MongoDB + RabbitMQ. Add flask-cors to requirements.txt and `CORS(server)` after app creation +- `src/converter-service/consumer.py`: in main loop, `pathlib.Path("/tmp/healthy").touch()` after processing +- `src/notification-service/consumer.py`: same touch file pattern + +**Deployment manifests — all four services:** + +Probes (auth/gateway — HTTP, converter/notification — exec): +```yaml +livenessProbe: + httpGet: {path: /healthz, port: PORT} + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 +readinessProbe: + httpGet: {path: /healthz, port: PORT} + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 +``` + +Resources: +``` +Auth: cpu 50m/200m mem 64Mi/128Mi +Gateway: cpu 100m/300m mem 128Mi/256Mi +Converter: cpu 250m/500m mem 256Mi/512Mi +Notification: cpu 50m/100m mem 64Mi/128Mi +``` + +Security context (all pods): +```yaml +securityContext: + runAsNonRoot: true + runAsUser: 1000 + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] +``` + +Converter and notification: add writable emptyDir volume at /tmp. + +**[HYBRID ONLY]:** Developer writes all security hardening manually. See HYBRID_IMPLEMENTATION_GUIDE_V2.md. + +### Phase U4: Monitoring Stack + +Install via Helm: `helm install monitoring prometheus-community/kube-prometheus-stack -f monitoring/values.yaml -n monitoring` + +Key config: Grafana NodePort 30007 (password: vidcast-demo), Alertmanager 30008, 7d retention, 10Gi storage. Disable etcd/scheduler/controller-manager (EKS manages these). + +Custom dashboard "VidCast Operations": pod status, restarts, node CPU/memory, queue depth. +Alert rules: PodCrashLoopBackOff (critical), HighNodeMemory >85% (warning), HighNodeCPU >85% (warning). + +### Phase U5: Frontend + +React + Vite + Tailwind CSS. Pages: Login, Upload, Download, Dashboard (Grafana iframe), Architecture (animated diagram). Nginx multi-stage Dockerfile, runs as non-root on port 8080. NodePort 30006. + +--- + +## Known Issues and Applied Fixes + +| # | Severity | Issue | Fix | +|---|----------|-------|-----| +| 1 | High | NameError in gateway-service/server.py — unauth_count.inc() | Remove lines 36 and 60 | +| 2 | High | JWT secret was "sarcasm" | Replace with 32+ char random string | +| 3 | High | Plaintext passwords in PostgreSQL | Document — acceptable for learning | +| 4 | High | Credentials in source YAML | .gitignore for secret.yaml files | +| 5 | Low | ffmpeg in notification Dockerfile | Remove if rebuilding images | +| 6 | Medium | No liveness/readiness probes | Fixed in Phase U3 | +| 7 | Medium | No resource limits | Fixed in Phase U3 | +| 8 | Medium | PostgreSQL has no PersistentVolume | Acceptable — use RDS in production | +| 9 | Low | prometheus-client unused in gateway | Remove if rebuilding | + +--- + +## AWS Account Constraints + +- **NEVER use T-type instances.** SCPs reject `CreditSpecification: unlimited` which EKS auto-generates for T-type. Every attempt fails after a long wait. +- **Working instance type:** m7i-flex.large (2 vCPU, 8 GB) +- **Region:** eu-west-2 (London) +- This constraint is already encoded as a validation block in the Terraform eks module. + +--- + +## Error Handling Rules + +1. Never silently continue past a non-zero exit code — stop, report, diagnose +2. Show every command before running it +3. Pod in CrashLoopBackOff → immediately `kubectl logs` and `kubectl describe pod`, fix before continuing +4. Never delete AWS resources without explicit user confirmation +5. Update DEPLOYMENT_HANDOVER.md AND DEPLOYMENT_REPORT.md after every phase +6. If GMAIL_APP_PASSWORD is SKIP, skip Gmail configuration — user checks queues manually +7. If usage limits are approaching, update both tracking files immediately before stopping + +--- + +## Cleanup and Destroy + +```bash +# Helm +helm uninstall mongodb postgres rabbitmq +helm uninstall monitoring -n monitoring + +# Kubernetes +kubectl delete -f src/auth-service/manifest/ +kubectl delete -f src/gateway-service/manifest/ +kubectl delete -f src/converter-service/manifest/ +kubectl delete -f src/notification-service/manifest/ +kubectl delete -f src/frontend/manifest/ + +# EKS +aws eks delete-nodegroup --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes --region eu-west-2 +aws eks wait nodegroup-deleted --cluster-name vidcast-cluster \ + --nodegroup-name vidcast-nodes --region eu-west-2 +aws eks delete-cluster --name vidcast-cluster --region eu-west-2 + +# Terraform (if used) +cd terraform/environments/dev && terraform destroy + +# VPC (if created manually — use IDs from DEPLOYMENT_HANDOVER.md) +aws ec2 delete-route-table --route-table-id RTB_ID +aws ec2 detach-internet-gateway --internet-gateway-id IGW_ID --vpc-id VPC_ID +aws ec2 delete-internet-gateway --internet-gateway-id IGW_ID +aws ec2 delete-subnet --subnet-id SUBNET_1_ID +aws ec2 delete-subnet --subnet-id SUBNET_2_ID +aws ec2 delete-vpc --vpc-id VPC_ID +``` diff --git a/Helm_charts/MongoDB/templates/secret.yaml b/Helm_charts/MongoDB/templates/secret.yaml deleted file mode 100644 index 8f280ab..0000000 --- a/Helm_charts/MongoDB/templates/secret.yaml +++ /dev/null @@ -1,11 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: mongodb-secret -type: Opaque -stringData: - MONGO_ROOT_USERNAME: {{ .Values.secret.root_username }} - MONGO_ROOT_PASSWORD: {{ .Values.secret.root_password }} - MONGO_USERNAME: {{ .Values.secret.username }} - MONGO_PASSWORD: {{ .Values.secret.password }} - MONGO_USERS_LIST: {{ .Values.secret.users_list }} diff --git a/Helm_charts/RabbitMQ/templates/secret.yaml b/Helm_charts/RabbitMQ/templates/secret.yaml deleted file mode 100644 index d714599..0000000 --- a/Helm_charts/RabbitMQ/templates/secret.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: rabbitmq-secret -stringData: - PLACEHOLDER: "NONE" -type: Opaque \ No newline at end of file diff --git a/VIDCAST_UPGRADE_PLAN.md b/VIDCAST_UPGRADE_PLAN.md new file mode 100644 index 0000000..953bc87 --- /dev/null +++ b/VIDCAST_UPGRADE_PLAN.md @@ -0,0 +1,634 @@ +# VidCast — Production Upgrade Plan + +**Project:** Video-to-Audio Microservices Platform on AWS EKS +**Product Name:** VidCast — "Turn video recordings into podcast-ready audio" +**Date:** May 2026 +**Status:** Base platform deployed and passing end-to-end tests. This document covers planned improvements. + +--- + +## How to Read This Document + +This document is for the team. It explains every improvement we plan to make, why it matters, what it costs (in time and money), and what the alternatives were. If you're picking up a phase to work on, read the relevant section fully before writing any code. If something isn't clear, ask — don't guess. + +Every improvement falls into one of three categories: + +- **Build It** — We will implement this. It goes into the repo and the demo. +- **Talk About It** — We understand this and can explain it in the presentation, but we're not implementing it. +- **Skip It** — Not relevant for this project at this stage. + +--- + +## Table of Contents + +1. [Current State — What We Have](#1-current-state--what-we-have) +2. [Product Concept — VidCast](#2-product-concept--vidcast) +3. [Phase 1 — Terraform Infrastructure as Code](#3-phase-1--terraform-infrastructure-as-code) +4. [Phase 2 — CI/CD Pipeline](#4-phase-2--cicd-pipeline) +5. [Phase 3 — Security Hardening](#5-phase-3--security-hardening) +6. [Phase 4 — Monitoring and Observability](#6-phase-4--monitoring-and-observability) +7. [Phase 5 — Frontend Web Application](#7-phase-5--frontend-web-application) +8. [Phase 6 — Documentation and Presentation](#8-phase-6--documentation-and-presentation) +9. [Things We Talk About But Don't Build](#9-things-we-talk-about-but-dont-build) +10. [Repository Structure](#10-repository-structure) +11. [Branch Strategy](#11-branch-strategy) +12. [Cost Breakdown](#12-cost-breakdown) +13. [Real-World Use Cases](#13-real-world-use-cases) +14. [Presentation Strategy](#14-presentation-strategy) + +--- + +## 1. Current State — What We Have + +The base platform is deployed on AWS EKS in eu-west-2. It consists of four Python microservices (auth, gateway, converter, notification) and three infrastructure services (MongoDB, PostgreSQL, RabbitMQ) deployed via Helm charts. The application accepts video uploads via HTTP, converts them to MP3 asynchronously using RabbitMQ as a message broker, and emails the user when the audio file is ready for download. + +What works: end-to-end flow (login, upload, convert, notify, download), JWT authentication, event-driven async processing, Helm-managed infrastructure services, multi-replica deployments. + +What's missing: no infrastructure as code (cluster built manually via console), no CI/CD pipeline (images built and deployed manually), no health checks or resource limits on pods, no monitoring or alerting, credentials stored in plaintext YAML committed to the repo, no web interface (API-only via curl), no documentation beyond the deployment guide. + +These gaps are normal for a first-pass learning project. The purpose of this upgrade plan is to close them systematically. + +--- + +## 2. Product Concept — VidCast + +Instead of presenting this as "a Kubernetes exercise," we're framing it as a product that solves a real problem. This makes the demo accessible to non-technical audiences and gives the architecture a business context. + +**The product story:** Content creators record video — Zoom interviews, webinars, conference talks. They need the audio as a standalone podcast episode. VidCast lets them upload the video, converts it automatically, and emails them when the MP3 is ready to download. + +**Why this framing matters:** Every architectural decision now has a business justification. "Why do we use a message queue?" becomes "Because the creator shouldn't have to wait 5 minutes staring at a loading screen — they upload and walk away." "Why do we have 4 converter replicas?" becomes "Because if 20 creators upload at once, we need parallel processing capacity." + +**Why not YouTube downloads:** Downloading from YouTube violates their Terms of Service, yt-dlp breaks regularly as YouTube fights it, and a failed download during a live demo would derail the presentation. Our demo uses locally-stored video files that we control. + +--- + +## 3. Phase 1 — Terraform Infrastructure as Code + +### What We're Building + +Terraform modules that create and manage all AWS infrastructure: VPC, subnets, internet gateway, route tables, security groups, IAM roles, EKS cluster, and managed node group. After this phase, the entire platform can be destroyed and recreated from a single `terraform apply` command. + +### Why This Matters + +Right now, if someone deletes the EKS cluster, we'd need to click through the AWS Console for 30-60 minutes to rebuild it, hoping we remember every setting. With Terraform, the infrastructure is version-controlled, reviewable, and repeatable. This is the single most impactful improvement for the CV and the demo. + +In industry, this is non-negotiable. Every company running cloud infrastructure uses some form of IaC — Terraform, CloudFormation, Pulumi, or CDK. "I can destroy and recreate this entire platform from scratch with one command" is a sentence that separates you from most bootcamp graduates. + +### What the Industry Calls This + +Infrastructure as Code (IaC). The practice comes from the DevOps principle that infrastructure should be treated like application code: version-controlled, peer-reviewed, tested, and reproducible. The term was popularised by tools like Chef and Puppet in the 2010s, and Terraform (by HashiCorp, now part of IBM) became the dominant multi-cloud IaC tool. + +### Trade-off Analysis + +| Dimension | Terraform (Chosen) | AWS CloudFormation | Pulumi | +|---|---|---|---| +| Multi-cloud support | Yes — works with AWS, Azure, GCP | AWS only | Yes | +| Language | HCL (domain-specific) | JSON/YAML | Python, TypeScript, Go | +| Industry adoption | Dominant in multi-cloud shops | Dominant in AWS-only shops | Growing but smaller | +| Learning curve | Moderate — HCL is readable | Low for simple stacks | Low if you know the language | +| State management | Remote state in S3 + DynamoDB lock | Managed by AWS automatically | Managed by Pulumi Cloud or self-hosted | +| Bootcamp relevance | Taught in most DevOps curricula | Less commonly taught | Rarely taught in bootcamps | + +**Why Terraform:** It's what we learned, it's what most job postings list, and it works across cloud providers. CloudFormation would also be fine for an AWS-only project, but Terraform demonstrates a transferable skill. + +### What We're Creating + +``` +terraform/ +├── environments/ +│ └── dev/ +│ ├── main.tf # Root module — calls all child modules +│ ├── variables.tf # Input variables (region, instance type, etc.) +│ ├── outputs.tf # Cluster endpoint, node IP, kubeconfig command +│ └── terraform.tfvars # Actual values (gitignored — never committed) +└── modules/ + ├── vpc/ # VPC, subnets, IGW, route tables, NAT + ├── eks/ # EKS cluster, node group, OIDC provider + ├── iam/ # Cluster role, node role, policies + └── security-groups/ # NodePort rules (30002-30005) +``` + +### Key Decisions + +**Remote state in S3 with DynamoDB locking.** Local state files are not acceptable for any shared project. If two people run `terraform apply` simultaneously with local state, one of them will corrupt the infrastructure. S3 stores the state file, and DynamoDB prevents concurrent modifications. This is standard practice. + +**Module structure instead of a single flat file.** Each concern (networking, compute, identity) is a separate module with its own inputs and outputs. This means one person can modify the security groups without touching the VPC configuration. It also means modules can be reused across environments (dev, staging, prod) with different variable values. + +**terraform.tfvars is gitignored.** This file contains the actual values for your deployment — AWS account ID, region, instance type. It's environment-specific and must never be committed to the repo. Each team member creates their own from a template. + +### Estimated Effort + +4-6 hours to write and test all modules. Most of the time is in the EKS module (cluster creation takes 15 minutes per attempt, so iteration is slow). + +--- + +## 4. Phase 2 — CI/CD Pipeline + +### What We're Building + +A GitHub Actions workflow that automatically lints, scans, builds, and deploys the application whenever code is pushed. A Jenkinsfile that achieves the same pipeline for teams using Jenkins. + +### Why This Matters + +Right now, deploying a code change means: manually build a Docker image on your laptop, manually push it to Docker Hub, manually run `kubectl apply` against the cluster, and hope you didn't forget a step. This is error-prone, unreviewable, and unauditable. Nobody knows who deployed what, when, or from which commit. + +A CI/CD pipeline enforces a consistent process: every change goes through the same steps, every deployment is traceable to a specific commit, and security scanning happens automatically before any image reaches the cluster. + +### What the Industry Calls This + +Continuous Integration (CI) — automatically building and testing every change. Continuous Delivery/Deployment (CD) — automatically deploying validated changes to environments. Together, CI/CD. The practice originated in the early 2000s with tools like CruiseControl and Hudson (which became Jenkins). Modern implementations use GitHub Actions, GitLab CI, CircleCI, or Jenkins. + +### Trade-off Analysis + +| Dimension | GitHub Actions (Chosen) | Jenkins | GitLab CI | +|---|---|---|---| +| Infrastructure cost | Free for public repos, generous free tier | Must host and maintain Jenkins server | Free for public repos | +| Setup complexity | Zero — lives in the repo | High — needs a server, plugins, configuration | Low if using GitLab.com | +| Plugin ecosystem | Growing (Actions marketplace) | Massive (1800+ plugins) | Built-in features | +| Enterprise adoption | High and growing | Very high (legacy and current) | High in European companies | +| Pipeline as code | YAML in .github/workflows/ | Jenkinsfile in repo root | .gitlab-ci.yml in repo root | +| Demo-ability | Excellent — visible in GitHub UI | Requires Jenkins server running | Requires GitLab instance | + +**Why both:** GitHub Actions for the actual pipeline (easy to demo, no infrastructure needed). Jenkinsfile in the repo to show we can work in enterprise environments. During the presentation, we show GitHub Actions running; we mention Jenkins as "the enterprise alternative I also wrote." + +### Pipeline Stages + +``` +Push to any branch + │ + ├── Lint (ruff for Python) + ├── Trivy Scan (container vulnerability scanning) + │ + └── If main branch: + ├── Build Docker Image + ├── Tag with Git SHA (never :latest) + ├── Push to Docker Hub + ├── Configure kubectl for EKS + └── Deploy to cluster (kubectl apply or helm upgrade) +``` + +### Security Scanning — Where Trivy Fits + +Trivy is an open-source vulnerability scanner by Aqua Security. It scans container images for known CVEs (Common Vulnerabilities and Exposures) in OS packages and application dependencies. In our pipeline, Trivy runs after the Docker image is built but before it's pushed to the registry. If Trivy finds a CRITICAL or HIGH severity CVE, the pipeline fails and the image never reaches the cluster. + +This is the same concept as Docker Content Trust from Docker Swarm — ensuring that only verified, safe images run in your cluster. Trivy is the scanning step; Docker Content Trust (or Cosign/Sigstore in Kubernetes) is the signing step. We implement scanning; we talk about signing. + +In industry, this is called "shift-left security" — catching security issues early in the development process rather than discovering them in production. Most companies run Trivy, Snyk, or Grype as a CI pipeline gate. + +### Jenkins Pipeline + +The Jenkinsfile mirrors the GitHub Actions workflow exactly. Same stages, same tools, different syntax. This demonstrates that the pipeline logic is tool-agnostic — the stages (lint, scan, build, push, deploy) are the same regardless of whether you're using GitHub Actions, Jenkins, GitLab CI, or CircleCI. + +```groovy +// Jenkinsfile — same pipeline, different syntax +pipeline { + agent any + stages { + stage('Lint') { steps { sh 'ruff check src/' } } + stage('Scan') { steps { sh 'trivy image ...' } } + stage('Build') { steps { sh 'docker build ...' } } + stage('Push') { steps { sh 'docker push ...' } } + stage('Deploy') { steps { sh 'kubectl apply ...' } } + } +} +``` + +### Estimated Effort + +3-4 hours. The workflow files are straightforward; most time goes into configuring GitHub Secrets (Docker Hub credentials, AWS credentials, kubeconfig) and testing the pipeline end-to-end. + +--- + +## 5. Phase 3 — Security Hardening + +### What We're Building + +Four categories of security improvements applied to every Kubernetes deployment manifest. + +### 5a. Liveness and Readiness Probes + +**What they are:** Health checks that Kubernetes runs continuously to determine if a pod is alive (liveness) and ready to receive traffic (readiness). If a liveness probe fails, Kubernetes restarts the pod. If a readiness probe fails, Kubernetes stops sending traffic to that pod but doesn't restart it. + +**Why they matter:** Right now, Kubernetes has no way to know if our pods are actually healthy. It only knows they're running. If the Gateway loses its RabbitMQ connection, Kubernetes keeps routing traffic to it, and every upload silently fails. With probes, Kubernetes detects the failure and either restarts the pod or routes traffic to a healthy replica. + +**Where this concept comes from:** Health checks are a core Kubernetes primitive, inspired by process monitoring in traditional infrastructure (like systemd watchdog timers or Nagios checks). The distinction between liveness and readiness was introduced by Kubernetes to handle the common case where a service is alive but temporarily unable to serve (e.g., during startup or when a dependency is down). + +**What we're adding:** + +| Service | Probe Type | Check Method | What It Checks | +|---|---|---|---| +| Auth | HTTP GET /healthz | Liveness + Readiness | Flask is responding, PostgreSQL is reachable | +| Gateway | HTTP GET /healthz | Liveness + Readiness | Flask is responding, MongoDB and RabbitMQ are reachable | +| Converter | Exec command | Liveness | Process is alive, RabbitMQ connection is active | +| Notification | Exec command | Liveness | Process is alive, RabbitMQ connection is active | + +This requires adding a small `/healthz` endpoint to the Flask services (auth and gateway) — about 10 lines of Python each. + +### 5b. Resource Requests and Limits + +**What they are:** CPU and memory boundaries set on each pod. Requests are the guaranteed minimum — Kubernetes uses these for scheduling decisions. Limits are the hard ceiling — if a pod exceeds its memory limit, it gets killed (OOMKilled). + +**Why they matter:** The converter service runs ffmpeg, which is CPU-intensive. Without limits, four converter replicas could consume all 2 vCPUs on our m7i-flex.large node, starving the gateway and auth services. Users would be able to upload files but never log in, because the auth service can't get CPU time to process JWT validation. + +**What we're setting:** + +| Service | CPU Request | CPU Limit | Memory Request | Memory Limit | Rationale | +|---|---|---|---|---|---| +| Auth | 50m | 200m | 64Mi | 128Mi | Lightweight Flask app, small queries | +| Gateway | 100m | 300m | 128Mi | 256Mi | HTTP handling + GridFS uploads | +| Converter | 250m | 500m | 256Mi | 512Mi | ffmpeg is CPU and memory hungry | +| Notification | 50m | 100m | 64Mi | 128Mi | Sends emails — minimal resources | + +Total request across all replicas: approximately 1.5 vCPU and 1.5GB RAM, which fits comfortably on a 2 vCPU / 8GB node. + +### 5c. Security Contexts (Runtime Hardening) + +**What they are:** Linux-level security constraints applied to the container process. This is the direct Kubernetes equivalent of the Docker Swarm runtime hardening we learned in class. + +**Where this concept comes from:** The principle of least privilege — a container should have only the permissions it needs to do its job, nothing more. In Docker Swarm, we configured this through service spec options. In Kubernetes, the same concepts exist in the `securityContext` block of the pod spec. + +**What we're adding to every pod:** + +```yaml +securityContext: + runAsNonRoot: true # Container cannot run as root user + runAsUser: 1000 # Run as a non-privileged user + readOnlyRootFilesystem: true # Filesystem is read-only (prevents malware writing to disk) + allowPrivilegeEscalation: false # Cannot gain more privileges than it started with + capabilities: + drop: ["ALL"] # Drop all Linux capabilities (network raw, sys admin, etc.) +``` + +**Special case — Converter service:** The converter needs to write temporary files (the video input and MP3 output during conversion). We set `readOnlyRootFilesystem: true` but mount a writable `emptyDir` volume at `/tmp`. This means the converter can write temp files but cannot modify its own binaries, configuration, or any other part of the filesystem. If an attacker compromises the converter, they can write to /tmp but cannot install tools, modify the application, or persist across pod restarts. + +**Mapping from Docker Swarm to Kubernetes:** + +| Swarm Concept | Kubernetes Equivalent | +|---|---| +| `--user` flag | `securityContext.runAsUser` | +| `--read-only` flag | `securityContext.readOnlyRootFilesystem` | +| `--cap-drop ALL` | `securityContext.capabilities.drop: ["ALL"]` | +| `--no-new-privileges` | `securityContext.allowPrivilegeEscalation: false` | +| mTLS between services | Requires a service mesh (Istio/Linkerd) — Talk About It, don't build | +| Rotating join tokens | Managed by EKS automatically — Talk About It | +| Certificate management | ACM for external certs, EKS manages internal — Talk About It | + +### 5d. .gitignore and Secrets Audit + +**What we're adding:** A comprehensive .gitignore that prevents credentials, state files, and generated artifacts from being committed. We're also auditing every file in the repo for hardcoded secrets and documenting which files contain sensitive values. + +**Files that must never be committed:** + +``` +# Terraform +terraform.tfvars +*.tfstate +*.tfstate.backup +.terraform/ + +# Kubernetes secrets (generated by customise.sh) +**/secret.yaml + +# Credentials and state +deployment-ids.txt +DEPLOYMENT_CONFIG.md +DEPLOYMENT_HANDOVER.md +customise.sh + +# Build artifacts +*.mp3 +*.mp4 +node_modules/ +__pycache__/ +.env +``` + +### Estimated Effort + +2-3 hours for all four categories. Most of the work is YAML editing and adding small health endpoints to the Python services. + +--- + +## 6. Phase 4 — Monitoring and Observability + +### What We're Building + +A Prometheus + Grafana + Alertmanager monitoring stack deployed via the kube-prometheus-stack Helm chart, with one custom Grafana dashboard for the demo. + +### Why This Matters + +Right now, if the converter pods crash, if RabbitMQ fills up, if MongoDB runs out of disk — nobody knows until a user complains (or, more likely, until we notice during a demo that nothing is working). In industry, this is unacceptable for anything beyond a personal experiment. + +Monitoring answers three questions: Is the system healthy right now? Was it healthy over the past hour/day/week? When did it stop being healthy, and what changed? + +### What the Industry Calls This + +Observability — the ability to understand the internal state of a system by examining its outputs. The "three pillars of observability" are metrics (numerical measurements over time), logs (structured event records), and traces (request paths across services). We're implementing metrics and dashboards. We'll discuss logs and traces in the presentation. + +### Trade-off Analysis + +| Dimension | kube-prometheus-stack (Chosen) | AWS CloudWatch | Datadog | +|---|---|---|---| +| Cost | Free (self-hosted) | Pay per metric/log/alarm | $15-23/host/month | +| Setup complexity | One Helm install | Requires CloudWatch agent, IAM roles | Agent install + SaaS config | +| Kubernetes integration | Native — built for K8s | Good but requires extra config | Excellent | +| Dashboard quality | Grafana — highly customisable | Basic but functional | Excellent out of the box | +| Industry relevance | Prometheus is the CNCF standard | Common in AWS-heavy shops | Common in well-funded startups | +| Demo impact | High — Grafana looks impressive | Medium | High but costs money | + +**Why kube-prometheus-stack:** One Helm install gives us Prometheus (metrics collection), Grafana (dashboards), Alertmanager (alerts), kube-state-metrics (Kubernetes object metrics), and node-exporter (host-level metrics). It's free, it's the CNCF standard, and Grafana dashboards look professional in a demo. + +### What We Get + +**Out of the box (no extra configuration):** CPU and memory usage per pod, per node, and cluster-wide. Pod restart counts and crash loop detection. Network I/O. Disk usage. Kubernetes object status (deployments, statefulsets, pods). + +**Custom dashboard for the demo ("VidCast Operations"):** RabbitMQ queue depth (video queue and mp3 queue) — this is the most compelling visual during a demo. Pod status for all four microservices. Node resource utilisation. Converter processing rate (if we add custom metrics to the Python code). + +**Alerts:** + +| Alert | Condition | Severity | Why | +|---|---|---|---| +| Pod CrashLoopBackOff | Pod restarted 3+ times in 10 minutes | Critical | Service is broken | +| High Node Memory | Node memory > 85% for 5 minutes | Warning | Risk of OOMKill | +| RabbitMQ Queue Backlog | Video queue depth > 10 for 5 minutes | Warning | Conversions are backing up | +| RabbitMQ Unavailable | RabbitMQ pod not ready for 2 minutes | Critical | Entire pipeline is blocked | + +### Estimated Effort + +3-4 hours. The Helm install takes 5 minutes; building a good custom dashboard takes iteration. + +--- + +## 7. Phase 5 — Frontend Web Application + +### What We're Building + +A React web application that serves as the VidCast product interface. It communicates with the existing Gateway API and provides a visual way to interact with the platform during the demo. + +### Why This Matters + +Right now, the demo involves running curl commands in a terminal. This is fine for a technical audience, but for a bootcamp presentation where we need to explain the system to non-technical people, a visual interface makes the flow immediately understandable. The frontend also gives us a place to show the monitoring dashboard and the architecture diagram during the presentation. + +### Pages + +**Login Page:** Email and password form. Calls `/login` on the Gateway, stores the JWT in React state (not localStorage — that's not supported in artifacts/sandboxed environments, and it's a security consideration worth mentioning). Clean VidCast branding. + +**Upload Page:** Drag-and-drop file upload. Sends the video to `/upload` with the JWT. Shows a success confirmation: "Your file is being processed. You'll receive an email when it's ready." + +**Download Page:** Text input for the file ID (from the email notification). Calls `/download` with the JWT and file ID. Triggers a browser download of the MP3. + +**Dashboard Page:** Embedded Grafana panels showing RabbitMQ queue depth and pod health, or a simplified custom view. This is the "behind the scenes" view for the presentation. + +**Architecture Page:** An interactive system diagram showing the microservices and data flow. During the demo, this helps explain what happens when you upload a file — "the request hits the Gateway here, then the video goes into the queue here, then a converter worker picks it up here..." + +### Deployment + +The frontend gets its own Dockerfile (Node.js, nginx to serve the built React app), its own Kubernetes Deployment and Service (NodePort or Ingress), and its own entry in the CI/CD pipeline. It becomes the fifth microservice in the cluster. + +### Trade-off Analysis + +| Dimension | React SPA (Chosen) | Plain HTML/CSS/JS | Next.js | +|---|---|---|---| +| Complexity | Moderate | Low | High | +| State management | React hooks (useState) | Manual DOM manipulation | React + SSR complexity | +| Component reuse | Excellent | Poor | Excellent | +| Build step required | Yes (npm build) | No | Yes | +| Team familiarity | Depends | Everyone knows HTML | Fewer people know Next.js | +| Demo appearance | Professional | Can look professional | Professional | + +**Why React:** Component-based architecture makes the dashboard and architecture views easier to build. Tailwind CSS keeps styling consistent without custom CSS. The built app is served as static files by nginx, so it's lightweight and fast. + +### Estimated Effort + +6-8 hours. This is the most visible piece but not the most complex — the backend already works, so the frontend is mostly API calls and UI design. + +--- + +## 8. Phase 6 — Documentation and Presentation + +### What We're Producing + +An updated README.md that explains the project from the perspective of someone finding it on GitHub — what it does, how to deploy it, how to destroy it. Architecture diagrams. Presentation notes with talking points and analogies for non-technical audiences. + +### Analogies for Non-Technical Audiences + +**Microservices → Restaurant:** A monolith is one chef doing everything. Microservices are specialised roles: host, cook, runner, cashier. Each can be scaled independently. + +**Message Queue → Post Office:** You don't wait at the counter for your letter to be delivered. You drop it off, and the postal workers process it on their own schedule. + +**JWT Authentication → Security Badge:** You show your ID at reception once (login), get a badge (token), and swipe it for access to different rooms (upload, download) without going back to reception. + +**Containers → Shipping Containers:** Standardised boxes that work the same everywhere — your laptop, a data centre, the cloud. + +**Kubernetes → Port Authority:** Manages where containers go, replaces ones that fall off the ship, and adds more when demand increases. + +**Infrastructure as Code → Building Blueprints:** Instead of telling builders "make it like the last one," you hand them exact blueprints. Anyone can build the same building from the same plans. + +**CI/CD Pipeline → Factory Assembly Line:** Raw materials (code) go in one end, pass through quality checks, and a finished product (deployed application) comes out the other end. Every step is automated and inspected. + +--- + +## 9. Things We Talk About But Don't Build + +These are concepts we understand and can discuss in the presentation or interviews, but we're not implementing them in this project. For each one, the reason for not building it is included. + +### ArgoCD / GitOps + +**What it is:** A deployment model where Git is the single source of truth. Instead of running `kubectl apply` from a pipeline, ArgoCD watches the Git repo and automatically syncs the cluster state to match what's in Git. If someone manually changes something in the cluster, ArgoCD detects the drift and reverts it. + +**Why we're not building it:** ArgoCD adds significant operational complexity (it needs its own deployment, RBAC, and repository credentials). For a single-developer project, the CI/CD pipeline with `kubectl apply` achieves the same outcome. ArgoCD shines in multi-team environments where drift detection and audit trails matter. + +**What to say in an interview:** "For a single-developer project, I used direct deployment from the CI/CD pipeline. In a team environment, I'd introduce ArgoCD for drift detection and to enforce that all changes go through Git." + +### KEDA / Queue-Based Autoscaling + +**What it is:** Kubernetes Event-Driven Autoscaling. Instead of scaling based on CPU (which HPA does), KEDA scales based on external metrics — in our case, RabbitMQ queue depth. If 50 videos are in the queue, KEDA would scale the converter from 4 replicas to 20. When the queue drains, it scales back down. + +**Why we're not building it:** Our demo processes one video at a time. KEDA is impressive but meaningless without a load-testing scenario to demonstrate it. Implementing it without a visible demo adds complexity without presentation value. + +**What to say in an interview:** "The converter service would benefit from queue-based autoscaling with KEDA. Instead of a fixed 4 replicas, KEDA would watch the RabbitMQ queue depth and scale converter workers dynamically. This means we pay for compute only when there's work to do." + +### Service Mesh / mTLS + +**What it is:** A service mesh (Istio, Linkerd) adds a sidecar proxy to every pod that handles service-to-service communication. This enables mutual TLS (mTLS) — every connection between services is encrypted and both sides verify each other's identity. In Docker Swarm, mTLS is built in. In Kubernetes, it requires a service mesh. + +**Why we're not building it:** Installing Istio would triple the resource consumption on our single node and add significant operational complexity. For a four-service demo with no sensitive data, it's overkill. + +**What to say in an interview:** "In production, I'd add a service mesh like Istio or Linkerd for mTLS between services. Even if an attacker gets inside the cluster network, they can't intercept or modify traffic between the gateway and auth service. The same encryption that Docker Swarm provides built-in requires a service mesh in Kubernetes." + +### Managed Database Services (RDS, DocumentDB, Amazon MQ) + +**What it is:** Instead of running MongoDB, PostgreSQL, and RabbitMQ as containers in the cluster, use AWS managed services: RDS for PostgreSQL, DocumentDB or MongoDB Atlas for MongoDB, and Amazon MQ for RabbitMQ. AWS handles backups, patching, replication, and failover. + +**Why we're not building it:** Managed services cost $200-400/month for a project we run for demos. They also remove the Kubernetes operational experience (running StatefulSets, Helm charts) that makes the project valuable. The in-cluster approach demonstrates more skills. + +**What to say in an interview:** "In production, I'd migrate PostgreSQL to RDS and RabbitMQ to Amazon MQ. Managed services handle backups, patching, and replication — operational burden the platform team shouldn't own. I kept them as StatefulSets in this project to demonstrate Kubernetes data service management." + +### External Secrets Operator / AWS Secrets Manager + +**What it is:** Instead of storing secrets in Kubernetes Secret objects (which are just base64-encoded, not encrypted), store them in AWS Secrets Manager and use the External Secrets Operator to sync them into the cluster at runtime. + +**Why we might not build it:** It requires an OIDC provider configured on the EKS cluster and IRSA (IAM Roles for Service Accounts). This is achievable but adds 2-3 hours of work. If time permits, we'll add it. If not, we document the approach and explain it. + +**What to say in an interview:** "Credentials are currently in Kubernetes Secrets, which are base64-encoded but not encrypted at rest unless you enable EKS envelope encryption. In production, I'd use AWS Secrets Manager with the External Secrets Operator. Secrets are stored in Secrets Manager, retrieved at runtime via IRSA, and never exist in Git." + +### Network Policies + +**What it is:** Kubernetes NetworkPolicy resources that restrict which pods can communicate with each other. By default, every pod in a Kubernetes cluster can talk to every other pod. Network Policies implement the principle of least privilege at the network level. + +**Why we should try to build it (stretch goal):** It's a 20-minute task that demonstrates security awareness. The auth service should only accept traffic from the gateway. MongoDB should only accept traffic from the gateway and converter. + +**What to say in an interview:** "I implemented Network Policies to restrict east-west traffic. The auth service only accepts connections from the gateway — even if an attacker compromises the converter, they can't directly access the auth database." + +--- + +## 10. Repository Structure + +``` +vidcast/ (repo root) +│ +├── README.md # Public-facing: what, why, how to deploy, how to destroy +├── VIDCAST_UPGRADE_PLAN.md # This document +├── .gitignore # Comprehensive — secrets, state, artifacts +├── Jenkinsfile # Enterprise CI/CD alternative +│ +├── .github/ +│ └── workflows/ +│ ├── ci.yml # Lint + scan + build + push +│ └── cd.yml # Deploy to EKS +│ +├── terraform/ +│ ├── environments/ +│ │ └── dev/ +│ │ ├── main.tf +│ │ ├── variables.tf +│ │ ├── outputs.tf +│ │ ├── backend.tf # S3 + DynamoDB state config +│ │ └── terraform.tfvars # GITIGNORED — actual values +│ └── modules/ +│ ├── vpc/ +│ ├── eks/ +│ ├── iam/ +│ └── security-groups/ +│ +├── Helm_charts/ # Existing — unchanged +│ ├── MongoDB/ +│ ├── Postgres/ +│ └── RabbitMQ/ +│ +├── src/ +│ ├── auth-service/ # Existing + health endpoint + security context +│ ├── gateway-service/ # Existing + health endpoint + security context +│ ├── converter-service/ # Existing + security context + resource limits +│ ├── notification-service/ # Existing + security context +│ └── frontend/ # NEW — React web application +│ ├── Dockerfile +│ ├── nginx.conf +│ ├── package.json +│ ├── src/ +│ │ ├── App.jsx +│ │ ├── pages/ +│ │ │ ├── Login.jsx +│ │ │ ├── Upload.jsx +│ │ │ ├── Download.jsx +│ │ │ ├── Dashboard.jsx +│ │ │ └── Architecture.jsx +│ │ └── components/ +│ └── manifest/ +│ ├── deployment.yaml +│ ├── service.yaml +│ └── configmap.yaml +│ +├── monitoring/ +│ ├── values.yaml # Custom values for kube-prometheus-stack +│ ├── dashboards/ +│ │ └── vidcast-operations.json # Custom Grafana dashboard +│ └── alerts/ +│ └── vidcast-alerts.yaml # Custom alert rules +│ +├── docs/ +│ ├── architecture.md +│ ├── deployment-guide.md +│ └── presentation-notes.md +│ +└── assets/ + └── video.mp4 # Test video +``` + +--- + +## 11. Branch Strategy + +``` +main ← current working state (base project) + │ + ├── feature/terraform-infra ← Phase 1: all Terraform code + ├── feature/ci-cd-pipeline ← Phase 2: GitHub Actions + Jenkinsfile + ├── feature/security-harden ← Phase 3: probes, limits, security contexts, .gitignore + ├── feature/monitoring ← Phase 4: kube-prometheus-stack + dashboard + ├── feature/frontend ← Phase 5: React web application + └── feature/documentation ← Phase 6: README, arch docs, presentation notes +``` + +Each branch is merged to main via a Pull Request when complete and tested. This gives us a clean Git history where each PR represents a meaningful improvement. The PR descriptions become talking points: "Here's the PR where I added infrastructure as code. Here's where I introduced container security scanning." + +**Rules:** +- Never push directly to main. Always use a feature branch and PR. +- Each PR should have a description explaining what changed and why. +- Merge in order: Phase 1 → 2 → 3 → 4 → 5 → 6 (though 2 and 3 can be parallel). + +--- + +## 12. Cost Breakdown + +| Component | Monthly Cost | Notes | +|---|---|---| +| EKS cluster | ~$73 | $0.10/hour for the control plane | +| EC2 node (m7i-flex.large) | ~$70 on-demand | Could reduce with Spot (~$25) but not for a demo | +| EBS storage (30GB gp3) | ~$2.40 | Root volume for the node | +| S3 (Terraform state) | <$0.10 | A few KB of state files | +| DynamoDB (state lock) | <$0.10 | On-demand pricing, minimal usage | +| Data transfer | ~$5 | Minimal for a demo | +| Docker Hub | Free | Public repos, free tier | +| **Total (running 24/7)** | **~$150/month** | | +| **Total (8 hours/day, weekdays only)** | **~$40/month** | Stop the node group outside working hours | + +**Cost-saving tip:** The biggest expense is the EC2 node. If you're not actively using the cluster, delete the node group (`aws eks delete-nodegroup`) and recreate it when you need it. The EKS control plane still costs $73/month even with no nodes, so for extended breaks, destroy the whole cluster and recreate it from Terraform. + +--- + +## 13. Real-World Use Cases + +This architecture pattern — API gateway, async processing queue, worker services, notification — is used everywhere in industry. Here are concrete examples to reference during the presentation: + +**Media processing (YouTube, TikTok, Spotify):** When you upload a video, it goes through a processing pipeline: transcoding to multiple resolutions, thumbnail generation, audio extraction for captions, content moderation. Each step is a separate service consuming from a queue. Our project does the same thing at a smaller scale. + +**E-commerce order processing (Amazon, ASOS):** When you place an order, separate services handle payment, inventory, warehouse notification, shipping labels, and confirmation email. The queue absorbs traffic spikes (Black Friday) without dropping orders. + +**Banking document processing:** Mortgage applications, bank statements, and identity documents go through OCR, data extraction, fraud checks, and compliance verification — each as a separate service. + +**Healthcare imaging:** MRI and X-ray images are uploaded, converted to standard formats, analysed by AI, stored in archives, and the referring doctor is notified. Upload, queue, process, store, notify — same pattern. + +--- + +## 14. Presentation Strategy + +### Flow (12-15 minutes) + +**Open with the product (2 min):** "This is VidCast — a platform that converts video recordings into podcast-ready audio." Demo the upload through the web interface. Everyone understands what the system does. + +**Explain the architecture (3 min):** Switch to the architecture view. Use the restaurant analogy for microservices, the post office analogy for queues. Walk through the data flow. + +**Show the platform engineering (5 min):** Show Terraform creating infrastructure. Show the CI/CD pipeline deploying a change. Show the Grafana dashboard. Show the security contexts. Explain each in terms the audience can follow. + +**Talk about what you'd do next (2 min):** Managed databases, service mesh, KEDA, GitOps. Shows you see beyond what you built. + +**Close with real-world connection (1 min):** "This is the same pattern used by YouTube, Spotify, and every media processing platform. The scale is different, but the principles are identical." + +### Teaching Tips + +- Start with the problem, not the technology. +- One analogy per concept. Don't stack metaphors. +- If you're about to say a technical term, explain it immediately: "RabbitMQ — that's our post office sorting room — was showing a backlog." +- Show, don't tell. A live demo is worth ten slides. +- End each section with "and this is why it matters" before moving on. diff --git a/src/auth-service/manifest/secret.yaml b/src/auth-service/manifest/secret.yaml deleted file mode 100644 index a662735..0000000 --- a/src/auth-service/manifest/secret.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: auth-secret -stringData: - PSQL_PASSWORD: nasi1234 - JWT_SECRET: sarcasm -type: Opaque - diff --git a/src/converter-service/manifest/secret.yaml b/src/converter-service/manifest/secret.yaml deleted file mode 100644 index 18a8217..0000000 --- a/src/converter-service/manifest/secret.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: converter-secret -stringData: - PLACEHOLDER: "NONE" -type: Opaque \ No newline at end of file diff --git a/src/gateway-service/manifest/secret.yaml b/src/gateway-service/manifest/secret.yaml deleted file mode 100644 index f9582f4..0000000 --- a/src/gateway-service/manifest/secret.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: gateway-secret -stringData: - PLACEHOLDER: nothing -type: Opaque \ No newline at end of file diff --git a/src/notification-service/manifest/secret.yaml b/src/notification-service/manifest/secret.yaml deleted file mode 100644 index 011b22b..0000000 --- a/src/notification-service/manifest/secret.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: notification-secret -stringData: - GMAIL_ADDRESS: "iambatmanthegoat@gmail.com" #enter your email to get the id - GMAIL_PASSWORD: "gkxk acif rhgv erjr" -type: Opaque - -# Passw0rd@1234 \ No newline at end of file diff --git a/terraform/environments/dev/terraform.tfvars.example b/terraform/environments/dev/terraform.tfvars.example new file mode 100644 index 0000000..8fea421 --- /dev/null +++ b/terraform/environments/dev/terraform.tfvars.example @@ -0,0 +1,19 @@ +# Copy this file to terraform.tfvars and fill in your values. +# NEVER commit terraform.tfvars — it is gitignored. + +aws_region = "eu-west-2" +cluster_name = "vidcast-cluster" +node_instance_type = "m7i-flex.large" +node_min_count = 1 +node_max_count = 2 +node_desired_count = 1 +kubernetes_version = "1.31" + +# Leave blank to create a new VPC, or provide an existing VPC ID +vpc_id = "" + +# S3 bucket for Terraform remote state (must exist before terraform init) +state_bucket = "your-terraform-state-bucket" + +# DynamoDB table for state locking (must exist before terraform init) +state_lock_table = "vidcast-terraform-locks" From 2362cb63ab6b5a81e8f331ac3fc99f1cfb2f695f Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:13:37 +0100 Subject: [PATCH 02/90] feat: add Terraform IaC modules for VPC, IAM, EKS, and security groups - VPC module: VPC, 2 public subnets (eu-west-2a/b), IGW, route table - IAM module: EKS cluster role + node role with correct policy attachments - EKS module: cluster v1.31, managed node group, OIDC provider for IRSA - Validation block rejects T-type instances (blocked by account SCP) - Security groups module: NodePort rules for ports 30002-30008 - Dev environment: root module wiring all child modules + S3/DynamoDB backend - All resources tagged: Project=vidcast, ManagedBy=terraform, Environment=dev Co-Authored-By: Claude Sonnet 4.6 --- terraform/environments/dev/backend.tf | 30 +++++++++ terraform/environments/dev/main.tf | 48 ++++++++++++++ terraform/environments/dev/outputs.tf | 34 ++++++++++ terraform/environments/dev/variables.tf | 64 +++++++++++++++++++ terraform/modules/eks/main.tf | 47 ++++++++++++++ terraform/modules/eks/outputs.tf | 29 +++++++++ terraform/modules/eks/variables.tf | 60 +++++++++++++++++ terraform/modules/iam/main.tf | 51 +++++++++++++++ terraform/modules/iam/outputs.tf | 9 +++ terraform/modules/iam/variables.tf | 10 +++ terraform/modules/security-groups/main.tf | 26 ++++++++ terraform/modules/security-groups/outputs.tf | 4 ++ .../modules/security-groups/variables.tf | 21 ++++++ terraform/modules/vpc/main.tf | 44 +++++++++++++ terraform/modules/vpc/outputs.tf | 14 ++++ terraform/modules/vpc/variables.tf | 22 +++++++ 16 files changed, 513 insertions(+) create mode 100644 terraform/environments/dev/backend.tf create mode 100644 terraform/environments/dev/main.tf create mode 100644 terraform/environments/dev/outputs.tf create mode 100644 terraform/environments/dev/variables.tf create mode 100644 terraform/modules/eks/main.tf create mode 100644 terraform/modules/eks/outputs.tf create mode 100644 terraform/modules/eks/variables.tf create mode 100644 terraform/modules/iam/main.tf create mode 100644 terraform/modules/iam/outputs.tf create mode 100644 terraform/modules/iam/variables.tf create mode 100644 terraform/modules/security-groups/main.tf create mode 100644 terraform/modules/security-groups/outputs.tf create mode 100644 terraform/modules/security-groups/variables.tf create mode 100644 terraform/modules/vpc/main.tf create mode 100644 terraform/modules/vpc/outputs.tf create mode 100644 terraform/modules/vpc/variables.tf diff --git a/terraform/environments/dev/backend.tf b/terraform/environments/dev/backend.tf new file mode 100644 index 0000000..f5b4d93 --- /dev/null +++ b/terraform/environments/dev/backend.tf @@ -0,0 +1,30 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + tls = { + source = "hashicorp/tls" + version = "~> 4.0" + } + } + + backend "s3" { + # Values are provided at init time: + # terraform init -backend-config="bucket=YOUR_BUCKET" \ + # -backend-config="key=vidcast/dev/terraform.tfstate" \ + # -backend-config="region=eu-west-2" \ + # -backend-config="dynamodb_table=vidcast-terraform-locks" + # + # Or configure in terraform.tfvars (gitignored). + key = "vidcast/dev/terraform.tfstate" + region = "eu-west-2" + } +} + +provider "aws" { + region = var.aws_region +} diff --git a/terraform/environments/dev/main.tf b/terraform/environments/dev/main.tf new file mode 100644 index 0000000..1f81ed2 --- /dev/null +++ b/terraform/environments/dev/main.tf @@ -0,0 +1,48 @@ +locals { + common_tags = { + Project = "vidcast" + ManagedBy = "terraform" + Environment = "dev" + Region = var.aws_region + } +} + +module "vpc" { + source = "../../modules/vpc" + + cluster_name = var.cluster_name + vpc_cidr = var.vpc_cidr + availability_zones = var.availability_zones + tags = local.common_tags +} + +module "iam" { + source = "../../modules/iam" + + cluster_name = var.cluster_name + tags = local.common_tags +} + +module "eks" { + source = "../../modules/eks" + + cluster_name = var.cluster_name + kubernetes_version = var.kubernetes_version + cluster_role_arn = module.iam.cluster_role_arn + node_role_arn = module.iam.node_role_arn + subnet_ids = module.vpc.public_subnet_ids + node_instance_type = var.node_instance_type + node_min_count = var.node_min_count + node_max_count = var.node_max_count + node_desired_count = var.node_desired_count + tags = local.common_tags +} + +module "security_groups" { + source = "../../modules/security-groups" + + cluster_name = var.cluster_name + vpc_id = module.vpc.vpc_id + nodeport_ports = [30002, 30003, 30004, 30005, 30006, 30007, 30008] + tags = local.common_tags +} diff --git a/terraform/environments/dev/outputs.tf b/terraform/environments/dev/outputs.tf new file mode 100644 index 0000000..9d8d9fa --- /dev/null +++ b/terraform/environments/dev/outputs.tf @@ -0,0 +1,34 @@ +output "cluster_endpoint" { + description = "EKS cluster API endpoint" + value = module.eks.cluster_endpoint +} + +output "cluster_name" { + description = "EKS cluster name" + value = module.eks.cluster_name +} + +output "vpc_id" { + description = "VPC ID" + value = module.vpc.vpc_id +} + +output "public_subnet_ids" { + description = "Public subnet IDs" + value = module.vpc.public_subnet_ids +} + +output "node_security_group_id" { + description = "NodePort security group ID" + value = module.security_groups.security_group_id +} + +output "kubeconfig_command" { + description = "Run this command to configure kubectl" + value = module.eks.kubeconfig_command +} + +output "oidc_provider_arn" { + description = "OIDC provider ARN for IRSA setup" + value = module.eks.oidc_provider_arn +} diff --git a/terraform/environments/dev/variables.tf b/terraform/environments/dev/variables.tf new file mode 100644 index 0000000..22d1e55 --- /dev/null +++ b/terraform/environments/dev/variables.tf @@ -0,0 +1,64 @@ +variable "aws_region" { + description = "AWS region for all resources" + type = string + default = "eu-west-2" +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string + default = "vidcast-cluster" +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC" + type = string + default = "10.0.0.0/16" +} + +variable "availability_zones" { + description = "Availability zones for public subnets" + type = list(string) + default = ["eu-west-2a", "eu-west-2b"] +} + +variable "kubernetes_version" { + description = "Kubernetes version for the EKS cluster" + type = string + default = "1.31" +} + +variable "node_instance_type" { + description = "EC2 instance type for worker nodes. Must be M/C/R-series — T-type is blocked by SCP." + type = string + default = "m7i-flex.large" +} + +variable "node_min_count" { + description = "Minimum node count" + type = number + default = 1 +} + +variable "node_max_count" { + description = "Maximum node count" + type = number + default = 2 +} + +variable "node_desired_count" { + description = "Desired node count" + type = number + default = 1 +} + +variable "state_bucket" { + description = "S3 bucket name for Terraform remote state" + type = string +} + +variable "state_lock_table" { + description = "DynamoDB table name for Terraform state locking" + type = string + default = "vidcast-terraform-locks" +} diff --git a/terraform/modules/eks/main.tf b/terraform/modules/eks/main.tf new file mode 100644 index 0000000..08f89ad --- /dev/null +++ b/terraform/modules/eks/main.tf @@ -0,0 +1,47 @@ +resource "aws_eks_cluster" "this" { + name = var.cluster_name + version = var.kubernetes_version + role_arn = var.cluster_role_arn + + vpc_config { + subnet_ids = var.subnet_ids + endpoint_public_access = true + endpoint_private_access = false + } + + tags = var.tags + + depends_on = [var.cluster_role_arn] +} + +resource "aws_eks_node_group" "this" { + cluster_name = aws_eks_cluster.this.name + node_group_name = "${var.cluster_name}-nodes" + node_role_arn = var.node_role_arn + subnet_ids = var.subnet_ids + instance_types = [var.node_instance_type] + ami_type = "AL2_x86_64" + + scaling_config { + min_size = var.node_min_count + max_size = var.node_max_count + desired_size = var.node_desired_count + } + + tags = var.tags + + depends_on = [aws_eks_cluster.this] +} + +# OIDC provider — required for IRSA (IAM Roles for Service Accounts) +data "tls_certificate" "eks_oidc" { + url = aws_eks_cluster.this.identity[0].oidc[0].issuer +} + +resource "aws_iam_openid_connect_provider" "eks" { + client_id_list = ["sts.amazonaws.com"] + thumbprint_list = [data.tls_certificate.eks_oidc.certificates[0].sha1_fingerprint] + url = aws_eks_cluster.this.identity[0].oidc[0].issuer + + tags = var.tags +} diff --git a/terraform/modules/eks/outputs.tf b/terraform/modules/eks/outputs.tf new file mode 100644 index 0000000..0698374 --- /dev/null +++ b/terraform/modules/eks/outputs.tf @@ -0,0 +1,29 @@ +output "cluster_endpoint" { + description = "Endpoint URL of the EKS cluster API server" + value = aws_eks_cluster.this.endpoint +} + +output "cluster_name" { + description = "Name of the EKS cluster" + value = aws_eks_cluster.this.name +} + +output "cluster_ca_certificate" { + description = "Base64-encoded certificate authority data for the cluster" + value = aws_eks_cluster.this.certificate_authority[0].data +} + +output "oidc_provider_arn" { + description = "ARN of the OIDC provider (needed for IRSA)" + value = aws_iam_openid_connect_provider.eks.arn +} + +output "oidc_provider_url" { + description = "URL of the OIDC provider" + value = aws_iam_openid_connect_provider.eks.url +} + +output "kubeconfig_command" { + description = "Command to update local kubeconfig for this cluster" + value = "aws eks update-kubeconfig --name ${aws_eks_cluster.this.name} --region ${var.tags["Region"] != null ? var.tags["Region"] : "eu-west-2"}" +} diff --git a/terraform/modules/eks/variables.tf b/terraform/modules/eks/variables.tf new file mode 100644 index 0000000..01cf9e5 --- /dev/null +++ b/terraform/modules/eks/variables.tf @@ -0,0 +1,60 @@ +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "kubernetes_version" { + description = "Kubernetes version for the EKS cluster" + type = string + default = "1.31" +} + +variable "cluster_role_arn" { + description = "ARN of the IAM role for the EKS cluster" + type = string +} + +variable "node_role_arn" { + description = "ARN of the IAM role for the EKS node group" + type = string +} + +variable "subnet_ids" { + description = "List of subnet IDs for the EKS cluster and node group" + type = list(string) +} + +variable "node_instance_type" { + description = "EC2 instance type for EKS worker nodes. Must NOT be a T-type — SCPs on this account reject CreditSpecification:unlimited which EKS auto-generates for T-type instances." + type = string + default = "m7i-flex.large" + + validation { + condition = !startswith(var.node_instance_type, "t") + error_message = "T-type instances (t2, t3, t4g, etc.) are blocked by SCP on this AWS account. Use m7i-flex.large or another M/C/R-series instance." + } +} + +variable "node_min_count" { + description = "Minimum number of nodes in the node group" + type = number + default = 1 +} + +variable "node_max_count" { + description = "Maximum number of nodes in the node group" + type = number + default = 2 +} + +variable "node_desired_count" { + description = "Desired number of nodes in the node group" + type = number + default = 1 +} + +variable "tags" { + description = "Common tags applied to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/iam/main.tf b/terraform/modules/iam/main.tf new file mode 100644 index 0000000..85486c8 --- /dev/null +++ b/terraform/modules/iam/main.tf @@ -0,0 +1,51 @@ +data "aws_iam_policy_document" "eks_cluster_assume_role" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["eks.amazonaws.com"] + } + } +} + +data "aws_iam_policy_document" "eks_node_assume_role" { + statement { + actions = ["sts:AssumeRole"] + principals { + type = "Service" + identifiers = ["ec2.amazonaws.com"] + } + } +} + +resource "aws_iam_role" "cluster" { + name = "${var.cluster_name}-cluster-role" + assume_role_policy = data.aws_iam_policy_document.eks_cluster_assume_role.json + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "cluster_policy" { + role = aws_iam_role.cluster.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" +} + +resource "aws_iam_role" "node" { + name = "${var.cluster_name}-node-role" + assume_role_policy = data.aws_iam_policy_document.eks_node_assume_role.json + tags = var.tags +} + +resource "aws_iam_role_policy_attachment" "node_worker_policy" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" +} + +resource "aws_iam_role_policy_attachment" "node_cni_policy" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy" +} + +resource "aws_iam_role_policy_attachment" "node_ecr_readonly" { + role = aws_iam_role.node.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" +} diff --git a/terraform/modules/iam/outputs.tf b/terraform/modules/iam/outputs.tf new file mode 100644 index 0000000..3d02ddd --- /dev/null +++ b/terraform/modules/iam/outputs.tf @@ -0,0 +1,9 @@ +output "cluster_role_arn" { + description = "ARN of the EKS cluster IAM role" + value = aws_iam_role.cluster.arn +} + +output "node_role_arn" { + description = "ARN of the EKS node group IAM role" + value = aws_iam_role.node.arn +} diff --git a/terraform/modules/iam/variables.tf b/terraform/modules/iam/variables.tf new file mode 100644 index 0000000..dc4d1e1 --- /dev/null +++ b/terraform/modules/iam/variables.tf @@ -0,0 +1,10 @@ +variable "cluster_name" { + description = "EKS cluster name — used for role naming" + type = string +} + +variable "tags" { + description = "Common tags applied to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/security-groups/main.tf b/terraform/modules/security-groups/main.tf new file mode 100644 index 0000000..096f36e --- /dev/null +++ b/terraform/modules/security-groups/main.tf @@ -0,0 +1,26 @@ +resource "aws_security_group" "node_ports" { + name = "${var.cluster_name}-nodeport-sg" + description = "Allow inbound traffic to Kubernetes NodePort services" + vpc_id = var.vpc_id + + dynamic "ingress" { + for_each = var.nodeport_ports + content { + from_port = ingress.value + to_port = ingress.value + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + description = "NodePort ${ingress.value}" + } + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + description = "Allow all outbound" + } + + tags = merge(var.tags, { Name = "${var.cluster_name}-nodeport-sg" }) +} diff --git a/terraform/modules/security-groups/outputs.tf b/terraform/modules/security-groups/outputs.tf new file mode 100644 index 0000000..7e158ac --- /dev/null +++ b/terraform/modules/security-groups/outputs.tf @@ -0,0 +1,4 @@ +output "security_group_id" { + description = "ID of the NodePort security group" + value = aws_security_group.node_ports.id +} diff --git a/terraform/modules/security-groups/variables.tf b/terraform/modules/security-groups/variables.tf new file mode 100644 index 0000000..e826d04 --- /dev/null +++ b/terraform/modules/security-groups/variables.tf @@ -0,0 +1,21 @@ +variable "cluster_name" { + description = "EKS cluster name — used for resource naming" + type = string +} + +variable "vpc_id" { + description = "VPC ID where the security group will be created" + type = string +} + +variable "nodeport_ports" { + description = "List of NodePort port numbers to open for inbound traffic" + type = list(number) + default = [30002, 30003, 30004, 30005, 30006, 30007, 30008] +} + +variable "tags" { + description = "Common tags applied to all resources" + type = map(string) + default = {} +} diff --git a/terraform/modules/vpc/main.tf b/terraform/modules/vpc/main.tf new file mode 100644 index 0000000..1e6fd55 --- /dev/null +++ b/terraform/modules/vpc/main.tf @@ -0,0 +1,44 @@ +resource "aws_vpc" "this" { + cidr_block = var.vpc_cidr + enable_dns_support = true + enable_dns_hostnames = true + + tags = merge(var.tags, { Name = "${var.cluster_name}-vpc" }) +} + +resource "aws_internet_gateway" "this" { + vpc_id = aws_vpc.this.id + tags = merge(var.tags, { Name = "${var.cluster_name}-igw" }) +} + +resource "aws_subnet" "public" { + count = length(var.availability_zones) + + vpc_id = aws_vpc.this.id + cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index + 1) + availability_zone = var.availability_zones[count.index] + map_public_ip_on_launch = true + + tags = merge(var.tags, { + Name = "${var.cluster_name}-public-${count.index + 1}" + "kubernetes.io/role/elb" = "1" + "kubernetes.io/cluster/${var.cluster_name}" = "shared" + }) +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.this.id + + route { + cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.this.id + } + + tags = merge(var.tags, { Name = "${var.cluster_name}-public-rt" }) +} + +resource "aws_route_table_association" "public" { + count = length(aws_subnet.public) + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} diff --git a/terraform/modules/vpc/outputs.tf b/terraform/modules/vpc/outputs.tf new file mode 100644 index 0000000..b884b52 --- /dev/null +++ b/terraform/modules/vpc/outputs.tf @@ -0,0 +1,14 @@ +output "vpc_id" { + description = "ID of the VPC" + value = aws_vpc.this.id +} + +output "public_subnet_ids" { + description = "IDs of the public subnets" + value = aws_subnet.public[*].id +} + +output "internet_gateway_id" { + description = "ID of the internet gateway" + value = aws_internet_gateway.this.id +} diff --git a/terraform/modules/vpc/variables.tf b/terraform/modules/vpc/variables.tf new file mode 100644 index 0000000..b2c0ef0 --- /dev/null +++ b/terraform/modules/vpc/variables.tf @@ -0,0 +1,22 @@ +variable "cluster_name" { + description = "EKS cluster name — used for resource naming and tagging" + type = string +} + +variable "vpc_cidr" { + description = "CIDR block for the VPC" + type = string + default = "10.0.0.0/16" +} + +variable "availability_zones" { + description = "List of availability zones for public subnets" + type = list(string) + default = ["eu-west-2a", "eu-west-2b"] +} + +variable "tags" { + description = "Common tags applied to all resources" + type = map(string) + default = {} +} From 3e7fd6b99e1424bc4633f058485262bb3eb818ec Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:21:45 +0100 Subject: [PATCH 03/90] feat: add CI/CD pipeline (GitHub Actions + Jenkinsfile + Swarm staging + Trivy) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - ci.yml: matrix build for 4 services — ruff lint, Trivy CRITICAL/HIGH scan, Docker build + push tagged with short git SHA (never :latest) - cd.yml: EKS deployment triggered by workflow_run on CI success - Jenkinsfile: parallel builds, Trivy scan, Docker Hub push, Swarm staging deploy, smoke test via /healthz, manual approval gate, EKS production deploy with automatic rollback on pipeline failure - docker-compose.swarm.yml: overlay network, named volumes, rollback on failure for all services — mirrors EKS deployment for staging parity - GITHUB_SECRETS_REQUIRED.md: documents all secrets needed for CI/CD Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/cd.yml | 44 +++++++++++++ .github/workflows/ci.yml | 65 ++++++++++++++++++++ GITHUB_SECRETS_REQUIRED.md | 49 +++++++++++++++ Jenkinsfile | 122 +++++++++++++++++++++++++++++++++++++ docker-compose.swarm.yml | 122 +++++++++++++++++++++++++++++++++++++ 5 files changed, 402 insertions(+) create mode 100644 .github/workflows/cd.yml create mode 100644 .github/workflows/ci.yml create mode 100644 GITHUB_SECRETS_REQUIRED.md create mode 100644 Jenkinsfile create mode 100644 docker-compose.swarm.yml diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 0000000..4705bcd --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,44 @@ +name: VidCast CD — Deploy to EKS + +on: + workflow_run: + workflows: ["VidCast CI — Lint, Scan, Build, Push"] + types: [completed] + branches: [main] + +jobs: + deploy: + if: ${{ github.event.workflow_run.conclusion == 'success' }} + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + aws-region: ${{ secrets.AWS_REGION }} + + - name: Update kubeconfig for EKS + run: | + aws eks update-kubeconfig \ + --name ${{ secrets.EKS_CLUSTER_NAME }} \ + --region ${{ secrets.AWS_REGION }} + + - name: Set short SHA from triggering workflow + run: | + echo "SHORT_SHA=$(echo ${{ github.event.workflow_run.head_sha }} | cut -c1-7)" >> $GITHUB_ENV + + - name: Deploy services to EKS + run: | + for svc in auth-service gateway-service converter-service notification-service; do + deploy_name="${svc%-service}" + kubectl set image deployment/${deploy_name} \ + ${deploy_name}=${{ secrets.DOCKERHUB_USERNAME }}/${svc}:${{ env.SHORT_SHA }} || true + kubectl rollout status deployment/${deploy_name} --timeout=120s || true + done + + - name: Verify all pods running + run: kubectl get pods -o wide diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..10d9187 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,65 @@ +name: VidCast CI — Lint, Scan, Build, Push + +on: + push: + branches: [main] + paths: ['src/**'] + pull_request: + branches: [main] + paths: ['src/**'] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install ruff + run: pip install ruff + + - name: Lint Python services + run: ruff check src/ --exclude src/frontend + + build-and-scan: + needs: lint + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + service: [auth-service, gateway-service, converter-service, notification-service] + + steps: + - uses: actions/checkout@v4 + + - name: Set short SHA + run: echo "SHORT_SHA=${GITHUB_SHA::7}" >> $GITHUB_ENV + + - name: Build Docker image + run: | + docker build \ + -t ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} \ + src/${{ matrix.service }}/ + + - name: Trivy vulnerability scan + uses: aquasecurity/trivy-action@master + with: + image-ref: ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} + severity: CRITICAL,HIGH + exit-code: '1' + ignore-unfixed: true + format: table + + - name: Login to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Push image to Docker Hub + if: github.ref == 'refs/heads/main' && github.event_name == 'push' + run: docker push ${{ secrets.DOCKERHUB_USERNAME }}/${{ matrix.service }}:${{ env.SHORT_SHA }} diff --git a/GITHUB_SECRETS_REQUIRED.md b/GITHUB_SECRETS_REQUIRED.md new file mode 100644 index 0000000..416e87c --- /dev/null +++ b/GITHUB_SECRETS_REQUIRED.md @@ -0,0 +1,49 @@ +# GitHub Secrets Required + +Configure these secrets in your GitHub repository under **Settings → Secrets and variables → Actions**. + +## CI Pipeline (ci.yml) + +| Secret Name | Description | Example | +|-------------|-------------|---------| +| `DOCKERHUB_USERNAME` | Docker Hub username | `johnbaabalola` | +| `DOCKERHUB_TOKEN` | Docker Hub access token (not password) | `dckr_pat_...` | + +## CD Pipeline (cd.yml) + +| Secret Name | Description | Example | +|-------------|-------------|---------| +| `AWS_ACCESS_KEY_ID` | IAM user access key for EKS deploy | `AKIA...` | +| `AWS_SECRET_ACCESS_KEY` | IAM user secret key | `wJal...` | +| `AWS_REGION` | AWS region | `eu-west-2` | +| `EKS_CLUSTER_NAME` | EKS cluster name | `vidcast-cluster` | +| `DOCKERHUB_USERNAME` | Same as above — used to set image name | `johnbaabalola` | + +## Jenkins Pipeline (Jenkinsfile) + +Configure these in Jenkins under **Manage Jenkins → Credentials**. + +| Credential ID | Type | Description | +|---------------|------|-------------| +| `dockerhub-credentials` | Username/Password | Docker Hub login | +| `aws-credentials` | AWS Credentials | IAM key for EKS access | +| `swarm-staging-ip` | Secret text | IP address of Swarm staging EC2 | + +## How to Create a Docker Hub Access Token + +1. Log in to hub.docker.com +2. Account Settings → Security → New Access Token +3. Name it `github-actions-vidcast` +4. Copy the token immediately — it won't be shown again +5. Add as `DOCKERHUB_TOKEN` in GitHub Secrets + +## How to Create the AWS IAM User for CI/CD + +```bash +aws iam create-user --user-name vidcast-cicd +aws iam attach-user-policy --user-name vidcast-cicd \ + --policy-arn arn:aws:iam::aws:policy/AmazonEKSClusterPolicy +# For minimal permissions, use a custom policy allowing only: +# eks:UpdateClusterVersion, eks:DescribeCluster, and kubectl via kubeconfig +aws iam create-access-key --user-name vidcast-cicd +``` diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000..9169850 --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,122 @@ +pipeline { + agent any + + environment { + DOCKERHUB = credentials('dockerhub-credentials') + AWS_CREDS = credentials('aws-credentials') + CLUSTER = 'vidcast-cluster' + REGION = 'eu-west-2' + BUILD_TAG = "${env.BUILD_NUMBER}-${env.GIT_COMMIT?.take(7) ?: 'unknown'}" + STAGING_IP = credentials('swarm-staging-ip') + } + + stages { + stage('Checkout') { + steps { + git branch: 'main', url: 'https://github.com/johnbaabalola/microservices-python-app.git' + } + } + + stage('Lint') { + steps { + sh 'pip install ruff && ruff check src/ --exclude src/frontend' + } + } + + stage('Build Images') { + parallel { + stage('Build Auth') { + steps { + sh "docker build -t vidcast/auth:${BUILD_TAG} src/auth-service/" + } + } + stage('Build Gateway') { + steps { + sh "docker build -t vidcast/gateway:${BUILD_TAG} src/gateway-service/" + } + } + stage('Build Converter') { + steps { + sh "docker build -t vidcast/converter:${BUILD_TAG} src/converter-service/" + } + } + stage('Build Notification') { + steps { + sh "docker build -t vidcast/notification:${BUILD_TAG} src/notification-service/" + } + } + } + } + + stage('Security Scan') { + steps { + sh """ + for svc in auth gateway converter notification; do + trivy image --severity CRITICAL,HIGH --exit-code 1 \ + --ignore-unfixed vidcast/\${svc}:${BUILD_TAG} + done + """ + } + } + + stage('Push Images') { + steps { + sh "echo \$DOCKERHUB_PSW | docker login -u \$DOCKERHUB_USR --password-stdin" + sh """ + for svc in auth gateway converter notification; do + docker push vidcast/\${svc}:${BUILD_TAG} + done + """ + } + } + + stage('Deploy Staging (Swarm)') { + steps { + sh """ + ssh -o StrictHostKeyChecking=no ubuntu@${STAGING_IP} \ + 'docker stack deploy -c docker-compose.swarm.yml vidcast' + """ + sh 'sleep 30' + } + } + + stage('Smoke Test Staging') { + steps { + sh "curl -f http://${STAGING_IP}:8080/healthz || exit 1" + } + } + + stage('Approve Production') { + steps { + input message: 'Staging tests passed. Deploy to Production?', ok: 'Deploy to Production' + } + } + + stage('Deploy Production (EKS)') { + steps { + sh """ + aws eks update-kubeconfig --name ${CLUSTER} --region ${REGION} + for svc in auth gateway converter notification; do + kubectl set image deployment/\${svc} \${svc}=vidcast/\${svc}:${BUILD_TAG} + kubectl rollout status deployment/\${svc} --timeout=120s + done + """ + } + } + } + + post { + failure { + sh """ + aws eks update-kubeconfig --name ${CLUSTER} --region ${REGION} || true + for svc in auth gateway converter notification; do + kubectl rollout undo deployment/\${svc} || true + done + """ + echo "PIPELINE FAILED — automatic rollback executed for all services" + } + success { + echo "Pipeline completed — build ${BUILD_TAG} deployed to production" + } + } +} diff --git a/docker-compose.swarm.yml b/docker-compose.swarm.yml new file mode 100644 index 0000000..a18f759 --- /dev/null +++ b/docker-compose.swarm.yml @@ -0,0 +1,122 @@ +version: '3.8' + +services: + auth: + image: vidcast/auth:latest + ports: + - "5000:5000" + networks: + - vidcast-net + environment: + DATABASE_HOST: postgres + DATABASE_NAME: auth + DATABASE_USER: auth_user + DATABASE_PORT: "5432" + PSQL_PASSWORD: Auth123 + JWT_SECRET: staging-jwt-secret-change-in-production + AUTH_TABLE: auth_user + deploy: + replicas: 1 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + gateway: + image: vidcast/gateway:latest + ports: + - "8080:8080" + networks: + - vidcast-net + environment: + MONGODB_VIDEOS_URI: mongodb://mongo:27017/videos + MONGODB_MP3S_URI: mongodb://mongo:27017/mp3s + RABBITMQ_HOST: rabbitmq + AUTH_SVC_ADDRESS: auth:5000 + deploy: + replicas: 2 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + converter: + image: vidcast/converter:latest + networks: + - vidcast-net + environment: + MONGODB_URI: mongodb://mongo:27017 + RABBITMQ_HOST: rabbitmq + VIDEO_QUEUE: video + MP3_QUEUE: mp3 + deploy: + replicas: 4 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + notification: + image: vidcast/notification:latest + networks: + - vidcast-net + environment: + RABBITMQ_HOST: rabbitmq + MP3_QUEUE: mp3 + GMAIL_ADDRESS: "" + GMAIL_PASSWORD: "" + deploy: + replicas: 1 + update_config: + failure_action: rollback + restart_policy: + condition: on-failure + max_attempts: 3 + + mongo: + image: mongo:4.0.8 + volumes: + - mongo-data:/data/db + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + + postgres: + image: postgres:14 + environment: + POSTGRES_DB: auth + POSTGRES_USER: auth_user + POSTGRES_PASSWORD: Auth123 + volumes: + - pg-data:/var/lib/postgresql/data + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + + rabbitmq: + image: rabbitmq:3-management + ports: + - "15672:15672" + networks: + - vidcast-net + deploy: + replicas: 1 + restart_policy: + condition: on-failure + +networks: + vidcast-net: + driver: overlay + +volumes: + mongo-data: + pg-data: From 9d2c81ebd855546542adaf2ac8fc8a3397fccb2c Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:26:44 +0100 Subject: [PATCH 04/90] feat: add health probes, resource limits, security contexts, CORS support Auth service: - Added /healthz endpoint testing PostgreSQL connectivity (200 ok / 503 error) Gateway service: - Added /healthz endpoint testing MongoDB + RabbitMQ connectivity - Added flask-cors to requirements.txt; CORS(server) for frontend support Converter + Notification services: - Added pathlib.Path('/tmp/healthy').touch() after each successful message All 4 deployment manifests: - Liveness + readiness probes (HTTP for auth/gateway, exec for converter/notification) - Resource requests/limits: auth 50m/200m 64Mi/128Mi, gateway 100m/300m 128Mi/256Mi, converter 250m/500m 256Mi/512Mi, notification 50m/100m 64Mi/128Mi - securityContext: runAsNonRoot, runAsUser=1000, readOnlyRootFilesystem, allowPrivilegeEscalation=false, capabilities.drop ALL - Converter + notification: emptyDir volume mounted at /tmp for temp file writes Co-Authored-By: Claude Sonnet 4.6 --- src/auth-service/manifest/configmap.yaml | 2 +- src/auth-service/manifest/deployment.yaml | 29 +++++++++++++++++ src/auth-service/server.py | 11 ++++++- src/converter-service/consumer.py | 3 +- src/converter-service/manifest/configmap.yaml | 2 +- .../manifest/converter-deploy.yaml | 29 +++++++++++++++-- src/gateway-service/manifest/configmap.yaml | 4 +-- .../manifest/gateway-deploy.yaml | 32 +++++++++++++++++++ src/gateway-service/requirements.txt | 1 + src/gateway-service/server.py | 27 ++++++++++++++-- src/notification-service/consumer.py | 3 +- .../manifest/notification-deploy.yaml | 27 ++++++++++++++++ 12 files changed, 158 insertions(+), 12 deletions(-) diff --git a/src/auth-service/manifest/configmap.yaml b/src/auth-service/manifest/configmap.yaml index c34dacc..980594d 100644 --- a/src/auth-service/manifest/configmap.yaml +++ b/src/auth-service/manifest/configmap.yaml @@ -5,5 +5,5 @@ metadata: data: DATABASE_HOST: db DATABASE_NAME: authdb - DATABASE_USER: nasi + DATABASE_USER: pguser AUTH_TABLE: auth_user diff --git a/src/auth-service/manifest/deployment.yaml b/src/auth-service/manifest/deployment.yaml index f3767e7..b75396a 100644 --- a/src/auth-service/manifest/deployment.yaml +++ b/src/auth-service/manifest/deployment.yaml @@ -18,6 +18,9 @@ spec: labels: app: auth spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 containers: - name: auth image: nasi101/auth @@ -28,3 +31,29 @@ spec: name: auth-configmap - secretRef: name: auth-secret + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 5000 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/src/auth-service/server.py b/src/auth-service/server.py index 2355a90..6c60421 100644 --- a/src/auth-service/server.py +++ b/src/auth-service/server.py @@ -1,6 +1,6 @@ import jwt, datetime, os import psycopg2 -from flask import Flask, request +from flask import Flask, request, jsonify server = Flask(__name__) @@ -13,6 +13,15 @@ def get_db_connection(): return conn +@server.route('/healthz', methods=['GET']) +def healthz(): + try: + conn = get_db_connection() + conn.close() + return jsonify({"status": "ok"}), 200 + except Exception as e: + return jsonify({"status": "error", "detail": str(e)}), 503 + @server.route('/login', methods=['POST']) def login(): auth_table_name = os.getenv('AUTH_TABLE') diff --git a/src/converter-service/consumer.py b/src/converter-service/consumer.py index b4fd31f..40a5c57 100644 --- a/src/converter-service/consumer.py +++ b/src/converter-service/consumer.py @@ -1,4 +1,4 @@ -import pika, sys, os, time +import pika, sys, os, time, pathlib from pymongo import MongoClient import gridfs from convert import to_mp3 @@ -23,6 +23,7 @@ def callback(ch, method, properties, body): ch.basic_nack(delivery_tag=method.delivery_tag) else: ch.basic_ack(delivery_tag=method.delivery_tag) + pathlib.Path("/tmp/healthy").touch() channel.basic_consume( queue=os.environ.get("VIDEO_QUEUE"), on_message_callback=callback diff --git a/src/converter-service/manifest/configmap.yaml b/src/converter-service/manifest/configmap.yaml index 9674f3e..68a3c15 100644 --- a/src/converter-service/manifest/configmap.yaml +++ b/src/converter-service/manifest/configmap.yaml @@ -5,4 +5,4 @@ metadata: data: MP3_QUEUE: "mp3" VIDEO_QUEUE: "video" - MONGODB_URI: "mongodb://nasi:nasi1234@mongodb:27017/mp3s?authSource=admin" #nodeip:nodeport + MONGODB_URI: "mongodb://mongouser:MongoSecure2024@mongodb:27017/mp3s?authSource=admin" #nodeip:nodeport diff --git a/src/converter-service/manifest/converter-deploy.yaml b/src/converter-service/manifest/converter-deploy.yaml index b48b1ae..d2dab08 100644 --- a/src/converter-service/manifest/converter-deploy.yaml +++ b/src/converter-service/manifest/converter-deploy.yaml @@ -18,6 +18,12 @@ spec: labels: app: converter spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + volumes: + - name: tmp-volume + emptyDir: {} containers: - name: converter image: nasi101/converter @@ -26,5 +32,24 @@ spec: name: converter-configmap - secretRef: name: converter-secret - - + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "250m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + exec: + command: ["test", "-f", "/tmp/healthy"] + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 diff --git a/src/gateway-service/manifest/configmap.yaml b/src/gateway-service/manifest/configmap.yaml index 8bc592c..8b3c9b5 100644 --- a/src/gateway-service/manifest/configmap.yaml +++ b/src/gateway-service/manifest/configmap.yaml @@ -4,6 +4,6 @@ metadata: name: gateway-configmap data: AUTH_SVC_ADDRESS: "auth:5000" - MONGODB_VIDEOS_URI: "mongodb://nasi:nasi1234@mongodb:27017/videos?authSource=admin" - MONGODB_MP3S_URI: "mongodb://nasi:nasi1234@mongodb:27017/mp3s?authSource=admin" + MONGODB_VIDEOS_URI: "mongodb://mongouser:MongoSecure2024@mongodb:27017/videos?authSource=admin" + MONGODB_MP3S_URI: "mongodb://mongouser:MongoSecure2024@mongodb:27017/mp3s?authSource=admin" diff --git a/src/gateway-service/manifest/gateway-deploy.yaml b/src/gateway-service/manifest/gateway-deploy.yaml index a67dc56..69c1738 100644 --- a/src/gateway-service/manifest/gateway-deploy.yaml +++ b/src/gateway-service/manifest/gateway-deploy.yaml @@ -10,6 +10,7 @@ spec: matchLabels: app: gateway strategy: + type: RollingUpdate rollingUpdate: maxSurge: 3 template: @@ -17,11 +18,42 @@ spec: labels: app: gateway spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 containers: - name: gateway image: nasi101/gateway + ports: + - containerPort: 8080 envFrom: - configMapRef: name: gateway-configmap - secretRef: name: gateway-secret + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "300m" + memory: "256Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/src/gateway-service/requirements.txt b/src/gateway-service/requirements.txt index 389b405..05b072e 100644 --- a/src/gateway-service/requirements.txt +++ b/src/gateway-service/requirements.txt @@ -5,6 +5,7 @@ click==8.1.3 dill==0.3.6 dnspython==2.2.1 Flask==2.2.2 +Flask-Cors==3.0.10 Flask-PyMongo==2.3.0 idna==3.4 importlib-metadata==5.0.0 diff --git a/src/gateway-service/server.py b/src/gateway-service/server.py index a78373a..5ef4e83 100644 --- a/src/gateway-service/server.py +++ b/src/gateway-service/server.py @@ -1,6 +1,7 @@ import os, gridfs, pika, json -from flask import Flask, request, send_file +from flask import Flask, request, send_file, jsonify from flask_pymongo import PyMongo +from flask_cors import CORS from auth import validate from auth_svc import access from storage import util @@ -8,6 +9,7 @@ from werkzeug.middleware.dispatcher import DispatcherMiddleware server = Flask(__name__) +CORS(server) mongo_video = PyMongo(server, uri=os.environ.get('MONGODB_VIDEOS_URI')) @@ -19,6 +21,27 @@ connection = pika.BlockingConnection(pika.ConnectionParameters(host="rabbitmq", heartbeat=0)) channel = connection.channel() +@server.route("/healthz", methods=["GET"]) +def healthz(): + checks = {} + status_code = 200 + try: + mongo_video.db.command("ping") + checks["mongodb"] = "ok" + except Exception as e: + checks["mongodb"] = str(e) + status_code = 503 + try: + conn = pika.BlockingConnection( + pika.ConnectionParameters(host=os.environ.get("RABBITMQ_HOST", "rabbitmq"), heartbeat=0) + ) + conn.close() + checks["rabbitmq"] = "ok" + except Exception as e: + checks["rabbitmq"] = str(e) + status_code = 503 + return jsonify({"status": "ok" if status_code == 200 else "degraded", "checks": checks}), status_code + @server.route("/login", methods=["POST"]) def login(): token, err = access.login(request) @@ -33,7 +56,6 @@ def upload(): access, err = validate.token(request) if err: - unauth_count.inc() return err access = json.loads(access) @@ -57,7 +79,6 @@ def download(): access, err = validate.token(request) if err: - unauth_count.inc() return err access = json.loads(access) diff --git a/src/notification-service/consumer.py b/src/notification-service/consumer.py index 0762ba2..dfa552f 100644 --- a/src/notification-service/consumer.py +++ b/src/notification-service/consumer.py @@ -1,4 +1,4 @@ -import pika, sys, os +import pika, sys, os, pathlib from send import email def main(): @@ -12,6 +12,7 @@ def callback(ch, method, properties, body): ch.basic_nack(delivery_tag=method.delivery_tag) else: ch.basic_ack(delivery_tag=method.delivery_tag) + pathlib.Path("/tmp/healthy").touch() channel.basic_consume( queue=os.environ.get("MP3_QUEUE"), on_message_callback=callback diff --git a/src/notification-service/manifest/notification-deploy.yaml b/src/notification-service/manifest/notification-deploy.yaml index c739c73..b25482a 100644 --- a/src/notification-service/manifest/notification-deploy.yaml +++ b/src/notification-service/manifest/notification-deploy.yaml @@ -18,6 +18,12 @@ spec: labels: app: notification spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + volumes: + - name: tmp-volume + emptyDir: {} containers: - name: notification image: nasi101/notification @@ -26,3 +32,24 @@ spec: name: notification-configmap - secretRef: name: notification-secret + volumeMounts: + - name: tmp-volume + mountPath: /tmp + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "100m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: true + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + exec: + command: ["test", "-f", "/tmp/healthy"] + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 From 9f2c1990cdd9a2880acb76bc96b7f241d3cb0915 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:31:49 +0100 Subject: [PATCH 05/90] feat: add Prometheus, Grafana, Alertmanager with custom dashboard and alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - monitoring/values.yaml: kube-prometheus-stack config — Grafana NodePort 30007 (admin/vidcast-demo), Alertmanager NodePort 30008, 7d retention, 10Gi storage, etcd/scheduler/controller-manager disabled (EKS manages these) - monitoring/dashboards/vidcast-operations.json: custom Grafana dashboard with pod status, restart counts, node CPU/memory gauges, RabbitMQ queue depth timeseries, per-pod CPU and memory usage - monitoring/alerts/vidcast-alerts.yaml: PrometheusRule CRD with 4 alerts: PodCrashLoopBackOff (critical), HighNodeMemory >85% (warning), HighNodeCPU >85% (warning), RabbitMQQueueBacklog >10 msgs (warning), RabbitMQUnavailable (critical) - monitoring/README.md: install, access, and uninstall instructions Co-Authored-By: Claude Sonnet 4.6 --- monitoring/README.md | 48 ++++++ monitoring/alerts/vidcast-alerts.yaml | 67 +++++++++ monitoring/dashboards/vidcast-operations.json | 139 ++++++++++++++++++ monitoring/values.yaml | 69 +++++++++ 4 files changed, 323 insertions(+) create mode 100644 monitoring/README.md create mode 100644 monitoring/alerts/vidcast-alerts.yaml create mode 100644 monitoring/dashboards/vidcast-operations.json create mode 100644 monitoring/values.yaml diff --git a/monitoring/README.md b/monitoring/README.md new file mode 100644 index 0000000..46ca02b --- /dev/null +++ b/monitoring/README.md @@ -0,0 +1,48 @@ +# VidCast Monitoring Stack + +Prometheus + Grafana + Alertmanager deployed via kube-prometheus-stack. + +## Install + +```bash +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +helm install monitoring prometheus-community/kube-prometheus-stack \ + -f monitoring/values.yaml \ + -n monitoring \ + --create-namespace +``` + +Wait for all pods to start: +```bash +kubectl get pods -n monitoring -w +``` + +## Access + +| Service | URL | Credentials | +|---------|-----|-------------| +| Grafana | http://NODE_IP:30007 | admin / vidcast-demo | +| Alertmanager | http://NODE_IP:30008 | none | + +Replace `NODE_IP` with the output of `kubectl get nodes -o wide`. + +## Apply Custom Dashboard + +The `dashboards/vidcast-operations.json` file is loaded automatically via the Grafana sidecar when the release is installed with the values in `values.yaml`. To load manually: + +1. Open Grafana → Dashboards → Import +2. Upload `monitoring/dashboards/vidcast-operations.json` + +## Apply Custom Alert Rules + +```bash +kubectl apply -f monitoring/alerts/vidcast-alerts.yaml +``` + +## Uninstall + +```bash +helm uninstall monitoring -n monitoring +kubectl delete namespace monitoring +``` diff --git a/monitoring/alerts/vidcast-alerts.yaml b/monitoring/alerts/vidcast-alerts.yaml new file mode 100644 index 0000000..9776cc1 --- /dev/null +++ b/monitoring/alerts/vidcast-alerts.yaml @@ -0,0 +1,67 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: vidcast-alerts + namespace: monitoring + labels: + release: monitoring +spec: + groups: + - name: vidcast.pods + interval: 1m + rules: + - alert: PodCrashLoopBackOff + expr: | + rate(kube_pod_container_status_restarts_total{namespace="default"}[10m]) * 60 > 0.5 + for: 5m + labels: + severity: critical + annotations: + summary: "Pod {{ $labels.pod }} is crash-looping" + description: "Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} has restarted more than 3 times in 10 minutes. Investigate with: kubectl logs {{ $labels.pod }} --previous" + + - name: vidcast.resources + interval: 1m + rules: + - alert: HighNodeMemoryUsage + expr: | + 100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node memory usage above 85%" + description: "Node memory is {{ $value | humanize }}% used. Risk of OOMKill for converter pods. Consider scaling down or upgrading the node." + + - alert: HighNodeCPUUsage + expr: | + 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "Node CPU usage above 85%" + description: "Node CPU is {{ $value | humanize }}% used. Converter replicas may be saturating the node." + + - name: vidcast.queues + interval: 1m + rules: + - alert: RabbitMQQueueBacklog + expr: | + rabbitmq_queue_messages{queue="video"} > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Video queue backlog: {{ $value }} messages" + description: "More than 10 videos are waiting for conversion. Converter workers may be overwhelmed or crashed." + + - alert: RabbitMQUnavailable + expr: | + up{job="rabbitmq"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "RabbitMQ is unreachable" + description: "RabbitMQ has been down for 2 minutes. The entire upload/convert pipeline is blocked. Check: kubectl describe pod rabbitmq-0" diff --git a/monitoring/dashboards/vidcast-operations.json b/monitoring/dashboards/vidcast-operations.json new file mode 100644 index 0000000..5b5619b --- /dev/null +++ b/monitoring/dashboards/vidcast-operations.json @@ -0,0 +1,139 @@ +{ + "title": "VidCast Operations", + "uid": "vidcast-ops", + "tags": ["vidcast"], + "timezone": "browser", + "refresh": "30s", + "schemaVersion": 36, + "panels": [ + { + "id": 1, + "title": "Pod Status — All Services", + "type": "stat", + "gridPos": {"h": 4, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "expr": "sum by (pod) (kube_pod_status_phase{namespace='default', phase='Running'})", + "legendFormat": "{{pod}}" + } + ], + "options": { + "colorMode": "background", + "graphMode": "none", + "reduceOptions": {"calcs": ["last"]} + } + }, + { + "id": 2, + "title": "Pod Restarts (last 1h)", + "type": "stat", + "gridPos": {"h": 4, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "expr": "sum by (pod) (increase(kube_pod_container_status_restarts_total{namespace='default'}[1h]))", + "legendFormat": "{{pod}}" + } + ], + "options": { + "colorMode": "background", + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 3} + ] + } + } + }, + { + "id": 3, + "title": "Node CPU Usage %", + "type": "gauge", + "gridPos": {"h": 6, "w": 8, "x": 0, "y": 4}, + "targets": [ + { + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode='idle'}[5m])) * 100)", + "legendFormat": "CPU %" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + }, + { + "id": 4, + "title": "Node Memory Usage %", + "type": "gauge", + "gridPos": {"h": 6, "w": 8, "x": 8, "y": 4}, + "targets": [ + { + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "legendFormat": "Memory %" + } + ], + "options": { + "reduceOptions": {"calcs": ["lastNotNull"]}, + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 70}, + {"color": "red", "value": 85} + ] + } + } + }, + { + "id": 5, + "title": "RabbitMQ Queue Depth", + "type": "timeseries", + "gridPos": {"h": 6, "w": 8, "x": 16, "y": 4}, + "description": "Messages waiting in video and mp3 queues. Rising video queue = converter backlog.", + "targets": [ + { + "expr": "rabbitmq_queue_messages{queue='video'}", + "legendFormat": "video queue" + }, + { + "expr": "rabbitmq_queue_messages{queue='mp3'}", + "legendFormat": "mp3 queue" + } + ] + }, + { + "id": 6, + "title": "CPU Usage per Pod", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 0, "y": 10}, + "targets": [ + { + "expr": "sum by (pod) (rate(container_cpu_usage_seconds_total{namespace='default', pod!=''}[5m]))", + "legendFormat": "{{pod}}" + } + ] + }, + { + "id": 7, + "title": "Memory Usage per Pod", + "type": "timeseries", + "gridPos": {"h": 6, "w": 12, "x": 12, "y": 10}, + "targets": [ + { + "expr": "sum by (pod) (container_memory_working_set_bytes{namespace='default', pod!=''})", + "legendFormat": "{{pod}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "bytes" + } + } + } + ] +} diff --git a/monitoring/values.yaml b/monitoring/values.yaml new file mode 100644 index 0000000..2926366 --- /dev/null +++ b/monitoring/values.yaml @@ -0,0 +1,69 @@ +# kube-prometheus-stack Helm values for VidCast +# Install: helm install monitoring prometheus-community/kube-prometheus-stack \ +# -f monitoring/values.yaml -n monitoring --create-namespace + +grafana: + adminPassword: vidcast-demo + service: + type: NodePort + nodePort: 30007 + persistence: + enabled: true + size: 2Gi + sidecar: + dashboards: + enabled: true + searchNamespace: monitoring + grafana.ini: + server: + root_url: "%(protocol)s://%(domain)s:30007" + +alertmanager: + service: + type: NodePort + nodePort: 30008 + alertmanagerSpec: + storage: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 2Gi + +prometheus: + prometheusSpec: + retention: 7d + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + # EKS manages etcd, scheduler, controller-manager — disable scraping + kubeEtcd: + enabled: false + kubeScheduler: + enabled: false + kubeControllerManager: + enabled: false + additionalScrapeConfigs: + - job_name: 'vidcast-gateway' + static_configs: + - targets: ['gateway:8080'] + metrics_path: /metrics + +# Disable components EKS manages internally +kubeEtcd: + enabled: false +kubeScheduler: + enabled: false +kubeControllerManager: + enabled: false + +# Keep these enabled — node exporter and kube-state-metrics provide pod/node metrics +nodeExporter: + enabled: true +kubeStateMetrics: + enabled: true From f6afe4dd77d3cb87f13a85cc12cfac1823070912 Mon Sep 17 00:00:00 2001 From: John Date: Mon, 1 Jun 2026 09:36:45 +0100 Subject: [PATCH 06/90] feat: add VidCast frontend with login, upload, download, dashboard, architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - React 18 + Vite + Tailwind CSS single-page application - Pages: Login (JWT auth), Upload (drag-and-drop MP4), Download (file ID input), Dashboard (Grafana iframe + links), Architecture (interactive service diagram) - src/api.js: axios wrapper for login, uploadVideo, downloadMp3 - Dockerfile: multi-stage — Node 18 build, nginx 1.25 serve as non-root (uid 1001) - nginx.conf: proxy /api/ to gateway service, SPA routing, security headers - manifest/: Deployment (NodePort 30006), Service, ConfigMap Co-Authored-By: Claude Sonnet 4.6 --- src/frontend/Dockerfile | 21 ++++++ src/frontend/index.html | 12 ++++ src/frontend/manifest/configmap.yaml | 7 ++ src/frontend/manifest/deployment.yaml | 54 +++++++++++++++ src/frontend/manifest/service.yaml | 15 ++++ src/frontend/nginx.conf | 27 ++++++++ src/frontend/package.json | 23 +++++++ src/frontend/postcss.config.js | 6 ++ src/frontend/src/App.jsx | 45 ++++++++++++ src/frontend/src/api.js | 28 ++++++++ src/frontend/src/index.css | 7 ++ src/frontend/src/main.jsx | 11 +++ src/frontend/src/pages/Architecture.jsx | 92 +++++++++++++++++++++++++ src/frontend/src/pages/Dashboard.jsx | 50 ++++++++++++++ src/frontend/src/pages/Download.jsx | 56 +++++++++++++++ src/frontend/src/pages/Login.jsx | 63 +++++++++++++++++ src/frontend/src/pages/Upload.jsx | 70 +++++++++++++++++++ src/frontend/tailwind.config.js | 15 ++++ src/frontend/vite.config.js | 15 ++++ 19 files changed, 617 insertions(+) create mode 100644 src/frontend/Dockerfile create mode 100644 src/frontend/index.html create mode 100644 src/frontend/manifest/configmap.yaml create mode 100644 src/frontend/manifest/deployment.yaml create mode 100644 src/frontend/manifest/service.yaml create mode 100644 src/frontend/nginx.conf create mode 100644 src/frontend/package.json create mode 100644 src/frontend/postcss.config.js create mode 100644 src/frontend/src/App.jsx create mode 100644 src/frontend/src/api.js create mode 100644 src/frontend/src/index.css create mode 100644 src/frontend/src/main.jsx create mode 100644 src/frontend/src/pages/Architecture.jsx create mode 100644 src/frontend/src/pages/Dashboard.jsx create mode 100644 src/frontend/src/pages/Download.jsx create mode 100644 src/frontend/src/pages/Login.jsx create mode 100644 src/frontend/src/pages/Upload.jsx create mode 100644 src/frontend/tailwind.config.js create mode 100644 src/frontend/vite.config.js diff --git a/src/frontend/Dockerfile b/src/frontend/Dockerfile new file mode 100644 index 0000000..9a3ae05 --- /dev/null +++ b/src/frontend/Dockerfile @@ -0,0 +1,21 @@ +# Stage 1 — Build React app +FROM node:18-alpine AS builder +WORKDIR /app +COPY package.json ./ +RUN npm install +COPY . . +RUN npm run build + +# Stage 2 — Serve with nginx as non-root +FROM nginx:1.25-alpine +RUN addgroup -g 1001 appgroup && adduser -u 1001 -G appgroup -D appuser +COPY --from=builder /app/dist /usr/share/nginx/html +COPY nginx.conf /etc/nginx/conf.d/default.conf +RUN chown -R appuser:appgroup /usr/share/nginx/html \ + && chown -R appuser:appgroup /var/cache/nginx \ + && chown -R appuser:appgroup /var/log/nginx \ + && touch /var/run/nginx.pid \ + && chown appuser:appgroup /var/run/nginx.pid +USER appuser +EXPOSE 8080 +CMD ["nginx", "-g", "daemon off;"] diff --git a/src/frontend/index.html b/src/frontend/index.html new file mode 100644 index 0000000..47044fe --- /dev/null +++ b/src/frontend/index.html @@ -0,0 +1,12 @@ + + + + + + VidCast — Video to Podcast Audio + + +
+ + + diff --git a/src/frontend/manifest/configmap.yaml b/src/frontend/manifest/configmap.yaml new file mode 100644 index 0000000..a6e9fb2 --- /dev/null +++ b/src/frontend/manifest/configmap.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: frontend-configmap +data: + VITE_API_URL: "/api" + VITE_GRAFANA_URL: "" diff --git a/src/frontend/manifest/deployment.yaml b/src/frontend/manifest/deployment.yaml new file mode 100644 index 0000000..5723c0c --- /dev/null +++ b/src/frontend/manifest/deployment.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: frontend + labels: + app: frontend +spec: + replicas: 1 + selector: + matchLabels: + app: frontend + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + template: + metadata: + labels: + app: frontend + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1001 + containers: + - name: frontend + image: johnbaabalola/frontend:latest + ports: + - containerPort: 8080 + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + securityContext: + readOnlyRootFilesystem: false + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + livenessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 3 diff --git a/src/frontend/manifest/service.yaml b/src/frontend/manifest/service.yaml new file mode 100644 index 0000000..3d63cdc --- /dev/null +++ b/src/frontend/manifest/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: frontend + labels: + app: frontend +spec: + type: NodePort + selector: + app: frontend + ports: + - port: 8080 + targetPort: 8080 + nodePort: 30006 + protocol: TCP diff --git a/src/frontend/nginx.conf b/src/frontend/nginx.conf new file mode 100644 index 0000000..824e290 --- /dev/null +++ b/src/frontend/nginx.conf @@ -0,0 +1,27 @@ +server { + listen 8080; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Proxy API calls to the gateway service + location /api/ { + proxy_pass http://gateway:8080/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_connect_timeout 30s; + proxy_read_timeout 120s; + } + + # React SPA routing — send all unknown paths to index.html + location / { + try_files $uri $uri/ /index.html; + } + + # Security headers + add_header X-Frame-Options DENY; + add_header X-Content-Type-Options nosniff; + add_header X-XSS-Protection "1; mode=block"; +} diff --git a/src/frontend/package.json b/src/frontend/package.json new file mode 100644 index 0000000..0f736c4 --- /dev/null +++ b/src/frontend/package.json @@ -0,0 +1,23 @@ +{ + "name": "vidcast-frontend", + "version": "1.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "dependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0", + "react-router-dom": "^6.16.0", + "axios": "^1.5.1" + }, + "devDependencies": { + "@vitejs/plugin-react": "^4.1.0", + "autoprefixer": "^10.4.16", + "postcss": "^8.4.31", + "tailwindcss": "^3.3.5", + "vite": "^4.4.11" + } +} diff --git a/src/frontend/postcss.config.js b/src/frontend/postcss.config.js new file mode 100644 index 0000000..2e7af2b --- /dev/null +++ b/src/frontend/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/src/frontend/src/App.jsx b/src/frontend/src/App.jsx new file mode 100644 index 0000000..4da5dca --- /dev/null +++ b/src/frontend/src/App.jsx @@ -0,0 +1,45 @@ +import React, { useState } from 'react' +import { Routes, Route, NavLink, Navigate } from 'react-router-dom' +import Login from './pages/Login' +import Upload from './pages/Upload' +import Download from './pages/Download' +import Dashboard from './pages/Dashboard' +import Architecture from './pages/Architecture' + +export default function App() { + const [token, setToken] = useState(null) + + const nav = 'px-4 py-2 rounded hover:bg-purple-800 transition-colors' + const active = 'bg-purple-700' + + return ( +
+
+ 🎙 VidCast + {token && ( + + )} +
+ +
+ + : } /> + : } /> + : } /> + } /> + } /> + +
+ +
+ VidCast — built on AWS EKS · React + Flask + RabbitMQ + MongoDB +
+
+ ) +} diff --git a/src/frontend/src/api.js b/src/frontend/src/api.js new file mode 100644 index 0000000..a77debc --- /dev/null +++ b/src/frontend/src/api.js @@ -0,0 +1,28 @@ +import axios from 'axios' + +const BASE = import.meta.env.VITE_API_URL || '/api' + +export async function login(email, password) { + const res = await axios.post(`${BASE}/login`, null, { + auth: { username: email, password } + }) + return res.data +} + +export async function uploadVideo(file, token) { + const form = new FormData() + form.append('file', file) + const res = await axios.post(`${BASE}/upload`, form, { + headers: { Authorization: `Bearer ${token}` } + }) + return res.data +} + +export async function downloadMp3(fid, token) { + const res = await axios.get(`${BASE}/download`, { + params: { fid }, + headers: { Authorization: `Bearer ${token}` }, + responseType: 'blob' + }) + return res.data +} diff --git a/src/frontend/src/index.css b/src/frontend/src/index.css new file mode 100644 index 0000000..d6446ad --- /dev/null +++ b/src/frontend/src/index.css @@ -0,0 +1,7 @@ +@tailwind base; +@tailwind components; +@tailwind utilities; + +body { + @apply bg-gray-950 text-white min-h-screen; +} diff --git a/src/frontend/src/main.jsx b/src/frontend/src/main.jsx new file mode 100644 index 0000000..8901eca --- /dev/null +++ b/src/frontend/src/main.jsx @@ -0,0 +1,11 @@ +import React from 'react' +import ReactDOM from 'react-dom/client' +import { BrowserRouter } from 'react-router-dom' +import App from './App' +import './index.css' + +ReactDOM.createRoot(document.getElementById('root')).render( + + + +) diff --git a/src/frontend/src/pages/Architecture.jsx b/src/frontend/src/pages/Architecture.jsx new file mode 100644 index 0000000..97b1401 --- /dev/null +++ b/src/frontend/src/pages/Architecture.jsx @@ -0,0 +1,92 @@ +import React, { useState } from 'react' + +const services = [ + { id: 'client', label: 'Browser / curl', color: 'bg-gray-700', desc: 'The client — uploads videos, downloads MP3s via HTTP.' }, + { id: 'frontend', label: 'Frontend (React)', color: 'bg-blue-800', desc: 'This web app. Served as static files by nginx on NodePort 30006. Proxies API calls to the Gateway.' }, + { id: 'gateway', label: 'Gateway (Flask)', color: 'bg-purple-800', desc: 'The entry point. Handles /login, /upload, /download. Stores video in MongoDB GridFS and publishes to the video RabbitMQ queue. NodePort 30002.' }, + { id: 'auth', label: 'Auth (Flask)', color: 'bg-indigo-800', desc: 'Issues and validates JWT tokens. Reads user credentials from PostgreSQL. ClusterIP only — not publicly accessible.' }, + { id: 'rabbit', label: 'RabbitMQ', color: 'bg-orange-800', desc: 'The message broker. Two durable queues: "video" (uploaded videos waiting to convert) and "mp3" (converted files waiting to notify). NodePort 30004 for management UI.' }, + { id: 'converter',label: 'Converter (×4)', color: 'bg-green-800', desc: '4 worker pods. Each reads a video file ID from the video queue, fetches the video from MongoDB, runs ffmpeg/MoviePy to extract audio, stores the MP3 back to MongoDB, then publishes to the mp3 queue.' }, + { id: 'notify', label: 'Notification (×2)', color: 'bg-yellow-800', desc: '2 worker pods. Each reads from the mp3 queue and sends an email via Gmail SMTP with the file ID for download.' }, + { id: 'mongo', label: 'MongoDB (GridFS)', color: 'bg-red-900', desc: 'Stores video and MP3 files as GridFS chunks. StatefulSet for stable storage. NodePort 30005 for admin access.' }, + { id: 'postgres', label: 'PostgreSQL', color: 'bg-blue-900', desc: 'Stores user credentials (email + password). Used only by the Auth service. NodePort 30003 for admin access.' }, +] + +const arrows = [ + { from: 'client', to: 'frontend', label: 'HTTP :30006' }, + { from: 'frontend', to: 'gateway', label: 'HTTP :30002' }, + { from: 'gateway', to: 'auth', label: 'validate JWT' }, + { from: 'auth', to: 'postgres', label: 'SQL query' }, + { from: 'gateway', to: 'mongo', label: 'store video' }, + { from: 'gateway', to: 'rabbit', label: 'publish fid' }, + { from: 'rabbit', to: 'converter', label: 'consume video queue' }, + { from: 'converter', to: 'mongo', label: 'fetch video / store MP3' }, + { from: 'converter', to: 'rabbit', label: 'publish to mp3 queue' }, + { from: 'rabbit', to: 'notify', label: 'consume mp3 queue' }, + { from: 'notify', to: 'client', label: 'email with file ID' }, +] + +export default function Architecture() { + const [selected, setSelected] = useState(null) + const current = services.find(s => s.id === selected) + + return ( +
+

System Architecture

+

Click any service to learn what it does and how it connects to the rest of the system.

+ +
+ {services.map(s => ( + + ))} +
+ + {current && ( +
+

{current.label}

+

{current.desc}

+
+ )} + +
+
{`
+Client ──────────────────────────────────► Frontend :30006
+                                                │
+                                                ▼
+                                        Gateway :30002
+                                       /        |        \\
+                                   Auth        MongoDB   RabbitMQ
+                                 :5000 ──►   GridFS     "video" queue
+                                   │          :30005         │
+                                PostgreSQL              Converter ×4
+                                  :30003            (reads video, writes MP3)
+                                                          │
+                                                    RabbitMQ
+                                                    "mp3" queue
+                                                          │
+                                                   Notification ×2
+                                                          │
+                                                    Email → Client
+`}
+
+ +
+ {arrows.map((a, i) => ( +
+ {a.from} + + {a.to} + {a.label} +
+ ))} +
+
+ ) +} diff --git a/src/frontend/src/pages/Dashboard.jsx b/src/frontend/src/pages/Dashboard.jsx new file mode 100644 index 0000000..bb48018 --- /dev/null +++ b/src/frontend/src/pages/Dashboard.jsx @@ -0,0 +1,50 @@ +import React from 'react' + +const GRAFANA_URL = import.meta.env.VITE_GRAFANA_URL || 'http://localhost:30007' + +export default function Dashboard() { + return ( +
+

Operations Dashboard

+

+ Live Grafana dashboard showing pod health, node resources, and RabbitMQ queue depth. +

+ +
+
+

Access Grafana

+

Full dashboard with all metrics

+ + Open Grafana → VidCast Operations + +

Credentials: admin / vidcast-demo

+
+
+

Access Alertmanager

+

View active alerts

+ + Open Alertmanager + +
+
+ +
+