diff --git a/CHANGELOG.md b/CHANGELOG.md index 82e91db..3b788dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,67 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [2.1.0] - 2026-04-02 + +### Added + +- OpenTelemetry integration for distributed tracing, metrics, and log correlation. + Traces are exported via OTLP gRPC to Tempo; metrics are exposed via a Prometheus + `/metrics` endpoint; structured JSON logs include `trace_id` and `span_id` for + Loki–Tempo correlation. +- RED metrics (Rate, Errors, Duration) and saturation gauges via custom + `MetricsMiddleware`: `http_requests_total`, `http_request_errors_total`, + `http_request_duration_seconds`, `http_active_requests`. +- Image processing metrics: `image_processing_duration_seconds`, + `image_uploads_total`, `images_currently_processing`. +- Auto-instrumentation for FastAPI routes and SQLAlchemy queries using + `opentelemetry-instrumentation-fastapi` and + `opentelemetry-instrumentation-sqlalchemy`. +- Observability stack Kubernetes manifests (`minikube/observability/`): + Prometheus, Tempo, Loki, Promtail (DaemonSet), and Grafana with + pre-provisioned datasources and dashboards. +- Three Grafana dashboards provisioned automatically: + - **RED Metrics** — request rate, error rate, latency percentiles, saturation + - **Traces** — service map, recent traces, duration distribution + - **Logs** — application logs, log volume by level, error log filter +- `IMG_OTEL_ENABLED`, `IMG_OTEL_EXPORTER_OTLP_ENDPOINT`, and + `IMG_OTEL_SERVICE_NAME` configuration settings for opt-in observability. +- Trace context (`trace_id`, `span_id`, `trace_flags`) injected into JSON log + output via `opentelemetry-instrumentation-logging`. +- `minikube/observability/setup.sh` and `teardown.sh` scripts for one-command + deployment of the full observability stack. + +### Fixed + +- Trace context (`trace_id`, `span_id`) now correctly appears in log records by + using a `log_hook` callback in `LoggingInstrumentor` instead of relying on the + no-op `set_logging_format=False` mode. +- OpenTelemetry `TracerProvider` and `MeterProvider` are now initialized in + `create_app()` before `FastAPIInstrumentor.instrument_app()`, ensuring spans + are created with the real provider instead of the no-op default. +- Switched Dockerfile CMD to Uvicorn `--factory` mode + (`src.main:create_app --factory`) so each worker process initializes its own + `TracerProvider` and gRPC exporter, avoiding broken state from pre-fork setup. +- Promtail log collection: added static `__path__` glob + (`/var/log/pods/cv-platform_image-service-*/*/*.log`) as a reliable fallback + alongside Kubernetes SD, with `docker: {}` pipeline stage for container log + format unwrapping. +- Grafana provisioned datasources now have explicit `uid` fields (`prometheus`, + `tempo`, `loki`), fixing "Datasource prometheus was not found" errors in + Tempo's Service Map panel and dashboard cross-references. +- Replaced `${DS_PROMETHEUS}`, `${DS_TEMPO}`, and `${DS_LOKI}` template + variables in all provisioned dashboard JSON files with hardcoded datasource + UIDs, since Grafana provisioned dashboards do not resolve template variables. +- Enabled Tempo metrics generator with `service-graphs` and `span-metrics` + processors, and added `--web.enable-remote-write-receiver` to Prometheus, so + the Service Map panel receives `traces_service_graph_*` metrics. +- Fixed `06-grafana-dashboards.yaml` ConfigMap which contained `PLACEHOLDER` + instead of actual dashboard JSON; now embeds the real dashboard definitions. +- Changed Grafana anonymous org role from `Viewer` to `Editor` so trace ID links + in the Traces dashboard can open the Explore view. +- `image_uploads_total` counter is now incremented in `UploadImageUseCase` on + each successful upload; previously the metric was defined but never recorded. + ## [2.0.2] - 2026-04-02 ### Added @@ -249,7 +310,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix `type: ignore` comment on `rowcount` to use correct mypy error code `attr-defined`. - Add proper type annotation for `settings` parameter in retention sweep endpoint. -[unreleased]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v2.0.2...HEAD +[unreleased]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v2.1.0...HEAD +[2.1.0]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v2.0.2...v2.1.0 [2.0.2]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v2.0.1...v2.0.2 [2.0.1]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v2.0.0...v2.0.1 [2.0.0]: https://github.com/vlantonov/ImageProcessingServiceDemo/compare/v1.3.0...v2.0.0 diff --git a/Dockerfile b/Dockerfile index abfe990..f0c6dbe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,4 +27,4 @@ USER appuser EXPOSE 8000 -CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"] +CMD ["uvicorn", "src.main:create_app", "--factory", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"] diff --git a/PROJECT_DESCRIPTION.md b/PROJECT_DESCRIPTION.md index 165899e..35ecc8d 100644 --- a/PROJECT_DESCRIPTION.md +++ b/PROJECT_DESCRIPTION.md @@ -120,6 +120,44 @@ A complete **Minikube demo** is included (`minikube/`) with automated setup, tea --- +## Observability (OpenTelemetry) + +The service is fully instrumented with **OpenTelemetry** for distributed tracing, metrics, and log correlation — enabled via `IMG_OTEL_ENABLED=true`. + +### Tracing + +- Auto-instrumentation for FastAPI routes and SQLAlchemy queries via `opentelemetry-instrumentation-fastapi` and `opentelemetry-instrumentation-sqlalchemy`. +- Traces exported via OTLP gRPC to **Grafana Tempo**, with `trace_id` and `span_id` injected into structured JSON logs for correlation. + +### Metrics (RED + Saturation) + +- **Rate**: `http_requests_total` — total HTTP requests by method, path, status. +- **Errors**: `http_request_errors_total` — 4xx/5xx errors by status code. +- **Duration**: `http_request_duration_seconds` — request latency histogram. +- **Saturation**: `http_active_requests` — in-flight request gauge. +- **Image processing**: `image_processing_duration_seconds`, `image_uploads_total`, `images_currently_processing`. +- Metrics exposed via a Prometheus `/metrics` endpoint, scraped by **Prometheus**. + +### Logging + +- Structured JSON logs with `trace_id`, `span_id`, and `correlation_id` fields. +- Collected by **Promtail** and shipped to **Grafana Loki** for aggregation and search. +- Derived fields in Loki link `trace_id` to Tempo for seamless log-to-trace navigation. + +### Grafana Dashboards + +Three dashboards are provisioned automatically: + +| Dashboard | Description | +|-----------|-------------| +| **RED Metrics** | Request rate, error rate, latency percentiles, saturation gauges | +| **Traces** | Service map, recent traces with clickable trace IDs, duration distribution | +| **Logs** | Application logs, log volume by level, error log filter | + +The full observability stack (Prometheus, Tempo, Loki, Promtail, Grafana) deploys to a separate `observability` Kubernetes namespace via `minikube/observability/setup.sh`. + +--- + ## 12-Factor Configuration All settings are provided via environment variables (prefix `IMG_`) using **pydantic-settings**, ensuring type-safe, validated configuration: @@ -145,6 +183,9 @@ All settings are provided via environment variables (prefix `IMG_`) using **pyda | `IMG_RATE_LIMIT_READ_MAX` | `60` | Max read requests per window per IP | | `IMG_RATE_LIMIT_READ_WINDOW` | `60` | Read rate limit window (seconds) | | `IMG_DEBUG` | `false` | Enable debug logging | +| `IMG_OTEL_ENABLED` | `false` | Enable OpenTelemetry instrumentation | +| `IMG_OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP gRPC endpoint for trace export | +| `IMG_OTEL_SERVICE_NAME` | `image-processing-service` | Service name in traces and metrics | --- @@ -188,6 +229,8 @@ Test tooling: pytest, pytest-asyncio (auto mode), httpx, aiosqlite (in-memory SQ | Multi-Stage Docker | Builder/runtime separation for minimal production images | | Horizontal Autoscaling | Kubernetes HPA scales pods based on CPU and memory metrics | | 12-Factor Config | Type-safe environment variables via pydantic-settings | +| OpenTelemetry | Distributed tracing, metrics, and log correlation across services | +| RED Metrics | Rate, Errors, Duration monitoring via custom middleware | | FastAPI DI | Routes depend on use case abstractions, not concrete implementations | --- @@ -205,4 +248,5 @@ Test tooling: pytest, pytest-asyncio (auto mode), httpx, aiosqlite (in-memory SQ | **Orchestration** | Kubernetes, Minikube (local demo) | | **Testing** | pytest, pytest-asyncio, httpx, aiosqlite | | **Code Quality** | ruff (linter/formatter), mypy (type checker) | +| **Observability** | OpenTelemetry SDK, Prometheus, Grafana, Tempo, Loki, Promtail | | **Migrations** | Alembic | diff --git a/README.md b/README.md index c569308..2271006 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,16 @@ cd minikube && ./setup.sh # deploy full stack See [minikube/README.md](minikube/README.md) for details. +### Observability Stack (Minikube) + +```bash +cd minikube/observability && ./setup.sh # deploy Prometheus, Tempo, Loki, Grafana +minikube service grafana --namespace=observability # open Grafana +./teardown.sh # clean up +``` + +See [minikube/observability/README.md](minikube/observability/README.md) for dashboards and configuration. + ### Kubernetes (Production) ```bash @@ -99,6 +109,9 @@ All settings via environment variables (prefix `IMG_`), validated by [pydantic-s | `IMG_RATE_LIMIT_READ_MAX` | `60` | Max read requests per window per IP | | `IMG_RATE_LIMIT_READ_WINDOW` | `60` | Read rate limit window (seconds) | | `IMG_DEBUG` | `false` | Enable debug logging | +| `IMG_OTEL_ENABLED` | `false` | Enable OpenTelemetry instrumentation | +| `IMG_OTEL_EXPORTER_OTLP_ENDPOINT` | `http://localhost:4317` | OTLP gRPC endpoint for trace export | +| `IMG_OTEL_SERVICE_NAME` | `image-processing-service` | Service name in traces and metrics | ## Project Structure @@ -109,11 +122,13 @@ src/ ├── domain/ # Entities & ports (zero external deps) ├── application/ # Use cases & DTOs ├── infrastructure/ # Adapters (PostgreSQL, Pillow, filesystem) +│ └── observability/ # OpenTelemetry setup, metrics, middleware └── presentation/ # FastAPI routes, schemas, middleware cpp/ # Optional C++ resize module (pybind11) k8s/ # Kubernetes manifests (Deployment, HPA, PVC, …) minikube/ # Local K8s demo scripts +│ └── observability/ # Prometheus, Tempo, Loki, Grafana manifests tests/ # tests across all architecture layers ``` diff --git a/minikube/01-configmap.yaml b/minikube/01-configmap.yaml index 289a6a8..801afbb 100644 --- a/minikube/01-configmap.yaml +++ b/minikube/01-configmap.yaml @@ -13,3 +13,6 @@ data: IMG_DB_MAX_OVERFLOW: "10" IMG_RETENTION_BATCH_SIZE: "50" IMG_DEBUG: "true" + IMG_OTEL_ENABLED: "true" + IMG_OTEL_EXPORTER_OTLP_ENDPOINT: "http://tempo.observability.svc.cluster.local:4317" + IMG_OTEL_SERVICE_NAME: "image-processing-service" diff --git a/minikube/05-service.yaml b/minikube/05-service.yaml index 86f24d3..6494aea 100644 --- a/minikube/05-service.yaml +++ b/minikube/05-service.yaml @@ -8,7 +8,8 @@ spec: selector: app: image-service ports: - - port: 80 + - name: http + port: 80 targetPort: 8000 nodePort: 30080 protocol: TCP diff --git a/minikube/observability/00-namespace.yaml b/minikube/observability/00-namespace.yaml new file mode 100644 index 0000000..4f75b8c --- /dev/null +++ b/minikube/observability/00-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: observability diff --git a/minikube/observability/01-prometheus.yaml b/minikube/observability/01-prometheus.yaml new file mode 100644 index 0000000..15cc397 --- /dev/null +++ b/minikube/observability/01-prometheus.yaml @@ -0,0 +1,113 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: observability +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + scrape_configs: + - job_name: "image-service" + metrics_path: /metrics + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - cv-platform + relabel_configs: + - source_labels: [__meta_kubernetes_service_name] + action: keep + regex: image-service + - source_labels: [__meta_kubernetes_endpoint_port_name] + action: keep + regex: http + # Fallback static config if K8s SD is not available + - job_name: "image-service-static" + metrics_path: /metrics + static_configs: + - targets: ["image-service.cv-platform.svc.cluster.local:80"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: + - apiGroups: [""] + resources: ["services", "endpoints", "pods"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: default + namespace: observability +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: observability + labels: + app: prometheus +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + containers: + - name: prometheus + image: prom/prometheus:v2.53.0 + args: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--storage.tsdb.retention.time=7d" + - "--web.enable-remote-write-receiver" + ports: + - containerPort: 9090 + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: data + mountPath: /prometheus + volumes: + - name: config + configMap: + name: prometheus-config + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: observability +spec: + selector: + app: prometheus + ports: + - port: 9090 + targetPort: 9090 + type: ClusterIP diff --git a/minikube/observability/02-tempo.yaml b/minikube/observability/02-tempo.yaml new file mode 100644 index 0000000..09891f9 --- /dev/null +++ b/minikube/observability/02-tempo.yaml @@ -0,0 +1,124 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: tempo + namespace: observability + labels: + app: tempo +spec: + replicas: 1 + selector: + matchLabels: + app: tempo + template: + metadata: + labels: + app: tempo + spec: + containers: + - name: tempo + image: grafana/tempo:2.4.1 + args: + - "-config.file=/etc/tempo/tempo.yaml" + ports: + - containerPort: 3200 # HTTP API + - containerPort: 4317 # OTLP gRPC + - containerPort: 4318 # OTLP HTTP + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /etc/tempo + - name: data + mountPath: /tmp/tempo + volumes: + - name: config + configMap: + name: tempo-config + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: tempo-config + namespace: observability +data: + tempo.yaml: | + server: + http_listen_port: 3200 + + distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + + storage: + trace: + backend: local + local: + path: /tmp/tempo/blocks + wal: + path: /tmp/tempo/wal + + compactor: + compaction: + block_retention: 48h + + metrics_generator: + processor: + service_graphs: + dimensions: + - http.method + - http.route + span_metrics: + dimensions: + - http.method + - http.route + - http.status_code + registry: + external_labels: + source: tempo + cluster: minikube + storage: + path: /tmp/tempo/generator/wal + remote_write: + - url: http://prometheus.observability.svc.cluster.local:9090/api/v1/write + send_exemplars: true + + overrides: + defaults: + metrics_generator: + processors: + - service-graphs + - span-metrics +--- +apiVersion: v1 +kind: Service +metadata: + name: tempo + namespace: observability +spec: + selector: + app: tempo + ports: + - name: http + port: 3200 + targetPort: 3200 + - name: otlp-grpc + port: 4317 + targetPort: 4317 + - name: otlp-http + port: 4318 + targetPort: 4318 + type: ClusterIP diff --git a/minikube/observability/03-loki.yaml b/minikube/observability/03-loki.yaml new file mode 100644 index 0000000..9100842 --- /dev/null +++ b/minikube/observability/03-loki.yaml @@ -0,0 +1,102 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: observability +data: + loki.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + grpc_listen_port: 9096 + + common: + path_prefix: /tmp/loki + storage: + filesystem: + chunks_directory: /tmp/loki/chunks + rules_directory: /tmp/loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + schema_config: + configs: + - from: 2020-10-24 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + limits_config: + reject_old_samples: true + reject_old_samples_max_age: 168h + allow_structured_metadata: true + + query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: loki + namespace: observability + labels: + app: loki +spec: + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + containers: + - name: loki + image: grafana/loki:2.9.4 + args: + - "-config.file=/etc/loki/loki.yaml" + ports: + - containerPort: 3100 + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumeMounts: + - name: config + mountPath: /etc/loki + - name: data + mountPath: /tmp/loki + volumes: + - name: config + configMap: + name: loki-config + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: observability +spec: + selector: + app: loki + ports: + - port: 3100 + targetPort: 3100 + type: ClusterIP diff --git a/minikube/observability/04-promtail.yaml b/minikube/observability/04-promtail.yaml new file mode 100644 index 0000000..d4079aa --- /dev/null +++ b/minikube/observability/04-promtail.yaml @@ -0,0 +1,159 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: promtail-config + namespace: observability +data: + promtail.yaml: | + server: + http_listen_port: 9080 + grpc_listen_port: 0 + + positions: + filename: /tmp/positions.yaml + + clients: + - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push + + scrape_configs: + - job_name: kubernetes-pods + kubernetes_sd_configs: + - role: pod + relabel_configs: + # Only keep pods in cv-platform namespace + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: cv-platform + # Set log file path from pod metadata + - source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + separator: / + regex: (.+)/(.+) + replacement: /var/log/pods/*$1/$2/*.log + - source_labels: [__meta_kubernetes_pod_label_app] + target_label: app + - source_labels: [__meta_kubernetes_namespace] + target_label: namespace + - source_labels: [__meta_kubernetes_pod_name] + target_label: pod + - source_labels: [__meta_kubernetes_pod_container_name] + target_label: container + pipeline_stages: + - cri: {} + - json: + expressions: + level: level + trace_id: trace_id + span_id: span_id + correlation_id: correlation_id + - labels: + level: + trace_id: + + - job_name: cv-platform-pods + static_configs: + - targets: [localhost] + labels: + job: cv-platform + namespace: cv-platform + app: image-service + __path__: /var/log/pods/cv-platform_image-service-*/*/*.log + pipeline_stages: + - docker: {} + - json: + expressions: + level: level + trace_id: trace_id + span_id: span_id + correlation_id: correlation_id + - labels: + level: + trace_id: +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: promtail + namespace: observability + labels: + app: promtail +spec: + selector: + matchLabels: + app: promtail + template: + metadata: + labels: + app: promtail + spec: + serviceAccountName: promtail + containers: + - name: promtail + image: grafana/promtail:2.9.4 + args: + - "-config.file=/etc/promtail/promtail.yaml" + ports: + - containerPort: 9080 + resources: + requests: + cpu: "50m" + memory: "64Mi" + limits: + cpu: "200m" + memory: "128Mi" + volumeMounts: + - name: config + mountPath: /etc/promtail + - name: varlog + mountPath: /var/log + readOnly: true + - name: pods + mountPath: /var/log/pods + readOnly: true + - name: containers + mountPath: /var/lib/docker/containers + readOnly: true + volumes: + - name: config + configMap: + name: promtail-config + - name: varlog + hostPath: + path: /var/log + - name: pods + hostPath: + path: /var/log/pods + - name: containers + hostPath: + path: /var/lib/docker/containers +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: promtail + namespace: observability +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: promtail +rules: + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "services", "endpoints", "pods"] + verbs: ["get", "watch", "list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: promtail +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: promtail +subjects: + - kind: ServiceAccount + name: promtail + namespace: observability diff --git a/minikube/observability/05-grafana.yaml b/minikube/observability/05-grafana.yaml new file mode 100644 index 0000000..d31c5fd --- /dev/null +++ b/minikube/observability/05-grafana.yaml @@ -0,0 +1,143 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: observability +data: + datasources.yaml: | + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + uid: prometheus + access: proxy + url: http://prometheus.observability.svc.cluster.local:9090 + isDefault: true + editable: false + + - name: Tempo + type: tempo + uid: tempo + access: proxy + url: http://tempo.observability.svc.cluster.local:3200 + editable: false + jsonData: + tracesToLogsV2: + datasourceUid: loki + filterByTraceID: true + filterBySpanID: false + tracesToMetrics: + datasourceUid: prometheus + serviceMap: + datasourceUid: prometheus + nodeGraph: + enabled: true + lokiSearch: + datasourceUid: loki + + - name: Loki + type: loki + access: proxy + uid: loki + url: http://loki.observability.svc.cluster.local:3100 + editable: false + jsonData: + derivedFields: + - datasourceUid: tempo + matcherRegex: '"trace_id":"(\w+)"' + name: TraceID + url: "$${__value.raw}" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-provider + namespace: observability +data: + dashboards.yaml: | + apiVersion: 1 + providers: + - name: default + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana + namespace: observability + labels: + app: grafana +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:10.4.0 + ports: + - containerPort: 3000 + env: + - name: GF_SECURITY_ADMIN_USER + value: admin + - name: GF_SECURITY_ADMIN_PASSWORD + value: admin + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "true" + - name: GF_AUTH_ANONYMOUS_ORG_ROLE + value: Editor + resources: + requests: + cpu: "100m" + memory: "256Mi" + limits: + cpu: "500m" + memory: "512Mi" + volumeMounts: + - name: datasources + mountPath: /etc/grafana/provisioning/datasources + - name: dashboard-provider + mountPath: /etc/grafana/provisioning/dashboards + - name: dashboards + mountPath: /var/lib/grafana/dashboards + - name: data + mountPath: /var/lib/grafana + volumes: + - name: datasources + configMap: + name: grafana-datasources + - name: dashboard-provider + configMap: + name: grafana-dashboard-provider + - name: dashboards + configMap: + name: grafana-dashboards + - name: data + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: observability +spec: + selector: + app: grafana + ports: + - port: 3000 + targetPort: 3000 + nodePort: 30300 + type: NodePort diff --git a/minikube/observability/06-grafana-dashboards.yaml b/minikube/observability/06-grafana-dashboards.yaml new file mode 100644 index 0000000..2f8d72f --- /dev/null +++ b/minikube/observability/06-grafana-dashboards.yaml @@ -0,0 +1,364 @@ +apiVersion: v1 +data: + logs.json: | + { + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Application Logs", + "type": "logs", + "gridPos": { "h": 16, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{app=\"image-service\"}", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Log Volume by Level", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "sum(count_over_time({app=\"image-service\"} | json | level=~\"ERROR|WARNING\" [1m])) by (level)", + "legendFormat": "{{level}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { "mode": "normal" } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "ERROR" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "WARNING" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + } + ] + } + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{app=\"image-service\"} | json | level=\"ERROR\"", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "logs", "loki"], + "templating": { + "list": [] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - Logs", + "uid": "image-service-logs", + "version": 1 + } + red-metrics.json: | + { + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Request Rate (req/s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total[5m])) by (method, path)", + "legendFormat": "{{method}} {{path}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Error Rate (errors/s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_request_errors_total[5m])) by (status_code)", + "legendFormat": "HTTP {{status_code}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }, + "color": { "mode": "palette-classic" } + } + } + }, + { + "title": "Request Latency (p50 / p90 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Active Requests (Saturation)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(http_active_requests)", + "legendFormat": "In-flight requests" + }, + { + "expr": "sum(images_currently_processing)", + "legendFormat": "Images processing" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2 } + } + } + }, + { + "title": "Image Processing Latency (p50 / p90 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Image Uploads (total)", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(image_uploads_total)", + "legendFormat": "Uploads" + } + ], + "fieldConfig": { + "defaults": { "unit": "short" } + } + }, + { + "title": "Error Rate %", + "type": "gauge", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_request_errors_total[5m])) / sum(rate(http_requests_total[5m])) * 100", + "legendFormat": "Error %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "min": 0, + "max": 100 + } + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "RED", "metrics"], + "templating": { + "list": [] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - RED Metrics", + "uid": "image-service-red", + "version": 1 + } + traces.json: | + { + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Traces - Service Map", + "type": "nodeGraph", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "tempo", "uid": "tempo" }, + "targets": [ + { + "queryType": "serviceMap" + } + ] + }, + { + "title": "Recent Traces", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 }, + "datasource": { "type": "tempo", "uid": "tempo" }, + "targets": [ + { + "queryType": "nativeSearch", + "serviceName": "image-processing-service", + "limit": 20 + } + ], + "fieldConfig": { + "overrides": [ + { + "matcher": { "id": "byName", "options": "traceID" }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "View Trace", + "internal": { + "datasourceUid": "tempo", + "datasourceName": "Tempo", + "query": { + "refId": "A", + "queryType": "traceql", + "query": "${__value.raw}", + "datasource": { + "type": "tempo", + "uid": "tempo" + } + } + } + } + ] + } + ] + } + ] + } + }, + { + "title": "Trace Duration Distribution", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{path=~\"/api/.*\"}[5m])) by (le, path))", + "legendFormat": "p50 {{path}}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{path=~\"/api/.*\"}[5m])) by (le, path))", + "legendFormat": "p95 {{path}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "traces", "tempo"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - Traces", + "uid": "image-service-traces", + "version": 1 + } +kind: ConfigMap +metadata: + name: grafana-dashboards + namespace: observability diff --git a/minikube/observability/README.md b/minikube/observability/README.md new file mode 100644 index 0000000..f5ae409 --- /dev/null +++ b/minikube/observability/README.md @@ -0,0 +1,90 @@ +# Observability Stack — Minikube + +Deploys Prometheus, Tempo, Loki, Promtail, and Grafana into a separate +`observability` namespace alongside the image-processing-service. + +## Architecture + +``` +cv-platform namespace observability namespace +┌─────────────────────┐ ┌───────────────────────────┐ +│ image-service │──── metrics ───→ │ Prometheus (:9090) │ +│ (:8000 /metrics) │ │ │ +│ │──── traces ────→ │ Tempo (:4317 OTLP gRPC) │ +│ │ │ │ +│ (stdout JSON logs) │──── logs ──────→ │ Promtail → Loki (:3100) │ +└─────────────────────┘ │ │ + │ Grafana (:3000) │ + │ ├─ RED Metrics dashboard│ + │ ├─ Traces dashboard │ + │ └─ Logs dashboard │ + └───────────────────────────┘ +``` + +## Quick Start + +```bash +# 1. Deploy the app first +./minikube/setup.sh + +# 2. Deploy observability stack +./minikube/observability/setup.sh + +# 3. Open Grafana +minikube service grafana --namespace=observability + +# 4. Clean up +./minikube/observability/teardown.sh +``` + +## Grafana Dashboards + +| Dashboard | UID | Description | +|-----------|-----|-------------| +| **RED Metrics** | `image-service-red` | Request rate, error rate, latency percentiles, saturation | +| **Traces** | `image-service-traces` | Service map, recent traces, duration distribution | +| **Logs** | `image-service-logs` | Application logs, log volume by level, error logs | + +Default credentials: `admin` / `admin` + +## Components + +| Component | Image | Port | Purpose | +|-----------|-------|------|---------| +| Prometheus | `prom/prometheus:v2.53.0` | 9090 | Metrics scraping and storage | +| Tempo | `grafana/tempo:2.4.1` | 3200 (HTTP), 4317 (OTLP gRPC) | Distributed trace storage | +| Loki | `grafana/loki:2.9.4` | 3100 | Log aggregation | +| Promtail | `grafana/promtail:2.9.4` | 9080 | Log collection (DaemonSet) | +| Grafana | `grafana/grafana:10.4.0` | 3000 (NodePort 30300) | Visualization | + +## Configuration + +The image-service is configured via the ConfigMap with: + +| Variable | Value | Purpose | +|----------|-------|---------| +| `IMG_OTEL_ENABLED` | `true` | Enable OpenTelemetry instrumentation | +| `IMG_OTEL_EXPORTER_OTLP_ENDPOINT` | `http://tempo.observability.svc.cluster.local:4317` | Tempo OTLP endpoint | +| `IMG_OTEL_SERVICE_NAME` | `image-processing-service` | Service name in traces | + +## Metrics (RED + Saturation) + +- **Rate**: `http_requests_total` — total HTTP requests by method, path, status +- **Errors**: `http_request_errors_total` — 4xx/5xx errors by status code +- **Duration**: `http_request_duration_seconds` — request latency histogram +- **Saturation**: `http_active_requests` — in-flight request count +- **Image processing**: `image_processing_duration_seconds`, `image_uploads_total` + +## Traces + +OpenTelemetry auto-instruments FastAPI routes and SQLAlchemy queries. Trace IDs +are injected into structured JSON logs for correlation in Grafana (click from +log → trace, or trace → logs). + +## Logs + +The app emits structured JSON logs (when `IMG_OTEL_ENABLED=true`) with fields: +`timestamp`, `level`, `logger`, `message`, `correlation_id`, `trace_id`, `span_id`. + +Promtail collects pod stdout, parses JSON, and ships to Loki. Grafana's Loki +datasource is configured with derived fields to link `trace_id` → Tempo. diff --git a/minikube/observability/dashboards/logs.json b/minikube/observability/dashboards/logs.json new file mode 100644 index 0000000..3e5df59 --- /dev/null +++ b/minikube/observability/dashboards/logs.json @@ -0,0 +1,93 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Application Logs", + "type": "logs", + "gridPos": { "h": 16, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{app=\"image-service\"}", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending", + "dedupStrategy": "none" + } + }, + { + "title": "Log Volume by Level", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "sum(count_over_time({app=\"image-service\"} | json | level=~\"ERROR|WARNING\" [1m])) by (level)", + "legendFormat": "{{level}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { + "drawStyle": "bars", + "fillOpacity": 50, + "stacking": { "mode": "normal" } + } + }, + "overrides": [ + { + "matcher": { "id": "byName", "options": "ERROR" }, + "properties": [{ "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }] + }, + { + "matcher": { "id": "byName", "options": "WARNING" }, + "properties": [{ "id": "color", "value": { "fixedColor": "yellow", "mode": "fixed" } }] + } + ] + } + }, + { + "title": "Error Logs", + "type": "logs", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 24 }, + "datasource": { "type": "loki", "uid": "loki" }, + "targets": [ + { + "expr": "{app=\"image-service\"} | json | level=\"ERROR\"", + "legendFormat": "" + } + ], + "options": { + "showTime": true, + "showLabels": true, + "wrapLogMessage": true, + "prettifyLogMessage": true, + "enableLogDetails": true, + "sortOrder": "Descending" + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "logs", "loki"], + "templating": { + "list": [] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - Logs", + "uid": "image-service-logs", + "version": 1 +} diff --git a/minikube/observability/dashboards/red-metrics.json b/minikube/observability/dashboards/red-metrics.json new file mode 100644 index 0000000..26d5b0c --- /dev/null +++ b/minikube/observability/dashboards/red-metrics.json @@ -0,0 +1,172 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Request Rate (req/s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_requests_total[5m])) by (method, path)", + "legendFormat": "{{method}} {{path}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Error Rate (errors/s)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_request_errors_total[5m])) by (status_code)", + "legendFormat": "HTTP {{status_code}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 }, + "color": { "mode": "palette-classic" } + } + } + }, + { + "title": "Request Latency (p50 / p90 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Active Requests (Saturation)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(http_active_requests)", + "legendFormat": "In-flight requests" + }, + { + "expr": "sum(images_currently_processing)", + "legendFormat": "Images processing" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "custom": { "drawStyle": "line", "fillOpacity": 20, "lineWidth": 2 } + } + } + }, + { + "title": "Image Processing Latency (p50 / p90 / p99)", + "type": "timeseries", + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p50" + }, + { + "expr": "histogram_quantile(0.90, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p90" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(image_processing_duration_seconds_bucket[5m])) by (le))", + "legendFormat": "p99" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + }, + { + "title": "Image Uploads (total)", + "type": "stat", + "gridPos": { "h": 8, "w": 6, "x": 12, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(image_uploads_total)", + "legendFormat": "Uploads" + } + ], + "fieldConfig": { + "defaults": { "unit": "short" } + } + }, + { + "title": "Error Rate %", + "type": "gauge", + "gridPos": { "h": 8, "w": 6, "x": 18, "y": 16 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "sum(rate(http_request_errors_total[5m])) / sum(rate(http_requests_total[5m])) * 100", + "legendFormat": "Error %" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 1 }, + { "color": "red", "value": 5 } + ] + }, + "min": 0, + "max": 100 + } + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "RED", "metrics"], + "templating": { + "list": [] + }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - RED Metrics", + "uid": "image-service-red", + "version": 1 +} diff --git a/minikube/observability/dashboards/traces.json b/minikube/observability/dashboards/traces.json new file mode 100644 index 0000000..51ce6e3 --- /dev/null +++ b/minikube/observability/dashboards/traces.json @@ -0,0 +1,93 @@ +{ + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "title": "Traces - Service Map", + "type": "nodeGraph", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 0 }, + "datasource": { "type": "tempo", "uid": "tempo" }, + "targets": [ + { + "queryType": "serviceMap" + } + ] + }, + { + "title": "Recent Traces", + "type": "table", + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 10 }, + "datasource": { "type": "tempo", "uid": "tempo" }, + "targets": [ + { + "queryType": "nativeSearch", + "serviceName": "image-processing-service", + "limit": 20 + } + ], + "fieldConfig": { + "overrides": [ + { + "matcher": { "id": "byName", "options": "traceID" }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "View Trace", + "internal": { + "datasourceUid": "tempo", + "datasourceName": "Tempo", + "query": { + "refId": "A", + "queryType": "traceql", + "query": "${__value.raw}", + "datasource": { + "type": "tempo", + "uid": "tempo" + } + } + } + } + ] + } + ] + } + ] + } + }, + { + "title": "Trace Duration Distribution", + "type": "timeseries", + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 20 }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(http_request_duration_seconds_bucket{path=~\"/api/.*\"}[5m])) by (le, path))", + "legendFormat": "p50 {{path}}" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{path=~\"/api/.*\"}[5m])) by (le, path))", + "legendFormat": "p95 {{path}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s", + "custom": { "drawStyle": "line", "fillOpacity": 10, "lineWidth": 2 } + } + } + } + ], + "schemaVersion": 39, + "tags": ["image-service", "traces", "tempo"], + "templating": { "list": [] }, + "time": { "from": "now-1h", "to": "now" }, + "title": "Image Service - Traces", + "uid": "image-service-traces", + "version": 1 +} diff --git a/minikube/observability/setup.sh b/minikube/observability/setup.sh new file mode 100755 index 0000000..cb7cbd3 --- /dev/null +++ b/minikube/observability/setup.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# ─── minikube/observability/setup.sh ───────────────────────────────────────── +# Deploys the observability stack (Prometheus, Tempo, Loki, Promtail, Grafana) +# into a separate "observability" namespace in minikube. +# +# Prerequisites: +# - minikube running (start it via ../setup.sh first) +# - kubectl installed +# +# Usage: +# cd +# ./minikube/observability/setup.sh +# ────────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DASHBOARD_DIR="$SCRIPT_DIR/dashboards" + +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +NC='\033[0m' + +info() { echo -e "${CYAN}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } + +# ── 1. Create namespace ───────────────────────────────────────────────────── +info "Creating observability namespace..." +kubectl apply -f "$SCRIPT_DIR/00-namespace.yaml" +ok "Namespace created" + +# ── 2. Generate Grafana dashboards ConfigMap from JSON files ───────────────── +info "Generating Grafana dashboards ConfigMap..." +kubectl create configmap grafana-dashboards \ + --namespace=observability \ + --from-file="$DASHBOARD_DIR/red-metrics.json" \ + --from-file="$DASHBOARD_DIR/traces.json" \ + --from-file="$DASHBOARD_DIR/logs.json" \ + --dry-run=client -o yaml | kubectl apply -f - +ok "Dashboard ConfigMap created" + +# ── 3. Deploy observability stack ──────────────────────────────────────────── +info "Deploying Prometheus..." +kubectl apply -f "$SCRIPT_DIR/01-prometheus.yaml" +ok "Prometheus deployed" + +info "Deploying Tempo..." +kubectl apply -f "$SCRIPT_DIR/02-tempo.yaml" +ok "Tempo deployed" + +info "Deploying Loki..." +kubectl apply -f "$SCRIPT_DIR/03-loki.yaml" +ok "Loki deployed" + +info "Deploying Promtail..." +kubectl apply -f "$SCRIPT_DIR/04-promtail.yaml" +ok "Promtail deployed" + +info "Deploying Grafana..." +kubectl apply -f "$SCRIPT_DIR/05-grafana.yaml" +ok "Grafana deployed" + +# ── 4. Wait for pods ──────────────────────────────────────────────────────── +info "Waiting for observability pods to be ready (timeout 180s)..." +kubectl wait --namespace=observability \ + --for=condition=ready pod \ + --selector=app=prometheus \ + --timeout=180s 2>/dev/null || true + +kubectl wait --namespace=observability \ + --for=condition=ready pod \ + --selector=app=tempo \ + --timeout=180s 2>/dev/null || true + +kubectl wait --namespace=observability \ + --for=condition=ready pod \ + --selector=app=loki \ + --timeout=180s 2>/dev/null || true + +kubectl wait --namespace=observability \ + --for=condition=ready pod \ + --selector=app=grafana \ + --timeout=180s 2>/dev/null || true + +ok "All observability pods ready" + +# ── 5. Print access URLs ──────────────────────────────────────────────────── +GRAFANA_URL=$(minikube service grafana --namespace=observability --url 2>/dev/null || true) + +echo "" +echo "════════════════════════════════════════════════════════════════════" +echo "" +ok "Observability stack deployed!" +echo "" +if [ -n "$GRAFANA_URL" ]; then + echo -e " Grafana : ${GREEN}${GRAFANA_URL}${NC} (admin/admin)" + echo -e " RED Metrics : ${GREEN}${GRAFANA_URL}/d/image-service-red${NC}" + echo -e " Traces : ${GREEN}${GRAFANA_URL}/d/image-service-traces${NC}" + echo -e " Logs : ${GREEN}${GRAFANA_URL}/d/image-service-logs${NC}" +else + echo -e " Run: ${YELLOW}minikube service grafana --namespace=observability${NC}" +fi +echo "" +echo -e " Prometheus : port-forward with ${CYAN}kubectl port-forward -n observability svc/prometheus 9090${NC}" +echo "" +echo "════════════════════════════════════════════════════════════════════" diff --git a/minikube/observability/teardown.sh b/minikube/observability/teardown.sh new file mode 100755 index 0000000..c8ed522 --- /dev/null +++ b/minikube/observability/teardown.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +# ─── minikube/observability/teardown.sh ────────────────────────────────────── +# Removes all observability resources. +# +# Usage: +# ./minikube/observability/teardown.sh +# ────────────────────────────────────────────────────────────────────────────── +set -euo pipefail + +GREEN='\033[0;32m' +CYAN='\033[0;36m' +NC='\033[0m' + +info() { echo -e "${CYAN}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } + +info "Deleting observability namespace and all its resources..." +kubectl delete namespace observability --ignore-not-found=true + +# Clean up cluster-wide RBAC resources +kubectl delete clusterrole prometheus --ignore-not-found=true +kubectl delete clusterrolebinding prometheus --ignore-not-found=true +kubectl delete clusterrole promtail --ignore-not-found=true +kubectl delete clusterrolebinding promtail --ignore-not-found=true + +ok "Observability stack removed" diff --git a/minikube/setup.sh b/minikube/setup.sh index b0482db..4a41c8e 100755 --- a/minikube/setup.sh +++ b/minikube/setup.sh @@ -98,5 +98,6 @@ else fi echo "" echo -e " Run the demo: ${CYAN}./minikube/demo.sh${NC}" +echo -e " Observability: ${CYAN}./minikube/observability/setup.sh${NC}" echo "" echo "════════════════════════════════════════════════════════════════════" diff --git a/pyproject.toml b/pyproject.toml index ebb54b3..17b8988 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "image-processing-service" -version = "2.0.2" +version = "2.1.0" description = "High-performance image processing microservice with Clean Architecture" requires-python = ">=3.11" dependencies = [ @@ -13,6 +13,14 @@ dependencies = [ "Pillow>=11.0,<12", "python-multipart>=0.0.20", "alembic>=1.14,<2", + "opentelemetry-api>=1.20,<2", + "opentelemetry-sdk>=1.20,<2", + "opentelemetry-instrumentation-fastapi>=0.41b0", + "opentelemetry-instrumentation-sqlalchemy>=0.41b0", + "opentelemetry-instrumentation-logging>=0.41b0", + "opentelemetry-exporter-otlp-proto-grpc>=1.20,<2", + "opentelemetry-exporter-prometheus>=0.41b0", + "prometheus-client>=0.20,<1", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index 495e1cc..702d136 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,11 @@ pydantic-settings>=2.0,<3 Pillow>=11.0,<12 python-multipart>=0.0.20 alembic>=1.14,<2 +opentelemetry-api>=1.20,<2 +opentelemetry-sdk>=1.20,<2 +opentelemetry-instrumentation-fastapi>=0.41b0 +opentelemetry-instrumentation-sqlalchemy>=0.41b0 +opentelemetry-instrumentation-logging>=0.41b0 +opentelemetry-exporter-otlp-proto-grpc>=1.20,<2 +opentelemetry-exporter-prometheus>=0.41b0 +prometheus-client>=0.20,<1 diff --git a/src/application/use_cases/process_image.py b/src/application/use_cases/process_image.py index dcf4820..9731c1c 100644 --- a/src/application/use_cases/process_image.py +++ b/src/application/use_cases/process_image.py @@ -7,6 +7,7 @@ from __future__ import annotations import logging +import time import uuid from src.domain.entities.image import ImageMetadata @@ -38,6 +39,7 @@ async def execute(self, image_id: uuid.UUID) -> bool: image.mark_processing() await self._repository.save(image) + start = time.perf_counter() thumb_path: str | None = None try: raw_data = await self._storage.retrieve(image.original_path) @@ -54,6 +56,10 @@ async def execute(self, image_id: uuid.UUID) -> bool: ) image.mark_completed(thumb_path, metadata) await self._repository.save(image) + + elapsed = time.perf_counter() - start + self._record_duration(elapsed) + logger.info( "Processing completed: image=%s width=%d height=%d format=%s", image_id, @@ -73,3 +79,12 @@ async def execute(self, image_id: uuid.UUID) -> bool: raise return True + + @staticmethod + def _record_duration(elapsed: float) -> None: + try: + from src.infrastructure.observability.metrics import image_processing_duration + + image_processing_duration.record(elapsed, {"operation": "thumbnail"}) + except Exception: + pass diff --git a/src/application/use_cases/upload_image.py b/src/application/use_cases/upload_image.py index 70561f6..01accf5 100644 --- a/src/application/use_cases/upload_image.py +++ b/src/application/use_cases/upload_image.py @@ -40,8 +40,18 @@ async def execute( saved = await self._repository.save(image) logger.info("Image persisted: id=%s filename=%s path=%s", saved.id, filename, storage_path) + self._record_upload() return _to_response(saved) + @staticmethod + def _record_upload() -> None: + try: + from src.infrastructure.observability.metrics import image_uploads_total + + image_uploads_total.add(1) + except Exception: + pass + def _to_response(img: Image) -> ImageResponse: return ImageResponse( diff --git a/src/config.py b/src/config.py index 661767f..207acd9 100644 --- a/src/config.py +++ b/src/config.py @@ -58,4 +58,9 @@ def database_url(self) -> str: # ── Authentication ─────────────────────────────────────────────────── api_key: str = "" + # ── Observability ──────────────────────────────────────────────────── + otel_enabled: bool = False + otel_exporter_otlp_endpoint: str = "http://localhost:4317" + otel_service_name: str = "image-processing-service" + model_config = {"env_prefix": "IMG_"} diff --git a/src/infrastructure/observability/__init__.py b/src/infrastructure/observability/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/infrastructure/observability/metrics.py b/src/infrastructure/observability/metrics.py new file mode 100644 index 0000000..672e3bb --- /dev/null +++ b/src/infrastructure/observability/metrics.py @@ -0,0 +1,58 @@ +"""Application-level RED + saturation metrics using OpenTelemetry. + +Exposes: +- request_duration_seconds (histogram) — latency +- request_total (counter) — traffic +- request_errors_total (counter) — errors +- active_requests (up-down counter) — saturation +- image_processing_duration (histogram) — processing latency +- image_uploads_total (counter) — upload traffic +- db_pool_active (gauge via callback) — DB pool saturation +""" + +from __future__ import annotations + +from opentelemetry import metrics + +_meter = metrics.get_meter("image-processing-service") + +# ── Latency ────────────────────────────────────────────────────────────────── +request_duration = _meter.create_histogram( + name="http_request_duration_seconds", + description="HTTP request latency in seconds", + unit="s", +) + +image_processing_duration = _meter.create_histogram( + name="image_processing_duration_seconds", + description="Image processing latency in seconds", + unit="s", +) + +# ── Traffic ────────────────────────────────────────────────────────────────── +request_total = _meter.create_counter( + name="http_requests_total", + description="Total HTTP requests", +) + +image_uploads_total = _meter.create_counter( + name="image_uploads_total", + description="Total image uploads", +) + +# ── Errors ─────────────────────────────────────────────────────────────────── +request_errors_total = _meter.create_counter( + name="http_request_errors_total", + description="Total HTTP request errors (4xx, 5xx)", +) + +# ── Saturation ─────────────────────────────────────────────────────────────── +active_requests = _meter.create_up_down_counter( + name="http_active_requests", + description="Number of in-flight HTTP requests", +) + +images_processing = _meter.create_up_down_counter( + name="images_currently_processing", + description="Number of images currently being processed", +) diff --git a/src/infrastructure/observability/middleware.py b/src/infrastructure/observability/middleware.py new file mode 100644 index 0000000..eeb8dd4 --- /dev/null +++ b/src/infrastructure/observability/middleware.py @@ -0,0 +1,47 @@ +"""Middleware that records RED metrics for every HTTP request.""" + +from __future__ import annotations + +import time + +from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint +from starlette.requests import Request +from starlette.responses import Response + +from src.infrastructure.observability.metrics import ( + active_requests, + request_duration, + request_errors_total, + request_total, +) + + +class MetricsMiddleware(BaseHTTPMiddleware): + """Record request duration, count, and error rate as OpenTelemetry metrics.""" + + async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response: + if request.url.path in ("/health", "/metrics"): + return await call_next(request) + + attrs = {"method": request.method, "path": request.url.path} + active_requests.add(1, attrs) + start = time.perf_counter() + + try: + response = await call_next(request) + except Exception: + request_errors_total.add(1, {**attrs, "status_code": "500"}) + active_requests.add(-1, attrs) + raise + + elapsed = time.perf_counter() - start + status_code = str(response.status_code) + + request_duration.record(elapsed, {**attrs, "status_code": status_code}) + request_total.add(1, {**attrs, "status_code": status_code}) + + if response.status_code >= 400: + request_errors_total.add(1, {**attrs, "status_code": status_code}) + + active_requests.add(-1, attrs) + return response diff --git a/src/infrastructure/observability/setup.py b/src/infrastructure/observability/setup.py new file mode 100644 index 0000000..0370a1b --- /dev/null +++ b/src/infrastructure/observability/setup.py @@ -0,0 +1,87 @@ +"""OpenTelemetry setup — traces, metrics, and log correlation. + +Configures: +- OTLP gRPC exporter for traces (→ Tempo) +- Prometheus exporter for metrics (→ Prometheus → Grafana) +- FastAPI and SQLAlchemy auto-instrumentation +- Custom RED metrics (request rate, error rate, duration) + saturation gauges +""" + +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +from opentelemetry import metrics, trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.exporter.prometheus import PrometheusMetricReader +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.resources import SERVICE_NAME, Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor + +if TYPE_CHECKING: + from fastapi import FastAPI + +logger = logging.getLogger(__name__) + + +def _build_resource(service_name: str) -> Resource: + return Resource.create({SERVICE_NAME: service_name}) + + +def setup_tracing( + *, + service_name: str, + otlp_endpoint: str, +) -> TracerProvider: + """Configure and set the global TracerProvider with OTLP export.""" + resource = _build_resource(service_name) + provider = TracerProvider(resource=resource) + exporter = OTLPSpanExporter(endpoint=otlp_endpoint, insecure=True) + provider.add_span_processor(BatchSpanProcessor(exporter)) + trace.set_tracer_provider(provider) + logger.info("Tracing configured → %s", otlp_endpoint) + return provider + + +def setup_metrics(*, service_name: str) -> PrometheusMetricReader: + """Configure and set the global MeterProvider with Prometheus export.""" + resource = _build_resource(service_name) + reader = PrometheusMetricReader() + provider = MeterProvider(resource=resource, metric_readers=[reader]) + metrics.set_meter_provider(provider) + logger.info("Metrics configured (Prometheus endpoint)") + return reader + + +def instrument_app(app: FastAPI) -> None: + """Auto-instrument FastAPI with OpenTelemetry.""" + FastAPIInstrumentor.instrument_app( + app, + excluded_urls="health,metrics", + ) + logger.info("FastAPI instrumented with OpenTelemetry") + + +def instrument_sqlalchemy(engine: object) -> None: + """Auto-instrument a SQLAlchemy engine.""" + SQLAlchemyInstrumentor().instrument(engine=engine) # type: ignore[arg-type] + logger.info("SQLAlchemy instrumented with OpenTelemetry") + + +def instrument_logging() -> None: + """Inject trace context (trace_id, span_id) into log records.""" + + def _log_hook(span: trace.Span, record: logging.LogRecord) -> None: + ctx = span.get_span_context() + if ctx is not None: + record.otelTraceID = format(ctx.trace_id, "032x") # type: ignore[attr-defined] + record.otelSpanID = format(ctx.span_id, "016x") # type: ignore[attr-defined] + record.otelTraceSampled = ctx.trace_flags.sampled # type: ignore[attr-defined] + + LoggingInstrumentor().instrument(set_logging_format=False, log_hook=_log_hook) + logger.info("Logging instrumented with OpenTelemetry trace context") diff --git a/src/main.py b/src/main.py index e89dd21..fc3a4f9 100644 --- a/src/main.py +++ b/src/main.py @@ -6,7 +6,7 @@ import logging from contextlib import asynccontextmanager -from fastapi import FastAPI +from fastapi import FastAPI, Response from src.infrastructure.database.models import Base from src.presentation.api.dependencies import get_settings @@ -14,7 +14,15 @@ from src.presentation.api.routes import health, images, retention from src.presentation.logging_config import configure_logging -configure_logging(json_output=False) + +def _is_otel_enabled() -> bool: + """Check OTel flag without requiring full Settings (DB creds may be absent).""" + import os + + return os.getenv("IMG_OTEL_ENABLED", "false").lower() in ("1", "true", "yes") + + +configure_logging(json_output=_is_otel_enabled()) logger = logging.getLogger(__name__) @@ -25,6 +33,18 @@ async def lifespan(app: FastAPI): settings = get_settings() engine = build_engine(settings) + + # ── OpenTelemetry SQLAlchemy + logging instrumentation ─────────────── + if _is_otel_enabled(): + from src.infrastructure.observability.setup import ( + instrument_logging, + instrument_sqlalchemy, + ) + + instrument_sqlalchemy(engine.sync_engine) + instrument_logging() + logger.info("OpenTelemetry initialized") + async with engine.begin() as conn: await conn.run_sync(Base.metadata.create_all) logger.info("Database tables ready") @@ -36,6 +56,23 @@ async def lifespan(app: FastAPI): def create_app() -> FastAPI: + otel_enabled = _is_otel_enabled() + + # ── OpenTelemetry tracing/metrics must be set up before app creation ─ + if otel_enabled: + from src.infrastructure.observability.setup import ( + instrument_app, + setup_metrics, + setup_tracing, + ) + + settings = get_settings() + setup_tracing( + service_name=settings.otel_service_name, + otlp_endpoint=settings.otel_exporter_otlp_endpoint, + ) + setup_metrics(service_name=settings.otel_service_name) + app = FastAPI( title="Image Processing Service", description="High-performance image processing microservice — Clean Architecture demo", @@ -45,12 +82,26 @@ def create_app() -> FastAPI: lifespan=lifespan, ) + if otel_enabled: + from src.infrastructure.observability.middleware import MetricsMiddleware + + app.add_middleware(MetricsMiddleware) + instrument_app(app) + app.add_middleware(RequestLoggingMiddleware) app.include_router(health.router) app.include_router(images.router) app.include_router(retention.router) + if otel_enabled: + + @app.get("/metrics", include_in_schema=False) + async def metrics_endpoint() -> Response: + from prometheus_client import generate_latest + + return Response(content=generate_latest(), media_type="text/plain; charset=utf-8") + return app diff --git a/src/presentation/logging_config.py b/src/presentation/logging_config.py index b2d641d..74f531c 100644 --- a/src/presentation/logging_config.py +++ b/src/presentation/logging_config.py @@ -37,6 +37,9 @@ def format(self, record: logging.LogRecord) -> str: "logger": record.name, "message": record.getMessage(), "correlation_id": getattr(record, "correlation_id", ""), + "trace_id": getattr(record, "otelTraceID", ""), + "span_id": getattr(record, "otelSpanID", ""), + "trace_flags": getattr(record, "otelTraceSampled", ""), } if record.exc_info and record.exc_info[1] is not None: log_entry["exception"] = self.formatException(record.exc_info)