From 17a64a7c22ae57714c765d77aa0505188b9e2958 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:12:50 +0100 Subject: [PATCH 01/18] fix(orchestrator): fail DB creation job on actual errors instead of silently succeeding --- charts/backstage/templates/sonataflows.yaml | 22 ++++++++++++++++++--- charts/backstage/values.yaml | 2 ++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index d957f89d..8007b165 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -186,11 +186,27 @@ spec: command: [ "sh", "-c" ] {{- if .Values.upstream.postgresql.enabled }} args: - - "psql -h {{ .Release.Name }}-postgresql{{- if eq .Values.upstream.postgresql.architecture "replication" }}-primary{{- end }} -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' || echo WARNING: Could not create database" + - | + psql -h {{ .Release.Name }}-postgresql{{- if eq .Values.upstream.postgresql.architecture "replication" }}-primary{{- end }} -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h {{ .Release.Name }}-postgresql{{- if eq .Values.upstream.postgresql.architecture "replication" }}-primary{{- end }} -p 5432 -U postgres -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + echo "Database 'sonataflow' already exists, skipping creation." + else + echo "ERROR: Failed to create database 'sonataflow'." + exit 1 + fi + } {{- else }} args: - - "psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -c 'CREATE DATABASE sonataflow;' || echo WARNING: Could not create database" + - | + psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + echo "Database 'sonataflow' already exists, skipping creation." + else + echo "ERROR: Failed to create database 'sonataflow'." + exit 1 + fi + } {{- end }} restartPolicy: Never - backoffLimit: 2 + backoffLimit: {{ .Values.orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit }} {{- end }} diff --git a/charts/backstage/values.yaml b/charts/backstage/values.yaml index 630f772c..d5ff818c 100644 --- a/charts/backstage/values.yaml +++ b/charts/backstage/values.yaml @@ -557,6 +557,8 @@ orchestrator: initContainerImage: "{{ .Values.upstream.postgresql.image.registry }}/{{ .Values.upstream.postgresql.image.repository }}:{{ .Values.upstream.postgresql.image.tag }}" # -- Image for the container used by the create-db job createDBJobImage: "{{ .Values.upstream.postgresql.image.registry }}/{{ .Values.upstream.postgresql.image.repository }}:{{ .Values.upstream.postgresql.image.tag }}" + # -- Number of retries for the create-db job if it fails + dbCreationJobBackoffLimit: 2 # -- Image for the container used by the sonataflow jobs service, optional and used for disconnected environments jobServiceImage: "" # -- Image for the container used by the sonataflow data index, optional and used for disconnected environments From c4cd0dd64632ebb463e17038a3ddb7029c277028 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:21:04 +0100 Subject: [PATCH 02/18] bump chart Signed-off-by: Fortune-Ndlovu --- charts/backstage/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/backstage/Chart.yaml b/charts/backstage/Chart.yaml index 59fca4ed..293ae765 100644 --- a/charts/backstage/Chart.yaml +++ b/charts/backstage/Chart.yaml @@ -47,4 +47,4 @@ sources: [] # Versions are expected to follow Semantic Versioning (https://semver.org/) # Note that when this chart is published to https://github.com/openshift-helm-charts/charts # it will follow the RHDH versioning 1.y.z -version: 5.11.1 +version: 5.11.2 From 23248730d5fa7d1b089e37809aa46990d62aa43d Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:25:35 +0100 Subject: [PATCH 03/18] update Signed-off-by: Fortune-Ndlovu --- charts/backstage/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/backstage/Chart.yaml b/charts/backstage/Chart.yaml index 0c50edf5..947080f3 100644 --- a/charts/backstage/Chart.yaml +++ b/charts/backstage/Chart.yaml @@ -47,4 +47,4 @@ sources: [] # Versions are expected to follow Semantic Versioning (https://semver.org/) # Note that when this chart is published to https://github.com/openshift-helm-charts/charts # it will follow the RHDH versioning 1.y.z -version: 5.13.2 +version: 5.11.3 From 43dca16a207a238b37fe446a703f8d85b4d28455 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:30:34 +0100 Subject: [PATCH 04/18] bump chart version to 5.13.3 --- charts/backstage/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/backstage/Chart.yaml b/charts/backstage/Chart.yaml index 947080f3..40bd4aac 100644 --- a/charts/backstage/Chart.yaml +++ b/charts/backstage/Chart.yaml @@ -47,4 +47,4 @@ sources: [] # Versions are expected to follow Semantic Versioning (https://semver.org/) # Note that when this chart is published to https://github.com/openshift-helm-charts/charts # it will follow the RHDH versioning 1.y.z -version: 5.11.3 +version: 5.13.3 From ce31748b31a3f9d409c804e8c44f9e18f54c47ae Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:34:03 +0100 Subject: [PATCH 05/18] Add dbCreationJobBackoffLimit to both values.schema.json and values.schema.tmpl.json. --- charts/backstage/values.schema.json | 5 +++++ charts/backstage/values.schema.tmpl.json | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/charts/backstage/values.schema.json b/charts/backstage/values.schema.json index d6cf6738..783da841 100644 --- a/charts/backstage/values.schema.json +++ b/charts/backstage/values.schema.json @@ -470,6 +470,11 @@ "title": "Image for the container used by the create-db job", "type": "string" }, + "dbCreationJobBackoffLimit": { + "default": 2, + "title": "Number of retries for the create-db job if it fails", + "type": "integer" + }, "dataIndexImage": { "additionalProperties": false, "title": "Image for the container used by the sonataflow data index", diff --git a/charts/backstage/values.schema.tmpl.json b/charts/backstage/values.schema.tmpl.json index 00bd3dd9..420ae8ab 100644 --- a/charts/backstage/values.schema.tmpl.json +++ b/charts/backstage/values.schema.tmpl.json @@ -568,6 +568,11 @@ "type": "string", "additionalProperties": false }, + "dbCreationJobBackoffLimit": { + "default": 2, + "title": "Number of retries for the create-db job if it fails", + "type": "integer" + }, "jobServiceImage": { "title": "Image for the container used by the sonataflow jobs service", "type": "string", From d17d0254e616ce5207717386173aae4ac8b11035 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 08:40:51 +0100 Subject: [PATCH 06/18] Update README and values.schema.json for Backstage chart version 5.13.3, adding dbCreationJobBackoffLimit parameter. --- charts/backstage/README.md | 5 +++-- charts/backstage/values.schema.json | 10 +++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/charts/backstage/README.md b/charts/backstage/README.md index 2666cd49..16b66233 100644 --- a/charts/backstage/README.md +++ b/charts/backstage/README.md @@ -1,7 +1,7 @@ # RHDH Backstage Helm Chart for OpenShift -![Version: 5.13.2](https://img.shields.io/badge/Version-5.13.2-informational?style=flat-square) +![Version: 5.13.3](https://img.shields.io/badge/Version-5.13.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for deploying Red Hat Developer Hub, which is a Red Hat supported version of Backstage. @@ -29,7 +29,7 @@ For the **Generally Available** version of this chart, see: helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add redhat-developer https://redhat-developer.github.io/rhdh-chart -helm install my-backstage redhat-developer/backstage --version 5.13.2 +helm install my-backstage redhat-developer/backstage --version 5.13.3 ``` ## Introduction @@ -211,6 +211,7 @@ Kubernetes: `>= 1.27.0-0` | orchestrator.serverlessOperator.enabled | | bool | `true` | | orchestrator.sonataflowPlatform.createDBJobImage | Image for the container used by the create-db job | string | `"{{ .Values.upstream.postgresql.image.registry }}/{{ .Values.upstream.postgresql.image.repository }}:{{ .Values.upstream.postgresql.image.tag }}"` | | orchestrator.sonataflowPlatform.dataIndexImage | Image for the container used by the sonataflow data index, optional and used for disconnected environments | string | `""` | +| orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit | Number of retries for the create-db job if it fails | int | `2` | | orchestrator.sonataflowPlatform.eventing.broker.name | | string | `""` | | orchestrator.sonataflowPlatform.eventing.broker.namespace | | string | `""` | | orchestrator.sonataflowPlatform.externalDBHost | Host for the user-configured external Database | string | `""` | diff --git a/charts/backstage/values.schema.json b/charts/backstage/values.schema.json index 783da841..c6337eb0 100644 --- a/charts/backstage/values.schema.json +++ b/charts/backstage/values.schema.json @@ -470,16 +470,16 @@ "title": "Image for the container used by the create-db job", "type": "string" }, - "dbCreationJobBackoffLimit": { - "default": 2, - "title": "Number of retries for the create-db job if it fails", - "type": "integer" - }, "dataIndexImage": { "additionalProperties": false, "title": "Image for the container used by the sonataflow data index", "type": "string" }, + "dbCreationJobBackoffLimit": { + "default": 2, + "title": "Number of retries for the create-db job if it fails", + "type": "integer" + }, "eventing": { "additionalProperties": false, "properties": { From 202bbc66eeed53aa4169d3a3294c9d097135f240 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 12:05:33 +0100 Subject: [PATCH 07/18] qodo suggestions: Enhance dbCreationJobBackoffLimit in values.schema.json and values.schema.tmpl.json with minimum and maximum constraints. Update sonataflows.yaml to simplify database creation command. Signed-off-by: Fortune-Ndlovu --- charts/backstage/templates/sonataflows.yaml | 4 ++-- charts/backstage/values.schema.json | 2 ++ charts/backstage/values.schema.tmpl.json | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index 8007b165..6b2d68a3 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -198,8 +198,8 @@ spec: {{- else }} args: - | - psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -c 'CREATE DATABASE sonataflow;' 2>&1 || { - if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then echo "Database 'sonataflow' already exists, skipping creation." else echo "ERROR: Failed to create database 'sonataflow'." diff --git a/charts/backstage/values.schema.json b/charts/backstage/values.schema.json index c6337eb0..36b4ce2a 100644 --- a/charts/backstage/values.schema.json +++ b/charts/backstage/values.schema.json @@ -477,6 +477,8 @@ }, "dbCreationJobBackoffLimit": { "default": 2, + "minimum": 0, + "maximum": 10, "title": "Number of retries for the create-db job if it fails", "type": "integer" }, diff --git a/charts/backstage/values.schema.tmpl.json b/charts/backstage/values.schema.tmpl.json index 420ae8ab..9141de7d 100644 --- a/charts/backstage/values.schema.tmpl.json +++ b/charts/backstage/values.schema.tmpl.json @@ -570,6 +570,8 @@ }, "dbCreationJobBackoffLimit": { "default": 2, + "minimum": 0, + "maximum": 10, "title": "Number of retries for the create-db job if it fails", "type": "integer" }, From 635fd182e540cf6388a50e3ac906fab1eabad977 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 12:08:39 +0100 Subject: [PATCH 08/18] add max and min values Signed-off-by: Fortune-Ndlovu --- charts/backstage/values.schema.json | 2 +- charts/backstage/values.schema.tmpl.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/backstage/values.schema.json b/charts/backstage/values.schema.json index 36b4ce2a..55c75d20 100644 --- a/charts/backstage/values.schema.json +++ b/charts/backstage/values.schema.json @@ -477,8 +477,8 @@ }, "dbCreationJobBackoffLimit": { "default": 2, - "minimum": 0, "maximum": 10, + "minimum": 0, "title": "Number of retries for the create-db job if it fails", "type": "integer" }, diff --git a/charts/backstage/values.schema.tmpl.json b/charts/backstage/values.schema.tmpl.json index 9141de7d..679b1772 100644 --- a/charts/backstage/values.schema.tmpl.json +++ b/charts/backstage/values.schema.tmpl.json @@ -570,8 +570,8 @@ }, "dbCreationJobBackoffLimit": { "default": 2, - "minimum": 0, "maximum": 10, + "minimum": 0, "title": "Number of retries for the create-db job if it fails", "type": "integer" }, From 8f6b80f3266408c6380055b83723624f8de7a250 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 12:17:46 +0100 Subject: [PATCH 09/18] connect to -d postgres the standard maintenance database that always exists in PostgreSQL and matches the behavior of the internal branch Signed-off-by: Fortune-Ndlovu --- charts/backstage/templates/sonataflows.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index 6b2d68a3..6b28a711 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -198,8 +198,8 @@ spec: {{- else }} args: - | - psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -c 'CREATE DATABASE sonataflow;' 2>&1 || { - if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d postgres -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d postgres -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then echo "Database 'sonataflow' already exists, skipping creation." else echo "ERROR: Failed to create database 'sonataflow'." From a8ad5066b6c1d2032c6cc57035420faedfcb1a97 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 14:24:07 +0100 Subject: [PATCH 10/18] add Helm hook annotations to sonataflow DB creation Job Signed-off-by: Fortune-Ndlovu --- charts/backstage/templates/sonataflows.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index 6b28a711..50470e33 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -87,6 +87,9 @@ kind: Job metadata: name: {{ .Release.Name }}-create-sonataflow-database namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation spec: activeDeadlineSeconds: 120 template: From 642a9800e129ca810181473e0612df5aec570966 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Tue, 19 May 2026 17:16:12 +0100 Subject: [PATCH 11/18] Refactor sonataflow database Job name and add TTL for job completion Updated the Job name to use a shorter format and added a TTL of 300 seconds after the job finishes. Removed Helm hook annotations for cleaner configuration. --- charts/backstage/templates/sonataflows.yaml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index 50470e33..66b99b1c 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -85,12 +85,10 @@ spec: apiVersion: batch/v1 kind: Job metadata: - name: {{ .Release.Name }}-create-sonataflow-database + name: {{ .Release.Name }}-create-sonataflow-db namespace: {{ .Release.Namespace }} - annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-delete-policy": before-hook-creation spec: + ttlSecondsAfterFinished: 300 activeDeadlineSeconds: 120 template: spec: From 4f9e0a792dd14ec54b1675da594d66b730def149 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Wed, 20 May 2026 15:44:08 +0100 Subject: [PATCH 12/18] update Signed-off-by: Fortune-Ndlovu --- charts/backstage/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/backstage/README.md b/charts/backstage/README.md index 2943abc4..99e08e0a 100644 --- a/charts/backstage/README.md +++ b/charts/backstage/README.md @@ -1,7 +1,7 @@ # RHDH Backstage Helm Chart for OpenShift -![Version: 5.14.1](https://img.shields.io/badge/Version-5.14.0-informational?style=flat-square) +![Version: 5.14.1](https://img.shields.io/badge/Version-5.14.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for deploying Red Hat Developer Hub, which is a Red Hat supported version of Backstage. From c466cb98d0c89bdc429333b9ca861b28a3082d09 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Sun, 24 May 2026 16:38:36 +0100 Subject: [PATCH 13/18] keep job name create-sonataflow-database Signed-off-by: Fortune-Ndlovu --- charts/backstage/templates/sonataflows.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index 66b99b1c..b91e13a5 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -85,7 +85,7 @@ spec: apiVersion: batch/v1 kind: Job metadata: - name: {{ .Release.Name }}-create-sonataflow-db + name: {{ .Release.Name }}-create-sonataflow-database namespace: {{ .Release.Namespace }} spec: ttlSecondsAfterFinished: 300 From 958b0acd04eb4b7307a70431966a3a2597e4bb24 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Wed, 27 May 2026 12:12:02 +0100 Subject: [PATCH 14/18] fix(orchestrator): add Helm hook annotations to DB creation Job for upgrade compatibility The CI "Test Latest Release" check fails because helm upgrade tries to patch the existing Job's spec.template, which Kubernetes rejects as immutable. The old chart created the Job without ttlSecondsAfterFinished, so it persists indefinitely and blocks the upgrade. Adding helm.sh/hook and helm.sh/hook-delete-policy annotations makes Helm delete the old Job before creating the new one on upgrade. --- charts/backstage/templates/sonataflows.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index b91e13a5..dccecec5 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -87,6 +87,9 @@ kind: Job metadata: name: {{ .Release.Name }}-create-sonataflow-database namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation spec: ttlSecondsAfterFinished: 300 activeDeadlineSeconds: 120 From 41fbb3e466d5c371919ffaded2bb50d52dbe8015 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Wed, 27 May 2026 12:29:02 +0100 Subject: [PATCH 15/18] fix(orchestrator): preserve failed DB creation Jobs for debugging Add hook-succeeded to the Helm hook delete policy so that successful. Jobs are cleaned up immediately while failed Jobs are kept for log inspection. TTL still handles cleanup for ArgoCD users after 5 minutes. --- charts/backstage/templates/sonataflows.yaml | 2 +- pr-info.md | 742 ++++++++++++++++++++ 2 files changed, 743 insertions(+), 1 deletion(-) create mode 100644 pr-info.md diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index dccecec5..cc38e8d0 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -89,7 +89,7 @@ metadata: namespace: {{ .Release.Namespace }} annotations: "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-delete-policy": before-hook-creation + "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: ttlSecondsAfterFinished: 300 activeDeadlineSeconds: 120 diff --git a/pr-info.md b/pr-info.md new file mode 100644 index 00000000..31ea822b --- /dev/null +++ b/pr-info.md @@ -0,0 +1,742 @@ +https://redhat.atlassian.net/browse/RHDHBUGS-2577 + +# RHDHBUGS-2577: Orchestrator DB Creation Job Fix + +## Background + +When the Orchestrator is enabled, the Helm chart creates a Kubernetes Job called +`create-sonataflow-database`. This Job runs a `psql` command to create the +`sonataflow` database in PostgreSQL. The Job has a `backoffLimit` that tells +Kubernetes how many times to retry if it fails. + +There were three problems with this Job. This document explains all of them. + +--- + +## Problem 1: The Job silently swallows errors + +### What was happening + +The old command looked like this: + +```sh +psql -h -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' || echo WARNING: Could not create database +``` + +The `|| echo WARNING...` at the end is the problem. In shell, `||` means +"if the previous command fails, run this instead." The `echo` command always +succeeds (exit code 0), so the overall command always exits 0 — even when `psql` +fails. + +Kubernetes checks the exit code to decide if the Job succeeded or failed: + +- Exit code 0 = success +- Any other exit code = failure (retry up to `backoffLimit` times) + +Because the exit code was always 0, Kubernetes always thought the Job succeeded. +The `backoffLimit` was set to 2, but it never triggered because the Job never +"failed" from Kubernetes' perspective. + +### Real-world impact + +If `psql` couldn't connect (wrong password, network issue, PostgreSQL not ready), +the Job would log `WARNING: Could not create database` and report success. +The `sonataflow` database would not exist, and downstream services (Data Index, +Job Service) would fail later with confusing errors that were hard to trace back +to a missing database. + +### What we changed: the conditional error handling logic + +The new command: + +```sh +psql -h -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h -p 5432 -U postgres -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + echo "Database 'sonataflow' already exists, skipping creation." + else + echo "ERROR: Failed to create database 'sonataflow'." + exit 1 + fi +} +``` + +Here is exactly what this does, step by step: + +1. **Try to create the database.** `psql ... -c 'CREATE DATABASE sonataflow;'` + runs the SQL command. The `2>&1` redirects stderr into stdout so we capture + all output. + +2. **If it succeeds (exit code 0):** The `||` block is skipped entirely. The Job + exits 0. Kubernetes marks it as successful. Done. + +3. **If it fails (any non-zero exit code):** The `|| { ... }` block runs. But + "failed" could mean two very different things: + - The database already exists (PostgreSQL returns an error for `CREATE DATABASE` + when the database is already there). This is fine — it's the expected case + on `helm upgrade` when the database was created on the first install. + - An actual failure (wrong password, connection refused, PostgreSQL is down, + network timeout, etc.). This is a real problem that needs retrying. + +4. **Distinguish between those two cases.** The `if` block runs a second `psql` + command that queries the `pg_database` system catalog: + ```sql + SELECT 1 FROM pg_database WHERE datname='sonataflow' + ``` + - If this returns `1`, the database exists. The `grep -q 1` succeeds, and we + print `"Database 'sonataflow' already exists, skipping creation."` and exit 0. + Kubernetes sees success. The Job is done. + - If this returns nothing (or the second `psql` itself fails because the server + is unreachable), `grep -q 1` fails, and we fall to the `else` branch: + print `"ERROR: Failed to create database 'sonataflow'."` and `exit 1`. + Kubernetes sees a failure and retries up to `backoffLimit` times. + +There are two versions of this logic in the template — one for the built-in +PostgreSQL (`upstream.postgresql.enabled = true`) that connects directly with +`-U postgres`, and one for external databases that uses environment variables +(`${POSTGRES_HOST}`, `${POSTGRES_PORT}`, `${POSTGRES_USER}`) and connects to +`-d postgres` (the standard maintenance database that always exists on any +PostgreSQL server). + +We also made `backoffLimit` configurable via +`orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit` (default: 2, range: +0-10) so users can tune retry behavior for their environment. + +--- + +## Problem 2: Upgrading breaks because Kubernetes Jobs are immutable + +### What was happening + +Kubernetes Jobs have an immutable `spec.template` — once a Job is created, you +cannot change its pod template. When we changed the `args` (from the old +`|| echo WARNING` to the new error handling), `helm upgrade` tries to patch +the existing Job with the new spec. Kubernetes rejects this: + +``` +Job.batch "my-backstage-create-sonataflow-database" is invalid: +spec.template: Invalid value: ... field is immutable +``` + +This means any user upgrading from the old chart to the new chart would hit this +error and the upgrade would fail. + +### What we changed: `ttlSecondsAfterFinished` + +We added `ttlSecondsAfterFinished: 300` to the Job spec: + +```yaml +spec: + ttlSecondsAfterFinished: 300 + activeDeadlineSeconds: 120 +``` + +This tells Kubernetes to automatically delete the Job 300 seconds (5 minutes) +after it completes. This is a native Kubernetes feature — the TTL controller +watches for completed Jobs and garbage-collects them after the specified time. +No deployment tool needs to understand or honor it; Kubernetes does it on its own. + +The flow on upgrade: + +1. User installs the chart — Job runs, creates the database, completes +2. 5 minutes later — Kubernetes deletes the completed Job automatically +3. User runs `helm upgrade` (or ArgoCD syncs) — no old Job exists, so the new + Job is created fresh with the updated spec. No immutability error. + +### Why TTL matters for ArgoCD and other GitOps tools + +ArgoCD and other GitOps platforms (Flux, etc.) work differently from Helm CLI. +They render Helm templates into plain YAML, then apply that YAML to the cluster +using `kubectl apply`. They do **not** use Helm's lifecycle features like hooks +(explained in Problem 3 below). This means ArgoCD has no built-in way to delete +an old Job before creating a new one. + +`ttlSecondsAfterFinished` solves this because it works at the Kubernetes level, +not the deployment tool level. After 5 minutes, the Job is gone regardless of +whether you used Helm, ArgoCD, Flux, or `kubectl apply` directly. When the next +sync happens, the Job doesn't exist, so ArgoCD creates it fresh — no conflict. + +### Why this worked fine in our OCP testing + +When we tested on OCP, we ran 5 tests: + +1. **Fresh install** — Created the Job fresh with the new chart. No old Job + existed, so there was nothing to conflict with. +2. **TTL cleanup** — The Job auto-deleted after 5 minutes. Confirmed working. +3. **Upgrade after TTL** — By the time we ran `helm upgrade`, the TTL had + already cleaned up the old Job. The upgrade created a new Job from scratch. +4. **Failure retry** — We manually deleted the Job, then applied a new one with + bad credentials. Again, no old Job to conflict with. +5. **Schema validation** — Just tested `helm template`, no cluster interaction. + +In every case, the old Job was **gone** before the new one was created. We never +actually tested the scenario where an old Job (from the old chart, without TTL) +is still sitting on the cluster when the upgrade runs. That's the scenario CI +tests, and it's the one that fails. + +### Upgrade note for users on release 1.8/1.9 + +The old chart did NOT have `ttlSecondsAfterFinished`, so the old completed Job +will still be sitting on the cluster indefinitely. On the **first** upgrade to +this version, users need to manually delete the old Job: + +```bash +kubectl delete job -create-sonataflow-database -n +``` + +After that first upgrade, TTL handles cleanup automatically for all future +upgrades. + +--- + +## Problem 3: CI was failing — the old-to-new upgrade race condition + +### What the CI test does + +The CI uses a tool called `chart-testing` (`ct`) that tests backward +compatibility. It does this: + +1. Install the **old** chart (from the `main` branch, version 5.14.0) +2. Run tests to verify the install works +3. Upgrade to the **new** chart (from the PR branch, version 5.14.1) +4. Run tests to verify the upgrade works + +The goal is to catch breaking changes — if an upgrade from the current released +version to the new version fails, that's a problem for real users. + +### Why it failed + +Here's the exact timeline from the CI logs: + +- **15:54:06** — `helm install` with the old chart (v5.14.0). This creates the + Job with the old `|| echo WARNING` args and **no `ttlSecondsAfterFinished`**. +- **15:56:55** — `helm upgrade` with the new chart (v5.14.1). Only ~3 minutes + later. Helm tries to patch the existing Job with the new multi-line error + handling args. +- **15:56:57** — Kubernetes rejects: `spec.template: Invalid value: ... field + is immutable` + +Two things made TTL unable to help here: + +1. **The old Job has no TTL.** `ttlSecondsAfterFinished` is set at Job creation + time. The old chart didn't have it, so the Job created in step 1 will sit + there forever until someone manually deletes it. The new chart adding TTL + doesn't retroactively apply to Jobs created by the old chart. + +2. **Even if TTL were set, 300 seconds > 170 seconds.** The CI upgrade happens + ~3 minutes after install. Even if the old Job had `ttlSecondsAfterFinished: + 300`, it wouldn't have been cleaned up in time. + +### What we changed: Helm hook annotations + +We added two annotations to the Job: + +```yaml +metadata: + name: {{ .Release.Name }}-create-sonataflow-database + namespace: {{ .Release.Namespace }} + annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation +``` + +### What are annotations? + +Annotations are key-value metadata you can attach to any Kubernetes resource. +They look like this in YAML: + +```yaml +metadata: + annotations: + "some-key": "some-value" +``` + +Unlike labels (which Kubernetes uses for selecting and grouping resources), +annotations are purely informational — Kubernetes itself ignores them. They're +used by external tools to attach instructions to resources. In this case, Helm +reads these specific annotation keys to decide how to handle the resource. + +### What are Helm hooks? + +Normally, when you run `helm install` or `helm upgrade`, Helm creates all the +resources in your chart at the same time — Deployments, Services, ConfigMaps, +Jobs, everything goes to the cluster together. + +Helm hooks change this. When a resource has the `helm.sh/hook` annotation, Helm +treats it differently — it pulls it out of the normal release and runs it at a +specific point in Helm's lifecycle. Think of it like telling Helm: "Don't deploy +this with everything else. Instead, run it at this specific moment." + +### The two annotations explained + +**`"helm.sh/hook": post-install,post-upgrade`** + +This tells Helm: "This Job is a hook. Run it at these specific moments: +- `post-install` — after all normal resources are created during `helm install` +- `post-upgrade` — after all normal resources are updated during `helm upgrade` + +Without this annotation, the Job is just another resource in the chart. Helm +would try to create it on install and patch it on upgrade (which fails because +of immutability). With this annotation, Helm knows to handle it specially. + +**`"helm.sh/hook-delete-policy": before-hook-creation`** + +This tells Helm: "Before creating this hook resource, delete any previous +version of it that might still exist on the cluster." + +This is the key part that fixes the CI failure. The sequence becomes: + +1. `helm install` (old chart) — Job is created with old args +2. `helm upgrade` (new chart) — Helm sees the `before-hook-creation` policy, + **deletes the old Job first**, then creates the new Job with the new args + +No patching. No immutability error. The old Job is gone before the new one +is created. + +### Why annotations don't help ArgoCD + +ArgoCD renders Helm templates into plain YAML (`helm template`) and then applies +that YAML to the cluster using its own sync mechanism. It does **not** run +`helm install` or `helm upgrade`. This means: + +- ArgoCD never reads `helm.sh/hook` — it doesn't know what Helm hooks are +- ArgoCD treats the Job as a regular resource, just like a Deployment or Service +- When the Job spec changes, ArgoCD tries to apply the new spec to the existing + Job — and hits the same immutability error + +This is why we need **both** mechanisms: +- Annotations for Helm CLI users and CI +- TTL for ArgoCD and other GitOps tools + +--- + +## How we're catering for everything + +We now have a layered solution that covers every deployment scenario: + +### Layer 1: Correct error handling (the core fix) + +The conditional `psql` logic replaces `|| echo WARNING` with proper error +detection. The Job now: +- Succeeds on fresh database creation (exit 0) +- Succeeds when the database already exists (exit 0) +- Fails on actual errors — wrong credentials, connection refused, etc. (exit 1) +- Retries up to `backoffLimit` times on failure (configurable, default 2) + +This is what the Jira ticket (RHDHBUGS-2577) asked for. + +### Layer 2: Helm hook annotations (for Helm CLI and CI) + +```yaml +annotations: + "helm.sh/hook": post-install,post-upgrade + "helm.sh/hook-delete-policy": before-hook-creation +``` + +Helm deletes the old Job before creating the new one on every upgrade. This: +- Fixes the CI `chart-testing` upgrade test +- Makes `helm upgrade` work for all Helm CLI users, even immediately after + install (no need to wait for TTL) +- Handles the old-to-new upgrade path (from chart versions that didn't have TTL) + +### Layer 3: TTL auto-cleanup (for ArgoCD and other GitOps tools) + +```yaml +spec: + ttlSecondsAfterFinished: 300 +``` + +Kubernetes deletes the completed Job after 5 minutes. This: +- Handles ArgoCD, Flux, and any tool that ignores Helm hooks +- Works at the Kubernetes level — no deployment tool needs to understand it +- Prevents stale Jobs from accumulating on the cluster +- Makes all future syncs/upgrades work automatically (as long as 5+ minutes + have passed since the last Job completed) + +### Layer 4: Configurable backoffLimit (user flexibility) + +```yaml +orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit: 2 # default, range 0-10 +``` + +Users can tune how many times Kubernetes retries the Job on failure. The JSON +schema validates the range (0-10) and type (integer). + +### Layer 5: Documentation (for the one-time old-to-new upgrade) + +For users upgrading from charts that had neither TTL nor hooks (releases 1.8/1.9), +the upgrade instructions document the one manual step needed: + +```bash +kubectl delete job -create-sonataflow-database -n +``` + +This is only needed once. After that first upgrade, layers 2 and 3 handle +everything automatically. + +--- + +## Summary of all changes + +| Change | Who it helps | +| -------------------------------------------------------------- | ----------------------------------------------- | +| Replace `|| echo WARNING` with conditional error handling | Everyone — Job now fails on real errors | +| Distinguish "database already exists" from actual failures | Everyone — upgrades succeed when DB exists | +| Make `backoffLimit` configurable (`dbCreationJobBackoffLimit`) | Users who need custom retry behavior | +| Add `ttlSecondsAfterFinished: 300` | ArgoCD / GitOps users — auto-cleanup on upgrade | +| Add `helm.sh/hook` annotations | Helm CLI users and CI — immediate upgrade support | +| Connect to `-d postgres` for external DB | External DB users — uses maintenance DB | + +--- + +## Upgrade instructions for users on release 1.8/1.9 + +### Before upgrading + +The old chart does not have `ttlSecondsAfterFinished` or Helm hook annotations, +so the completed Job from your previous install is still on the cluster. Helm +CLI users with the new hook annotations will have this handled automatically. +ArgoCD users must delete the old Job manually before syncing. + +**Step 1: Check if the old Job exists** + +```bash +kubectl get job -create-sonataflow-database -n +``` + +Replace `` with your Helm release name (e.g. `my-backstage`) and +`` with your deployment namespace. + +If you see output like this, the old Job exists and must be deleted: + +``` +NAME STATUS COMPLETIONS DURATION AGE +my-backstage-create-sonataflow-database Complete 1/1 27s 5d +``` + +If you get `NotFound`, the Job is already gone and you can skip to Step 3. + +**Step 2: Delete the old Job** + +```bash +kubectl delete job -create-sonataflow-database -n +``` + +This is safe — the Job already completed its work (the database was created). +Deleting the Job removes the Job object and its completed pods from the cluster. +It does NOT affect the database. + +**Step 3: Upgrade the chart** + +```bash +helm upgrade redhat-developer/backstage \ + -n \ + --reuse-values +``` + +Or if you're using a local checkout: + +```bash +helm upgrade ./charts/backstage \ + -n \ + --reuse-values +``` + +The new chart will create a fresh Job with the updated error handling. After this +upgrade, both `ttlSecondsAfterFinished: 300` and the Helm hook annotations will +be set, so all future upgrades will work automatically without manual +intervention. + +### For ArgoCD users + +If ArgoCD manages your deployment: + +1. Delete the old Job manually (Step 2 above) +2. Trigger a sync in ArgoCD + +ArgoCD will create the new Job with the updated spec. The `ttlSecondsAfterFinished` +will ensure the Job is cleaned up automatically after future syncs. + +If you miss the manual deletion step, ArgoCD will show a sync error +(`field is immutable`). Simply delete the Job and re-sync. + +--- + +## OCP test results (2026-05-27) + +All 5 tests passed on OCP 4.20 (Kubernetes v1.33): + +| Test | Result | Details | +| -------------------------------------- | ------ | ------------------------------------------------------------------------------------------------- | +| 1. Fresh install | PASS | Job completed 1/1, logs show `CREATE DATABASE`, sonataflow DB confirmed in PostgreSQL | +| 2. TTL auto-cleanup | PASS | Job auto-deleted after 5 minutes (`ttlSecondsAfterFinished: 300` working) | +| 3. Upgrade — DB already exists | PASS | Upgrade succeeded, logs show `Database 'sonataflow' already exists, skipping creation.` | +| 4. Failure retry with bad credentials | PASS | 3 pods created (backoffLimit=2), each logged `ERROR: Failed to create database`, Job status=Failed | +| 5. Schema validation | PASS | Helm rejects `-1` (minimum), `11` (maximum), and `abc` (wrong type) for backoffLimit | + +Note: These tests did not cover the old-to-new upgrade path (installing from +`main` then upgrading to the PR branch without deleting the Job). That's the +scenario the CI tests, and it required the Helm hook annotations to fix. + +--- + +## Step-by-step manual testing guide + +These steps let you reproduce all three problems and verify the fixes on any +Kubernetes or OpenShift cluster. + +### Prerequisites + +- A Kubernetes (1.27+) or OpenShift (4.14+) cluster +- `helm` v3.x installed +- `kubectl` or `oc` CLI logged in with cluster-admin access +- SonataFlow and Knative CRDs installed (the Orchestrator requires them) + +**Install CRDs if not present:** + +```bash +# Knative CRDs (from the chart repo) +for crdDir in charts/orchestrator-infra/crds/*; do + kubectl apply -f "${crdDir}" +done + +# SonataFlow CRDs +SONATAFLOW_OPERATOR_VERSION="10.1.0" +curl -sL "https://github.com/apache/incubator-kie-tools/releases/download/${SONATAFLOW_OPERATOR_VERSION}/apache-kie-${SONATAFLOW_OPERATOR_VERSION}-incubating-sonataflow-operator.yaml" \ + | kubectl apply --server-side --force-conflicts -f - +``` + +**Create a test namespace:** + +```bash +kubectl create namespace sf-test +``` + +--- + +### Test 1: Fresh install — verify error handling works + +This test verifies the Job creates the database successfully on a fresh install. + +```bash +# 1. Build chart dependencies +helm dependency build ./charts/backstage/ + +# 2. Install the chart with orchestrator enabled +helm install my-backstage ./charts/backstage \ + --namespace sf-test \ + --set orchestrator.enabled=true \ + --set route.enabled=false \ + --set upstream.postgresql.primary.persistence.enabled=true \ + --timeout 500s \ + --wait + +# 3. Check the Job completed +kubectl get jobs -n sf-test +# Expected: STATUS = Complete, COMPLETIONS = 1/1 + +# 4. Check the logs +kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql +# Expected: "CREATE DATABASE" + +# 5. Verify the database exists in PostgreSQL +kubectl exec -n sf-test my-backstage-postgresql-0 -- \ + psql -U postgres -tc "SELECT datname FROM pg_database WHERE datname='sonataflow';" +# Expected: "sonataflow" + +# 6. Check TTL is set +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.spec.ttlSecondsAfterFinished}' +# Expected: 300 + +# 7. Check backoffLimit is configurable +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.spec.backoffLimit}' +# Expected: 2 (default) + +# 8. Check Helm hook annotations are set +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.metadata.annotations}' +# Expected: contains "helm.sh/hook":"post-install,post-upgrade" and +# "helm.sh/hook-delete-policy":"before-hook-creation" +``` + +--- + +### Test 2: TTL auto-cleanup + +This test verifies the Job is automatically deleted after 5 minutes. + +```bash +# 1. Check when the Job completed +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.status.completionTime}' +# Note the time + +# 2. Wait 5 minutes from the completion time, then check +kubectl get jobs -n sf-test +# Expected: "No resources found" — the Job was garbage-collected +``` + +--- + +### Test 3: Upgrade — "database already exists" path + +This test verifies that after TTL cleans up the old Job, an upgrade creates +a new Job that gracefully handles the existing database. + +```bash +# 1. Make sure the Job was cleaned up by TTL (wait 5 min if needed) +kubectl get jobs -n sf-test +# Expected: "No resources found" + +# 2. Run helm upgrade +helm upgrade my-backstage ./charts/backstage \ + --namespace sf-test \ + --set orchestrator.enabled=true \ + --set route.enabled=false \ + --set upstream.postgresql.primary.persistence.enabled=true \ + --timeout 500s \ + --wait +# Expected: Upgrade succeeds + +# 3. Check the new Job completed +kubectl get jobs -n sf-test +# Expected: STATUS = Complete, COMPLETIONS = 1/1 + +# 4. Check the logs — should show "already exists" message +kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql +# Expected: +# ERROR: database "sonataflow" already exists +# Database 'sonataflow' already exists, skipping creation. +``` + +--- + +### Test 4: Upgrade from old chart (simulates 1.8/1.9 user) + +This is the most important test. It simulates a user who has the old chart +(with `|| echo WARNING`) and upgrades to the new chart. With the Helm hook +annotations, this should now work automatically. + +```bash +# 1. Clean up from previous tests +helm uninstall my-backstage -n sf-test 2>/dev/null +kubectl delete jobs --all -n sf-test 2>/dev/null + +# 2. Install the OLD chart from the main branch +git stash # save your current changes +git checkout main -- charts/backstage/ # get the old chart files +helm dependency build ./charts/backstage/ + +helm install my-backstage ./charts/backstage \ + --namespace sf-test \ + --set orchestrator.enabled=true \ + --set route.enabled=false \ + --set upstream.postgresql.primary.persistence.enabled=true \ + --timeout 500s \ + --wait + +# 3. Verify the old Job exists with the old pattern +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.spec.template.spec.containers[0].args[0]}' +# Expected: contains "|| echo WARNING: Could not create database" + +# 4. Restore your branch +git checkout HEAD -- charts/backstage/ # restore the new chart files +git stash pop # restore your local changes +helm dependency build ./charts/backstage/ + +# 5. Upgrade — with hooks, this should now succeed immediately +helm upgrade my-backstage ./charts/backstage \ + --namespace sf-test \ + --set orchestrator.enabled=true \ + --set route.enabled=false \ + --set upstream.postgresql.primary.persistence.enabled=true \ + --timeout 500s \ + --wait +# Expected: Upgrade SUCCEEDS (Helm deletes old Job via before-hook-creation) + +# 6. Verify the new Job has the correct spec +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.spec.template.spec.containers[0].args[0]}' +# Expected: contains "SELECT 1 FROM pg_database" (new error handling) + +# 7. Check the logs +kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql +# Expected: "Database 'sonataflow' already exists, skipping creation." +# OR "CREATE DATABASE" if the DB was lost during pod restart +``` + +--- + +### Test 5: Verify retry on actual failure + +This test verifies that the Job properly fails and retries when there is a +real error (not just "database already exists"). + +```bash +# 1. Clean up from previous tests +helm uninstall my-backstage -n sf-test 2>/dev/null +kubectl delete jobs --all -n sf-test 2>/dev/null + +# 2. Install with a custom backoffLimit to see retries +helm install my-backstage ./charts/backstage \ + --namespace sf-test \ + --set orchestrator.enabled=true \ + --set route.enabled=false \ + --set upstream.postgresql.primary.persistence.enabled=true \ + --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=3 \ + --timeout 500s \ + --wait + +# 3. Verify the backoffLimit was applied +kubectl get job my-backstage-create-sonataflow-database -n sf-test \ + -o jsonpath='{.spec.backoffLimit}' +# Expected: 3 +``` + +To test an actual failure scenario with retries, you would need to make +PostgreSQL unreachable during the Job run (e.g. by scaling down the PostgreSQL +StatefulSet temporarily, or by providing wrong credentials). This is harder to +set up in a simple test but the mechanism is: + +- Job fails with exit 1 (the `exit 1` in our error handler) +- Kubernetes creates a new pod to retry +- This repeats up to `backoffLimit` times +- After all retries are exhausted, the Job is marked as Failed + +--- + +### Test 6: Schema validation + +This test verifies the JSON schema rejects invalid values for +`dbCreationJobBackoffLimit`. + +```bash +# Negative value — should fail +helm template my-backstage ./charts/backstage \ + --set orchestrator.enabled=true \ + --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=-1 +# Expected: "minimum: got -1, want 0" + +# Over maximum — should fail +helm template my-backstage ./charts/backstage \ + --set orchestrator.enabled=true \ + --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=11 +# Expected: "maximum: got 11, want 10" + +# Wrong type — should fail +helm template my-backstage ./charts/backstage \ + --set orchestrator.enabled=true \ + --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=abc +# Expected: "got string, want integer" +``` + +--- + +### Cleanup + +```bash +helm uninstall my-backstage -n sf-test 2>/dev/null +kubectl delete namespace sf-test +``` From 333fd0ccfbd307aa787c047db22c8ac775f03e63 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Wed, 27 May 2026 12:32:56 +0100 Subject: [PATCH 16/18] cleanup --- pr-info.md | 742 ----------------------------------------------------- 1 file changed, 742 deletions(-) delete mode 100644 pr-info.md diff --git a/pr-info.md b/pr-info.md deleted file mode 100644 index 31ea822b..00000000 --- a/pr-info.md +++ /dev/null @@ -1,742 +0,0 @@ -https://redhat.atlassian.net/browse/RHDHBUGS-2577 - -# RHDHBUGS-2577: Orchestrator DB Creation Job Fix - -## Background - -When the Orchestrator is enabled, the Helm chart creates a Kubernetes Job called -`create-sonataflow-database`. This Job runs a `psql` command to create the -`sonataflow` database in PostgreSQL. The Job has a `backoffLimit` that tells -Kubernetes how many times to retry if it fails. - -There were three problems with this Job. This document explains all of them. - ---- - -## Problem 1: The Job silently swallows errors - -### What was happening - -The old command looked like this: - -```sh -psql -h -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' || echo WARNING: Could not create database -``` - -The `|| echo WARNING...` at the end is the problem. In shell, `||` means -"if the previous command fails, run this instead." The `echo` command always -succeeds (exit code 0), so the overall command always exits 0 — even when `psql` -fails. - -Kubernetes checks the exit code to decide if the Job succeeded or failed: - -- Exit code 0 = success -- Any other exit code = failure (retry up to `backoffLimit` times) - -Because the exit code was always 0, Kubernetes always thought the Job succeeded. -The `backoffLimit` was set to 2, but it never triggered because the Job never -"failed" from Kubernetes' perspective. - -### Real-world impact - -If `psql` couldn't connect (wrong password, network issue, PostgreSQL not ready), -the Job would log `WARNING: Could not create database` and report success. -The `sonataflow` database would not exist, and downstream services (Data Index, -Job Service) would fail later with confusing errors that were hard to trace back -to a missing database. - -### What we changed: the conditional error handling logic - -The new command: - -```sh -psql -h -p 5432 -U postgres -c 'CREATE DATABASE sonataflow;' 2>&1 || { - if psql -h -p 5432 -U postgres -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then - echo "Database 'sonataflow' already exists, skipping creation." - else - echo "ERROR: Failed to create database 'sonataflow'." - exit 1 - fi -} -``` - -Here is exactly what this does, step by step: - -1. **Try to create the database.** `psql ... -c 'CREATE DATABASE sonataflow;'` - runs the SQL command. The `2>&1` redirects stderr into stdout so we capture - all output. - -2. **If it succeeds (exit code 0):** The `||` block is skipped entirely. The Job - exits 0. Kubernetes marks it as successful. Done. - -3. **If it fails (any non-zero exit code):** The `|| { ... }` block runs. But - "failed" could mean two very different things: - - The database already exists (PostgreSQL returns an error for `CREATE DATABASE` - when the database is already there). This is fine — it's the expected case - on `helm upgrade` when the database was created on the first install. - - An actual failure (wrong password, connection refused, PostgreSQL is down, - network timeout, etc.). This is a real problem that needs retrying. - -4. **Distinguish between those two cases.** The `if` block runs a second `psql` - command that queries the `pg_database` system catalog: - ```sql - SELECT 1 FROM pg_database WHERE datname='sonataflow' - ``` - - If this returns `1`, the database exists. The `grep -q 1` succeeds, and we - print `"Database 'sonataflow' already exists, skipping creation."` and exit 0. - Kubernetes sees success. The Job is done. - - If this returns nothing (or the second `psql` itself fails because the server - is unreachable), `grep -q 1` fails, and we fall to the `else` branch: - print `"ERROR: Failed to create database 'sonataflow'."` and `exit 1`. - Kubernetes sees a failure and retries up to `backoffLimit` times. - -There are two versions of this logic in the template — one for the built-in -PostgreSQL (`upstream.postgresql.enabled = true`) that connects directly with -`-U postgres`, and one for external databases that uses environment variables -(`${POSTGRES_HOST}`, `${POSTGRES_PORT}`, `${POSTGRES_USER}`) and connects to -`-d postgres` (the standard maintenance database that always exists on any -PostgreSQL server). - -We also made `backoffLimit` configurable via -`orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit` (default: 2, range: -0-10) so users can tune retry behavior for their environment. - ---- - -## Problem 2: Upgrading breaks because Kubernetes Jobs are immutable - -### What was happening - -Kubernetes Jobs have an immutable `spec.template` — once a Job is created, you -cannot change its pod template. When we changed the `args` (from the old -`|| echo WARNING` to the new error handling), `helm upgrade` tries to patch -the existing Job with the new spec. Kubernetes rejects this: - -``` -Job.batch "my-backstage-create-sonataflow-database" is invalid: -spec.template: Invalid value: ... field is immutable -``` - -This means any user upgrading from the old chart to the new chart would hit this -error and the upgrade would fail. - -### What we changed: `ttlSecondsAfterFinished` - -We added `ttlSecondsAfterFinished: 300` to the Job spec: - -```yaml -spec: - ttlSecondsAfterFinished: 300 - activeDeadlineSeconds: 120 -``` - -This tells Kubernetes to automatically delete the Job 300 seconds (5 minutes) -after it completes. This is a native Kubernetes feature — the TTL controller -watches for completed Jobs and garbage-collects them after the specified time. -No deployment tool needs to understand or honor it; Kubernetes does it on its own. - -The flow on upgrade: - -1. User installs the chart — Job runs, creates the database, completes -2. 5 minutes later — Kubernetes deletes the completed Job automatically -3. User runs `helm upgrade` (or ArgoCD syncs) — no old Job exists, so the new - Job is created fresh with the updated spec. No immutability error. - -### Why TTL matters for ArgoCD and other GitOps tools - -ArgoCD and other GitOps platforms (Flux, etc.) work differently from Helm CLI. -They render Helm templates into plain YAML, then apply that YAML to the cluster -using `kubectl apply`. They do **not** use Helm's lifecycle features like hooks -(explained in Problem 3 below). This means ArgoCD has no built-in way to delete -an old Job before creating a new one. - -`ttlSecondsAfterFinished` solves this because it works at the Kubernetes level, -not the deployment tool level. After 5 minutes, the Job is gone regardless of -whether you used Helm, ArgoCD, Flux, or `kubectl apply` directly. When the next -sync happens, the Job doesn't exist, so ArgoCD creates it fresh — no conflict. - -### Why this worked fine in our OCP testing - -When we tested on OCP, we ran 5 tests: - -1. **Fresh install** — Created the Job fresh with the new chart. No old Job - existed, so there was nothing to conflict with. -2. **TTL cleanup** — The Job auto-deleted after 5 minutes. Confirmed working. -3. **Upgrade after TTL** — By the time we ran `helm upgrade`, the TTL had - already cleaned up the old Job. The upgrade created a new Job from scratch. -4. **Failure retry** — We manually deleted the Job, then applied a new one with - bad credentials. Again, no old Job to conflict with. -5. **Schema validation** — Just tested `helm template`, no cluster interaction. - -In every case, the old Job was **gone** before the new one was created. We never -actually tested the scenario where an old Job (from the old chart, without TTL) -is still sitting on the cluster when the upgrade runs. That's the scenario CI -tests, and it's the one that fails. - -### Upgrade note for users on release 1.8/1.9 - -The old chart did NOT have `ttlSecondsAfterFinished`, so the old completed Job -will still be sitting on the cluster indefinitely. On the **first** upgrade to -this version, users need to manually delete the old Job: - -```bash -kubectl delete job -create-sonataflow-database -n -``` - -After that first upgrade, TTL handles cleanup automatically for all future -upgrades. - ---- - -## Problem 3: CI was failing — the old-to-new upgrade race condition - -### What the CI test does - -The CI uses a tool called `chart-testing` (`ct`) that tests backward -compatibility. It does this: - -1. Install the **old** chart (from the `main` branch, version 5.14.0) -2. Run tests to verify the install works -3. Upgrade to the **new** chart (from the PR branch, version 5.14.1) -4. Run tests to verify the upgrade works - -The goal is to catch breaking changes — if an upgrade from the current released -version to the new version fails, that's a problem for real users. - -### Why it failed - -Here's the exact timeline from the CI logs: - -- **15:54:06** — `helm install` with the old chart (v5.14.0). This creates the - Job with the old `|| echo WARNING` args and **no `ttlSecondsAfterFinished`**. -- **15:56:55** — `helm upgrade` with the new chart (v5.14.1). Only ~3 minutes - later. Helm tries to patch the existing Job with the new multi-line error - handling args. -- **15:56:57** — Kubernetes rejects: `spec.template: Invalid value: ... field - is immutable` - -Two things made TTL unable to help here: - -1. **The old Job has no TTL.** `ttlSecondsAfterFinished` is set at Job creation - time. The old chart didn't have it, so the Job created in step 1 will sit - there forever until someone manually deletes it. The new chart adding TTL - doesn't retroactively apply to Jobs created by the old chart. - -2. **Even if TTL were set, 300 seconds > 170 seconds.** The CI upgrade happens - ~3 minutes after install. Even if the old Job had `ttlSecondsAfterFinished: - 300`, it wouldn't have been cleaned up in time. - -### What we changed: Helm hook annotations - -We added two annotations to the Job: - -```yaml -metadata: - name: {{ .Release.Name }}-create-sonataflow-database - namespace: {{ .Release.Namespace }} - annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-delete-policy": before-hook-creation -``` - -### What are annotations? - -Annotations are key-value metadata you can attach to any Kubernetes resource. -They look like this in YAML: - -```yaml -metadata: - annotations: - "some-key": "some-value" -``` - -Unlike labels (which Kubernetes uses for selecting and grouping resources), -annotations are purely informational — Kubernetes itself ignores them. They're -used by external tools to attach instructions to resources. In this case, Helm -reads these specific annotation keys to decide how to handle the resource. - -### What are Helm hooks? - -Normally, when you run `helm install` or `helm upgrade`, Helm creates all the -resources in your chart at the same time — Deployments, Services, ConfigMaps, -Jobs, everything goes to the cluster together. - -Helm hooks change this. When a resource has the `helm.sh/hook` annotation, Helm -treats it differently — it pulls it out of the normal release and runs it at a -specific point in Helm's lifecycle. Think of it like telling Helm: "Don't deploy -this with everything else. Instead, run it at this specific moment." - -### The two annotations explained - -**`"helm.sh/hook": post-install,post-upgrade`** - -This tells Helm: "This Job is a hook. Run it at these specific moments: -- `post-install` — after all normal resources are created during `helm install` -- `post-upgrade` — after all normal resources are updated during `helm upgrade` - -Without this annotation, the Job is just another resource in the chart. Helm -would try to create it on install and patch it on upgrade (which fails because -of immutability). With this annotation, Helm knows to handle it specially. - -**`"helm.sh/hook-delete-policy": before-hook-creation`** - -This tells Helm: "Before creating this hook resource, delete any previous -version of it that might still exist on the cluster." - -This is the key part that fixes the CI failure. The sequence becomes: - -1. `helm install` (old chart) — Job is created with old args -2. `helm upgrade` (new chart) — Helm sees the `before-hook-creation` policy, - **deletes the old Job first**, then creates the new Job with the new args - -No patching. No immutability error. The old Job is gone before the new one -is created. - -### Why annotations don't help ArgoCD - -ArgoCD renders Helm templates into plain YAML (`helm template`) and then applies -that YAML to the cluster using its own sync mechanism. It does **not** run -`helm install` or `helm upgrade`. This means: - -- ArgoCD never reads `helm.sh/hook` — it doesn't know what Helm hooks are -- ArgoCD treats the Job as a regular resource, just like a Deployment or Service -- When the Job spec changes, ArgoCD tries to apply the new spec to the existing - Job — and hits the same immutability error - -This is why we need **both** mechanisms: -- Annotations for Helm CLI users and CI -- TTL for ArgoCD and other GitOps tools - ---- - -## How we're catering for everything - -We now have a layered solution that covers every deployment scenario: - -### Layer 1: Correct error handling (the core fix) - -The conditional `psql` logic replaces `|| echo WARNING` with proper error -detection. The Job now: -- Succeeds on fresh database creation (exit 0) -- Succeeds when the database already exists (exit 0) -- Fails on actual errors — wrong credentials, connection refused, etc. (exit 1) -- Retries up to `backoffLimit` times on failure (configurable, default 2) - -This is what the Jira ticket (RHDHBUGS-2577) asked for. - -### Layer 2: Helm hook annotations (for Helm CLI and CI) - -```yaml -annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-delete-policy": before-hook-creation -``` - -Helm deletes the old Job before creating the new one on every upgrade. This: -- Fixes the CI `chart-testing` upgrade test -- Makes `helm upgrade` work for all Helm CLI users, even immediately after - install (no need to wait for TTL) -- Handles the old-to-new upgrade path (from chart versions that didn't have TTL) - -### Layer 3: TTL auto-cleanup (for ArgoCD and other GitOps tools) - -```yaml -spec: - ttlSecondsAfterFinished: 300 -``` - -Kubernetes deletes the completed Job after 5 minutes. This: -- Handles ArgoCD, Flux, and any tool that ignores Helm hooks -- Works at the Kubernetes level — no deployment tool needs to understand it -- Prevents stale Jobs from accumulating on the cluster -- Makes all future syncs/upgrades work automatically (as long as 5+ minutes - have passed since the last Job completed) - -### Layer 4: Configurable backoffLimit (user flexibility) - -```yaml -orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit: 2 # default, range 0-10 -``` - -Users can tune how many times Kubernetes retries the Job on failure. The JSON -schema validates the range (0-10) and type (integer). - -### Layer 5: Documentation (for the one-time old-to-new upgrade) - -For users upgrading from charts that had neither TTL nor hooks (releases 1.8/1.9), -the upgrade instructions document the one manual step needed: - -```bash -kubectl delete job -create-sonataflow-database -n -``` - -This is only needed once. After that first upgrade, layers 2 and 3 handle -everything automatically. - ---- - -## Summary of all changes - -| Change | Who it helps | -| -------------------------------------------------------------- | ----------------------------------------------- | -| Replace `|| echo WARNING` with conditional error handling | Everyone — Job now fails on real errors | -| Distinguish "database already exists" from actual failures | Everyone — upgrades succeed when DB exists | -| Make `backoffLimit` configurable (`dbCreationJobBackoffLimit`) | Users who need custom retry behavior | -| Add `ttlSecondsAfterFinished: 300` | ArgoCD / GitOps users — auto-cleanup on upgrade | -| Add `helm.sh/hook` annotations | Helm CLI users and CI — immediate upgrade support | -| Connect to `-d postgres` for external DB | External DB users — uses maintenance DB | - ---- - -## Upgrade instructions for users on release 1.8/1.9 - -### Before upgrading - -The old chart does not have `ttlSecondsAfterFinished` or Helm hook annotations, -so the completed Job from your previous install is still on the cluster. Helm -CLI users with the new hook annotations will have this handled automatically. -ArgoCD users must delete the old Job manually before syncing. - -**Step 1: Check if the old Job exists** - -```bash -kubectl get job -create-sonataflow-database -n -``` - -Replace `` with your Helm release name (e.g. `my-backstage`) and -`` with your deployment namespace. - -If you see output like this, the old Job exists and must be deleted: - -``` -NAME STATUS COMPLETIONS DURATION AGE -my-backstage-create-sonataflow-database Complete 1/1 27s 5d -``` - -If you get `NotFound`, the Job is already gone and you can skip to Step 3. - -**Step 2: Delete the old Job** - -```bash -kubectl delete job -create-sonataflow-database -n -``` - -This is safe — the Job already completed its work (the database was created). -Deleting the Job removes the Job object and its completed pods from the cluster. -It does NOT affect the database. - -**Step 3: Upgrade the chart** - -```bash -helm upgrade redhat-developer/backstage \ - -n \ - --reuse-values -``` - -Or if you're using a local checkout: - -```bash -helm upgrade ./charts/backstage \ - -n \ - --reuse-values -``` - -The new chart will create a fresh Job with the updated error handling. After this -upgrade, both `ttlSecondsAfterFinished: 300` and the Helm hook annotations will -be set, so all future upgrades will work automatically without manual -intervention. - -### For ArgoCD users - -If ArgoCD manages your deployment: - -1. Delete the old Job manually (Step 2 above) -2. Trigger a sync in ArgoCD - -ArgoCD will create the new Job with the updated spec. The `ttlSecondsAfterFinished` -will ensure the Job is cleaned up automatically after future syncs. - -If you miss the manual deletion step, ArgoCD will show a sync error -(`field is immutable`). Simply delete the Job and re-sync. - ---- - -## OCP test results (2026-05-27) - -All 5 tests passed on OCP 4.20 (Kubernetes v1.33): - -| Test | Result | Details | -| -------------------------------------- | ------ | ------------------------------------------------------------------------------------------------- | -| 1. Fresh install | PASS | Job completed 1/1, logs show `CREATE DATABASE`, sonataflow DB confirmed in PostgreSQL | -| 2. TTL auto-cleanup | PASS | Job auto-deleted after 5 minutes (`ttlSecondsAfterFinished: 300` working) | -| 3. Upgrade — DB already exists | PASS | Upgrade succeeded, logs show `Database 'sonataflow' already exists, skipping creation.` | -| 4. Failure retry with bad credentials | PASS | 3 pods created (backoffLimit=2), each logged `ERROR: Failed to create database`, Job status=Failed | -| 5. Schema validation | PASS | Helm rejects `-1` (minimum), `11` (maximum), and `abc` (wrong type) for backoffLimit | - -Note: These tests did not cover the old-to-new upgrade path (installing from -`main` then upgrading to the PR branch without deleting the Job). That's the -scenario the CI tests, and it required the Helm hook annotations to fix. - ---- - -## Step-by-step manual testing guide - -These steps let you reproduce all three problems and verify the fixes on any -Kubernetes or OpenShift cluster. - -### Prerequisites - -- A Kubernetes (1.27+) or OpenShift (4.14+) cluster -- `helm` v3.x installed -- `kubectl` or `oc` CLI logged in with cluster-admin access -- SonataFlow and Knative CRDs installed (the Orchestrator requires them) - -**Install CRDs if not present:** - -```bash -# Knative CRDs (from the chart repo) -for crdDir in charts/orchestrator-infra/crds/*; do - kubectl apply -f "${crdDir}" -done - -# SonataFlow CRDs -SONATAFLOW_OPERATOR_VERSION="10.1.0" -curl -sL "https://github.com/apache/incubator-kie-tools/releases/download/${SONATAFLOW_OPERATOR_VERSION}/apache-kie-${SONATAFLOW_OPERATOR_VERSION}-incubating-sonataflow-operator.yaml" \ - | kubectl apply --server-side --force-conflicts -f - -``` - -**Create a test namespace:** - -```bash -kubectl create namespace sf-test -``` - ---- - -### Test 1: Fresh install — verify error handling works - -This test verifies the Job creates the database successfully on a fresh install. - -```bash -# 1. Build chart dependencies -helm dependency build ./charts/backstage/ - -# 2. Install the chart with orchestrator enabled -helm install my-backstage ./charts/backstage \ - --namespace sf-test \ - --set orchestrator.enabled=true \ - --set route.enabled=false \ - --set upstream.postgresql.primary.persistence.enabled=true \ - --timeout 500s \ - --wait - -# 3. Check the Job completed -kubectl get jobs -n sf-test -# Expected: STATUS = Complete, COMPLETIONS = 1/1 - -# 4. Check the logs -kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql -# Expected: "CREATE DATABASE" - -# 5. Verify the database exists in PostgreSQL -kubectl exec -n sf-test my-backstage-postgresql-0 -- \ - psql -U postgres -tc "SELECT datname FROM pg_database WHERE datname='sonataflow';" -# Expected: "sonataflow" - -# 6. Check TTL is set -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.spec.ttlSecondsAfterFinished}' -# Expected: 300 - -# 7. Check backoffLimit is configurable -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.spec.backoffLimit}' -# Expected: 2 (default) - -# 8. Check Helm hook annotations are set -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.metadata.annotations}' -# Expected: contains "helm.sh/hook":"post-install,post-upgrade" and -# "helm.sh/hook-delete-policy":"before-hook-creation" -``` - ---- - -### Test 2: TTL auto-cleanup - -This test verifies the Job is automatically deleted after 5 minutes. - -```bash -# 1. Check when the Job completed -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.status.completionTime}' -# Note the time - -# 2. Wait 5 minutes from the completion time, then check -kubectl get jobs -n sf-test -# Expected: "No resources found" — the Job was garbage-collected -``` - ---- - -### Test 3: Upgrade — "database already exists" path - -This test verifies that after TTL cleans up the old Job, an upgrade creates -a new Job that gracefully handles the existing database. - -```bash -# 1. Make sure the Job was cleaned up by TTL (wait 5 min if needed) -kubectl get jobs -n sf-test -# Expected: "No resources found" - -# 2. Run helm upgrade -helm upgrade my-backstage ./charts/backstage \ - --namespace sf-test \ - --set orchestrator.enabled=true \ - --set route.enabled=false \ - --set upstream.postgresql.primary.persistence.enabled=true \ - --timeout 500s \ - --wait -# Expected: Upgrade succeeds - -# 3. Check the new Job completed -kubectl get jobs -n sf-test -# Expected: STATUS = Complete, COMPLETIONS = 1/1 - -# 4. Check the logs — should show "already exists" message -kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql -# Expected: -# ERROR: database "sonataflow" already exists -# Database 'sonataflow' already exists, skipping creation. -``` - ---- - -### Test 4: Upgrade from old chart (simulates 1.8/1.9 user) - -This is the most important test. It simulates a user who has the old chart -(with `|| echo WARNING`) and upgrades to the new chart. With the Helm hook -annotations, this should now work automatically. - -```bash -# 1. Clean up from previous tests -helm uninstall my-backstage -n sf-test 2>/dev/null -kubectl delete jobs --all -n sf-test 2>/dev/null - -# 2. Install the OLD chart from the main branch -git stash # save your current changes -git checkout main -- charts/backstage/ # get the old chart files -helm dependency build ./charts/backstage/ - -helm install my-backstage ./charts/backstage \ - --namespace sf-test \ - --set orchestrator.enabled=true \ - --set route.enabled=false \ - --set upstream.postgresql.primary.persistence.enabled=true \ - --timeout 500s \ - --wait - -# 3. Verify the old Job exists with the old pattern -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.spec.template.spec.containers[0].args[0]}' -# Expected: contains "|| echo WARNING: Could not create database" - -# 4. Restore your branch -git checkout HEAD -- charts/backstage/ # restore the new chart files -git stash pop # restore your local changes -helm dependency build ./charts/backstage/ - -# 5. Upgrade — with hooks, this should now succeed immediately -helm upgrade my-backstage ./charts/backstage \ - --namespace sf-test \ - --set orchestrator.enabled=true \ - --set route.enabled=false \ - --set upstream.postgresql.primary.persistence.enabled=true \ - --timeout 500s \ - --wait -# Expected: Upgrade SUCCEEDS (Helm deletes old Job via before-hook-creation) - -# 6. Verify the new Job has the correct spec -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.spec.template.spec.containers[0].args[0]}' -# Expected: contains "SELECT 1 FROM pg_database" (new error handling) - -# 7. Check the logs -kubectl logs -n sf-test -l job-name=my-backstage-create-sonataflow-database -c psql -# Expected: "Database 'sonataflow' already exists, skipping creation." -# OR "CREATE DATABASE" if the DB was lost during pod restart -``` - ---- - -### Test 5: Verify retry on actual failure - -This test verifies that the Job properly fails and retries when there is a -real error (not just "database already exists"). - -```bash -# 1. Clean up from previous tests -helm uninstall my-backstage -n sf-test 2>/dev/null -kubectl delete jobs --all -n sf-test 2>/dev/null - -# 2. Install with a custom backoffLimit to see retries -helm install my-backstage ./charts/backstage \ - --namespace sf-test \ - --set orchestrator.enabled=true \ - --set route.enabled=false \ - --set upstream.postgresql.primary.persistence.enabled=true \ - --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=3 \ - --timeout 500s \ - --wait - -# 3. Verify the backoffLimit was applied -kubectl get job my-backstage-create-sonataflow-database -n sf-test \ - -o jsonpath='{.spec.backoffLimit}' -# Expected: 3 -``` - -To test an actual failure scenario with retries, you would need to make -PostgreSQL unreachable during the Job run (e.g. by scaling down the PostgreSQL -StatefulSet temporarily, or by providing wrong credentials). This is harder to -set up in a simple test but the mechanism is: - -- Job fails with exit 1 (the `exit 1` in our error handler) -- Kubernetes creates a new pod to retry -- This repeats up to `backoffLimit` times -- After all retries are exhausted, the Job is marked as Failed - ---- - -### Test 6: Schema validation - -This test verifies the JSON schema rejects invalid values for -`dbCreationJobBackoffLimit`. - -```bash -# Negative value — should fail -helm template my-backstage ./charts/backstage \ - --set orchestrator.enabled=true \ - --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=-1 -# Expected: "minimum: got -1, want 0" - -# Over maximum — should fail -helm template my-backstage ./charts/backstage \ - --set orchestrator.enabled=true \ - --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=11 -# Expected: "maximum: got 11, want 10" - -# Wrong type — should fail -helm template my-backstage ./charts/backstage \ - --set orchestrator.enabled=true \ - --set orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit=abc -# Expected: "got string, want integer" -``` - ---- - -### Cleanup - -```bash -helm uninstall my-backstage -n sf-test 2>/dev/null -kubectl delete namespace sf-test -``` From 9c3d496a97efae1be548e31846bd502cdd9ebc6e Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Thu, 28 May 2026 13:29:43 +0100 Subject: [PATCH 17/18] refactor(orchestrator): use versioned Job name instead of Helm hooks Signed-off-by: Fortune-Ndlovu --- charts/backstage/Chart.yaml | 2 +- charts/backstage/README.md | 4 ++-- charts/backstage/templates/sonataflows.yaml | 9 +++------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/charts/backstage/Chart.yaml b/charts/backstage/Chart.yaml index 7bcbfc75..9ebd837a 100644 --- a/charts/backstage/Chart.yaml +++ b/charts/backstage/Chart.yaml @@ -47,4 +47,4 @@ sources: [] # Versions are expected to follow Semantic Versioning (https://semver.org/) # Note that when this chart is published to https://github.com/openshift-helm-charts/charts # it will follow the RHDH versioning 1.y.z -version: 5.14.1 +version: 6.0.0 diff --git a/charts/backstage/README.md b/charts/backstage/README.md index 99e08e0a..9f09af1d 100644 --- a/charts/backstage/README.md +++ b/charts/backstage/README.md @@ -1,7 +1,7 @@ # RHDH Backstage Helm Chart for OpenShift -![Version: 5.14.1](https://img.shields.io/badge/Version-5.14.1-informational?style=flat-square) +![Version: 6.0.0](https://img.shields.io/badge/Version-6.0.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) A Helm chart for deploying Red Hat Developer Hub, which is a Red Hat supported version of Backstage. @@ -29,7 +29,7 @@ For the **Generally Available** version of this chart, see: helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add redhat-developer https://redhat-developer.github.io/rhdh-chart -helm install my-backstage redhat-developer/backstage --version 5.14.1 +helm install my-backstage redhat-developer/backstage --version 6.0.0 ``` ## Introduction diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index cc38e8d0..e7301052 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -85,11 +85,8 @@ spec: apiVersion: batch/v1 kind: Job metadata: - name: {{ .Release.Name }}-create-sonataflow-database + name: {{ .Release.Name }}-create-sonataflow-database-{{ .Chart.Version }} namespace: {{ .Release.Namespace }} - annotations: - "helm.sh/hook": post-install,post-upgrade - "helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded spec: ttlSecondsAfterFinished: 300 activeDeadlineSeconds: 120 @@ -202,8 +199,8 @@ spec: {{- else }} args: - | - psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d postgres -c 'CREATE DATABASE sonataflow;' 2>&1 || { - if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d postgres -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then + psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -c 'CREATE DATABASE sonataflow;' 2>&1 || { + if psql -h ${POSTGRES_HOST} -p ${POSTGRES_PORT} -U ${POSTGRES_USER} -d {{ .Values.orchestrator.sonataflowPlatform.externalDBName }} -tc "SELECT 1 FROM pg_database WHERE datname='sonataflow'" | grep -q 1; then echo "Database 'sonataflow' already exists, skipping creation." else echo "ERROR: Failed to create database 'sonataflow'." From 56b08b4c71b48978bb71b0e26a7d0acccc381d19 Mon Sep 17 00:00:00 2001 From: Fortune-Ndlovu Date: Fri, 29 May 2026 11:49:00 +0100 Subject: [PATCH 18/18] Address rm3l's review feedback: shorten Job name to avoid long resource names and replace dots with dashes, expose ttlSecondsAfterFinished and activeDeadlineSeconds in values.yaml so users can configure them. Signed-off-by: Fortune-Ndlovu --- charts/backstage/README.md | 2 ++ charts/backstage/templates/sonataflows.yaml | 6 +++--- charts/backstage/values.schema.json | 12 ++++++++++++ charts/backstage/values.schema.tmpl.json | 12 ++++++++++++ charts/backstage/values.yaml | 4 ++++ 5 files changed, 33 insertions(+), 3 deletions(-) diff --git a/charts/backstage/README.md b/charts/backstage/README.md index 9f09af1d..49e20ef2 100644 --- a/charts/backstage/README.md +++ b/charts/backstage/README.md @@ -211,7 +211,9 @@ Kubernetes: `>= 1.27.0-0` | orchestrator.serverlessOperator.enabled | | bool | `true` | | orchestrator.sonataflowPlatform.createDBJobImage | Image for the container used by the create-db job | string | `"{{ .Values.upstream.postgresql.image.registry }}/{{ .Values.upstream.postgresql.image.repository }}:{{ .Values.upstream.postgresql.image.tag }}"` | | orchestrator.sonataflowPlatform.dataIndexImage | Image for the container used by the sonataflow data index, optional and used for disconnected environments | string | `""` | +| orchestrator.sonataflowPlatform.dbCreationJobActiveDeadlineSeconds | Maximum time in seconds for the create-db Job to complete before being terminated | int | `120` | | orchestrator.sonataflowPlatform.dbCreationJobBackoffLimit | Number of retries for the create-db job if it fails | int | `2` | +| orchestrator.sonataflowPlatform.dbCreationJobTTLSecondsAfterFinished | Time in seconds after which a finished create-db Job is automatically deleted | int | `300` | | orchestrator.sonataflowPlatform.eventing.broker.name | | string | `""` | | orchestrator.sonataflowPlatform.eventing.broker.namespace | | string | `""` | | orchestrator.sonataflowPlatform.externalDBHost | Host for the user-configured external Database | string | `""` | diff --git a/charts/backstage/templates/sonataflows.yaml b/charts/backstage/templates/sonataflows.yaml index e7301052..d76b7c8a 100644 --- a/charts/backstage/templates/sonataflows.yaml +++ b/charts/backstage/templates/sonataflows.yaml @@ -85,11 +85,11 @@ spec: apiVersion: batch/v1 kind: Job metadata: - name: {{ .Release.Name }}-create-sonataflow-database-{{ .Chart.Version }} + name: {{ .Release.Name }}-create-sf-db-{{ .Chart.Version | replace "." "-" }} namespace: {{ .Release.Namespace }} spec: - ttlSecondsAfterFinished: 300 - activeDeadlineSeconds: 120 + ttlSecondsAfterFinished: {{ .Values.orchestrator.sonataflowPlatform.dbCreationJobTTLSecondsAfterFinished }} + activeDeadlineSeconds: {{ .Values.orchestrator.sonataflowPlatform.dbCreationJobActiveDeadlineSeconds }} template: spec: initContainers: diff --git a/charts/backstage/values.schema.json b/charts/backstage/values.schema.json index 55c75d20..94a2d6ad 100644 --- a/charts/backstage/values.schema.json +++ b/charts/backstage/values.schema.json @@ -475,6 +475,12 @@ "title": "Image for the container used by the sonataflow data index", "type": "string" }, + "dbCreationJobActiveDeadlineSeconds": { + "default": 120, + "minimum": 1, + "title": "Maximum time in seconds for the create-db Job to complete before being terminated", + "type": "integer" + }, "dbCreationJobBackoffLimit": { "default": 2, "maximum": 10, @@ -482,6 +488,12 @@ "title": "Number of retries for the create-db job if it fails", "type": "integer" }, + "dbCreationJobTTLSecondsAfterFinished": { + "default": 300, + "minimum": 0, + "title": "Time in seconds after which a finished create-db Job is automatically deleted", + "type": "integer" + }, "eventing": { "additionalProperties": false, "properties": { diff --git a/charts/backstage/values.schema.tmpl.json b/charts/backstage/values.schema.tmpl.json index 679b1772..d47fdf80 100644 --- a/charts/backstage/values.schema.tmpl.json +++ b/charts/backstage/values.schema.tmpl.json @@ -575,6 +575,18 @@ "title": "Number of retries for the create-db job if it fails", "type": "integer" }, + "dbCreationJobTTLSecondsAfterFinished": { + "default": 300, + "minimum": 0, + "title": "Time in seconds after which a finished create-db Job is automatically deleted", + "type": "integer" + }, + "dbCreationJobActiveDeadlineSeconds": { + "default": 120, + "minimum": 1, + "title": "Maximum time in seconds for the create-db Job to complete before being terminated", + "type": "integer" + }, "jobServiceImage": { "title": "Image for the container used by the sonataflow jobs service", "type": "string", diff --git a/charts/backstage/values.yaml b/charts/backstage/values.yaml index 4f75f6e4..0be42f06 100644 --- a/charts/backstage/values.yaml +++ b/charts/backstage/values.yaml @@ -535,6 +535,10 @@ orchestrator: createDBJobImage: "{{ .Values.upstream.postgresql.image.registry }}/{{ .Values.upstream.postgresql.image.repository }}:{{ .Values.upstream.postgresql.image.tag }}" # -- Number of retries for the create-db job if it fails dbCreationJobBackoffLimit: 2 + # -- Time in seconds after which a finished create-db Job is automatically deleted + dbCreationJobTTLSecondsAfterFinished: 300 + # -- Maximum time in seconds for the create-db Job to complete before being terminated + dbCreationJobActiveDeadlineSeconds: 120 # -- Image for the container used by the sonataflow jobs service, optional and used for disconnected environments jobServiceImage: "" # -- Image for the container used by the sonataflow data index, optional and used for disconnected environments