diff --git a/.github/workflows/e2e-bootstrap-k8s.yml b/.github/workflows/e2e-bootstrap-k8s.yml index 3276888e6..6aaa789f5 100755 --- a/.github/workflows/e2e-bootstrap-k8s.yml +++ b/.github/workflows/e2e-bootstrap-k8s.yml @@ -699,7 +699,7 @@ jobs: echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 shell: bash run: | diff --git a/.github/workflows/e2e-bootstrap.yml b/.github/workflows/e2e-bootstrap.yml index ed787eafe..1a1b2d2e6 100755 --- a/.github/workflows/e2e-bootstrap.yml +++ b/.github/workflows/e2e-bootstrap.yml @@ -1129,7 +1129,7 @@ jobs: PY - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 shell: bash run: | diff --git a/.github/workflows/e2e-docker.yml b/.github/workflows/e2e-docker.yml index d4f68a695..5d3ba1ee5 100755 --- a/.github/workflows/e2e-docker.yml +++ b/.github/workflows/e2e-docker.yml @@ -148,7 +148,7 @@ jobs: echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 env: MNODES: "${{ needs.deploy.outputs.mnodes }}" diff --git a/.github/workflows/k8s-native-e2e-add-node.yaml b/.github/workflows/k8s-native-e2e-add-node.yaml index 0f5211366..fbe656626 100755 --- a/.github/workflows/k8s-native-e2e-add-node.yaml +++ b/.github/workflows/k8s-native-e2e-add-node.yaml @@ -1061,7 +1061,7 @@ jobs: cid=$(echo "$output" | awk 'NR==4{print $2}') csecret=$(echo "$output" | awk 'NR==4{print $NF}') if [ -z "$cid" ] || [ "$cid" = "+" ]; then - echo "Table parsing failed, trying JSON..." + echo "Table parsing failed, trying JSON..." >&2 local json_out json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ sbctl cluster list --json 2>&1) || true @@ -1071,7 +1071,7 @@ jobs: if [ -n "$cid" ] && [ "$cid" != "+" ]; then echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV - echo "Extracted CLUSTER_ID=${cid}" + echo "Extracted CLUSTER_ID=${cid}" >&2 fi echo "$cid" } @@ -1196,9 +1196,14 @@ jobs: echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV + # Log collection timeout: half the test runtime, minimum 30 minutes + LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV + - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} run: | set +e echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ===" diff --git a/.github/workflows/k8s-native-e2e-node-migration.yaml b/.github/workflows/k8s-native-e2e-node-migration.yaml index 95f3317c5..089c53aa3 100755 --- a/.github/workflows/k8s-native-e2e-node-migration.yaml +++ b/.github/workflows/k8s-native-e2e-node-migration.yaml @@ -1059,7 +1059,7 @@ jobs: cid=$(echo "$output" | awk 'NR==4{print $2}') csecret=$(echo "$output" | awk 'NR==4{print $NF}') if [ -z "$cid" ] || [ "$cid" = "+" ]; then - echo "Table parsing failed, trying JSON..." + echo "Table parsing failed, trying JSON..." >&2 local json_out json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ sbctl cluster list --json 2>&1) || true @@ -1069,7 +1069,7 @@ jobs: if [ -n "$cid" ] && [ "$cid" != "+" ]; then echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV - echo "Extracted CLUSTER_ID=${cid}" + echo "Extracted CLUSTER_ID=${cid}" >&2 fi echo "$cid" } @@ -1194,9 +1194,14 @@ jobs: echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV + # Log collection timeout: half the test runtime, minimum 30 minutes + LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV + - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} run: | set +e echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ===" diff --git a/.github/workflows/k8s-native-e2e.yaml b/.github/workflows/k8s-native-e2e.yaml index 02595ca71..ef680bc78 100755 --- a/.github/workflows/k8s-native-e2e.yaml +++ b/.github/workflows/k8s-native-e2e.yaml @@ -1212,7 +1212,7 @@ jobs: cid=$(echo "$output" | awk 'NR==4{print $2}') csecret=$(echo "$output" | awk 'NR==4{print $NF}') if [ -z "$cid" ] || [ "$cid" = "+" ]; then - echo "Table parsing failed, trying JSON..." + echo "Table parsing failed, trying JSON..." >&2 local json_out json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ sbctl cluster list --json 2>&1) || true @@ -1222,7 +1222,7 @@ jobs: if [ -n "$cid" ] && [ "$cid" != "+" ]; then echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV - echo "Extracted CLUSTER_ID=${cid}" + echo "Extracted CLUSTER_ID=${cid}" >&2 fi echo "$cid" } @@ -1350,9 +1350,14 @@ jobs: echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV + # Log collection timeout: half the test runtime, minimum 30 minutes + LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV + - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} run: | set +e echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ===" diff --git a/.github/workflows/k8s-native-stress.yaml b/.github/workflows/k8s-native-stress.yaml index 4536e9438..4f096cf98 100755 --- a/.github/workflows/k8s-native-stress.yaml +++ b/.github/workflows/k8s-native-stress.yaml @@ -1162,7 +1162,7 @@ jobs: cid=$(echo "$output" | awk 'NR==4{print $2}') csecret=$(echo "$output" | awk 'NR==4{print $NF}') if [ -z "$cid" ] || [ "$cid" = "+" ]; then - echo "Table parsing failed, trying JSON..." + echo "Table parsing failed, trying JSON..." >&2 local json_out json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ sbctl cluster list --json 2>&1) || true @@ -1172,7 +1172,7 @@ jobs: if [ -n "$cid" ] && [ "$cid" != "+" ]; then echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV - echo "Extracted CLUSTER_ID=${cid}" + echo "Extracted CLUSTER_ID=${cid}" >&2 fi echo "$cid" } @@ -1303,9 +1303,14 @@ jobs: echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV + # Log collection timeout: half the test runtime, minimum 30 minutes + LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV + - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} run: | set +e echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ===" diff --git a/.github/workflows/monitoring-suite-docker.yaml b/.github/workflows/monitoring-suite-docker.yaml index 95a7dee2e..86bf3b987 100755 --- a/.github/workflows/monitoring-suite-docker.yaml +++ b/.github/workflows/monitoring-suite-docker.yaml @@ -605,6 +605,14 @@ jobs: echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV" echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" + # Log collection timeout: half the test runtime, minimum 30 minutes + if [ -n "${TEST_START_EPOCH:-}" ]; then + _elapsed=$(( $(date +%s) - TEST_START_EPOCH )) + LOG_COLLECT_TIMEOUT_MINS=$(( (_elapsed + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> "$GITHUB_ENV" + fi + # ============================================================ # POST-TEST CLEANUP + LOG COLLECTION # ============================================================ @@ -687,8 +695,8 @@ jobs: done - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} shell: bash run: | set +e diff --git a/.github/workflows/monitoring-suite-k8s-native.yaml b/.github/workflows/monitoring-suite-k8s-native.yaml index 39e8ef9a1..835a0fbe9 100755 --- a/.github/workflows/monitoring-suite-k8s-native.yaml +++ b/.github/workflows/monitoring-suite-k8s-native.yaml @@ -111,6 +111,14 @@ on: options: - 'false' - 'true' + use_existing_cluster: + description: 'Skip cluster cleanup and setup, reuse existing cluster' + required: false + default: 'false' + type: choice + options: + - 'false' + - 'true' send_slack_notification: description: 'Send Slack notification?' required: false @@ -315,6 +323,7 @@ jobs: # CLEANUP OLD DEPLOYMENT # ============================================================ - name: Cleanup old CSI deployment + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | set +e NAMESPACE=simplyblock @@ -458,6 +467,7 @@ jobs: kubectl delete -f $GITHUB_WORKSPACE/helm-charts/charts/simplyblock-operator/crds/ --ignore-not-found 2>/dev/null || true - name: Cleanup old cert-manager + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | set +e helm uninstall cert-manager -n cert-manager 2>/dev/null || true @@ -465,6 +475,7 @@ jobs: kubectl wait --for=delete namespace/cert-manager --timeout=120s 2>/dev/null || true - name: Cleanup old KMS + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | set +e helm uninstall openbao -n vault 2>/dev/null || true @@ -475,6 +486,7 @@ jobs: # LABEL + NAMESPACE + DEPLOY # ============================================================ - name: Label worker nodes + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | CLUSTER_ENV="${{ github.event.inputs.cluster_environment || 'local' }}" IFS=',' read -ra NODES <<< "${{ github.event.inputs.worker_nodes }}" @@ -486,6 +498,7 @@ jobs: done - name: Create namespace + pod-security labels + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | kubectl create namespace simplyblock --dry-run=client -o yaml | kubectl apply -f - kubectl label namespace simplyblock \ @@ -495,6 +508,7 @@ jobs: --overwrite - name: Create Docker registry secret + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | kubectl create secret docker-registry regcred \ --docker-server=https://index.docker.io/v1/ \ @@ -507,7 +521,7 @@ jobs: DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} - name: Configure OpenShift SCC policies - if: ${{ github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local' }} + if: ${{ github.event.inputs.use_existing_cluster != 'true' && (github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local') }} run: | oc adm policy add-scc-to-user privileged -z default -n simplyblock oc adm policy add-scc-to-user anyuid -z default -n simplyblock @@ -518,10 +532,11 @@ jobs: --overwrite - name: Wait before helm install + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: sleep 30 - name: Install cert-manager (TLS prerequisite) - if: ${{ github.event.inputs.tls_enabled == 'true' }} + if: ${{ github.event.inputs.use_existing_cluster != 'true' && github.event.inputs.tls_enabled == 'true' }} run: | helm repo add jetstack https://charts.jetstack.io helm repo update @@ -531,6 +546,7 @@ jobs: kubectl wait --for=condition=Ready pods --all -n cert-manager --timeout=120s - name: Install Helm Chart for simplyblock-operator + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | cd $GITHUB_WORKSPACE/helm-charts/charts/simplyblock-operator/ TLS_FLAGS="" @@ -562,13 +578,14 @@ jobs: $TLS_FLAGS $CSI_FLAGS - name: Grant OpenShift SCC post-helm - if: ${{ github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local' }} + if: ${{ github.event.inputs.use_existing_cluster != 'true' && (github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local') }} run: | for sa in $(oc get sa -n simplyblock -o name | cut -d/ -f2); do oc adm policy add-scc-to-user privileged -z $sa -n simplyblock done - name: Patch fluent-bit daemonset + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock PATCHED=false @@ -587,6 +604,7 @@ jobs: echo "FLUENTBIT_PATCHED=$PATCHED" >> $GITHUB_ENV - name: Patch service accounts with imagePullSecrets + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | for sa in $(kubectl get serviceaccounts -n simplyblock --no-headers | awk '{print $1}'); do kubectl patch serviceaccount "$sa" -n simplyblock \ @@ -594,6 +612,7 @@ jobs: done - name: Delete ImagePullBackOff pods + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock for pod in $(kubectl get pods -n $NAMESPACE --no-headers 2>/dev/null | grep ImagePullBackOff | awk '{print $1}'); do @@ -608,6 +627,7 @@ jobs: # OPERATOR CRDs # ============================================================ - name: Wait for operator pod + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock for i in $(seq 1 60); do @@ -621,7 +641,7 @@ jobs: done - name: Setup KMS (vault) for encryption - if: ${{ github.event.inputs.tls_enabled == 'true' }} + if: ${{ github.event.inputs.use_existing_cluster != 'true' && github.event.inputs.tls_enabled == 'true' }} run: | STORAGE_CLASS=$(kubectl get sc -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}' | awk '{print $1}') [ -z "$STORAGE_CLASS" ] && STORAGE_CLASS=$(kubectl get sc -o jsonpath='{.items[0].metadata.name}') @@ -630,6 +650,7 @@ jobs: kubectl wait --for=condition=Ready pods -l app.kubernetes.io/name=openbao -n vault --timeout=300s || true - name: Apply operator custom resources + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock IFC_NAMES="${{ github.event.inputs.ifc_names || 'ens18:enp1s0' }}" @@ -735,6 +756,7 @@ jobs: NPCS: ${{ env.NPCS }} - name: Patch service accounts post-CRD + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | for sa in $(kubectl get serviceaccounts -n simplyblock --no-headers | awk '{print $1}'); do kubectl patch serviceaccount "$sa" -n simplyblock \ @@ -742,6 +764,7 @@ jobs: done - name: Delete ImagePullBackOff pods post-CRD + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock for pod in $(kubectl get pods -n $NAMESPACE --no-headers 2>/dev/null | grep ImagePullBackOff | awk '{print $1}'); do @@ -753,6 +776,7 @@ jobs: done - name: Wait for storage SA + patch + restart daemonset + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} run: | NAMESPACE=simplyblock CLUSTER_ENV="${{ github.event.inputs.cluster_environment }}" @@ -834,10 +858,85 @@ jobs: fi echo "Not active yet ($i/$MAX_POLL)..."; sleep 10 done - echo "ERROR: Cluster not active" && exit 1 + echo "WARNING: Cluster did not become active within timeout — will attempt force-activate" + kubectl -n $NAMESPACE get pods + kubectl -n $NAMESPACE exec "$ADMIN_POD" -- sbctl cluster list 2>&1 || true + + - name: Verify and force-activate cluster if needed + if: ${{ github.event.inputs.use_existing_cluster != 'true' }} + run: | + NAMESPACE=simplyblock + ADMIN_POD=$(kubectl -n $NAMESPACE get pods \ + -l app=simplyblock-admin-control \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || true + + if [ -z "$ADMIN_POD" ]; then + echo "ERROR: No admin pod found" + exit 1 + fi + + # Helper: extract cluster ID and secret from sbctl output and export to GITHUB_ENV + extract_cluster_info() { + local output="$1" + local cid csecret + cid=$(echo "$output" | awk 'NR==4{print $2}') + csecret=$(echo "$output" | awk 'NR==4{print $NF}') + if [ -z "$cid" ] || [ "$cid" = "+" ]; then + echo "Table parsing failed, trying JSON..." >&2 + local json_out + json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ + sbctl cluster list --json 2>&1) || true + cid=$(echo "$json_out" | jq -r '.[0].id // .[0].uuid // empty') + csecret=$(echo "$json_out" | jq -r '.[0].secret // empty') + fi + if [ -n "$cid" ] && [ "$cid" != "+" ]; then + echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV + echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV + echo "Extracted CLUSTER_ID=${cid}" >&2 + fi + echo "$cid" + } + + echo "=== Verifying cluster activation ===" + OUTPUT=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ + sbctl cluster list 2>&1) || true + echo "$OUTPUT" + + if echo "$OUTPUT" | grep -qi "active"; then + echo "Cluster is active, ensuring env vars are set" + extract_cluster_info "$OUTPUT" + exit 0 + fi + + echo "Cluster is NOT active, attempting forced activation..." + CID=$(extract_cluster_info "$OUTPUT") + if [ -n "$CID" ] && [ "$CID" != "+" ]; then + kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ + sbctl -d cluster activate "${CID}" 2>&1 || true + else + echo "WARNING: Could not extract cluster ID, trying activate without ID..." + kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ + sbctl -d cluster activate 2>&1 || true + fi + + echo "Waiting 60s for activation to take effect..." + sleep 60 + + OUTPUT=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \ + sbctl cluster list 2>&1) || true + echo "$OUTPUT" + + if echo "$OUTPUT" | grep -qi "active"; then + echo "Cluster is now active after forced activation" + extract_cluster_info "$OUTPUT" + exit 0 + fi + + echo "ERROR: Cluster is still not active after forced activation" + exit 1 - name: Patch fluent-bit post-active - if: ${{ env.FLUENTBIT_PATCHED != 'true' }} + if: ${{ github.event.inputs.use_existing_cluster != 'true' && env.FLUENTBIT_PATCHED != 'true' }} run: | NAMESPACE=simplyblock for i in $(seq 1 30); do @@ -901,6 +1000,14 @@ jobs: echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" echo "TEST_END_TIME=$(date +%s)" >> $GITHUB_ENV + # Log collection timeout: half the test runtime, minimum 30 minutes + if [ -n "${TEST_START_TIME:-}" ]; then + _elapsed=$(( $(date +%s) - TEST_START_TIME )) + LOG_COLLECT_TIMEOUT_MINS=$(( (_elapsed + 119) / 120 )) + [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30 + echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> "$GITHUB_ENV" + fi + # ============================================================ # POST-TEST: LOG COLLECTION # ============================================================ @@ -912,8 +1019,8 @@ jobs: [[ -n "${RUN_BASE_DIR}" ]] && echo "RUN_BASE_DIR=${RUN_BASE_DIR}" >> "$GITHUB_ENV" || true - name: Collect Graylog/OpenSearch logs - if: always() - timeout-minutes: 240 + if: '!cancelled()' + timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }} run: | set +e NAMESPACE=simplyblock diff --git a/.github/workflows/stress-run-bootstrap-k8s.yml b/.github/workflows/stress-run-bootstrap-k8s.yml index e03d43896..7e9153cde 100755 --- a/.github/workflows/stress-run-bootstrap-k8s.yml +++ b/.github/workflows/stress-run-bootstrap-k8s.yml @@ -714,6 +714,15 @@ jobs: echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" + - name: Enable shared placement + shell: bash + run: | + set -euxo pipefail + admin_pod="$(kubectl get pods -n "${K8S_NAMESPACE}" --no-headers \ + -o custom-columns=:metadata.name | grep simplyblock-admin-control | head -1)" + kubectl exec -n "${K8S_NAMESPACE}" "${admin_pod}" -- \ + bash -c "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true + - name: Run stress (foreground; runs until failure) shell: bash working-directory: sbcli/e2e @@ -760,7 +769,7 @@ jobs: echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 shell: bash run: | diff --git a/.github/workflows/stress-run-bootstrap-v2.yml b/.github/workflows/stress-run-bootstrap-v2.yml index 6c02f4044..05bd17f67 100755 --- a/.github/workflows/stress-run-bootstrap-v2.yml +++ b/.github/workflows/stress-run-bootstrap-v2.yml @@ -752,6 +752,15 @@ jobs: echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" + - name: Enable shared placement + shell: bash + run: | + set -euxo pipefail + ssh_opts="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${KEY_PATH}" + mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" + ssh ${ssh_opts} "${SSH_USER}@${mgmt_ip}" \ + "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true + - name: Run stress (foreground; runs until failure) shell: bash working-directory: sbcli/e2e @@ -822,7 +831,7 @@ jobs: echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 shell: bash run: | diff --git a/.github/workflows/stress-run-bootstrap.yml b/.github/workflows/stress-run-bootstrap.yml index a2cd37ad6..3af3aecce 100755 --- a/.github/workflows/stress-run-bootstrap.yml +++ b/.github/workflows/stress-run-bootstrap.yml @@ -736,6 +736,15 @@ jobs: echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV" echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" + - name: Enable shared placement + shell: bash + run: | + set -euxo pipefail + ssh_opts="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${KEY_PATH}" + mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')" + ssh ${ssh_opts} "${SSH_USER}@${mgmt_ip}" \ + "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true + - name: Run stress (foreground; runs until failure) shell: bash working-directory: sbcli/e2e @@ -806,7 +815,7 @@ jobs: echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV" - name: Collect Graylog/OpenSearch logs - if: always() + if: '!cancelled()' timeout-minutes: 240 shell: bash run: | diff --git a/e2e/__init__.py b/e2e/__init__.py index d03818e24..7248a5953 100755 --- a/e2e/__init__.py +++ b/e2e/__init__.py @@ -27,6 +27,11 @@ from e2e_tests.ha_journal.lvol_journal_device_node_restart import TestDeviceNodeRestart from e2e_tests.data_migration.data_migration_ha_fio import FioWorkloadTest from e2e_tests.multi_node_crash_fio_clone import TestMultiFioSnapshotDowntime +from e2e_tests.test_multi_node_outage import ( + TestMultiNodeOutageDocker, + TestMultiNodeOutageK8s, + TestMultiNodeVMRebootDocker +) from e2e_tests.add_node_fio_run import ( @@ -85,8 +90,14 @@ LargeScaleLvolK8s, ) from stress_test.device_failure_migration import ( - DeviceFailureMigrationNoLoad, - DeviceFailureMigrationUnderLoad, + DeviceFailureMigrationNoLoadDocker, + DeviceFailureMigrationUnderLoadDocker, + DeviceFailureMigrationPCIeNoLoadDocker, + DeviceFailureMigrationPCIeUnderLoadDocker, + DeviceFailureMigrationNoLoadK8s, + DeviceFailureMigrationUnderLoadK8s, + DeviceFailureMigrationPCIeNoLoadK8s, + DeviceFailureMigrationPCIeUnderLoadK8s, ) from stress_test.continuous_failover_ha_security import ( RandomSecurityFailoverTest, @@ -274,8 +285,17 @@ BulkLvolHotDeleteK8s, LargeScaleLvolDocker, LargeScaleLvolK8s, - DeviceFailureMigrationNoLoad, - DeviceFailureMigrationUnderLoad, + DeviceFailureMigrationNoLoadDocker, + DeviceFailureMigrationUnderLoadDocker, + DeviceFailureMigrationPCIeNoLoadDocker, + DeviceFailureMigrationPCIeUnderLoadDocker, + DeviceFailureMigrationNoLoadK8s, + DeviceFailureMigrationUnderLoadK8s, + DeviceFailureMigrationPCIeNoLoadK8s, + DeviceFailureMigrationPCIeUnderLoadK8s, + TestMultiNodeOutageDocker, + TestMultiNodeOutageK8s, + TestMultiNodeVMRebootDocker, ] def get_all_tests(custom=True, ha_test=False): @@ -392,8 +412,14 @@ def get_stress_tests(): BulkLvolHotDeleteK8s, LargeScaleLvolDocker, LargeScaleLvolK8s, - DeviceFailureMigrationNoLoad, - DeviceFailureMigrationUnderLoad, + DeviceFailureMigrationNoLoadDocker, + DeviceFailureMigrationUnderLoadDocker, + DeviceFailureMigrationPCIeNoLoadDocker, + DeviceFailureMigrationPCIeUnderLoadDocker, + DeviceFailureMigrationNoLoadK8s, + DeviceFailureMigrationUnderLoadK8s, + DeviceFailureMigrationPCIeNoLoadK8s, + DeviceFailureMigrationPCIeUnderLoadK8s, ] return tests @@ -409,9 +435,16 @@ def get_monitoring_tests(): BulkLvolHotDeleteK8s, LargeScaleLvolDocker, LargeScaleLvolK8s, - DeviceFailureMigrationNoLoad, - DeviceFailureMigrationUnderLoad, + DeviceFailureMigrationNoLoadDocker, + DeviceFailureMigrationUnderLoadDocker, + DeviceFailureMigrationPCIeNoLoadDocker, + DeviceFailureMigrationPCIeUnderLoadDocker, + DeviceFailureMigrationNoLoadK8s, + DeviceFailureMigrationUnderLoadK8s, + DeviceFailureMigrationPCIeNoLoadK8s, + DeviceFailureMigrationPCIeUnderLoadK8s, TestLvolOutageLoadTest, + TestParallelLvolSnapshotCloneAPI, ] def get_backup_tests(): diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py index 7237e6640..50fcb5fe7 100755 --- a/e2e/e2e_tests/cluster_test_base.py +++ b/e2e/e2e_tests/cluster_test_base.py @@ -319,6 +319,9 @@ def stop_docker_logs_collect(self): self.logger.info("All log monitoring threads stopped.") def stop_k8s_log_collect(self): + if not self.runner_k8s_log or isinstance(self.runner_k8s_log, str): + self.logger.warning("[stop_k8s_log_collect] runner_k8s_log not initialized — skipping") + return self.runner_k8s_log.stop_log_monitor() self.runner_k8s_log.stop_logging() diff --git a/e2e/e2e_tests/k8s_native_add_node.py b/e2e/e2e_tests/k8s_native_add_node.py index 428f7f39d..976ae3764 100755 --- a/e2e/e2e_tests/k8s_native_add_node.py +++ b/e2e/e2e_tests/k8s_native_add_node.py @@ -61,6 +61,7 @@ def __init__(self, **kwargs): # K8s resource naming self.STORAGE_CLASS_NAME = "simplyblock-csi-sc" + self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs" self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass" self.FIO_IMAGE = "dockerpinata/fio:2.1" @@ -221,6 +222,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) self.k8s_utils.create_volume_snapshot_class(name=self.SNAPSHOT_CLASS_NAME) # Record initial node count @@ -238,11 +247,13 @@ def run(self): pvc_name = f"add-node-pvc-{_rand_seq(4)}-{i}" job_name = f"fio-{pvc_name}" cm_name = f"fio-cfg-{pvc_name}" + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" self.k8s_utils.create_pvc( name=pvc_name, size=self.pvc_size, - storage_class=self.STORAGE_CLASS_NAME, + storage_class=sc_name, ) self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300) @@ -250,6 +261,8 @@ def run(self): "job_name": job_name, "configmap_name": cm_name, "snapshots": [], + "storage_class": sc_name, + "fs_type": fs_type, } # ── Step 3: Start FIO on existing PVCs ─────────────────────────── @@ -289,10 +302,12 @@ def run(self): detail["snapshots"].append(snap_name) self.snapshot_details[snap_name] = {"pvc_name": pvc_name} + clone_sc = detail.get("storage_class", self.STORAGE_CLASS_NAME) + clone_fs_type = detail.get("fs_type", "ext4") self.k8s_utils.create_clone_pvc( name=clone_name, size=self.pvc_size, - storage_class=self.STORAGE_CLASS_NAME, + storage_class=clone_sc, snapshot_name=snap_name, ) self.k8s_utils.wait_pvc_bound(clone_name, timeout=300) @@ -312,6 +327,8 @@ def run(self): "snap_name": snap_name, "job_name": clone_job, "configmap_name": clone_cm, + "storage_class": clone_sc, + "fs_type": clone_fs_type, } sleep_n_sec(5) @@ -394,11 +411,13 @@ def run(self): pvc_name = f"new-node-pvc-{_rand_seq(4)}-{i}" job_name = f"fio-{pvc_name}" cm_name = f"fio-cfg-{pvc_name}" + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" self.k8s_utils.create_pvc( name=pvc_name, size=self.pvc_size, - storage_class=self.STORAGE_CLASS_NAME, + storage_class=sc_name, ) self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300) @@ -416,6 +435,8 @@ def run(self): new_pvc_details[pvc_name] = { "job_name": job_name, "configmap_name": cm_name, + "storage_class": sc_name, + "fs_type": fs_type, } sleep_n_sec(5) diff --git a/e2e/e2e_tests/k8s_native_node_migration.py b/e2e/e2e_tests/k8s_native_node_migration.py index d41a93fc2..7037fee0c 100755 --- a/e2e/e2e_tests/k8s_native_node_migration.py +++ b/e2e/e2e_tests/k8s_native_node_migration.py @@ -56,6 +56,7 @@ def __init__(self, **kwargs): # K8s resource naming self.STORAGE_CLASS_NAME = "simplyblock-csi-sc" + self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs" self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass" self.FIO_IMAGE = "dockerpinata/fio:2.1" @@ -212,6 +213,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) self.k8s_utils.create_volume_snapshot_class(name=self.SNAPSHOT_CLASS_NAME) # Record nodes @@ -226,11 +235,13 @@ def run(self): pvc_name = f"mig-pvc-{_rand_seq(4)}-{i}" job_name = f"fio-{pvc_name}" cm_name = f"fio-cfg-{pvc_name}" + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" self.k8s_utils.create_pvc( name=pvc_name, size=self.pvc_size, - storage_class=self.STORAGE_CLASS_NAME, + storage_class=sc_name, ) self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300) @@ -249,6 +260,8 @@ def run(self): "job_name": job_name, "configmap_name": cm_name, "snapshots": [], + "storage_class": sc_name, + "fs_type": fs_type, } sleep_n_sec(5) @@ -273,10 +286,12 @@ def run(self): detail["snapshots"].append(snap_name) self.snapshot_details[snap_name] = {"pvc_name": pvc_name} + clone_sc = detail.get("storage_class", self.STORAGE_CLASS_NAME) + clone_fs_type = detail.get("fs_type", "ext4") self.k8s_utils.create_clone_pvc( name=clone_name, size=self.pvc_size, - storage_class=self.STORAGE_CLASS_NAME, + storage_class=clone_sc, snapshot_name=snap_name, ) self.k8s_utils.wait_pvc_bound(clone_name, timeout=300) @@ -296,6 +311,8 @@ def run(self): "snap_name": snap_name, "job_name": clone_job, "configmap_name": clone_cm, + "storage_class": clone_sc, + "fs_type": clone_fs_type, } sleep_n_sec(5) diff --git a/e2e/e2e_tests/test_multi_node_outage.py b/e2e/e2e_tests/test_multi_node_outage.py new file mode 100755 index 000000000..65a913c97 --- /dev/null +++ b/e2e/e2e_tests/test_multi_node_outage.py @@ -0,0 +1,815 @@ +"""E2E Multi-Node Outage Test with Data Integrity Verification. + +Tests cluster resilience when 3 out of 4 storage nodes experience +simultaneous outage (random mix of SPDK crash and network disconnect). + +Flow: + 1. Create 3 lvols per storage node, run FIO on all. + 2. Wait for 1 FIO per node to complete (short write), keep 2 running. + 3. Compute md5sum on completed lvols, take pre-outage snapshots+clones. + 4. Trigger simultaneous outage on 3 random nodes for ~3 minutes. + 5. Wait for recovery: all nodes online, cluster Active. + 6. Verify md5sum on completed lvols (data integrity). + 7. Create 1 new lvol per node + run FIO (basic functionality). + 8. Take post-outage snapshots+clones (snapshot/clone functionality). + +Two variants: + - TestMultiNodeOutageDocker: SSH-based (k8s_run=False) + - TestMultiNodeOutageK8s: K8s sbcli via kubectl (k8s_run=True) +""" + +import os +import random +import threading +import time + +from e2e_tests.cluster_test_base import TestClusterBase, generate_random_sequence +from logger_config import setup_logger +from utils.common_utils import sleep_n_sec + + +class _TestMultiNodeOutageBase(TestClusterBase): + """Shared logic for Docker and K8s multi-node outage tests.""" + + def __init__(self, k8s_run=False, **kwargs): + super().__init__(k8s_run=k8s_run, **kwargs) + self.logger = setup_logger(__name__) + + # Test parameters + self.lvol_size = "5G" + self.fio_size = "1G" + self.short_fio_runtime = 120 # seconds — short FIO should complete well within this + self.long_fio_runtime = 1000 # seconds — long FIO runs during outage + self.outage_duration = 180 # 3 minutes + self.num_lvols_per_node = 3 + self.num_outage_nodes = 3 + + # Internal state + self._node_info = {} # node_uuid -> {ip, rpc_port, data_nics, if_names} + self._lvol_info = {} # lvol_name -> {node_uuid, device, mount_path, fio_name} + self._completed_lvols = [] # lvol names where short FIO completed + self._running_lvols = [] # lvol names where long FIO is still running + self._pre_checksums = {} # lvol_name -> {filepath: md5} + self._outage_plan = {} # node_uuid -> "spdk_crash" | "network_outage" + self._outage_threads = [] + + # ── Snapshot/clone helpers (branched by k8s_test) ──────────────── + + def _create_snapshot(self, lvol_id, snap_name): + if self.k8s_test: + self.sbcli_utils.add_snapshot(lvol_id=lvol_id, snapshot_name=snap_name) + else: + self.ssh_obj.add_snapshot( + node=self.mgmt_nodes[0], lvol_id=lvol_id, snapshot_name=snap_name + ) + + def _get_snapshot_id(self, snap_name): + if self.k8s_test: + return self.sbcli_utils.get_snapshot_id(snap_name=snap_name) + else: + return self.ssh_obj.get_snapshot_id( + node=self.mgmt_nodes[0], snapshot_name=snap_name + ) + + def _create_clone(self, snap_id, clone_name): + if self.k8s_test: + self.sbcli_utils.add_clone(snapshot_id=snap_id, clone_name=clone_name) + else: + self.ssh_obj.add_clone( + node=self.mgmt_nodes[0], snapshot_id=snap_id, clone_name=clone_name + ) + + # ── SPDK crash helper (branched by k8s_test) ──────────────────── + + def _trigger_spdk_crash(self, node_uuid, node_ip, rpc_port): + if self.k8s_test: + k8s = getattr(self.sbcli_utils, "k8s", None) + if k8s: + k8s.stop_spdk_pod(node_ip) + else: + self.logger.warning( + f"k8s_utils not available — falling back to SSH spdk_process_kill" + ) + self.ssh_obj.stop_spdk_process(node_ip, rpc_port, self.cluster_id) + else: + self.ssh_obj.stop_spdk_process(node_ip, rpc_port, self.cluster_id) + + # ── NVMe connect/reconnect helpers ────────────────────────────── + + def _connect_lvol(self, client, lvol_name): + """Run NVMe connect commands for a lvol on the given client.""" + connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name) + if not connect_ls: + raise RuntimeError(f"No connect strings for lvol {lvol_name}") + for connect_str in connect_ls: + self.ssh_obj.exec_command(node=client, command=connect_str) + + def _detect_new_device(self, client, initial_devices): + """Return the first new device that appeared since initial_devices.""" + final_devices = self.ssh_obj.get_devices(node=client) + for device in final_devices: + if device not in initial_devices: + return f"/dev/{device.strip()}" + return None + + def _reconnect_lvol(self, client, lvol_name, mount_path): + """Reconnect NVMe, detect device, mount without format. Returns device path.""" + # Unmount if still mounted (may fail — that's ok) + self.ssh_obj.exec_command( + node=client, command=f"sudo umount {mount_path} 2>/dev/null || true" + ) + + # Disconnect existing NVMe paths for this lvol + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name) + if lvol_id: + subsystems = self.ssh_obj.get_nvme_subsystems(node=client, nqn_filter=lvol_id) + for subsys in subsystems: + self.ssh_obj.disconnect_nvme(node=client, nqn_grep=subsys) + sleep_n_sec(3) + + # Re-connect NVMe + initial_devices = self.ssh_obj.get_devices(node=client) + self._connect_lvol(client, lvol_name) + sleep_n_sec(5) + + device = self._detect_new_device(client, initial_devices) + if not device: + # Device might have reconnected with same name — try the old device + old_device = self._lvol_info.get(lvol_name, {}).get("device") + if old_device: + self.logger.info( + f"No new device detected for {lvol_name}, trying old device {old_device}" + ) + device = old_device + else: + raise RuntimeError(f"Could not detect device for {lvol_name} after reconnect") + + # Mount (no format — data must be preserved) + self.ssh_obj.exec_command( + node=client, command=f"sudo mkdir -p {mount_path}" + ) + self.ssh_obj.mount_path(node=client, device=device, mount_path=mount_path) + return device + + # ── FIO wait helper ───────────────────────────────────────────── + + def _wait_fio_complete(self, client, fio_name, timeout=300): + """Poll until the tmux session for this FIO exits.""" + deadline = time.time() + timeout + session = f"fio_{fio_name}" + while time.time() < deadline: + out, _ = self.ssh_obj.exec_command( + node=client, + command=f"sudo tmux has-session -t {session} 2>&1 && echo RUNNING || echo DONE", + max_retries=1, + ) + if "DONE" in out: + self.logger.info(f"FIO session '{session}' completed on {client}") + return True + sleep_n_sec(10) + self.logger.warning(f"FIO session '{session}' did not complete within {timeout}s") + return False + + def _kill_fio_session(self, client, fio_name): + """Kill a tmux FIO session if still running.""" + session = f"fio_{fio_name}" + self.ssh_obj.exec_command( + node=client, + command=f"sudo tmux kill-session -t {session} 2>/dev/null || true", + max_retries=1, + ) + + # ── Outage + recovery (overridable by subclasses) ────────────── + + def _execute_outage_and_recovery(self, node_uuids, client): + """Steps 9-11: plan outage, execute, wait for recovery. + + Subclasses can override this to change the outage mechanism + (e.g. VM reboot instead of SPDK crash / network disconnect). + """ + # ── Step 9: Plan and execute multi-node outage ────────────── + self.logger.info("[step-9] Planning multi-node outage") + outage_nodes = random.sample(node_uuids, self.num_outage_nodes) + for node_uuid in outage_nodes: + outage_type = random.choice(["spdk_crash", "network_outage"]) + self._outage_plan[node_uuid] = outage_type + + self.logger.info("[step-9] Outage plan:") + for node_uuid, otype in self._outage_plan.items(): + ip = self._node_info[node_uuid]["ip"] + self.logger.info(f" Node {node_uuid[:8]} ({ip}): {otype}") + + # Collect pre-outage diagnostics + self.logger.info("[step-9] Collecting pre-outage diagnostics") + try: + self.collect_management_details(suffix="_pre_outage") + except Exception as e: + self.logger.warning(f"Pre-outage diagnostics failed: {e}") + + # Execute outages simultaneously + self.logger.info("[step-9] TRIGGERING OUTAGES ON 3 NODES") + self._outage_threads = [] + for node_uuid, outage_type in self._outage_plan.items(): + ninfo = self._node_info[node_uuid] + node_ip = ninfo["ip"] + + if outage_type == "spdk_crash": + t = threading.Thread( + target=self._trigger_spdk_crash, + args=(node_uuid, node_ip, ninfo["rpc_port"]), + daemon=True, + ) + else: # network_outage + if_names = ninfo["if_names"] + if not if_names: + self.logger.warning( + f"No interface names for {node_uuid} — " + f"falling back to get_active_interfaces" + ) + if_names = self.ssh_obj.get_active_interfaces(node_ip) + t = threading.Thread( + target=self.ssh_obj.disconnect_all_active_interfaces, + args=(node_ip, if_names, self.outage_duration), + daemon=True, + ) + + self._outage_threads.append(t) + t.start() + self.logger.info( + f" Outage thread started for {node_uuid[:8]} ({outage_type})" + ) + + # ── Step 10: Wait for outage to pass ──────────────────────── + self.logger.info("[step-10] Waiting for cluster to become Suspended or Degraded") + try: + self.sbcli_utils.wait_for_cluster_status( + status=["suspended", "degraded"], timeout=600 + ) + self.logger.info("[step-10] Cluster is Suspended/Degraded (outage confirmed)") + except TimeoutError: + cluster_status = self.sbcli_utils.get_cluster_status() + self.logger.warning( + f"Cluster did not reach Suspended/Degraded — " + f"current status: {cluster_status}" + ) + + wait_secs = self.outage_duration + 60 # extra buffer + self.logger.info(f"[step-10] Waiting {wait_secs}s for outage period to pass") + sleep_n_sec(wait_secs) + + # Join outage threads (network disconnect threads block for duration) + for t in self._outage_threads: + t.join(timeout=120) + + # ── Step 11: Wait for recovery ────────────────────────────── + self.logger.info("[step-11] Waiting for all nodes to come back online") + for node_uuid in outage_nodes: + try: + self.sbcli_utils.wait_for_storage_node_status( + node_uuid, status=["online"], timeout=600 + ) + self.logger.info(f" Node {node_uuid[:8]} is online") + except TimeoutError: + self.logger.error( + f" Node {node_uuid[:8]} did NOT come back online within 600s" + ) + raise + + self.logger.info("[step-11] Waiting for cluster to become Active") + try: + self.sbcli_utils.wait_for_cluster_status( + status=["active"], timeout=600 + ) + self.logger.info("[step-11] Cluster is Active") + except TimeoutError: + self.logger.warning("Cluster did not reach Active") + cluster_status = self.sbcli_utils.get_cluster_status() + self.logger.info(f"Current cluster status: {cluster_status}") + raise + + # Collect post-recovery diagnostics + try: + self.collect_management_details(suffix="_post_recovery") + except Exception as e: + self.logger.warning(f"Post-recovery diagnostics failed: {e}") + + sleep_n_sec(30) # settle time after recovery + + # ── Main test flow ────────────────────────────────────────────── + + def run(self): + self.logger.info("=" * 70) + self.logger.info("Starting Multi-Node Outage E2E Test") + self.logger.info("=" * 70) + + client = self.fio_node[0] + + # K8s mode: establish SSH to storage nodes (needed for network outage) + if self.k8s_test: + for node in self.storage_nodes: + self.logger.info(f"[setup] SSH-connecting to storage node {node}") + self.ssh_obj.connect( + address=node, bastion_server_address=self.bastion_server + ) + sleep_n_sec(1) + + # ── Step 1: Discover storage nodes ────────────────────────── + self.logger.info("[step-1] Discovering storage nodes") + storage_nodes_data = self.sbcli_utils.get_storage_nodes() + node_uuids = [] + for result in storage_nodes_data["results"]: + if not result.get("is_secondary_node", False): + uuid = result["uuid"] + node_uuids.append(uuid) + self._node_info[uuid] = { + "ip": result["mgmt_ip"], + "rpc_port": result.get("rpc_port", ""), + "data_nics": result.get("data_nics", []), + "if_names": [ + nic["if_name"] + for nic in result.get("data_nics", []) + if nic.get("if_name") + ], + } + + num_nodes = len(node_uuids) + self.logger.info(f"[step-1] Found {num_nodes} primary storage nodes: {node_uuids}") + assert num_nodes >= 4, ( + f"Need at least 4 storage nodes for this test, found {num_nodes}" + ) + + # ── Step 2: Create pool ───────────────────────────────────── + self.logger.info("[step-2] Creating storage pool") + self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + pools = self.sbcli_utils.list_storage_pools() + assert self.pool_name in pools, f"Pool {self.pool_name} not created" + sleep_n_sec(5) + + # ── Step 3: Create 3 lvols per node ───────────────────────── + self.logger.info("[step-3] Creating lvols") + node_lvol_names = {} # uuid -> [lvol_name, ...] + for node_uuid in node_uuids: + short_id = node_uuid[:6] + node_lvol_names[node_uuid] = [] + for i in range(self.num_lvols_per_node): + lvol_name = f"mno-{short_id}-{i}" + self.logger.info( + f" Creating lvol {lvol_name} on node {node_uuid} ({self._node_info[node_uuid]['ip']})" + ) + self.sbcli_utils.add_lvol( + lvol_name=lvol_name, + pool_name=self.pool_name, + size=self.lvol_size, + host_id=node_uuid, + distr_ndcs=self.ndcs, + distr_npcs=self.npcs, + distr_bs=self.bs, + distr_chunk_bs=self.chunk_bs, + ) + node_lvol_names[node_uuid].append(lvol_name) + self._lvol_info[lvol_name] = { + "node_uuid": node_uuid, + "device": None, + "mount_path": f"/mnt/mno_{lvol_name}", + "fio_name": None, + } + + total_lvols = sum(len(v) for v in node_lvol_names.values()) + self.logger.info(f"[step-3] Created {total_lvols} lvols across {num_nodes} nodes") + + # ── Step 4: Connect, format, mount all lvols ──────────────── + self.logger.info("[step-4] Connecting, formatting, and mounting all lvols") + for lvol_name, info in self._lvol_info.items(): + initial_devices = self.ssh_obj.get_devices(node=client) + self._connect_lvol(client, lvol_name) + sleep_n_sec(3) + + device = self._detect_new_device(client, initial_devices) + if not device: + raise RuntimeError(f"No new device detected after connecting {lvol_name}") + + info["device"] = device + mount_path = info["mount_path"] + + self.ssh_obj.unmount_path(node=client, device=device) + self.ssh_obj.format_disk(node=client, device=device, fs_type="ext4") + self.ssh_obj.mount_path(node=client, device=device, mount_path=mount_path) + self.logger.info(f" {lvol_name}: {device} → {mount_path}") + + # ── Step 5: Run short FIO (1 per node) and wait ───────────── + self.logger.info("[step-5] Running short FIO on 1 lvol per node (write 1G)") + for node_uuid in node_uuids: + lvol_name = node_lvol_names[node_uuid][0] # first lvol per node + info = self._lvol_info[lvol_name] + fio_name = f"short_{lvol_name}" + info["fio_name"] = fio_name + + self.ssh_obj.run_fio_test( + node=client, + directory=info["mount_path"], + log_file=os.path.join(self.log_path, f"{fio_name}.log"), + name=fio_name, + rw="write", + bs="1M", + size=self.fio_size, + numjobs=1, + nrfiles=4, + runtime=self.short_fio_runtime, + time_based=False, + use_latency=False, + ) + self._completed_lvols.append(lvol_name) + + # Wait for all short FIOs to complete + self.logger.info("[step-5] Waiting for short FIOs to complete") + for lvol_name in self._completed_lvols: + fio_name = self._lvol_info[lvol_name]["fio_name"] + ok = self._wait_fio_complete(client, fio_name, timeout=self.short_fio_runtime + 120) + if not ok: + self.logger.warning(f"Short FIO {fio_name} may not have completed cleanly") + + sleep_n_sec(5) + + # ── Step 6: Compute pre-outage md5sum on completed lvols ──── + self.logger.info("[step-6] Computing pre-outage md5sum checksums") + for lvol_name in self._completed_lvols: + mount_path = self._lvol_info[lvol_name]["mount_path"] + files = self.ssh_obj.find_files(client, directory=mount_path) + if not files or files == [""]: + self.logger.warning(f"No files found in {mount_path} for {lvol_name}") + continue + checksums = self.ssh_obj.generate_checksums(client, files) + self._pre_checksums[lvol_name] = checksums + self.logger.info( + f" {lvol_name}: {len(checksums)} files checksummed" + ) + + assert self._pre_checksums, "No pre-outage checksums computed — aborting" + + # ── Step 7: Pre-outage snapshots + clones ─────────────────── + self.logger.info("[step-7] Creating pre-outage snapshots and clones") + for lvol_name in self._completed_lvols: + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name) + if not lvol_id: + self.logger.warning(f"Cannot find lvol_id for {lvol_name} — skipping snapshot") + continue + + snap_name = f"{lvol_name}_snap_pre" + clone_name = f"{lvol_name}_clone_pre" + self.logger.info(f" Snapshot: {snap_name}, Clone: {clone_name}") + + self._create_snapshot(lvol_id, snap_name) + snap_id = self._get_snapshot_id(snap_name) + if snap_id: + self._create_clone(snap_id, clone_name) + else: + self.logger.warning(f"Could not get snapshot ID for {snap_name}") + + # ── Step 8: Start long FIO on remaining 2 lvols per node ──── + self.logger.info("[step-8] Starting long FIO on remaining lvols") + for node_uuid in node_uuids: + for lvol_name in node_lvol_names[node_uuid][1:]: # lvols 1 and 2 + info = self._lvol_info[lvol_name] + fio_name = f"long_{lvol_name}" + info["fio_name"] = fio_name + + self.ssh_obj.run_fio_test( + node=client, + directory=info["mount_path"], + log_file=os.path.join(self.log_path, f"{fio_name}.log"), + name=fio_name, + rw="randrw", + bs="4K", + size=self.fio_size, + numjobs=4, + iodepth=16, + runtime=self.long_fio_runtime, + time_based=True, + rwmixread=70, + ) + self._running_lvols.append(lvol_name) + + self.logger.info(f"[step-8] {len(self._running_lvols)} long FIOs started") + sleep_n_sec(30) # let FIOs establish + + # ── Steps 9-11: Outage + recovery (overridable) ────────── + self._execute_outage_and_recovery(node_uuids, client) + + # ── Step 12: Kill remaining long FIOs (they may have errored) ─ + self.logger.info("[step-12] Killing remaining long FIO sessions") + for lvol_name in self._running_lvols: + fio_name = self._lvol_info[lvol_name].get("fio_name") + if fio_name: + self._kill_fio_session(client, fio_name) + + sleep_n_sec(10) + + # ── Step 13: Verify md5sum on completed lvols ─────────────── + self.logger.info("[step-13] Verifying data integrity (md5sum) on completed lvols") + checksum_failures = [] + for lvol_name in self._completed_lvols: + if lvol_name not in self._pre_checksums: + self.logger.warning(f"No pre-outage checksum for {lvol_name} — skipping") + continue + + mount_path = self._lvol_info[lvol_name]["mount_path"] + self.logger.info(f" Reconnecting {lvol_name}") + + try: + device = self._reconnect_lvol(client, lvol_name, mount_path) + self._lvol_info[lvol_name]["device"] = device + except Exception as e: + self.logger.error(f" Failed to reconnect {lvol_name}: {e}") + checksum_failures.append(lvol_name) + continue + + files = self.ssh_obj.find_files(client, directory=mount_path) + if not files or files == [""]: + self.logger.error(f" No files found in {mount_path} after recovery") + checksum_failures.append(lvol_name) + continue + + post_checksums = self.ssh_obj.generate_checksums(client, files) + pre_set = set(self._pre_checksums[lvol_name].values()) + post_set = set(post_checksums.values()) + + if pre_set == post_set: + self.logger.info( + f" {lvol_name}: CHECKSUM OK ({len(post_checksums)} files verified)" + ) + else: + self.logger.error( + f" {lvol_name}: CHECKSUM MISMATCH!\n" + f" Pre: {self._pre_checksums[lvol_name]}\n" + f" Post: {post_checksums}" + ) + checksum_failures.append(lvol_name) + + if checksum_failures: + raise AssertionError( + f"Data integrity check failed on {len(checksum_failures)} lvols: {checksum_failures}" + ) + self.logger.info("[step-13] All checksum verifications passed") + + # ── Step 14: Create 1 new lvol per node + run FIO ─────────── + self.logger.info("[step-14] Creating new lvols post-recovery and running FIO") + new_lvol_names = [] + for node_uuid in node_uuids: + short_id = node_uuid[:6] + new_name = f"mno-new-{short_id}" + self.logger.info( + f" Creating {new_name} on node {node_uuid[:8]} ({self._node_info[node_uuid]['ip']})" + ) + self.sbcli_utils.add_lvol( + lvol_name=new_name, + pool_name=self.pool_name, + size=self.lvol_size, + host_id=node_uuid, + distr_ndcs=self.ndcs, + distr_npcs=self.npcs, + distr_bs=self.bs, + distr_chunk_bs=self.chunk_bs, + ) + + # Connect, format, mount + initial_devices = self.ssh_obj.get_devices(node=client) + self._connect_lvol(client, new_name) + sleep_n_sec(3) + device = self._detect_new_device(client, initial_devices) + if not device: + raise RuntimeError(f"No new device for post-recovery lvol {new_name}") + + new_mount = f"/mnt/mno_{new_name}" + self.ssh_obj.unmount_path(node=client, device=device) + self.ssh_obj.format_disk(node=client, device=device, fs_type="ext4") + self.ssh_obj.mount_path(node=client, device=device, mount_path=new_mount) + + # Run short FIO + fio_name = f"post_{new_name}" + self.ssh_obj.run_fio_test( + node=client, + directory=new_mount, + log_file=os.path.join(self.log_path, f"{fio_name}.log"), + name=fio_name, + rw="write", + bs="1M", + size=self.fio_size, + numjobs=1, + nrfiles=4, + runtime=self.short_fio_runtime, + time_based=False, + use_latency=False, + ) + new_lvol_names.append(new_name) + self._lvol_info[new_name] = { + "node_uuid": node_uuid, + "device": device, + "mount_path": new_mount, + "fio_name": fio_name, + } + + # Wait for new FIOs to complete + self.logger.info("[step-14] Waiting for post-recovery FIOs to complete") + for new_name in new_lvol_names: + fio_name = self._lvol_info[new_name]["fio_name"] + ok = self._wait_fio_complete(client, fio_name, timeout=self.short_fio_runtime + 120) + assert ok, f"Post-recovery FIO {fio_name} did not complete" + + self.logger.info("[step-14] All post-recovery FIOs completed successfully") + + # ── Step 15: Post-outage snapshots + clones ───────────────── + self.logger.info("[step-15] Creating post-outage snapshots and clones") + for lvol_name in self._completed_lvols: + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name) + if not lvol_id: + self.logger.warning(f"Cannot find lvol_id for {lvol_name} — skipping") + continue + + snap_name = f"{lvol_name}_snap_post" + clone_name = f"{lvol_name}_clone_post" + self.logger.info(f" Snapshot: {snap_name}, Clone: {clone_name}") + + self._create_snapshot(lvol_id, snap_name) + snap_id = self._get_snapshot_id(snap_name) + if snap_id: + self._create_clone(snap_id, clone_name) + else: + self.logger.warning(f"Could not get snapshot ID for {snap_name}") + + self.logger.info("=" * 70) + self.logger.info("Multi-Node Outage E2E Test PASSED") + self.logger.info("=" * 70) + + +class _TestMultiNodeVMRebootBase(_TestMultiNodeOutageBase): + """VM reboot variant — reboots 3 nodes instead of SPDK crash / network outage.""" + + def _execute_outage_and_recovery(self, node_uuids, client): + """Override: reboot VMs, verify offline + degraded/suspended, wait for recovery.""" + # ── Step 9: Select and reboot nodes ─────────────────────────── + self.logger.info("[step-9] Planning VM reboot outage") + outage_nodes = random.sample(node_uuids, self.num_outage_nodes) + for node_uuid in outage_nodes: + self._outage_plan[node_uuid] = "vm_reboot" + ip = self._node_info[node_uuid]["ip"] + self.logger.info(f" Node {node_uuid[:8]} ({ip}): vm_reboot") + + # Collect pre-outage diagnostics + self.logger.info("[step-9] Collecting pre-outage diagnostics") + try: + self.collect_management_details(suffix="_pre_outage") + except Exception as e: + self.logger.warning(f"Pre-outage diagnostics failed: {e}") + + # Trigger reboots — just send `sudo reboot` and close SSH, + # do NOT wait for reconnect yet (we need to verify offline first). + self.logger.info("[step-9] TRIGGERING VM REBOOTS ON 3 NODES") + for node_uuid in outage_nodes: + node_ip = self._node_info[node_uuid]["ip"] + try: + self.ssh_obj.exec_command( + node=node_ip, command="sudo reboot", max_retries=1 + ) + except Exception: + pass # Expected — connection drops during reboot + # Close SSH connection so subsequent checks don't reuse stale socket + if node_ip in self.ssh_obj.ssh_connections: + try: + self.ssh_obj.ssh_connections[node_ip].close() + except Exception: + pass + del self.ssh_obj.ssh_connections[node_ip] + self.logger.info(f" Reboot triggered for {node_uuid[:8]} ({node_ip})") + + sleep_n_sec(15) # Give nodes time to go down + + # ── Step 10a: Verify nodes are NOT online ───────────────────── + self.logger.info("[step-10] Verifying nodes are offline/unreachable") + for node_uuid in outage_nodes: + try: + self.sbcli_utils.wait_for_storage_node_status( + node_uuid, + status=["offline", "unreachable"], + timeout=120, + ) + self.logger.info(f" Node {node_uuid[:8]} is offline/unreachable (good)") + except TimeoutError: + try: + details = self.sbcli_utils.get_storage_node_details( + storage_node_id=node_uuid + ) + node_status = details[0]["status"] if details else "unknown" + except Exception: + node_status = "unknown" + self.logger.warning( + f" Node {node_uuid[:8]} did not go offline within 120s " + f"(current: {node_status})" + ) + + # ── Step 10b: Verify cluster is degraded or suspended ───────── + self.logger.info("[step-10] Waiting for cluster to become Suspended or Degraded") + try: + self.sbcli_utils.wait_for_cluster_status( + status=["suspended", "degraded"], timeout=600 + ) + self.logger.info("[step-10] Cluster is Suspended/Degraded (outage confirmed)") + except TimeoutError: + cluster_status = self.sbcli_utils.get_cluster_status() + self.logger.warning( + f"Cluster did not reach Suspended/Degraded — " + f"current status: {cluster_status}" + ) + + # ── Step 11: Wait for nodes to come back online ─────────────── + self.logger.info("[step-11] Waiting for all nodes to come back online after reboot") + for node_uuid in outage_nodes: + node_ip = self._node_info[node_uuid]["ip"] + # Poll SSH until the node is reachable again + self.logger.info(f" Waiting for SSH on {node_uuid[:8]} ({node_ip})") + start_time = time.time() + ssh_ok = False + while time.time() - start_time < 600: + try: + self.ssh_obj.connect( + address=node_ip, + bastion_server_address=getattr(self, "bastion_server", None), + ) + self.logger.info(f" SSH reconnected to {node_uuid[:8]} ({node_ip})") + ssh_ok = True + break + except Exception: + sleep_n_sec(10) + if not ssh_ok: + self.logger.error( + f" SSH reconnect failed for {node_uuid[:8]} ({node_ip}) " + f"after 600s" + ) + + # Wait for storage node status to become online + for node_uuid in outage_nodes: + try: + self.sbcli_utils.wait_for_storage_node_status( + node_uuid, status=["online"], timeout=600 + ) + self.logger.info(f" Node {node_uuid[:8]} is online") + except TimeoutError: + self.logger.error( + f" Node {node_uuid[:8]} did NOT come back online within 600s" + ) + raise + + self.logger.info("[step-11] Waiting for cluster to become Active") + try: + self.sbcli_utils.wait_for_cluster_status( + status=["active"], timeout=600 + ) + self.logger.info("[step-11] Cluster is Active") + except TimeoutError: + self.logger.warning("Cluster did not reach Active") + cluster_status = self.sbcli_utils.get_cluster_status() + self.logger.info(f"Current cluster status: {cluster_status}") + raise + + # Collect post-recovery diagnostics + try: + self.collect_management_details(suffix="_post_recovery") + except Exception as e: + self.logger.warning(f"Post-recovery diagnostics failed: {e}") + + sleep_n_sec(30) # settle time after recovery + + +class TestMultiNodeVMRebootDocker(_TestMultiNodeVMRebootBase): + """Docker SSH-based multi-node VM reboot test.""" + + def __init__(self, **kwargs): + kwargs.pop("k8s_run", None) + super().__init__(k8s_run=False, **kwargs) + self.test_name = "multi_node_vm_reboot_docker" + + +class TestMultiNodeVMRebootK8s(_TestMultiNodeVMRebootBase): + """K8s-based multi-node VM reboot test.""" + + def __init__(self, **kwargs): + kwargs.pop("k8s_run", None) + super().__init__(k8s_run=True, **kwargs) + self.test_name = "multi_node_vm_reboot_k8s" + + +class TestMultiNodeOutageDocker(_TestMultiNodeOutageBase): + """Docker SSH-based multi-node outage test.""" + + def __init__(self, **kwargs): + kwargs.pop("k8s_run", None) + super().__init__(k8s_run=False, **kwargs) + self.test_name = "multi_node_outage_docker" + + +class TestMultiNodeOutageK8s(_TestMultiNodeOutageBase): + """K8s-based multi-node outage test (sbcli via kubectl exec).""" + + def __init__(self, **kwargs): + kwargs.pop("k8s_run", None) + super().__init__(k8s_run=True, **kwargs) + self.test_name = "multi_node_outage_k8s" diff --git a/e2e/stress_test/continuous_bulk_lvol_delete.py b/e2e/stress_test/continuous_bulk_lvol_delete.py index 0b8c6a0f3..5449ee782 100755 --- a/e2e/stress_test/continuous_bulk_lvol_delete.py +++ b/e2e/stress_test/continuous_bulk_lvol_delete.py @@ -21,6 +21,7 @@ from __future__ import annotations +import os import random import string import threading @@ -175,6 +176,13 @@ def _wait_lvol_deleted(self, lvol_name, timeout=300): ) return False + def _validate_fio_batch(self, iteration, names): + """Validate FIO liveness + collect logs before deletion. + + Override in Docker/K8s subclasses. Returns failure count. + """ + return 0 + def _run_bulk_iterations(self): results = [] for iteration in range(1, self.NUM_ITERATIONS + 1): @@ -189,14 +197,19 @@ def _run_bulk_iterations(self): ) sleep_n_sec(self.WAIT_AFTER_CREATE) + # Validate FIO before deletion + fio_failures = self._validate_fio_batch(iteration, names) + t_del = time.time() result = self._bulk_delete_sequential(iteration, names) result["delete_duration"] = time.time() - t_del + result["fio_validation_failures"] = fio_failures results.append(result) self.logger.info( f"Iteration {iteration} done: " f"created={result['created']} deleted={result['deleted']} " f"failed={result['failed']} stale={result['stale']} " + f"fio_failures={fio_failures} " f"delete_time={result['delete_duration']:.1f}s" ) @@ -209,6 +222,9 @@ def _run_bulk_iterations(self): total_core_dumps = sum( r.get("core_dumps_detected", 0) for r in results ) + total_fio_failures = sum( + r.get("fio_validation_failures", 0) for r in results + ) if total_core_dumps > 0: raise RuntimeError( @@ -216,6 +232,12 @@ def _run_bulk_iterations(self): f"on storage nodes across {self.NUM_ITERATIONS} iterations" ) + if total_fio_failures > 0: + raise RuntimeError( + f"Bulk delete test detected {total_fio_failures} FIO " + f"validation failures across {self.NUM_ITERATIONS} iterations" + ) + if total_failed > 0: raise RuntimeError( f"Bulk delete test had {total_failed} total failures across " @@ -231,16 +253,21 @@ def _print_bulk_summary(self, results): self.logger.info("=== Bulk Lvol Delete Test Summary ===") self.logger.info( f"{'Iter':>4} | {'Created':>7} | {'Deleted':>7} | " - f"{'Failed':>6} | {'Stale':>5}" + f"{'Failed':>6} | {'Stale':>5} | {'FIO Err':>7}" ) for r in results: + fio_f = r.get("fio_validation_failures", 0) self.logger.info( f"{r['iteration']:>4} | {r['created']:>7} | {r['deleted']:>7} | " - f"{r['failed']:>6} | {r['stale']:>5}" + f"{r['failed']:>6} | {r['stale']:>5} | {fio_f:>7}" ) total_f = sum(r["failed"] for r in results) total_s = sum(r["stale"] for r in results) - self.logger.info(f"Total failures: {total_f} Total stale: {total_s}") + total_fio = sum(r.get("fio_validation_failures", 0) for r in results) + self.logger.info( + f"Total failures: {total_f} Total stale: {total_s} " + f"Total FIO errors: {total_fio}" + ) def _write_monitoring_json(self, results): """Write standardised timing JSON for monitoring suite aggregation.""" @@ -259,16 +286,18 @@ def _write_monitoring_json(self, results): avg_delete = round( sum(t["delete_sec"] for t in per_lvol) / len(per_lvol), 3 ) + fio_f = r.get("fio_validation_failures", 0) phases.append({ "name": f"iteration_{r['iteration']}", "duration_sec": round(r.get("delete_duration", 0), 2), - "status": "ok" if r["failed"] + r["stale"] == 0 else "degraded", + "status": "ok" if r["failed"] + r["stale"] + fio_f == 0 else "degraded", "details": { "created": r["created"], "deleted": r["deleted"], "failed": r["failed"], "stale": r["stale"], "core_dumps_detected": cd, + "fio_validation_failures": fio_f, "avg_delete_sec": avg_delete, "per_lvol_times": per_lvol, }, @@ -466,7 +495,12 @@ def __init__(self, **kwargs): self._run_id = _rand_seq(8) def run(self): - self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + if actual_pool and actual_pool != self.pool_name: + self.logger.info( + f"[run] Pool name changed: {self.pool_name} -> {actual_pool}" + ) + self.pool_name = actual_pool storage_nodes = self.sbcli_utils.get_storage_nodes() for result in storage_nodes["results"]: @@ -615,6 +649,112 @@ def _bulk_create(self, iteration): return names + # ── FIO validation ──────────────────────────────────────────────────── + + def _validate_fio_batch(self, iteration, names): + """Check FIO thread liveness + collect and validate FIO logs.""" + self.logger.info( + f"[validate {iteration}] Checking FIO status for " + f"{len(names)} lvols" + ) + failures = 0 + + # 1. Check thread liveness + alive = sum(1 for t in self.fio_threads if t.is_alive()) + dead = len(self.fio_threads) - alive + self.logger.info( + f"[validate {iteration}] FIO threads: {alive} alive, " + f"{dead} dead" + ) + if dead > 0: + failures += dead + self.logger.error( + f"[validate {iteration}] {dead} FIO threads died " + f"during wait" + ) + + # 2. Collect FIO logs from remote clients + validate + log_dir = os.path.join("logs", "ClientLogs") + os.makedirs(log_dir, exist_ok=True) + saved = 0 + for lvol_name in names: + details = self.lvol_mount_details.get(lvol_name, {}) + log_file = details.get("Log") + client = details.get("Client") + if not log_file or not client: + continue + # Save FIO stdout log locally + try: + file_data = self.ssh_obj.read_file(client, log_file) + if file_data: + local_path = os.path.join( + log_dir, f"{lvol_name}_fio.log" + ) + with open(local_path, "w") as f: + f.write(file_data) + saved += 1 + except Exception as e: + self.logger.warning( + f"[collect {iteration}] Failed to save FIO log for " + f"{lvol_name} on {client} (remote: {log_file}): {e}" + ) + # Validate log contents for error keywords + try: + self.common_utils.validate_fio_test(client, log_file) + except RuntimeError as e: + failures += 1 + self.logger.error( + f"[validate {iteration}] FIO error in " + f"{lvol_name} on {client}: {e}" + ) + except Exception: + pass + + # 3. Collect FIO perf logs (iolog, bw, lat, iops files) + for lvol_name in names: + details = self.lvol_mount_details.get(lvol_name, {}) + client = details.get("Client") + iolog_base = details.get("iolog_base_path") + if not client or not iolog_base: + continue + perf_dir = os.path.join(log_dir, f"{lvol_name}_perf") + try: + out, _ = self.ssh_obj.exec_command( + node=client, + command=( + f"bash -lc 'ls {iolog_base}* " + f"2>/dev/null || true'" + ), + ) + perf_files = [ + f.strip() for f in (out or "").splitlines() + if f.strip() + ] + if perf_files: + os.makedirs(perf_dir, exist_ok=True) + for src in perf_files: + fname = os.path.basename(src) + dest = os.path.join(perf_dir, fname) + try: + data = self.ssh_obj.read_file(client, src) + if data: + with open(dest, "w") as f: + f.write(data) + except Exception as e: + self.logger.warning( + f"[validate {iteration}] Failed to collect " + f"perf file for {lvol_name} on {client}: " + f"{src} -> {dest}: {e}" + ) + except Exception: + pass + + self.logger.info( + f"[validate {iteration}] Collected {saved} FIO logs, " + f"{failures} failures" + ) + return failures + # ── Delete (sequential, one-by-one) ────────────────────────────────── def _bulk_delete_sequential(self, iteration, names): @@ -820,6 +960,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) self._run_bulk_iterations() @@ -836,13 +984,16 @@ def _bulk_create(self, iteration): f"({i+1}/{self.NUM_LVOLS})" ) + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + pvc_fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" + # Snapshot lvol IDs before PVC creation (for client mode mapping) if self.use_client_fio: old_lvol_ids = self._snapshot_lvol_ids() try: self.k8s_utils.create_pvc( - pvc_name, self.PVC_SIZE, self.STORAGE_CLASS_NAME, + pvc_name, self.PVC_SIZE, sc_name, ) self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300) except Exception as exc: @@ -920,7 +1071,7 @@ def _bulk_create(self, iteration): "client": client, "log_file": log_file, "fs_type": fs_type, - "storage_class": self.STORAGE_CLASS_NAME, + "storage_class": sc_name, } self.lvol_mount_details[lvol_name] = { "ID": lvol_id, @@ -968,7 +1119,8 @@ def _bulk_create(self, iteration): "configmap_name": cm_name, "snapshots": [], "node_id": node_id, - "storage_class": self.STORAGE_CLASS_NAME, + "storage_class": sc_name, + "fs_type": pvc_fs_type, } self.logger.info( @@ -983,6 +1135,125 @@ def _bulk_create(self, iteration): return names + # ── FIO validation ──────────────────────────────────────────────────── + + def _validate_fio_batch(self, iteration, names): + """Check FIO liveness + collect and validate FIO logs.""" + self.logger.info( + f"[validate {iteration}] Checking FIO status for " + f"{len(names)} PVCs" + ) + failures = 0 + log_dir = os.path.join("logs", "ClientLogs") + os.makedirs(log_dir, exist_ok=True) + saved = 0 + + if self.use_client_fio: + # ── Client SSH FIO path ── + for pvc_name in names: + pvc_info = self.pvc_details.get(pvc_name, {}) + log_file = pvc_info.get("log_file") + client = pvc_info.get("client") + if not log_file or not client: + continue + # Save FIO stdout log locally + try: + file_data = self.ssh_obj.read_file(client, log_file) + if file_data: + local_path = os.path.join( + log_dir, f"{pvc_name}_fio.log" + ) + with open(local_path, "w") as f: + f.write(file_data) + saved += 1 + except Exception as e: + self.logger.warning( + f"[validate {iteration}] Unable to save FIO log for " + f"{pvc_name} on {client} ({log_file}): {e}" + ) + # Validate log contents + try: + self.common_utils.validate_fio_test(client, log_file) + except RuntimeError as e: + failures += 1 + self.logger.error( + f"[validate {iteration}] FIO error in " + f"{pvc_name} on {client}: {e}" + ) + except Exception: + pass + else: + # ── K8s Job FIO path ── + fail_words = ["error", "fail", "interrupt", "terminate"] + for pvc_name in names: + pvc_info = self.pvc_details.get(pvc_name, {}) + job_name = pvc_info.get("job_name") + if not job_name: + continue + try: + # Save pod logs + pod_name = self.k8s_utils.get_job_pod_name(job_name) + if not pod_name: + continue + logs = self.k8s_utils.get_pod_logs( + pod_name, tail=2000 + ) + if logs: + local_path = os.path.join( + log_dir, f"{pvc_name}_fio.log" + ) + with open(local_path, "w") as f: + f.write(logs) + saved += 1 + + # Copy FIO perf logs from pod + try: + self._save_fio_pod_logs( + job_name, pvc_name, pvc_name=pvc_name + ) + except Exception: + pass + + # Check pod status — Failed/Error means FIO crashed + status_out, _ = self.k8s_utils._exec_kubectl( + f"get pod {pod_name} " + f"-o jsonpath='{{.status.phase}}'", + supress_logs=True, + ) + pod_phase = (status_out or "").strip() + if pod_phase in ("Failed", "Error"): + failures += 1 + self.logger.error( + f"[validate {iteration}] FIO pod " + f"{pod_name} phase={pod_phase} for " + f"{pvc_name}" + ) + continue + + # Check pod logs for error keywords + if logs: + logs_lower = logs.lower() + for word in fail_words: + if word in logs_lower: + failures += 1 + self.logger.error( + f"[validate {iteration}] FIO " + f"pod logs for {pvc_name} " + f"contain '{word}'" + ) + break + except Exception as exc: + self.logger.warning( + f"[validate {iteration}] Could not check " + f"FIO for {pvc_name}: {exc}" + ) + + self.logger.info( + f"[validate {iteration}] Collected {saved} FIO logs, " + f"{failures} failures" + ) + return failures + # ── Delete (sequential, one-by-one) ────────────────────────────────── def _bulk_delete_sequential(self, iteration, names): diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py b/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py old mode 100644 new mode 100755 index 168b890fc..14945b5a5 --- a/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py +++ b/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py @@ -1,3 +1,4 @@ +import os import random import threading import time @@ -147,6 +148,39 @@ def run(self): ) self.logger.info( - f"max_fault_tolerance={max_fault_tolerance} — proceeding with all-nodes outage test." + f"max_fault_tolerance={max_fault_tolerance} — proceeding " + f"with all-nodes outage test." ) - super().run() + + # Start full pcap capture on all nodes for network diagnostics + all_node_ips = set( + self.storage_nodes + self.mgmt_nodes + self.fio_node + ) + self.logger.info( + f"Starting full pcap capture on {len(all_node_ips)} nodes" + ) + for node_ip in all_node_ips: + try: + node_log_dir = os.path.join( + self.docker_logs_path, node_ip, + ) + self.ssh_obj.make_directory( + node=node_ip, dir_name=node_log_dir, + ) + self.ssh_obj.start_full_pcap_capture( + node_ip, node_log_dir, + ) + except Exception as exc: + self.logger.warning( + f"Failed to start pcap on {node_ip}: {exc}" + ) + + try: + super().run() + finally: + # Stop pcap capture on all nodes + for node_ip in all_node_ips: + try: + self.ssh_obj.stop_full_pcap_capture(node_ip) + except Exception: + pass diff --git a/e2e/stress_test/continuous_k8s_native_failover.py b/e2e/stress_test/continuous_k8s_native_failover.py index ab5ccfe77..035c62590 100755 --- a/e2e/stress_test/continuous_k8s_native_failover.py +++ b/e2e/stress_test/continuous_k8s_native_failover.py @@ -69,6 +69,7 @@ def __init__(self, **kwargs): # K8s resource naming self.STORAGE_CLASS_NAME = "simplyblock-csi-sc" + self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs" self.CRYPTO_STORAGE_CLASS_NAME = "simplyblock-csi-sc-crypto" self.CRYPTO_POOL_NAME = "encryption-pool" self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass" @@ -1192,16 +1193,17 @@ def create_pvcs_with_fio(self, count: int, node_ids: list[str] = None, pvc_name = f"pvc-{_rand_seq(12)}" target_node = node_ids[i] if node_ids and i < len(node_ids) else None - # Determine StorageClass: explicit > 50/50 alternation > regular + # Determine StorageClass: explicit > TLS alternation > random ext4/xfs if storage_class: sc_name = storage_class elif self.tls_enabled and (existing_count + i) % 2 == 1: sc_name = self.CRYPTO_STORAGE_CLASS_NAME else: - sc_name = self.STORAGE_CLASS_NAME + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" self.logger.info( - f"[create_pvc] Creating PVC {pvc_name} ({i+1}/{count}) SC={sc_name}" + f"[create_pvc] Creating PVC {pvc_name} ({i+1}/{count}) SC={sc_name} fs={fs_type}" + (f" pinned to node {target_node}" if target_node else "") ) @@ -1358,10 +1360,11 @@ def create_pvcs_with_fio(self, count: int, node_ids: list[str] = None, "snapshots": [], "node_id": node_id, "storage_class": sc_name, + "fs_type": fs_type, } self.logger.info( - f"[create_pvc] PVC {pvc_name} on node {node_id} with FIO Job {job_name} SC={sc_name}" + f"[create_pvc] PVC {pvc_name} on node {node_id} with FIO Job {job_name} SC={sc_name} fs={fs_type}" ) if node_id: @@ -1431,8 +1434,9 @@ def create_snapshots_and_clones(self): # Snapshot lvol IDs before clone PVC (for client mode mapping) old_lvol_ids = self._snapshot_lvol_ids() if self.use_client_fio else set() - # Create clone PVC — use same StorageClass as source PVC + # Create clone PVC — use same StorageClass/fs_type as source PVC clone_sc = self.pvc_details.get(pvc_name, {}).get("storage_class", self.STORAGE_CLASS_NAME) + clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4") sleep_n_sec(10) try: self.k8s_utils.create_clone_pvc( @@ -1487,6 +1491,7 @@ def create_snapshots_and_clones(self): "client": client, "log_file": None, "storage_class": clone_sc, + "fs_type": clone_fs_type, } continue @@ -1512,6 +1517,7 @@ def create_snapshots_and_clones(self): "client": client, "log_file": log_file, "storage_class": clone_sc, + "fs_type": clone_fs_type, } self.clone_mount_details[clone_lvol_name] = { "ID": clone_lvol_id, @@ -1551,6 +1557,7 @@ def create_snapshots_and_clones(self): "job_name": clone_job, "configmap_name": clone_cm, "storage_class": clone_sc, + "fs_type": clone_fs_type, } # Resize source PVC and clone PVC @@ -2754,6 +2761,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) if self.tls_enabled: self.logger.info("TLS enabled — ensuring encryption pool exists") self.sbcli_utils.ensure_pool_exists( @@ -2960,8 +2975,9 @@ def create_snapshots_and_clones_with_cleanup(self, count: int = None): # Snapshot lvol IDs before clone PVC (for client mode mapping) old_lvol_ids = self._snapshot_lvol_ids() if self.use_client_fio else set() - # Create clone PVC — use same StorageClass as source PVC + # Create clone PVC — use same StorageClass/fs_type as source PVC clone_sc = self.pvc_details.get(pvc_name, {}).get("storage_class", self.STORAGE_CLASS_NAME) + clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4") sleep_n_sec(10) try: self.k8s_utils.create_clone_pvc( @@ -3060,6 +3076,7 @@ def create_snapshots_and_clones_with_cleanup(self, count: int = None): "job_name": clone_job, "configmap_name": clone_cm, "storage_class": clone_sc, + "fs_type": clone_fs_type, } # Resize source PVC and clone PVC @@ -3134,6 +3151,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) self.k8s_utils.delete_volume_snapshot_class(self.SNAPSHOT_CLASS_NAME) self.k8s_utils.create_volume_snapshot_class(self.SNAPSHOT_CLASS_NAME) sleep_n_sec(5) @@ -3321,13 +3346,14 @@ def _create_pvcs_deferred(self, count: int): self._ensure_k8s_utils() for i in range(count): pvc_name = f"pvc-{_rand_seq(12)}" + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) self.logger.info( f"[deferred_create] Creating PVC {pvc_name} " - f"({i+1}/{count}) — will bind after recovery" + f"({i+1}/{count}) SC={sc_name} — will bind after recovery" ) try: self.k8s_utils.create_pvc( - pvc_name, self.pvc_size, self.STORAGE_CLASS_NAME, + pvc_name, self.pvc_size, sc_name, ) except Exception as exc: self.logger.warning( @@ -3579,10 +3605,11 @@ def _create_permanent_snapshots_and_clones(self): self._snapshot_lvol_ids() if self.use_client_fio else set() ) - # Create clone PVC — use same StorageClass as source PVC + # Create clone PVC — use same StorageClass/fs_type as source PVC clone_sc = self.pvc_details.get(pvc_name, {}).get( "storage_class", self.STORAGE_CLASS_NAME ) + clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4") sleep_n_sec(10) try: self.k8s_utils.create_clone_pvc( @@ -3659,6 +3686,7 @@ def _create_permanent_snapshots_and_clones(self): "client": client, "log_file": log_file, "storage_class": clone_sc, + "fs_type": clone_fs_type, } self.clone_mount_details[clone_lvol_name] = { "ID": clone_lvol_id, @@ -3702,6 +3730,7 @@ def _create_permanent_snapshots_and_clones(self): "job_name": clone_job, "configmap_name": clone_cm, "storage_class": clone_sc, + "fs_type": clone_fs_type, } self.logger.info( @@ -4120,6 +4149,14 @@ def run(self): ndcs=self.ndcs, npcs=self.npcs, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + ) if self.tls_enabled: self.logger.info("TLS enabled — ensuring encryption pool exists") self.sbcli_utils.ensure_pool_exists( diff --git a/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py b/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py old mode 100644 new mode 100755 index 7285b2354..96d6a7689 --- a/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py +++ b/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py @@ -1,8 +1,11 @@ +import json as _json import os import time import threading from collections import deque from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timezone +from pathlib import Path from e2e_tests.cluster_test_base import TestClusterBase, generate_random_sequence from utils.common_utils import sleep_n_sec @@ -94,6 +97,11 @@ def __init__(self, **kwargs): # clone_registry[clone_name] = { id, client, mount_path, snap_name, delete_state } self._clone_registry = {} + # Per-operation timing: list of (wall_ts, op_type, duration_sec, ok) + self._op_events: list[tuple] = [] + # Inventory timeline: list of (wall_ts, lvols, snapshots, clones) + self._inventory_timeline: list[tuple] = [] + # Metrics self._metrics = { "start_ts": None, @@ -158,6 +166,33 @@ def _inc(self, bucket: str, key: str, n: int = 1): with self._lock: self._metrics[bucket][key] += n + def _record_op(self, op: str, duration: float, ok: bool): + """Append a timing event (thread-safe).""" + with self._lock: + self._op_events.append((time.time(), op, duration, ok)) + + def _snapshot_inventory(self): + """Record current inventory counts (thread-safe).""" + with self._lock: + self._inventory_timeline.append(( + time.time(), + len(self._lvol_registry), + len(self._snap_registry), + len(self._clone_registry), + )) + + def _timed(self, op: str, fn, *args, **kwargs): + """Wrap a task function with timing collection.""" + t0 = time.time() + ok = True + try: + return fn(*args, **kwargs) + except Exception: + ok = False + raise + finally: + self._record_op(op, time.time() - t0, ok) + def _set_failure(self, op: str, exc: Exception, details: str = "", ctx: dict = None, api_err: dict = None): with self._lock: if self._metrics["failure_info"] is None: @@ -1028,7 +1063,7 @@ def _submit_creates(self, ex, create_f: dict, idx_counter: dict): idx = idx_counter["idx"] idx_counter["idx"] += 1 lvol_name = f"lvl{generate_random_sequence(15)}_{idx}_{int(time.time())}" - f = ex.submit(lambda i=idx, n=lvol_name: self._task_create_lvol(i, n)) + f = ex.submit(lambda i=idx, n=lvol_name: self._timed("create_lvol", self._task_create_lvol, i, n)) create_f[f] = time.time() def _submit_snapshots(self, ex, snap_f: dict): @@ -1053,7 +1088,7 @@ def _submit_snapshots(self, ex, snap_f: dict): lvol_name, lvol_id = candidate snap_name = f"snap{generate_random_sequence(15)}_{int(time.time())}" - f = ex.submit(lambda ln=lvol_name, lid=lvol_id, sn=snap_name: self._task_create_snapshot(ln, lid, sn)) + f = ex.submit(lambda ln=lvol_name, lid=lvol_id, sn=snap_name: self._timed("create_snapshot", self._task_create_snapshot, ln, lid, sn)) snap_f[f] = time.time() def _submit_clones(self, ex, clone_f: dict): @@ -1079,7 +1114,7 @@ def _submit_clones(self, ex, clone_f: dict): snap_name, snap_id = candidate idx = int(time.time()) clone_name = f"cln{generate_random_sequence(15)}_{idx}_{int(time.time())}" - f = ex.submit(lambda s=snap_name, sid=snap_id, i=idx, cn=clone_name: self._task_create_clone(s, sid, i, cn)) + f = ex.submit(lambda s=snap_name, sid=snap_id, i=idx, cn=clone_name: self._timed("create_clone", self._task_create_clone, s, sid, i, cn)) clone_f[f] = time.time() def _submit_snapshot_delete_trees(self, ex, snap_del_f: dict): @@ -1088,7 +1123,7 @@ def _submit_snapshot_delete_trees(self, ex, snap_del_f: dict): if not self._snapshot_delete_tree_q: return sn = self._snapshot_delete_tree_q.popleft() - f = ex.submit(lambda sn=sn: self._task_delete_snapshot_tree(sn)) + f = ex.submit(lambda sn=sn: self._timed("delete_snapshot_tree", self._task_delete_snapshot_tree, sn)) snap_del_f[f] = time.time() def _submit_lvol_delete_trees(self, ex, lvol_del_f: dict): @@ -1097,7 +1132,7 @@ def _submit_lvol_delete_trees(self, ex, lvol_del_f: dict): if not self._lvol_delete_tree_q: return ln = self._lvol_delete_tree_q.popleft() - f = ex.submit(lambda ln=ln: self._task_delete_lvol_tree(ln)) + f = ex.submit(lambda ln=ln: self._timed("delete_lvol_tree", self._task_delete_lvol_tree, ln)) lvol_del_f[f] = time.time() def _update_peaks(self, create_f, snap_f, clone_f, snap_del_f, lvol_del_f): @@ -1194,6 +1229,269 @@ def _print_summary(self): self.logger.info("===========================================================") + # ---------------------------- + # Monitoring JSON + Charts + # ---------------------------- + def _write_monitoring_json(self): + """Persist metrics, per-op timing, and inventory timeline to JSON.""" + out_dir = Path("logs") + out_dir.mkdir(parents=True, exist_ok=True) + + with self._lock: + start_ts = self._metrics["start_ts"] or time.time() + end_ts = self._metrics["end_ts"] or time.time() + dur = end_ts - start_ts + + # Build per-operation latency summaries + op_latencies: dict[str, list[float]] = {} + for _, op, duration, ok in self._op_events: + if ok: + op_latencies.setdefault(op, []).append(duration) + + op_summary = {} + for op, lats in op_latencies.items(): + lats_sorted = sorted(lats) + n = len(lats_sorted) + op_summary[op] = { + "count": n, + "min": round(lats_sorted[0], 2) if n else 0, + "max": round(lats_sorted[-1], 2) if n else 0, + "avg": round(sum(lats_sorted) / n, 2) if n else 0, + "p50": round(lats_sorted[n // 2], 2) if n else 0, + "p90": round(lats_sorted[int(n * 0.9)], 2) if n else 0, + "p99": round(lats_sorted[int(n * 0.99)], 2) if n else 0, + } + + # Throughput: ops/min buckets + if self._op_events: + bucket_size = 60 # 1-minute buckets + throughput_buckets: dict[int, dict[str, int]] = {} + for ts, op, _, ok in self._op_events: + if ok: + bucket = int((ts - start_ts) // bucket_size) + throughput_buckets.setdefault(bucket, {}) + throughput_buckets[bucket][op] = throughput_buckets[bucket].get(op, 0) + 1 + throughput_timeline = [ + {"minute": b, **counts} + for b, counts in sorted(throughput_buckets.items()) + ] + else: + throughput_timeline = [] + + report = { + "test_class": self.__class__.__name__, + "timestamp": datetime.now(timezone.utc).isoformat(), + "status": "passed" if not self._metrics["failure_info"] else "failed", + "duration_sec": round(dur, 2), + "geometry": {"ndcs": self.ndcs, "npcs": self.npcs}, + "config": { + "create_inflight": self.CREATE_INFLIGHT, + "snapshot_inflight": self.SNAPSHOT_INFLIGHT, + "clone_inflight": self.CLONE_INFLIGHT, + "total_inventory_max": self.TOTAL_INVENTORY_MAX, + "total_delete_threshold": self.TOTAL_DELETE_THRESHOLD, + "lvol_size": self.LVOL_SIZE, + }, + "counts": dict(self._metrics["counts"]), + "attempts": dict(self._metrics["attempts"]), + "success": dict(self._metrics["success"]), + "failures": dict(self._metrics["failures"]), + "peak_inflight": dict(self._metrics["peak_inflight"]), + "op_latency_summary": op_summary, + "throughput_per_minute": throughput_timeline, + "op_events": [ + {"ts": round(ts - start_ts, 2), "op": op, + "duration": round(d, 2), "ok": ok} + for ts, op, d, ok in self._op_events + ], + "inventory_timeline": [ + {"ts": round(ts - start_ts, 2), "lvols": lv, + "snapshots": sn, "clones": cl} + for ts, lv, sn, cl in self._inventory_timeline + ], + } + + out_path = out_dir / "parallel_lvol_snapshot_clone_timing.json" + with open(out_path, "w") as f: + _json.dump(report, f, indent=2) + self.logger.info(f"Monitoring JSON written to {out_path}") + + def _generate_charts(self): + """Generate performance charts from collected timing data.""" + out_dir = Path("logs") + out_dir.mkdir(parents=True, exist_ok=True) + + try: + import matplotlib + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except ImportError: + self.logger.warning("matplotlib not available — skipping charts") + return + + with self._lock: + start_ts = self._metrics["start_ts"] or 0 + op_events = list(self._op_events) + inv_timeline = list(self._inventory_timeline) + counts = dict(self._metrics["counts"]) + + class_name = self.__class__.__name__ + + # --- Chart 1: Operation latency scatter --- + try: + if op_events: + fig, ax = plt.subplots(figsize=(14, 6)) + op_colors = { + "create_lvol": "#3498db", + "create_snapshot": "#2ecc71", + "create_clone": "#f39c12", + "delete_snapshot_tree": "#e74c3c", + "delete_lvol_tree": "#9b59b6", + } + for op, color in op_colors.items(): + pts = [(ts - start_ts, d) for ts, o, d, ok in op_events if o == op and ok] + if pts: + xs, ys = zip(*pts) + ax.scatter(xs, ys, c=color, alpha=0.5, s=12, label=op) + ax.set_xlabel("Time (seconds since start)") + ax.set_ylabel("Duration (seconds)") + ax.set_title(f"{class_name} — Operation Latency Over Time") + ax.legend(fontsize=8, loc="upper right") + ax.grid(True, alpha=0.3) + plt.tight_layout() + fig.savefig(str(out_dir / "op_latency_scatter.png"), dpi=150) + plt.close(fig) + self.logger.info("Chart saved: op_latency_scatter.png") + except Exception as exc: + self.logger.warning(f"Latency scatter chart failed: {exc}") + + # --- Chart 2: Inventory timeline (stacked area) --- + try: + if inv_timeline: + ts_vals = [t - start_ts for t, _, _, _ in inv_timeline] + lvols = [lv for _, lv, _, _ in inv_timeline] + snaps = [sn for _, _, sn, _ in inv_timeline] + clones = [cl for _, _, _, cl in inv_timeline] + + fig, ax = plt.subplots(figsize=(14, 5)) + ax.stackplot(ts_vals, lvols, snaps, clones, + labels=["LVols", "Snapshots", "Clones"], + colors=["#3498db", "#2ecc71", "#f39c12"], alpha=0.7) + ax.axhline(y=self.TOTAL_INVENTORY_MAX, color="red", + linestyle="--", alpha=0.6, label=f"Max ({self.TOTAL_INVENTORY_MAX})") + ax.axhline(y=self.TOTAL_DELETE_THRESHOLD, color="orange", + linestyle="--", alpha=0.6, label=f"Delete threshold ({self.TOTAL_DELETE_THRESHOLD})") + ax.set_xlabel("Time (seconds since start)") + ax.set_ylabel("Count") + ax.set_title(f"{class_name} — Inventory Over Time") + ax.legend(fontsize=8, loc="upper left") + ax.grid(True, alpha=0.3) + plt.tight_layout() + fig.savefig(str(out_dir / "inventory_timeline.png"), dpi=150) + plt.close(fig) + self.logger.info("Chart saved: inventory_timeline.png") + except Exception as exc: + self.logger.warning(f"Inventory timeline chart failed: {exc}") + + # --- Chart 3: Throughput (ops/min bar chart) --- + try: + if op_events: + bucket_size = 60 + buckets: dict[int, dict[str, int]] = {} + for ts, op, _, ok in op_events: + if ok: + b = int((ts - start_ts) // bucket_size) + buckets.setdefault(b, {}) + buckets[b][op] = buckets[b].get(op, 0) + 1 + + if buckets: + max_bucket = max(buckets.keys()) + minutes = list(range(max_bucket + 1)) + op_types = sorted({op for c in buckets.values() for op in c}) + op_colors_list = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#9b59b6"] + + fig, ax = plt.subplots(figsize=(14, 5)) + bottom = [0] * len(minutes) + for i, op in enumerate(op_types): + vals = [buckets.get(m, {}).get(op, 0) for m in minutes] + color = op_colors_list[i % len(op_colors_list)] + ax.bar(minutes, vals, bottom=bottom, label=op, + color=color, alpha=0.8, width=0.8) + bottom = [b + v for b, v in zip(bottom, vals)] + ax.set_xlabel("Minute") + ax.set_ylabel("Completed Operations") + ax.set_title(f"{class_name} — Throughput (ops/min)") + ax.legend(fontsize=8, loc="upper right") + ax.grid(True, axis="y", alpha=0.3) + plt.tight_layout() + fig.savefig(str(out_dir / "throughput_per_minute.png"), dpi=150) + plt.close(fig) + self.logger.info("Chart saved: throughput_per_minute.png") + except Exception as exc: + self.logger.warning(f"Throughput chart failed: {exc}") + + # --- Chart 4: Operations summary (total counts bar) --- + try: + creates = [ + ("LVols created", counts.get("lvols_created", 0)), + ("Snapshots created", counts.get("snapshots_created", 0)), + ("Clones created", counts.get("clones_created", 0)), + ] + deletes = [ + ("LVols deleted", counts.get("lvols_deleted", 0)), + ("Snapshots deleted", counts.get("snapshots_deleted", 0)), + ("Clones deleted", counts.get("clones_deleted", 0)), + ] + labels = [c[0] for c in creates] + [d[0] for d in deletes] + values = [c[1] for c in creates] + [d[1] for d in deletes] + colors = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#c0392b", "#d35400"] + + fig, ax = plt.subplots(figsize=(10, 5)) + bars = ax.bar(range(len(labels)), values, color=colors, alpha=0.8) + ax.set_xticks(range(len(labels))) + ax.set_xticklabels(labels, rotation=30, ha="right", fontsize=9) + ax.set_ylabel("Count") + ax.set_title(f"{class_name} — Total Operations") + for b, v in zip(bars, values): + if v > 0: + ax.text(b.get_x() + b.get_width() / 2, + b.get_height() + max(values) * 0.02, + str(v), ha="center", va="bottom", fontsize=9) + ax.grid(True, axis="y", alpha=0.3) + plt.tight_layout() + fig.savefig(str(out_dir / "operations_summary.png"), dpi=150) + plt.close(fig) + self.logger.info("Chart saved: operations_summary.png") + except Exception as exc: + self.logger.warning(f"Operations summary chart failed: {exc}") + + # --- Chart 5: Latency box plot per operation --- + try: + op_latencies: dict[str, list[float]] = {} + for _, op, d, ok in op_events: + if ok: + op_latencies.setdefault(op, []).append(d) + + if op_latencies: + fig, ax = plt.subplots(figsize=(10, 5)) + ops = sorted(op_latencies.keys()) + data = [op_latencies[op] for op in ops] + bp = ax.boxplot(data, tick_labels=ops, patch_artist=True) + box_colors = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#9b59b6"] + for i, patch in enumerate(bp["boxes"]): + patch.set_facecolor(box_colors[i % len(box_colors)]) + patch.set_alpha(0.7) + ax.set_ylabel("Duration (seconds)") + ax.set_title(f"{class_name} — Latency Distribution Per Operation") + ax.tick_params(axis="x", rotation=30) + ax.grid(True, axis="y", alpha=0.3) + plt.tight_layout() + fig.savefig(str(out_dir / "latency_boxplot.png"), dpi=150) + plt.close(fig) + self.logger.info("Chart saved: latency_boxplot.png") + except Exception as exc: + self.logger.warning(f"Latency box plot failed: {exc}") + # ---------------------------- # Main # ---------------------------- @@ -1248,6 +1546,9 @@ def run(self): self._submit_snapshot_delete_trees(ex, snap_del_f) self._submit_lvol_delete_trees(ex, lvol_del_f) + # Record inventory snapshot every loop iteration + self._snapshot_inventory() + # Update peaks and harvest self._update_peaks(create_f, snap_f, clone_f, snap_del_f, lvol_del_f) self._harvest_fail_fast(create_f) @@ -1270,6 +1571,8 @@ def run(self): finally: self._print_summary() + self._write_monitoring_json() + self._generate_charts() with self._lock: failure_info = self._metrics["failure_info"] diff --git a/e2e/stress_test/continuous_parallel_namespace_lvol.py b/e2e/stress_test/continuous_parallel_namespace_lvol.py index cef2a8f8d..a6c3f8b50 100755 --- a/e2e/stress_test/continuous_parallel_namespace_lvol.py +++ b/e2e/stress_test/continuous_parallel_namespace_lvol.py @@ -1,10 +1,14 @@ """ Parallel Namespace LVol Stress Test (Docker + K8s) -Creates 300 parent lvols each with 6 namespace partitions (1800 total), -takes 2 snapshots per lvol (3600 total), clones 1 picked snapshot 1500 times, -then deletes everything in parallel — with verified deletion. Repeats for -NUM_ITERATIONS cycles to measure latency degradation over time. +Creates 100 parent lvols each with 50 namespace children (5100 total lvols), +writes 10 MB data to each parent, takes 2 snapshots per parent (+ 1 random +child), clones 1 picked snapshot 1500 times, verifies everything, then deletes +in parallel — with verified deletion. Repeats for NUM_ITERATIONS cycles to +measure latency degradation over time. + +**Sequential per-parent flow**: for each parent, all 50 children are created +and verified before moving to the next parent. Any failure aborts the test. Two variants: - TestParallelNamespaceLvolDocker: sbcli API (add_lvol with namespace=) @@ -23,8 +27,10 @@ import time from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timezone from e2e_tests.cluster_test_base import TestClusterBase from utils.common_utils import sleep_n_sec +from utils.ssh_utils import RunnerK8sLog try: import requests @@ -47,12 +53,12 @@ def __init__(self, **kwargs): super().__init__(**kwargs) # ── Scale ────────────────────────────────────────────────────────── - self.NUM_PARENTS = 300 - self.NAMESPACES_PER_PARENT = 100 # max_namespace_per_subsys - self.CHILDREN_PER_PARENT = 5 # 300 × 5 = 1500 children - self.SNAPSHOTS_PER_LVOL = 2 # per parent + 1 random child + self.NUM_PARENTS = 20 + self.NAMESPACES_PER_PARENT = 26 # max_namespace_per_subsys (parent + 25 children) + self.CHILDREN_PER_PARENT = 25 # 20 × 25 = 500 children + self.SNAPSHOTS_PER_LVOL = 2 # per parent + 1 random child → ~42 total self.NUM_CLONES = 1500 # from 1 picked snapshot - self.NUM_ITERATIONS = 20 + self.NUM_ITERATIONS = 1 # ── Sizing ───────────────────────────────────────────────────────── self.LVOL_SIZE = "1G" @@ -63,14 +69,18 @@ def __init__(self, **kwargs): self.MAX_WORKERS_DELETE = 30 self.BATCH_SIZE = 50 self.TASK_TIMEOUT = 300 + self.PARALLEL_PARENTS = 10 # concurrent parents during child creation + self.CLONE_BATCH_SIZE = 250 # clone creation batch size for stats + self.CLONE_BIND_TIMEOUT = 3600 # 1 hour — large clone batches queue in CSI # ── Retry ───────────────────────────────────────────────────────── self.RETRY_MAX = 10 - self.RETRY_INTERVAL = 5 + self.RETRY_INTERVAL = 30 # ── Thread-safe state ───────────────────────────────────────────── self._lock = threading.Lock() self._stop_event = threading.Event() + self._clones_binding = 0 # how many clones waiting for Bound right now # parent_name -> {id, children: [child_name], snapshots: [snap_name]} self._parent_registry = {} @@ -83,8 +93,10 @@ def __init__(self, **kwargs): # ── Timing samples ──────────────────────────────────────────────── self._timing_samples = [] # list of dicts + self._batch_timings = [] # batch-level summaries for graphs self._iteration_timings = [] # per-iteration phase durations self._current_iteration = 0 + self._snapshot_child = None # pre-selected child for snapshot (set in write_data) # ── Metrics ─────────────────────────────────────────────────────── self._metrics = { @@ -133,15 +145,55 @@ def _snapshot_inventory(self) -> dict: "clones": clones, "total": lvols + snaps + clones, } - def _record_timing(self, op: str, name: str, elapsed: float, inventory: dict): + def _record_timing(self, op: str, name: str, elapsed: float, + inventory: dict, api_elapsed: float = None): with self._lock: - self._timing_samples.append({ + sample = { "iteration": self._current_iteration, "op": op, "name": name, "elapsed_sec": round(elapsed, 4), "inventory": inventory, "timestamp": time.time(), + } + if api_elapsed is not None: + sample["api_elapsed_sec"] = round(api_elapsed, 4) + self._timing_samples.append(sample) + + def _log_op_stats(self, op: str, batch_label: str = "", + batch_elapsed: float = 0, count: int = 0): + """Log avg/p50/p95 stats for a given op in the current iteration.""" + with self._lock: + samples = [ + s["elapsed_sec"] for s in self._timing_samples + if s["iteration"] == self._current_iteration and s["op"] == op + ] + if not samples: + return + samples_sorted = sorted(samples) + n = len(samples_sorted) + avg = sum(samples_sorted) / n + p50 = samples_sorted[n // 2] + p95 = samples_sorted[min(int(n * 0.95), n - 1)] + mn, mx = samples_sorted[0], samples_sorted[-1] + tag = f" ({batch_label})" if batch_label else "" + self.logger.info( + f"[{op}]{tag}: {count or n} ops in {batch_elapsed:.1f}s — " + f"avg={avg:.2f}s p50={p50:.2f}s p95={p95:.2f}s " + f"min={mn:.2f}s max={mx:.2f}s" + ) + with self._lock: + self._batch_timings.append({ + "iteration": self._current_iteration, + "op": op, + "batch_label": batch_label, + "batch_elapsed": round(batch_elapsed, 2), + "count": count or n, + "avg": round(avg, 4), + "p50": round(p50, 4), + "p95": round(p95, 4), + "min": round(mn, 4), + "max": round(mx, 4), }) # ── API error helpers (reused from existing parallel test) ──────────── @@ -184,6 +236,14 @@ def _is_sync_deletion_error(self, api_err: dict) -> bool: msg = (api_err.get("msg") or "").lower() return "lvol sync deletion found" in text or "lvol sync deletion found" in msg + def _is_already_exists_error(self, api_err: dict) -> bool: + """Detect 'LVol name must be unique' — resource was created by a + prior attempt that appeared to fail but actually succeeded.""" + text = (api_err.get("text") or "").lower() + msg = (api_err.get("msg") or "").lower() + return ("must be unique" in text or "must be unique" in msg + or "already exists" in text or "already exists" in msg) + def _api_retry(self, op: str, fn, ctx: dict = None): """Call fn() with retry. Returns fn() result on success.""" ctx = ctx or {} @@ -196,6 +256,14 @@ def _api_retry(self, op: str, fn, ctx: dict = None): self._inc("failures", op) self.logger.warning(f"[max_lvols] op={op} ctx={ctx}") raise + # "Name must be unique" means a prior attempt actually + # succeeded — treat as success, not failure + if self._is_already_exists_error(api_err): + self.logger.info( + f"[retry] op={op} resource already exists " + f"(prior attempt succeeded): ctx={ctx}" + ) + return None # treat as success if attempt < self.RETRY_MAX: self.logger.warning( f"[retry] op={op} attempt {attempt}/{self.RETRY_MAX} " @@ -251,6 +319,230 @@ def _wait_snapshot_gone(self, snap_name: str, timeout: int = 120) -> float: self.logger.warning(f"snapshot {snap_name} still exists after {timeout}s") return time.time() - start + # ── Verification helpers ────────────────────────────────────────────── + + def _verify_all_lvols_exist(self): + """Verify registered parents and children exist in lvol list. + + Retries up to 30 minutes to allow resources to settle. + Warns for missing, only fails if >50% missing. + """ + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + + with self._lock: + total = len(self._parent_registry) + len(self._child_registry) + + while waited <= max_wait: + all_lvols = self.sbcli_utils.list_lvols() + missing = [] + with self._lock: + for name in self._parent_registry: + if name not in all_lvols: + missing.append(("parent", name)) + for name in self._child_registry: + if name not in all_lvols: + missing.append(("child", name)) + + miss_pct = len(missing) * 100 / max(total, 1) + if miss_pct <= 50: + break # Within tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_lvols] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"lvols missing, waiting {poll_interval}s... " + f"(waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + miss_pct = len(missing) * 100 / max(total, 1) + if missing: + self.logger.warning( + f"[verify_lvols] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"lvols missing from API after {waited}s wait: " + f"{missing[:10]}{'...' if len(missing) > 10 else ''}" + ) + if miss_pct > 50: + raise RuntimeError( + f"[verify_lvols] {miss_pct:.1f}% lvols missing exceeds " + f"50% threshold — {len(missing)}/{total}" + ) + self.logger.info( + f"[verify_lvols] {total - len(missing)}/{total} lvols " + f"confirmed in API" + ) + + def _verify_all_snapshots_exist(self): + """Verify registered snapshots exist in snapshot list. + + Retries up to 30 minutes to allow resources to settle. + Warns for missing, only fails if >50% missing. + """ + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + + with self._lock: + total = len(self._snap_registry) + + while waited <= max_wait: + all_snaps = self.sbcli_utils.list_snapshots() + missing = [] + with self._lock: + for name in self._snap_registry: + if name not in all_snaps: + missing.append(name) + + miss_pct = len(missing) * 100 / max(total, 1) + if miss_pct <= 50: + break # Within tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_snapshots] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"snapshots missing, waiting {poll_interval}s... " + f"(waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + miss_pct = len(missing) * 100 / max(total, 1) + if missing: + self.logger.warning( + f"[verify_snapshots] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"snapshots missing after {waited}s wait: " + f"{missing[:10]}{'...' if len(missing) > 10 else ''}" + ) + if miss_pct > 50: + raise RuntimeError( + f"[verify_snapshots] {miss_pct:.1f}% snapshots missing " + f"exceeds 50% threshold — {len(missing)}/{total}" + ) + self.logger.info( + f"[verify_snapshots] {total - len(missing)}/{total} snapshots " + f"confirmed in API" + ) + + def _verify_all_clones_exist(self): + """Verify registered clones exist in lvol list. + + Retries up to 30 minutes to allow resources to settle. + Warns for missing, only fails if >50% missing. + """ + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + + with self._lock: + total = len(self._clone_registry) + + while waited <= max_wait: + all_lvols = self.sbcli_utils.list_lvols() + missing = [] + with self._lock: + for name in self._clone_registry: + if name not in all_lvols: + missing.append(name) + + miss_pct = len(missing) * 100 / max(total, 1) + if miss_pct <= 50: + break # Within tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_clones] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"clones missing, waiting {poll_interval}s... " + f"(waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + miss_pct = len(missing) * 100 / max(total, 1) + if missing: + self.logger.warning( + f"[verify_clones] {len(missing)}/{total} ({miss_pct:.1f}%) " + f"clones missing from API after {waited}s wait: " + f"{missing[:10]}{'...' if len(missing) > 10 else ''}" + ) + if miss_pct > 50: + raise RuntimeError( + f"[verify_clones] {miss_pct:.1f}% clones missing exceeds " + f"50% threshold — {len(missing)}/{total}" + ) + self.logger.info( + f"[verify_clones] {total - len(missing)}/{total} clones " + f"confirmed in API" + ) + + def _phase_mount_verify_clones(self): + """Mount 20 random clones and run short FIO read to verify accessibility. + + Picks up to 20 random clones from the registry, connects/mounts each, + runs a 4 MB FIO read, checks for errors, and disconnects. Fails the + phase if any clone verification fails. + """ + with self._lock: + clone_names = list(self._clone_registry.keys()) + sample_size = min(20, len(clone_names)) + if sample_size == 0: + self.logger.info("[mount_verify] No clones to verify, skipping") + return + selected = random.sample(clone_names, sample_size) + self.logger.info( + f"[mount_verify] Verifying {sample_size} clones with FIO read" + ) + ok, fail = self._batch_parallel( + [{"clone_name": c} for c in selected], + self._mount_verify_single_clone, + min(sample_size, self.MAX_WORKERS_CREATE), + "mount_verify", + ) + self.logger.info( + f"[mount_verify] {ok}/{sample_size} OK, {fail} failed" + ) + if fail > 0: + raise RuntimeError( + f"[mount_verify] {fail}/{sample_size} clone mount+FIO " + f"verifications failed. Check logs for FIO err= or " + f"connect failures." + ) + + def _mount_verify_single_clone(self, item): + """Subclass must implement: connect/mount clone, FIO read, verify.""" + raise NotImplementedError + + def _verify_nodes_healthy(self): + """Verify all storage nodes are online and healthy.""" + nodes_data = self.sbcli_utils.get_storage_nodes() + unhealthy = [] + for node in nodes_data.get("results", []): + node_id = node.get("id", "?") + hostname = node.get("hostname", "?") + status = node.get("status", "unknown") + health = node.get("health_check", None) + if status != "online" or health is not True: + unhealthy.append( + f"{hostname}(id={node_id}, status={status}, " + f"health={health})" + ) + if unhealthy: + raise RuntimeError( + f"[verify_nodes] Unhealthy nodes: {', '.join(unhealthy)}" + ) + total = len(nodes_data.get("results", [])) + self.logger.info( + f"[verify_nodes] All {total} storage nodes online and healthy" + ) + # ── Batch parallel execution ────────────────────────────────────────── def _batch_parallel(self, items, task_fn, max_workers: int, op_name: str): @@ -306,17 +598,76 @@ def _run_phase(self, name: str, fn): except Exception as e: self.logger.error(f"[{name}] Phase failed: {e}") self._set_failure(name, e, f"Phase {name} failed") + self._stop_event.set() finally: dur = time.time() - start self.logger.info(f"=== Phase {name} done in {dur:.1f}s ===") + # Flush timing data after every phase so data survives cancellation + try: + self._flush_timing_data() + except Exception: + pass return dur # used for iteration timing + def _flush_timing_data(self): + """Write intermediate timing JSON to disk (fast, no graphs). + + Called after every phase so data survives if the test is killed. + """ + try: + out_dir = self._get_log_dir() + except Exception: + return + report = { + "config": { + "NUM_PARENTS": self.NUM_PARENTS, + "NAMESPACES_PER_PARENT": self.NAMESPACES_PER_PARENT, + "CHILDREN_PER_PARENT": self.CHILDREN_PER_PARENT, + "SNAPSHOTS_PER_LVOL": self.SNAPSHOTS_PER_LVOL, + "NUM_CLONES": self.NUM_CLONES, + "NUM_ITERATIONS": self.NUM_ITERATIONS, + "BATCH_SIZE": self.BATCH_SIZE, + "MAX_WORKERS_CREATE": self.MAX_WORKERS_CREATE, + "CLONE_BATCH_SIZE": self.CLONE_BATCH_SIZE, + }, + "iterations": self._iteration_timings, + "samples": self._timing_samples, + "batch_timings": self._batch_timings, + "metrics": self._metrics, + "mappings": self._get_registry_mappings(), + } + path = os.path.join(out_dir, "namespace_stress_timings.json") + try: + with open(path, "w") as f: + json.dump(report, f, indent=2, default=str) + except Exception: + pass + + def _get_registry_mappings(self) -> dict: + """Snapshot current registry relationships for graph generation.""" + with self._lock: + child_to_parent = { + cn: ci.get("parent_name", "unknown") + for cn, ci in self._child_registry.items() + } + clone_to_snap = { + cn: ci.get("snap_name", "unknown") + for cn, ci in self._clone_registry.items() + } + parent_list = list(self._parent_registry.keys()) + return { + "child_to_parent": child_to_parent, + "clone_to_snap": clone_to_snap, + "parent_list": parent_list, + } + def _clear_registries(self): with self._lock: self._parent_registry.clear() self._child_registry.clear() self._snap_registry.clear() self._clone_registry.clear() + self._snapshot_child = None # ── Abstract-like methods (subclasses override) ─────────────────────── @@ -326,10 +677,12 @@ def _phase_setup(self): def _phase_cleanup(self): raise NotImplementedError - def _create_parent_impl(self, params: dict): + def _phase_create_subsystems(self): + """Sequential per-parent: create parent + children + verify.""" raise NotImplementedError - def _create_child_impl(self, params: dict): + def _phase_write_data(self): + """Write 10 MB to each parent lvol before snapshotting.""" raise NotImplementedError def _create_snapshot_impl(self, params: dict): @@ -350,6 +703,38 @@ def _delete_child_impl(self, child_name: str): def _delete_parent_impl(self, parent_name: str): raise NotImplementedError + def _phase_verify_cleanup(self): + """Verify all test resources are gone before next iteration.""" + all_lvols = self.sbcli_utils.list_lvols() + if all_lvols: + self.logger.warning( + f"[verify_cleanup] {len(all_lvols)} lvols still present " + f"— retrying cleanup" + ) + try: + self.sbcli_utils.delete_all_clones() + except Exception: + pass + try: + self.sbcli_utils.delete_all_snapshots() + except Exception as e: + self.logger.warning( + "[verify_cleanup] delete_all_snapshots failed during retry: %s", + e, + ) + try: + self.sbcli_utils.delete_all_lvols() + except Exception: + pass + sleep_n_sec(10) + remaining = self.sbcli_utils.list_lvols() + if remaining: + raise RuntimeError( + f"Cleanup verification failed: " + f"{len(remaining)} lvols still exist" + ) + self.logger.info("[verify_cleanup] All resources confirmed deleted") + # ── Timed wrappers (called by _batch_parallel) ─────────────────────── def _timed_create_parent(self, params: dict): @@ -376,9 +761,12 @@ def _timed_create_snapshot(self, params: dict): def _timed_create_clone(self, params: dict): inv = self._snapshot_inventory() t0 = time.time() - self._create_clone_impl(params) + api_elapsed = self._create_clone_impl(params) elapsed = time.time() - t0 - self._record_timing("create_clone", params["name"], elapsed, inv) + self._record_timing( + "create_clone", params["name"], elapsed, inv, + api_elapsed=api_elapsed, + ) def _timed_delete_clone(self, clone_name: str): inv = self._snapshot_inventory() @@ -410,35 +798,6 @@ def _timed_delete_parent(self, parent_name: str): # ── Phase implementations ───────────────────────────────────────────── - def _phase_create_parents(self): - items = [] - for i in range(self.NUM_PARENTS): - name = f"ns-par-{_rand_seq(6)}-{i:04d}" - items.append({"name": name, "idx": i}) - self._batch_parallel( - items, self._timed_create_parent, - self.MAX_WORKERS_CREATE, "create_parents", - ) - - def _phase_create_children(self): - """Create CHILDREN_PER_PARENT child namespace lvols per parent.""" - items = [] - with self._lock: - parents = list(self._parent_registry.items()) - for parent_name, pinfo in parents: - parent_id = pinfo["id"] - for c in range(self.CHILDREN_PER_PARENT): - child_name = f"ns-ch-{_rand_seq(6)}-{parent_name[-4:]}-{c}" - items.append({ - "name": child_name, - "parent_name": parent_name, - "parent_id": parent_id, - }) - self._batch_parallel( - items, self._timed_create_child, - self.MAX_WORKERS_CREATE, "create_children", - ) - def _phase_create_snapshots(self): """Create SNAPSHOTS_PER_LVOL snapshots for each parent + 1 random child.""" items = [] @@ -447,10 +806,12 @@ def _phase_create_snapshots(self): snap_lvols = [] for pname, pinfo in self._parent_registry.items(): snap_lvols.append((pname, pinfo["id"])) - # Pick 1 random child (if any) + # Use pre-selected child (from write_data) or pick a random one + chosen_child = getattr(self, "_snapshot_child", None) child_names = list(self._child_registry.keys()) - if child_names: + if not chosen_child and child_names: chosen_child = random.choice(child_names) + if chosen_child and chosen_child in self._child_registry: cinfo = self._child_registry[chosen_child] snap_lvols.append((chosen_child, cinfo["id"])) self.logger.info( @@ -469,13 +830,30 @@ def _phase_create_snapshots(self): f"[create_snapshots] Creating {len(items)} snapshots " f"({len(snap_lvols)} lvols × {self.SNAPSHOTS_PER_LVOL})" ) - self._batch_parallel( + snap_t0 = time.time() + _ok, fail = self._batch_parallel( items, self._timed_create_snapshot, self.MAX_WORKERS_CREATE, "create_snapshots", ) + snap_elapsed = time.time() - snap_t0 + self._log_op_stats( + "create_snapshot", batch_label="all snapshots", + batch_elapsed=snap_elapsed, + ) + snap_fail_pct = fail * 100 / max(len(items), 1) + if fail > 0: + self.logger.warning( + f"[create_snapshots] {fail}/{len(items)} " + f"({snap_fail_pct:.1f}%) snapshots failed" + ) + if snap_fail_pct > 50: + raise RuntimeError( + f"[create_snapshots] {snap_fail_pct:.1f}% snapshot failures " + f"exceeds 50% threshold — {fail}/{len(items)}" + ) def _phase_create_clones(self): - """Pick 1 random snapshot and create NUM_CLONES clones from it.""" + """Pick 1 random snapshot and create NUM_CLONES clones in batches.""" with self._lock: snap_names = list(self._snap_registry.keys()) if not snap_names: @@ -484,66 +862,273 @@ def _phase_create_clones(self): chosen_snap = random.choice(snap_names) with self._lock: snap_id = self._snap_registry[chosen_snap]["snap_id"] + snap_parent = self._snap_registry[chosen_snap].get("lvol_name", "") + clone_sc = self._parent_registry.get(snap_parent, {}).get( + "storage_class", self.STORAGE_CLASS_NAME + ) self.logger.info( f"[create_clones] Chosen snapshot: {chosen_snap} (id={snap_id})" ) - items = [] + all_items = [] for i in range(self.NUM_CLONES): clone_name = f"cln-{_rand_seq(6)}-{i:04d}" - items.append({ + all_items.append({ "name": clone_name, "snap_name": chosen_snap, "snap_id": snap_id, + "sc_name": clone_sc, }) - self._batch_parallel( - items, self._timed_create_clone, - self.MAX_WORKERS_CREATE, "create_clones", + + total_batches = ( + (len(all_items) + self.CLONE_BATCH_SIZE - 1) + // self.CLONE_BATCH_SIZE + ) + overall_t0 = time.time() + total_clone_fail = 0 + + for batch_idx in range(0, len(all_items), self.CLONE_BATCH_SIZE): + batch = all_items[batch_idx:batch_idx + self.CLONE_BATCH_SIZE] + batch_num = batch_idx // self.CLONE_BATCH_SIZE + 1 + self.logger.info( + f"[create_clones] Batch {batch_num}/{total_batches}: " + f"{len(batch)} clones" + ) + batch_t0 = time.time() + _ok, batch_fail = self._batch_parallel( + batch, self._timed_create_clone, + self.MAX_WORKERS_CREATE, + f"create_clones_b{batch_num}", + ) + batch_elapsed = time.time() - batch_t0 + total_clone_fail += batch_fail + with self._lock: + still_binding = self._clones_binding + if batch_fail > 0: + self.logger.warning( + f"[create_clones] Batch {batch_num}: " + f"{batch_fail}/{len(batch)} clones failed " + f"(still_binding={still_binding})" + ) + # Per-batch stats (only for clones created in this batch) + with self._lock: + batch_samples = [ + s["elapsed_sec"] for s in self._timing_samples + if (s["iteration"] == self._current_iteration + and s["op"] == "create_clone" + and s["timestamp"] >= batch_t0) + ] + if batch_samples: + bs = sorted(batch_samples) + n = len(bs) + throughput = n / batch_elapsed if batch_elapsed > 0 else 0 + effective_per_clone = batch_elapsed / n if n > 0 else 0 + self.logger.info( + f"[create_clones] Batch {batch_num} stats: " + f"{n} ops in {batch_elapsed:.1f}s — " + f"avg_wall={sum(bs)/n:.2f}s " + f"p50={bs[n//2]:.2f}s " + f"p95={bs[min(int(n*0.95), n-1)]:.2f}s " + f"min={bs[0]:.2f}s max={bs[-1]:.2f}s | " + f"throughput={throughput:.2f} clones/s " + f"effective_per_clone={effective_per_clone:.2f}s" + ) + with self._lock: + self._batch_timings.append({ + "iteration": self._current_iteration, + "op": "create_clone", + "batch_label": f"batch {batch_num}/{total_batches}", + "batch_elapsed": round(batch_elapsed, 2), + "count": n, + "avg_wall": round(sum(bs) / n, 4), + "p50": round(bs[n // 2], 4), + "p95": round(bs[min(int(n * 0.95), n - 1)], 4), + "min": round(bs[0], 4), + "max": round(bs[-1], 4), + "throughput_per_sec": round(throughput, 4), + "effective_per_clone": round(effective_per_clone, 4), + }) + + overall_elapsed = time.time() - overall_t0 + self._log_op_stats( + "create_clone", batch_label="all clones", + batch_elapsed=overall_elapsed, ) + # Overall clone failure check + clone_fail_pct = total_clone_fail * 100 / max(len(all_items), 1) + if total_clone_fail > 0: + self.logger.warning( + f"[create_clones] Total: {total_clone_fail}/{len(all_items)} " + f"({clone_fail_pct:.1f}%) clones failed across all batches" + ) + if clone_fail_pct > 50: + raise RuntimeError( + f"[create_clones] {clone_fail_pct:.1f}% clone failures " + f"exceeds 50% threshold — " + f"{total_clone_fail}/{len(all_items)}" + ) + def _phase_delete_all(self): """Delete: clones → snapshots → children → parents (ordered).""" + total_failures = 0 + # Step 1: clones with self._lock: clone_names = list(self._clone_registry.keys()) if clone_names: self.logger.info(f"[delete_all] Deleting {len(clone_names)} clones") - self._batch_parallel( + t0 = time.time() + _ok, fail = self._batch_parallel( clone_names, self._timed_delete_clone, self.MAX_WORKERS_DELETE, "delete_clones", ) + self._log_op_stats( + "delete_clone", batch_label="all clones", + batch_elapsed=time.time() - t0, count=len(clone_names), + ) + if fail > 0: + self.logger.warning( + f"[delete_all] {fail}/{len(clone_names)} clone " + f"deletions failed" + ) + total_failures += fail # Step 2: snapshots with self._lock: snap_names = list(self._snap_registry.keys()) if snap_names: self.logger.info(f"[delete_all] Deleting {len(snap_names)} snapshots") - self._batch_parallel( + t0 = time.time() + _ok, fail = self._batch_parallel( snap_names, self._timed_delete_snapshot, self.MAX_WORKERS_DELETE, "delete_snapshots", ) + self._log_op_stats( + "delete_snapshot", batch_label="all snapshots", + batch_elapsed=time.time() - t0, count=len(snap_names), + ) + if fail > 0: + self.logger.warning( + f"[delete_all] {fail}/{len(snap_names)} snapshot " + f"deletions failed" + ) + total_failures += fail # Step 3: children with self._lock: child_names = list(self._child_registry.keys()) if child_names: self.logger.info(f"[delete_all] Deleting {len(child_names)} children") - self._batch_parallel( + t0 = time.time() + _ok, fail = self._batch_parallel( child_names, self._timed_delete_child, self.MAX_WORKERS_DELETE, "delete_children", ) + self._log_op_stats( + "delete_child", batch_label="all children", + batch_elapsed=time.time() - t0, count=len(child_names), + ) + if fail > 0: + self.logger.warning( + f"[delete_all] {fail}/{len(child_names)} child " + f"deletions failed" + ) + total_failures += fail # Step 4: parents with self._lock: parent_names = list(self._parent_registry.keys()) if parent_names: self.logger.info(f"[delete_all] Deleting {len(parent_names)} parents") - self._batch_parallel( + t0 = time.time() + _ok, fail = self._batch_parallel( parent_names, self._timed_delete_parent, self.MAX_WORKERS_DELETE, "delete_parents", ) + self._log_op_stats( + "delete_parent", batch_label="all parents", + batch_elapsed=time.time() - t0, count=len(parent_names), + ) + if fail > 0: + self.logger.warning( + f"[delete_all] {fail}/{len(parent_names)} parent " + f"deletions failed" + ) + total_failures += fail + + if total_failures > 0: + self.logger.warning( + f"[delete_all] Total: {total_failures} deletion failures — " + f"verify_cleanup phase will retry" + ) # ── Reporting ───────────────────────────────────────────────────────── + def _compute_per_iteration_summary(self): + """Compute per-iteration avg/min/max/p50/p95 for create operations. + + Uses api_elapsed_sec when available (Docker — API-only time), + otherwise falls back to elapsed_sec (K8s — time to PVC Bound). + """ + summary = {} + with self._lock: + all_samples = list(self._timing_samples) + if not all_samples: + return summary + iterations = sorted(set(s["iteration"] for s in all_samples)) + create_ops = [ + "create_parent", "create_child", "create_clone", + ] + for it in iterations: + it_key = str(it) + summary[it_key] = {} + for op in create_ops: + samples = [ + s for s in all_samples + if s["iteration"] == it and s["op"] == op + ] + if not samples: + continue + times = [ + s.get("api_elapsed_sec", s["elapsed_sec"]) + for s in samples + ] + times_sorted = sorted(times) + n = len(times_sorted) + op_summary = { + "count": n, + "avg_wall": round(sum(times_sorted) / n, 4), + "min": round(times_sorted[0], 4), + "max": round(times_sorted[-1], 4), + "p50": round(times_sorted[n // 2], 4), + "p95": round( + times_sorted[min(int(n * 0.95), n - 1)], 4 + ), + } + # For clone ops, compute throughput from batch timings + if op == "create_clone": + with self._lock: + it_batches = [ + b for b in self._batch_timings + if b["iteration"] == it and b["op"] == op + ] + if it_batches: + total_elapsed = sum( + b["batch_elapsed"] for b in it_batches + ) + total_count = sum( + b["count"] for b in it_batches + ) + if total_elapsed > 0: + op_summary["throughput_per_sec"] = round( + total_count / total_elapsed, 4 + ) + op_summary["effective_per_clone"] = round( + total_elapsed / total_count, 4 + ) + summary[it_key][op] = op_summary + return summary + def _get_log_dir(self) -> str: """Return the directory for timing/graph output.""" d = getattr(self, "docker_logs_path", None) @@ -562,10 +1147,16 @@ def _write_timing_report(self): "SNAPSHOTS_PER_LVOL": self.SNAPSHOTS_PER_LVOL, "NUM_CLONES": self.NUM_CLONES, "NUM_ITERATIONS": self.NUM_ITERATIONS, + "BATCH_SIZE": self.BATCH_SIZE, + "MAX_WORKERS_CREATE": self.MAX_WORKERS_CREATE, + "CLONE_BATCH_SIZE": self.CLONE_BATCH_SIZE, }, "iterations": self._iteration_timings, + "per_iteration_summary": self._compute_per_iteration_summary(), "samples": self._timing_samples, + "batch_timings": self._batch_timings, "metrics": self._metrics, + "mappings": self._get_registry_mappings(), } path = os.path.join(out_dir, "namespace_stress_timings.json") try: @@ -615,19 +1206,22 @@ def _generate_graphs(self): except Exception as exc: self.logger.warning(f"Graph 1 failed: {exc}") - # ── 2. Latency per iteration (box plot) ────────────────────────── + # ── 2. Latency per iteration (box plot with legend) ────────────── try: + from matplotlib.patches import Patch create_ops = [ "create_parent", "create_child", "create_snapshot", "create_clone", ] + op_labels = ["parent", "child", "snapshot", "clone"] iterations = sorted(set(s["iteration"] for s in samples)) fig, ax = plt.subplots(figsize=(14, 8)) positions = [] labels = [] data_groups = [] + op_indices = [] # track which op each box belongs to for it in iterations: - for op in create_ops: + for oi, op in enumerate(create_ops): vals = [ s["elapsed_sec"] for s in samples if s["iteration"] == it and s["op"] == op @@ -639,11 +1233,12 @@ def _generate_graphs(self): + create_ops.index(op) ) labels.append(f"i{it}_{op.split('_')[-1]}") + op_indices.append(oi) if data_groups: bp = ax.boxplot(data_groups, positions=positions, widths=0.6, patch_artist=True, showfliers=False) for j, patch in enumerate(bp["boxes"]): - c_idx = j % len(create_ops) + c_idx = op_indices[j] if j < len(op_indices) else j patch.set_facecolor(colors[c_idx % len(colors)]) ax.set_xlabel("Iteration / Operation") ax.set_ylabel("Latency (sec)") @@ -653,6 +1248,12 @@ def _generate_graphs(self): [f"iter {it}" for it in iterations], rotation=45, fontsize=7, ) + # Add explicit legend mapping colors to operations + legend_patches = [ + Patch(facecolor=colors[i % len(colors)], label=op_labels[i]) + for i in range(len(create_ops)) + ] + ax.legend(handles=legend_patches, fontsize=8, loc="upper left") fig.tight_layout() fig.savefig(os.path.join(out_dir, "latency_per_iteration.png"), dpi=150) @@ -664,8 +1265,9 @@ def _generate_graphs(self): # ── 3. Phase duration per iteration (stacked bar) ──────────────── try: phase_names = [ - "create_parents", "create_children", + "create_subsystems", "write_data", "create_snapshots", "create_clones", "delete_all", + "verify_cleanup", ] fig, ax = plt.subplots(figsize=(12, 6)) x_pos = list(range(len(self._iteration_timings))) @@ -694,7 +1296,7 @@ def _generate_graphs(self): except Exception as exc: self.logger.warning(f"Graph 3 failed: {exc}") - # ── 4. Clone latency vs clone index (per iteration) ────────────── + # ── 4. Clone latency vs clone index with batch boundaries ──────── try: fig, ax = plt.subplots(figsize=(14, 8)) for it in iterations: @@ -709,9 +1311,27 @@ def _generate_graphs(self): [s["elapsed_sec"] for s in clone_samples], label=f"iter {it}", alpha=0.7, linewidth=0.8, ) + # Mark batch boundaries (CLONE_BATCH_SIZE) + cbs = self.CLONE_BATCH_SIZE + for bi in range(cbs, len(clone_samples), cbs): + ax.axvline( + x=bi, color="gray", linestyle="--", + alpha=0.4, linewidth=0.6, + ) + # Mark _batch_parallel BATCH_SIZE boundaries too + bs = self.BATCH_SIZE + for bi in range(bs, len(clone_samples), bs): + ax.axvline( + x=bi, color="red", linestyle=":", + alpha=0.3, linewidth=0.5, + ) ax.set_xlabel("Clone index (creation order)") ax.set_ylabel("Latency (sec)") - ax.set_title("Clone Creation Latency vs Clone Count") + ax.set_title( + f"Clone Creation Latency vs Clone Count " + f"(gray=clone batch/{self.CLONE_BATCH_SIZE}, " + f"red=submit batch/{self.BATCH_SIZE})" + ) ax.legend(fontsize=7) fig.tight_layout() fig.savefig( @@ -751,6 +1371,333 @@ def _generate_graphs(self): except Exception as exc: self.logger.warning(f"Graph 5 failed: {exc}") + # ── 6. Batch timing stats (bar chart) ──────────────────────────── + try: + bt = self._batch_timings + if bt: + clone_batches = [ + b for b in bt + if b["op"] == "create_clone" + and b["batch_label"].startswith("batch ") + ] + if clone_batches: + fig, ax = plt.subplots(figsize=(14, 8)) + labels = [b["batch_label"] for b in clone_batches] + avgs = [b["avg_wall"] for b in clone_batches] + p50s = [b["p50"] for b in clone_batches] + p95s = [b["p95"] for b in clone_batches] + effs = [ + b.get("effective_per_clone", 0) + for b in clone_batches + ] + x = range(len(labels)) + width = 0.2 + ax.bar( + [i - 1.5 * width for i in x], avgs, width, + label="avg wall", color=colors[0], + ) + ax.bar( + [i - 0.5 * width for i in x], p50s, width, + label="p50", color=colors[1], + ) + ax.bar( + [i + 0.5 * width for i in x], p95s, width, + label="p95", color=colors[2], + ) + ax.bar( + [i + 1.5 * width for i in x], effs, width, + label="effective/clone", color=colors[3 % len(colors)], + ) + # Annotate throughput on each batch + for idx, b in enumerate(clone_batches): + tp = b.get("throughput_per_sec", 0) + if tp > 0: + ax.text( + idx, max(avgs[idx], p95s[idx]) + 0.5, + f"{tp:.2f}/s", + ha="center", fontsize=6, color="black", + ) + ax.set_xlabel("Clone Batch") + ax.set_ylabel("Latency (sec)") + ax.set_title( + "Clone Creation — Per-Batch Latency " + "(wall vs effective vs throughput)" + ) + ax.set_xticks(list(x)) + ax.set_xticklabels(labels, rotation=45, fontsize=7) + ax.legend(fontsize=7) + fig.tight_layout() + fig.savefig( + os.path.join( + out_dir, "clone_batch_latency_stats.png" + ), + dpi=150, + ) + plt.close(fig) + self.logger.info( + "Generated clone_batch_latency_stats.png" + ) + except Exception as exc: + self.logger.warning(f"Graph 6 failed: {exc}") + + # ── 7. Creation timeline — latency over wall-clock time ─────── + try: + create_ops_ordered = [ + "create_parent", "create_child", + "create_snapshot", "create_clone", + ] + fig, ax = plt.subplots(figsize=(16, 8)) + t0_global = min(s["timestamp"] for s in samples) + for i, op in enumerate(create_ops_ordered): + pts = sorted( + [s for s in samples if s["op"] == op], + key=lambda s: s["timestamp"], + ) + if pts: + x = [(p["timestamp"] - t0_global) / 60.0 for p in pts] + y = [p["elapsed_sec"] for p in pts] + ax.plot(x, y, label=op, alpha=0.7, linewidth=0.8, + color=colors[i % len(colors)]) + ax.set_xlabel("Time since test start (minutes)") + ax.set_ylabel("Latency (sec)") + ax.set_title("Creation Latency Over Time") + ax.legend(fontsize=7) + fig.tight_layout() + fig.savefig( + os.path.join(out_dir, "creation_latency_timeline.png"), + dpi=150, + ) + plt.close(fig) + self.logger.info("Generated creation_latency_timeline.png") + except Exception as exc: + self.logger.warning(f"Graph 7 failed: {exc}") + + # ── 8. Per-parent child creation duration (bar chart) ───────── + try: + child_samples = [ + s for s in samples if s["op"] == "create_child" + ] + if child_samples: + # Build child→parent mapping from registry or saved JSON + with self._lock: + child_to_parent = { + cn: ci.get("parent_name", "unknown") + for cn, ci in self._child_registry.items() + } + # Fall back to saved mappings if registry was cleared + if not child_to_parent: + try: + rpath = os.path.join( + out_dir, "namespace_stress_timings.json" + ) + with open(rpath) as rf: + saved = json.load(rf) + child_to_parent = saved.get( + "mappings", {} + ).get("child_to_parent", {}) + except Exception: + pass + + parent_durations = {} + for s in child_samples: + pname = child_to_parent.get(s["name"], "unknown") + parent_durations.setdefault(pname, []).append( + s["elapsed_sec"] + ) + parents_sorted = sorted(parent_durations.keys()) + fig, ax = plt.subplots(figsize=(14, 6)) + x = range(len(parents_sorted)) + totals = [ + sum(parent_durations[p]) for p in parents_sorted + ] + avgs = [ + sum(parent_durations[p]) / len(parent_durations[p]) + for p in parents_sorted + ] + counts = [ + len(parent_durations[p]) for p in parents_sorted + ] + ax.bar(x, totals, color=colors[0], alpha=0.7, + label="total (sec)") + ax2 = ax.twinx() + ax2.plot(list(x), avgs, "ro-", markersize=4, + label="avg per child (sec)") + ax.set_xlabel("Parent subsystem") + ax.set_ylabel("Total creation time (sec)") + ax2.set_ylabel("Avg per child (sec)") + ax.set_title( + f"Child Creation Duration per Parent " + f"({len(parents_sorted)} parents, " + f"{len(child_samples)} children)" + ) + ax.set_xticks(list(x)) + ax.set_xticklabels( + [f"{p[-8:]}({counts[i]})" for i, p in enumerate(parents_sorted)], + rotation=45, fontsize=7, + ) + ax.legend(loc="upper left", fontsize=7) + ax2.legend(loc="upper right", fontsize=7) + fig.tight_layout() + fig.savefig( + os.path.join( + out_dir, "child_creation_per_parent.png" + ), + dpi=150, + ) + plt.close(fig) + self.logger.info( + "Generated child_creation_per_parent.png" + ) + except Exception as exc: + self.logger.warning(f"Graph 8 failed: {exc}") + + # ── 9-12. Individual per-op latency over time (one graph each) ── + individual_ops = [ + ("create_parent", "Parent LVol Creation Latency Over Time"), + ("create_child", "Child LVol Creation Latency Over Time"), + ("create_snapshot", "Snapshot Creation Latency Over Time"), + ("create_clone", "Clone Creation Latency Over Time"), + ] + for op_name, title in individual_ops: + try: + op_samples = sorted( + [s for s in samples if s["op"] == op_name], + key=lambda s: s["timestamp"], + ) + if not op_samples: + continue + fig, ax = plt.subplots(figsize=(14, 8)) + t0_global = min(s["timestamp"] for s in samples) + x = [(s["timestamp"] - t0_global) / 60.0 + for s in op_samples] + y = [s["elapsed_sec"] for s in op_samples] + + ax.scatter(x, y, alpha=0.5, s=12, + color=colors[0], label="latency") + # Rolling average (window=20) + if len(y) >= 20: + window = 20 + rolling = [ + sum(y[max(0, i - window):i]) / min(i, window) + for i in range(1, len(y) + 1) + ] + ax.plot(x, rolling, color="red", linewidth=1.5, + alpha=0.8, label=f"rolling avg (w={window})") + + # Mark batch boundaries + bs = self.BATCH_SIZE + for bi in range(bs, len(op_samples), bs): + ax.axvline( + x=x[bi] if bi < len(x) else x[-1], + color="gray", linestyle="--", + alpha=0.3, linewidth=0.5, + ) + + ax.set_xlabel("Time since test start (minutes)") + ax.set_ylabel("Latency (sec)") + ax.set_title( + f"{title} ({len(op_samples)} ops, " + f"batch_size={bs}, workers={self.MAX_WORKERS_CREATE})" + ) + ax.legend(fontsize=8) + fig.tight_layout() + fname = f"{op_name}_latency_over_time.png" + fig.savefig(os.path.join(out_dir, fname), dpi=150) + plt.close(fig) + self.logger.info(f"Generated {fname}") + except Exception as exc: + self.logger.warning( + f"Graph {op_name}_latency_over_time failed: {exc}" + ) + + # ── 13. Per-iteration average create time (grouped bar) ──────── + try: + per_it = self._compute_per_iteration_summary() + if per_it: + create_ops_bar = [ + "create_parent", "create_child", "create_clone", + ] + op_labels_bar = ["parent", "child", "clone"] + it_keys = sorted(per_it.keys(), key=int) + fig, ax = plt.subplots(figsize=(14, 8)) + n_its = len(it_keys) + n_ops = len(create_ops_bar) + width = 0.8 / max(n_ops, 1) + has_data = False + + for oi, (op, label) in enumerate( + zip(create_ops_bar, op_labels_bar) + ): + avgs = [] + mins = [] + maxs = [] + x_pos = [] + eff_times = [] # effective per-clone (throughput-based) + for xi, it_key in enumerate(it_keys): + stats = per_it[it_key].get(op) + if stats: + avgs.append(stats["avg_wall"]) + mins.append(stats["min"]) + maxs.append(stats["max"]) + eff_times.append( + stats.get("effective_per_clone") + ) + x_pos.append(xi) + if avgs: + has_data = True + offsets = [ + x + (oi - n_ops / 2 + 0.5) * width + for x in x_pos + ] + err_lo = [a - m for a, m in zip(avgs, mins)] + err_hi = [m - a for a, m in zip(avgs, maxs)] + ax.bar( + offsets, avgs, width, + label=f"{label} (avg wall)", + color=colors[oi % len(colors)], + alpha=0.8, + yerr=[err_lo, err_hi], + capsize=3, + error_kw={"linewidth": 0.8}, + ) + # Annotate counts + effective time + for j, xi in enumerate(x_pos): + cnt = per_it[it_keys[xi]][op]["count"] + ann = f"n={cnt}" + if eff_times[j] is not None: + ann += f"\neff={eff_times[j]:.1f}s" + ax.text( + offsets[j], avgs[j] + err_hi[j] + 0.3, + ann, ha="center", fontsize=6, + ) + + if has_data: + ax.set_xlabel("Iteration") + ax.set_ylabel("Create time (sec)") + ax.set_title( + "Per-Iteration Average Create Time " + "(API time for Docker, PVC Bound for K8s)" + ) + ax.set_xticks(range(n_its)) + ax.set_xticklabels( + [f"iter {k}" for k in it_keys], fontsize=8, + ) + ax.legend(fontsize=8) + fig.tight_layout() + fig.savefig( + os.path.join( + out_dir, + "per_iteration_avg_create_time.png", + ), + dpi=150, + ) + self.logger.info( + "Generated per_iteration_avg_create_time.png" + ) + plt.close(fig) + except Exception as exc: + self.logger.warning(f"Graph 13 failed: {exc}") + def _print_summary(self): self.logger.info("=" * 60) self.logger.info(" PARALLEL NAMESPACE LVOL STRESS — SUMMARY") @@ -797,11 +1744,18 @@ def run(self): phase_durations = {} for phase_name, phase_fn in [ - ("create_parents", self._phase_create_parents), - ("create_children", self._phase_create_children), + ("create_subsystems", self._phase_create_subsystems), + ("verify_lvols", self._verify_all_lvols_exist), + ("verify_nodes_healthy", self._verify_nodes_healthy), + ("write_data", self._phase_write_data), ("create_snapshots", self._phase_create_snapshots), + ("verify_snapshots", self._verify_all_snapshots_exist), ("create_clones", self._phase_create_clones), + ("verify_clones", self._verify_all_clones_exist), + ("mount_verify_clones", self._phase_mount_verify_clones), + ("verify_nodes_final", self._verify_nodes_healthy), ("delete_all", self._phase_delete_all), + ("verify_cleanup", self._phase_verify_cleanup), ]: dur = self._run_phase(phase_name, phase_fn) phase_durations[phase_name] = round(dur or 0, 2) @@ -810,7 +1764,10 @@ def run(self): "iteration": iteration, "phase_durations_sec": phase_durations, }) - self._clear_registries() + # Only clear registries if iteration succeeded — graphs + # need the mappings and they run in the finally block + if not self._stop_event.is_set(): + self._clear_registries() finally: self._metrics["end_ts"] = time.time() @@ -842,11 +1799,17 @@ def __init__(self, **kwargs): # ── Setup / Cleanup ─────────────────────────────────────────────────── def _phase_setup(self): - self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + if actual_pool and actual_pool != self.pool_name: + self.logger.info( + f"[setup] Pool name changed: {self.pool_name} -> {actual_pool}" + ) + self.pool_name = actual_pool sleep_n_sec(2) def _phase_cleanup(self): - self.logger.info("[cleanup] Bulk delete safety net") + self.logger.info("[cleanup] Bulk delete safety net (ns-* only)") + # Delete only test resources by prefix, not all lvols try: self.sbcli_utils.delete_all_clones() except Exception: @@ -856,7 +1819,23 @@ def _phase_cleanup(self): except Exception: pass try: - self.sbcli_utils.delete_all_lvols() + all_lvols = self.sbcli_utils.list_lvols() + test_lvols = [ + name for name in all_lvols + if name.startswith("ns-") or name.startswith("cln-") + or name.startswith("snap-") + ] + self.logger.info( + f"[cleanup] Deleting {len(test_lvols)}/{len(all_lvols)} " + f"test lvols" + ) + for lv_name in test_lvols: + try: + self.sbcli_utils.delete_lvol( + lvol_name=lv_name, skip_error=True + ) + except Exception: + pass except Exception: pass try: @@ -864,14 +1843,184 @@ def _phase_cleanup(self): except Exception: pass - # ── Create implementations ──────────────────────────────────────────── + # ── Two-phase subsystem creation: parents then parallel children ──── - def _create_parent_impl(self, params: dict): - name = params["name"] - self._inc("attempts", "create_parent") - self._api_retry("create_parent", lambda: self.sbcli_utils.add_lvol( - lvol_name=name, - pool_name=self.pool_name, + def _phase_create_subsystems(self): + """Sub-phase 1: create all parents in parallel. + Sub-phase 2: create ALL children in parallel (flat list). + 50% failure threshold with detailed name logging.""" + pvcs_per_subsys = 1 + self.CHILDREN_PER_PARENT + total_expected = self.NUM_PARENTS * pvcs_per_subsys + self.logger.info( + f"[create_subsystems] {self.NUM_PARENTS} parents × " + f"{pvcs_per_subsys} lvols = {total_expected} total " + f"(parallel, workers={self.MAX_WORKERS_CREATE})" + ) + + # ── Sub-phase 1: Create all parents (parallel) ───────────── + parent_items = [] + parent_names = [] + for i in range(self.NUM_PARENTS): + pname = f"ns-par-{_rand_seq(6)}-{i:04d}" + parent_items.append({"name": pname, "idx": i}) + parent_names.append(pname) + + self.logger.info( + f"[create_subsystems][sub1] Creating {self.NUM_PARENTS} parents " + f"(parallel, workers={self.MAX_WORKERS_CREATE})" + ) + parents_t0 = time.time() + _ok, parent_fail = self._batch_parallel( + parent_items, + self._create_single_parent_docker, + self.MAX_WORKERS_CREATE, + "create_parents", + ) + parents_elapsed = time.time() - parents_t0 + self._log_op_stats( + "create_parent", batch_label="all parents", + batch_elapsed=parents_elapsed, + ) + + # Remove failed parents + failed_parents = [] + if parent_fail > 0: + created_parents = set(self._parent_registry.keys()) + for pname in list(parent_names): + if pname not in created_parents: + failed_parents.append(pname) + parent_names.remove(pname) + + self.logger.info( + f"[create_subsystems][sub1] {len(parent_names)} parents " + f"created in {parents_elapsed:.1f}s" + f"{f', {len(failed_parents)} FAILED: {failed_parents}' if failed_parents else ''}" + ) + + # ── Sub-phase 2: Create ALL children in parallel ─────────── + total_children = len(parent_names) * self.CHILDREN_PER_PARENT + self.logger.info( + f"[create_subsystems][sub2] Creating {total_children} children " + f"in parallel (workers={self.MAX_WORKERS_CREATE})" + ) + child_items = [] + for pname in parent_names: + pinfo = self._parent_registry[pname] + for c in range(self.CHILDREN_PER_PARENT): + child_items.append({ + "name": f"ns-ch-{_rand_seq(6)}-{pname[-4:]}-{c:02d}", + "parent_name": pname, + "parent_id": pinfo["id"], + "parent_node_id": pinfo.get("node_id"), + }) + children_t0 = time.time() + _ok, child_fail = self._batch_parallel( + child_items, + self._create_single_child_docker, + self.MAX_WORKERS_CREATE, + "create_children", + ) + children_elapsed = time.time() - children_t0 + self._log_op_stats( + "create_child", batch_label="all children", + batch_elapsed=children_elapsed, + ) + + # Identify failed children + failed_children = [] + if child_fail > 0: + created_children = set(self._child_registry.keys()) + for item in child_items: + if item["name"] not in created_children: + failed_children.append( + f"{item['name']} (parent={item['parent_name']})" + ) + + # ── Failure summary ────────────────────────────────────────── + total_attempted = self.NUM_PARENTS + total_children + total_failed = len(failed_parents) + len(failed_children) + fail_pct = (total_failed * 100 / max(total_attempted, 1)) + + if total_failed > 0: + self.logger.warning( + f"[create_subsystems] FAILED lvols: {total_failed}/" + f"{total_attempted} ({fail_pct:.1f}%)" + ) + if failed_parents: + self.logger.warning( + f" Failed PARENTS ({len(failed_parents)}): " + f"{failed_parents}" + ) + if failed_children: + self.logger.warning( + f" Failed CHILDREN ({len(failed_children)}): " + f"{failed_children[:20]}" + f"{'...' if len(failed_children) > 20 else ''}" + ) + + if fail_pct > 50: + raise RuntimeError( + f"[create_subsystems] {fail_pct:.1f}% failure rate " + f"exceeds 50% threshold — {total_failed}/{total_attempted} " + f"(parents={len(failed_parents)}, " + f"children={len(failed_children)})" + ) + + # ── Bulk verify ────────────────────────────────────────────── + all_lvols = self.sbcli_utils.list_lvols() + expected_created = total_attempted - total_failed + if len(all_lvols) < expected_created: + self.logger.warning( + f"[create_subsystems] lvol count {len(all_lvols)} < " + f"expected {expected_created}" + ) + + self.logger.info( + f"[create_subsystems] Done: {len(self._parent_registry)} parents, " + f"{len(self._child_registry)} children" + f"{f' ({total_failed} failures tolerated)' if total_failed else ''}" + ) + + def _create_single_parent_docker(self, item): + """Create a single parent lvol. Called from _batch_parallel.""" + name = item["name"] + t0 = time.time() + api_elapsed = self._create_parent(name) + self._record_timing( + "create_parent", name, + time.time() - t0, self._snapshot_inventory(), + api_elapsed=api_elapsed, + ) + + def _create_single_child_docker(self, item): + """Create a single child lvol and register under its parent. + + Called from _batch_parallel with MAX_WORKERS_CREATE concurrency — + all children for all parents run in parallel.""" + child_name = item["name"] + parent_name = item["parent_name"] + parent_id = item["parent_id"] + parent_node_id = item["parent_node_id"] + t0 = time.time() + api_elapsed = self._create_child( + child_name, parent_name, parent_id, parent_node_id, + ) + self._record_timing( + "create_child", child_name, + time.time() - t0, self._snapshot_inventory(), + api_elapsed=api_elapsed, + ) + + def _create_parent(self, name: str): + """Create a single parent lvol + register. Raises on failure. + + Returns the API-only elapsed time (seconds) for timing reports. + """ + self._inc("attempts", "create_parent") + api_t0 = time.time() + self._api_retry("create_parent", lambda: self.sbcli_utils.add_lvol( + lvol_name=name, + pool_name=self.pool_name, size=self.LVOL_SIZE, distr_ndcs=self.ndcs, distr_npcs=self.npcs, @@ -880,20 +2029,36 @@ def _create_parent_impl(self, params: dict): max_namespace_per_subsys=self.NAMESPACES_PER_PARENT, retry=1, ), ctx={"name": name}) + api_elapsed = time.time() - api_t0 lvol_id = self._wait_lvol_id(name) + node_id = None + try: + details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id) + if details: + node_id = details[0].get("node_id") + except Exception as ex: + self.logger.warning( + f"[create_parent] {name}: could not get node_id: {ex}" + ) with self._lock: self._parent_registry[name] = { - "id": lvol_id, "children": [], "snapshots": [], + "id": lvol_id, "node_id": node_id, + "children": [], "snapshots": [], } self._metrics["counts"]["parents_created"] += 1 - self._inc("attempts", "create_parent", 0) # already counted - self.logger.info(f"[create_parent] {name} -> {lvol_id}") + self.logger.info( + f"[create_parent] {name} -> {lvol_id} (node={node_id})" + ) + return api_elapsed - def _create_child_impl(self, params: dict): - name = params["name"] - parent_name = params["parent_name"] - parent_id = params["parent_id"] + def _create_child(self, name: str, parent_name: str, + parent_id: str, parent_node_id: str): + """Create a single child namespace lvol. Raises on failure. + + Returns the API-only elapsed time (seconds) for timing reports. + """ self._inc("attempts", "create_child") + api_t0 = time.time() self._api_retry("create_child", lambda: self.sbcli_utils.add_lvol( lvol_name=name, pool_name=self.pool_name, @@ -902,18 +2067,199 @@ def _create_child_impl(self, params: dict): distr_npcs=self.npcs, distr_bs=self.bs, distr_chunk_bs=self.chunk_bs, + host_id=parent_node_id, namespace=parent_id, retry=1, ), ctx={"name": name, "parent": parent_name}) + api_elapsed = time.time() - api_t0 child_id = self._wait_lvol_id(name) with self._lock: self._child_registry[name] = { "id": child_id, "parent_name": parent_name, } - if parent_name in self._parent_registry: - self._parent_registry[parent_name]["children"].append(name) + self._parent_registry[parent_name]["children"].append(name) self._metrics["counts"]["children_created"] += 1 - self.logger.info(f"[create_child] {name} -> {child_id} (parent={parent_name})") + self.logger.info( + f"[create_child] {name} -> {child_id} (parent={parent_name})" + ) + return api_elapsed + + # ── Write data (parallel FIO per parent group) ───────────────────── + + def _phase_write_data(self): + """Parallel FIO: one thread per parent group. + + Each thread NVMe-connects the parent + all its children, runs + FIO (100 MB sequential write) on each device, then disconnects. + Also pre-selects the snapshot child so _phase_create_snapshots + reuses it. + """ + # Pre-select snapshot child + with self._lock: + child_names = list(self._child_registry.keys()) + if child_names: + self._snapshot_child = random.choice(child_names) + self.logger.info( + f"[write_data] Pre-selected child for snapshot: " + f"{self._snapshot_child}" + ) + else: + self._snapshot_child = None + + # Build per-parent groups: parent + all its children + parent_items = [] + with self._lock: + for pname, pinfo in self._parent_registry.items(): + lvols = [(pname, pinfo["id"])] + for cname in pinfo.get("children", []): + cinfo = self._child_registry.get(cname) + if cinfo: + lvols.append((cname, cinfo["id"])) + parent_items.append({ + "parent_name": pname, + "lvols": lvols, + }) + + total_lvols = sum(len(item["lvols"]) for item in parent_items) + self.logger.info( + f"[write_data] Running parallel FIO (100 MB) on {total_lvols} " + f"lvols across {len(parent_items)} parent groups " + f"(workers={self.MAX_WORKERS_CREATE})" + ) + + write_t0 = time.time() + _ok, fail = self._batch_parallel( + parent_items, self._fio_parent_group_docker, + self.MAX_WORKERS_CREATE, "write_data", + ) + write_elapsed = time.time() - write_t0 + self.logger.info( + f"[write_data] Done: {_ok}/{len(parent_items)} groups OK, " + f"{fail} failed in {write_elapsed:.1f}s" + ) + if fail > 0: + self.logger.warning( + f"[write_data] {fail}/{len(parent_items)} FIO groups failed" + ) + + def _extract_nqn(self, connect_strs): + """Extract NQN from nvme connect command strings.""" + for cs in connect_strs: + for part in cs.split(): + if part.startswith("--nqn="): + return part.split("=", 1)[1] + if part.startswith("-n ") or part == "-n": + continue + return None + + def _find_device_by_nqn(self, client, nqn): + """Find NVMe block device for a given NQN via nvme list-subsys.""" + import json as _json + out, _ = self.ssh_obj.exec_command( + client, + "sudo nvme list-subsys -o json 2>/dev/null || echo '[]'", + supress_logs=True, + ) + try: + subsys_data = _json.loads(out) + if isinstance(subsys_data, list) and subsys_data: + subsys_data = subsys_data[0] + for ss in subsys_data.get("Subsystems", []): + if ss.get("NQN") == nqn: + for path in ss.get("Paths", []): + dev_name = path.get("Name") + if dev_name: + return f"/dev/{dev_name}" + except Exception: + pass + return None + + def _fio_parent_group_docker(self, item): + """Connect all lvols in a parent group, run FIO on each, disconnect. + + Each parent thread owns its NVMe connections exclusively — no shared + connect strings across threads. + """ + client = self.fio_node[0] + parent_name = item["parent_name"] + lvols = item["lvols"] # [(name, id), ...] + connected_nqns = [] + t0_group = time.time() + + try: + # ── Step 1: NVMe-connect all lvols in this group ───────── + nqn_map = {} # lvol_name -> nqn + for lvol_name, lvol_id in lvols: + try: + connect_strs = self.sbcli_utils.get_lvol_connect_str( + lvol_name + ) + if not connect_strs: + self.logger.warning( + f"[write_data] No connect strings for {lvol_name}" + ) + continue + nqn = self._extract_nqn(connect_strs) + for cs in connect_strs: + self.ssh_obj.exec_command(client, cs) + if nqn: + nqn_map[lvol_name] = nqn + connected_nqns.append(nqn) + except Exception as exc: + self.logger.warning( + f"[write_data] Connect failed for {lvol_name}: {exc}" + ) + + sleep_n_sec(3) + + # ── Step 2: Discover devices and run FIO on each ───────── + fio_ok = 0 + for lvol_name, nqn in nqn_map.items(): + try: + device = self._find_device_by_nqn(client, nqn) + if not device: + self.logger.warning( + f"[write_data] No device found for " + f"{lvol_name} (nqn={nqn})" + ) + continue + t0 = time.time() + self.ssh_obj.exec_command( + client, + f"sudo fio --name=write-{lvol_name[:20]} " + f"--filename={device} --size=100M --bs=1M " + f"--rw=write --direct=1 --ioengine=libaio " + f"--iodepth=1 --numjobs=1", + ) + elapsed = time.time() - t0 + self._record_timing( + "write_data", lvol_name, elapsed, + self._snapshot_inventory(), + ) + fio_ok += 1 + except Exception as exc: + self.logger.warning( + f"[write_data] FIO failed for {lvol_name}: {exc}" + ) + + group_elapsed = time.time() - t0_group + self.logger.info( + f"[write_data] Group {parent_name}: " + f"{fio_ok}/{len(lvols)} lvols written " + f"in {group_elapsed:.1f}s" + ) + + finally: + # ── Step 3: NVMe-disconnect all ────────────────────────── + for nqn in connected_nqns: + try: + self.ssh_obj.exec_command( + client, f"sudo nvme disconnect -n {nqn}", + ) + except Exception: + pass + + # ── Create implementations ──────────────────────────────────────────── def _create_snapshot_impl(self, params: dict): snap_name = params["name"] @@ -943,11 +2289,13 @@ def _create_clone_impl(self, params: dict): snap_name = params["snap_name"] snap_id = params["snap_id"] self._inc("attempts", "create_clone") + api_t0 = time.time() self._api_retry("create_clone", lambda: self.sbcli_utils.add_clone( snapshot_id=snap_id, clone_name=clone_name, retry=1, ), ctx={"clone": clone_name, "snap": snap_name}) + api_elapsed = time.time() - api_t0 clone_id = self._wait_lvol_id(clone_name) with self._lock: self._clone_registry[clone_name] = { @@ -957,6 +2305,134 @@ def _create_clone_impl(self, params: dict): self._snap_registry[snap_name]["clones"].append(clone_name) self._metrics["counts"]["clones_created"] += 1 self.logger.info(f"[create_clone] {clone_name} -> {clone_id}") + return api_elapsed + + # ── Clone mount verification ───────────────────────────────────────── + + def _mount_verify_single_clone(self, item): + """Connect a clone via NVMe, run short FIO read, check for errors.""" + clone_name = item["clone_name"] + client = self.fio_node[0] + nqn = None + t0 = time.time() + + try: + # 1. Get connect strings (works for clones — they are lvols) + connect_strs = self.sbcli_utils.get_lvol_connect_str(clone_name) + if not connect_strs: + raise RuntimeError( + f"No connect strings returned for clone {clone_name}" + ) + nqn = self._extract_nqn(connect_strs) + + # 2. Record devices before connect + initial_devices = set(self.ssh_obj.get_devices(node=client)) + + # 3. NVMe connect + for cs in connect_strs: + self.ssh_obj.exec_command(client, cs) + sleep_n_sec(3) + + # 4. Detect new device (namespace lvols may add namespace to + # existing controller rather than creating a new one) + final_devices = set(self.ssh_obj.get_devices(node=client)) + new_devices = list(final_devices - initial_devices) + + device = None + if new_devices: + device = f"/dev/{new_devices[0]}" + else: + # Namespace lvol: try ns-rescan on existing controllers + out, _ = self.ssh_obj.exec_command( + client, + "ls /dev/nvme[0-9]* 2>/dev/null | grep -oP 'nvme\\d+$' " + "| sort -u", + supress_logs=True, + ) + for ctrl in (out or "").strip().splitlines(): + ctrl = ctrl.strip() + if ctrl: + self.ssh_obj.exec_command( + client, + f"sudo nvme ns-rescan /dev/{ctrl}", + supress_logs=True, + ) + sleep_n_sec(2) + rescan_devices = set(self.ssh_obj.get_devices(node=client)) + new_after_rescan = list(rescan_devices - initial_devices) + if new_after_rescan: + device = f"/dev/{new_after_rescan[0]}" + + if not device: + # Fall back: find any device for this NQN + device = self._find_device_by_nqn(client, nqn) + + if not device: + raise RuntimeError( + f"Could not find block device for clone {clone_name} " + f"after NVMe connect (NQN={nqn})" + ) + + self.logger.info( + f"[mount_verify] Clone {clone_name} -> device {device}" + ) + + # 5. Run short FIO read with output capture + fio_log = f"/tmp/fio_verify_{clone_name}.log" + fio_cmd = ( + f"sudo fio --name=verify-{clone_name[:20]} " + f"--filename={device} --size=4M --bs=4K " + f"--rw=read --direct=1 --ioengine=libaio " + f"--iodepth=1 --numjobs=1 " + f"--output={fio_log}" + ) + self.ssh_obj.exec_command(client, fio_cmd) + + # 6. Check FIO log for errors + fio_output, _ = self.ssh_obj.exec_command( + client, f"cat {fio_log}", supress_logs=True, + ) + fio_output = fio_output or "" + + # Parse err= from FIO output + err_found = False + for line in fio_output.splitlines(): + if "err=" in line: + # Extract err value: "err= 5" or "err=5" + import re + m = re.search(r"err=\s*(\d+)", line) + if m and int(m.group(1)) != 0: + err_found = True + break + + if err_found: + self.logger.error( + f"[mount_verify] FIO reported error on clone " + f"{clone_name}:\n{fio_output}" + ) + raise RuntimeError( + f"FIO read error on clone {clone_name}: {fio_output[:200]}" + ) + + elapsed = time.time() - t0 + self.logger.info( + f"[mount_verify] Clone {clone_name} verified OK " + f"({elapsed:.1f}s)" + ) + self._record_timing( + "mount_verify", clone_name, elapsed, + self._snapshot_inventory(), + ) + + finally: + # Always disconnect + if nqn: + try: + self.ssh_obj.exec_command( + client, f"sudo nvme disconnect -n {nqn}", + ) + except Exception: + pass # ── Delete implementations (with verification) ──────────────────────── @@ -1031,9 +2507,67 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.test_name = "parallel_namespace_lvol_k8s" self.STORAGE_CLASS_NAME = "simplyblock-ns-stress-sc" + self.XFS_STORAGE_CLASS_NAME = "simplyblock-ns-stress-sc-xfs" self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass" self.k8s_utils = None + def setup(self): + """K8s-native setup: no SSH client machines needed — FIO runs as K8s Jobs.""" + self.logger.info("Inside TestParallelNamespaceLvolK8s.setup()") + + retry = 30 + while retry > 0: + try: + self.logger.info("Getting all storage nodes") + self.mgmt_nodes, self.storage_nodes = self.sbcli_utils.get_all_nodes_ip() + self.sbcli_utils.list_lvols() + self.sbcli_utils.list_storage_pools() + break + except Exception as e: + self.logger.debug(f"API call failed with error: {e}") + retry -= 1 + if retry == 0: + self.logger.info(f"Retry attempt exhausted. API failed with: {e}. Exiting") + raise e + self.logger.info(f"Retrying Base APIs before starting tests. Attempt: {30 - retry + 1}") + sleep_n_sec(10) + + # No client machines needed — FIO runs as K8s Jobs + self.client_machines = [] + self.fio_node = [] + + # Record UTC start time for Graylog log export at teardown + self.test_start_time_utc = datetime.now(timezone.utc) + + # Initialize k8s_utils early so it's available even if _phase_setup fails + self._init_k8s_utils() + + # Set up log directories + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + log_base = self.nfs_log_base + try: + os.makedirs(log_base, exist_ok=True) + except OSError: + log_base = os.path.join(os.path.expanduser("~"), "e2e-logs") + os.makedirs(log_base, exist_ok=True) + self.docker_logs_path = os.path.join(log_base, f"{self.test_name}-{timestamp}") + self.log_path = os.path.join(self.docker_logs_path, "ClientLogs") + os.makedirs(self.log_path, exist_ok=True) + os.makedirs(self.docker_logs_path, exist_ok=True) + + run_file = os.getenv("RUN_DIR_FILE", None) + if run_file: + with open(run_file, "w") as f: + f.write(self.docker_logs_path) + + # Start K8s log monitor + self.runner_k8s_log = RunnerK8sLog( + log_dir=self.docker_logs_path, + test_name=self.test_name, + ) + self.runner_k8s_log.start_logging() + self.runner_k8s_log.monitor_pod_logs() + # ── K8s helpers ─────────────────────────────────────────────────────── def _init_k8s_utils(self): @@ -1078,10 +2612,15 @@ def _wait_snapshot_k8s_gone(self, snap_name: str, timeout: int = 120) -> float: def _phase_setup(self): self._init_k8s_utils() # Create pool via sbcli - self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + if actual_pool and actual_pool != self.pool_name: + self.logger.info( + f"[setup] Pool name changed: {self.pool_name} -> {actual_pool}" + ) + self.pool_name = actual_pool sleep_n_sec(2) - # Create StorageClass with namespace support + # Create StorageClasses with namespace support (ext4 + xfs) cluster_id = self.cluster_id or os.environ.get("CLUSTER_ID", "") self.k8s_utils.create_storage_class( name=self.STORAGE_CLASS_NAME, @@ -1091,6 +2630,15 @@ def _phase_setup(self): npcs=self.npcs, max_namespace_per_subsys=self.NAMESPACES_PER_PARENT, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + max_namespace_per_subsys=self.NAMESPACES_PER_PARENT, + ) self.k8s_utils.create_volume_snapshot_class( name=self.SNAPSHOT_CLASS_NAME, ) @@ -1099,31 +2647,40 @@ def _phase_cleanup(self): self.logger.info("[cleanup] K8s bulk cleanup") ns = self.k8s_utils.namespace if self.k8s_utils else "default" if self.k8s_utils: - # Delete all PVCs with our label + # Delete FIO/write-data jobs with our label try: self.k8s_utils._exec_kubectl( - f"kubectl delete pvc -l test=ns-stress -n {ns} " + f"kubectl delete job -l test=ns-stress -n {ns} " f"--wait=false --ignore-not-found 2>/dev/null || true" ) except Exception: pass - # Delete all volume snapshots + # Delete all PVCs with our label try: self.k8s_utils._exec_kubectl( - f"kubectl delete volumesnapshot -l test=ns-stress -n {ns} " + f"kubectl delete pvc -l test=ns-stress -n {ns} " f"--wait=false --ignore-not-found 2>/dev/null || true" ) except Exception: pass - # Delete StorageClass + # Delete all volume snapshots try: self.k8s_utils._exec_kubectl( - f"kubectl delete storageclass {self.STORAGE_CLASS_NAME} " - f"--ignore-not-found 2>/dev/null || true" + f"kubectl delete volumesnapshot -l test=ns-stress -n {ns} " + f"--wait=false --ignore-not-found 2>/dev/null || true" ) except Exception: pass - # Bulk sbcli cleanup + # Delete StorageClasses + for sc in [self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]: + try: + self.k8s_utils._exec_kubectl( + f"kubectl delete storageclass {sc} " + f"--ignore-not-found 2>/dev/null || true" + ) + except Exception: + pass + # Targeted sbcli cleanup — only test resources try: self.sbcli_utils.delete_all_clones() except Exception: @@ -1133,7 +2690,23 @@ def _phase_cleanup(self): except Exception: pass try: - self.sbcli_utils.delete_all_lvols() + all_lvols = self.sbcli_utils.list_lvols() + test_lvols = [ + name for name in all_lvols + if name.startswith("ns-") or name.startswith("cln-") + or name.startswith("snap-") + ] + self.logger.info( + f"[cleanup] Deleting {len(test_lvols)}/{len(all_lvols)} " + f"test lvols" + ) + for lv_name in test_lvols: + try: + self.sbcli_utils.delete_lvol( + lvol_name=lv_name, skip_error=True + ) + except Exception: + pass except Exception: pass try: @@ -1141,35 +2714,512 @@ def _phase_cleanup(self): except Exception: pass - # ── Phase overrides ─────────────────────────────────────────────────── + def _phase_verify_cleanup(self): + """K8s override: also verify no test PVCs remain.""" + ns = self.k8s_utils.namespace if self.k8s_utils else "default" + # Check K8s PVCs with test label + if self.k8s_utils: + try: + output = self.k8s_utils._exec_kubectl( + f"kubectl get pvc -l test=ns-stress -n {ns} " + f"--no-headers 2>/dev/null || true" + ) + if output and output.strip(): + lines = [ + ln for ln in output.strip().split("\n") + if ln.strip() + ] + self.logger.warning( + f"[verify_cleanup] {len(lines)} test PVCs still " + f"present — force deleting" + ) + self.k8s_utils._exec_kubectl( + f"kubectl delete pvc -l test=ns-stress -n {ns} " + f"--wait=false --ignore-not-found 2>/dev/null || true" + ) + sleep_n_sec(10) + except Exception: + pass + # Delegate to base for sbcli-level verification + super()._phase_verify_cleanup() - def _phase_create_parents(self): - """In K8s, create ALL PVCs (NUM_PARENTS × NAMESPACES_PER_PARENT). - CSI driver groups into subsystems automatically.""" - total = self.NUM_PARENTS * self.NAMESPACES_PER_PARENT - items = [] - for i in range(total): - pvc_name = f"ns-pvc-{_rand_seq(6)}-{i:04d}" - items.append({"name": pvc_name, "idx": i}) - self._batch_parallel( - items, self._timed_create_parent, - self.MAX_WORKERS_CREATE, "create_pvcs", + # ── K8s verification overrides ──────────────────────────────────────── + # PVC names != API lvol names (CSI driver uses its own naming), so + # verify via K8s PVC status + API lvol count instead of name matching. + + def _verify_all_lvols_exist(self): + """K8s override: verify PVCs are Bound and PV names exist in API. + + PVC names (ns-pvc-xxx) don't match API lvol names. The PV name + (VOLUME column in ``kubectl get pvc``) matches the lvol name in the + API (``sbctl lvol list``). We verify both: PVC Bound + PV in API. + + Retries up to 30 minutes to allow stragglers to settle after creation. + """ + ns = self.k8s_utils.namespace + with self._lock: + all_pvc_names = set( + list(self._parent_registry.keys()) + + list(self._child_registry.keys()) + ) + expected = len(all_pvc_names) + + # Retry loop: wait for PVCs to settle (some may still be binding) + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + not_bound = [] + pv_names = [] + found_pvcs = set() + + while waited <= max_wait: + not_bound = [] + pv_names = [] + found_pvcs = set() + + # Bulk fetch all test PVCs via -o json (avoids jsonpath quoting issues) + out, _ = self.k8s_utils._exec_kubectl( + f"kubectl get pvc -l test=ns-stress -n {ns} " + f"-o json 2>/dev/null || echo '{{\"items\":[]}}'", + supress_logs=True, + ) + + try: + data = json.loads(out or '{"items":[]}') + for item in data.get("items", []): + pvc_name = item.get("metadata", {}).get("name", "") + phase = item.get("status", {}).get("phase", "") + pv_name = item.get("spec", {}).get("volumeName", "") + if pvc_name not in all_pvc_names: + continue + found_pvcs.add(pvc_name) + if phase != "Bound": + not_bound.append((pvc_name, phase)) + elif pv_name: + pv_names.append((pvc_name, pv_name)) + except (json.JSONDecodeError, TypeError): + self.logger.warning( + f"[verify_lvols] Failed to parse kubectl JSON output " + f"(len={len(out or '')})" + ) + + # Check for PVCs not found in K8s at all + missing_pvcs = all_pvc_names - found_pvcs + if missing_pvcs: + not_bound.extend( + (name, "not-found") for name in list(missing_pvcs)[:50] + ) + + not_bound_pct = len(not_bound) * 100 / max(expected, 1) + if not not_bound or not_bound_pct <= 50: + break # All Bound or within 50% tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_lvols] {len(not_bound)}/{expected} PVCs " + f"({not_bound_pct:.1f}%) not yet Bound, waiting " + f"{poll_interval}s... (waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + # Final assessment after wait + not_bound_pct = len(not_bound) * 100 / max(expected, 1) + if not_bound: + self.logger.warning( + f"[verify_lvols] {len(not_bound)}/{expected} PVCs " + f"({not_bound_pct:.1f}%) not Bound/found after " + f"{waited}s wait: " + f"{not_bound[:10]}{'...' if len(not_bound) > 10 else ''}" + ) + if not_bound_pct > 50: + raise RuntimeError( + f"[verify_lvols] {not_bound_pct:.1f}% PVCs not Bound " + f"exceeds 50% threshold — {len(not_bound)}/{expected}" + ) + + # Cross-check: PV names (VOLUME column) should exist in API lvol list + all_lvols = self.sbcli_utils.list_lvols() + lvol_names = set(all_lvols.keys()) if isinstance(all_lvols, dict) else set(all_lvols) + missing_in_api = [] + for pvc_name, pv_name in pv_names: + if pv_name not in lvol_names: + missing_in_api.append((pvc_name, pv_name)) + + if missing_in_api: + self.logger.warning( + f"[verify_lvols] {len(missing_in_api)}/{expected} PVCs Bound " + f"but PV not in API: " + f"{missing_in_api[:10]}{'...' if len(missing_in_api) > 10 else ''}" + ) + + bound_count = len(found_pvcs) - len(not_bound) + self.logger.info( + f"[verify_lvols] {bound_count}/{expected} PVCs Bound, " + f"{len(pv_names)} PVs found in API " + f"(lvol count={len(all_lvols)})" ) - def _phase_create_children(self): - """No-op in K8s — CSI groups namespaces automatically.""" + def _verify_all_snapshots_exist(self): + """K8s override: verify VolumeSnapshots are readyToUse. + + Uses ``-o json`` instead of jsonpath to avoid shell-quoting issues + when _exec_kubectl runs through bash -c or SSH layers. + + Retries up to 30 minutes to allow snapshots to become ready. + Warns for not-ready, only fails if >50% not ready. + """ + ns = self.k8s_utils.namespace + with self._lock: + snap_names = list(self._snap_registry.keys()) + if not snap_names: + self.logger.info("[verify_snapshots] No snapshots to verify") + return + + total = len(snap_names) + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + not_ready = [] + + while waited <= max_wait: + not_ready = [] + # Use -o json for reliable parsing (jsonpath has shell-quoting issues) + out, _ = self.k8s_utils._exec_kubectl( + f"kubectl get volumesnapshot -l test=ns-stress -n {ns} " + f"-o json 2>/dev/null || echo '{{\"items\":[]}}'", + supress_logs=True, + ) + found_snaps = {} + try: + data = json.loads(out or '{"items":[]}') + for item in data.get("items", []): + name = item.get("metadata", {}).get("name", "") + ready = item.get("status", {}).get("readyToUse", False) + found_snaps[name] = str(ready).lower() + except (json.JSONDecodeError, TypeError): + self.logger.warning( + f"[verify_snapshots] Failed to parse kubectl JSON output " + f"(len={len(out or '')})" + ) + + for snap_name in snap_names: + ready = found_snaps.get(snap_name, "not-found") + if ready != "true": + not_ready.append((snap_name, ready)) + + not_ready_pct = len(not_ready) * 100 / max(total, 1) + if not not_ready or not_ready_pct <= 50: + break # All ready or within 50% tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_snapshots] {len(not_ready)}/{total} " + f"({not_ready_pct:.1f}%) snapshots not ready, " + f"waiting {poll_interval}s... " + f"(waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + not_ready_pct = len(not_ready) * 100 / max(total, 1) + if not_ready: + self.logger.warning( + f"[verify_snapshots] {len(not_ready)}/{total} " + f"({not_ready_pct:.1f}%) snapshots not ready after " + f"{waited}s wait: " + f"{not_ready[:10]}{'...' if len(not_ready) > 10 else ''}" + ) + if not_ready_pct > 50: + raise RuntimeError( + f"[verify_snapshots] {not_ready_pct:.1f}% snapshots not " + f"ready exceeds 50% threshold — " + f"{len(not_ready)}/{total}" + ) self.logger.info( - "[K8s] Children phase is no-op; CSI driver groups " - "PVCs into subsystems automatically" + f"[verify_snapshots] {total - len(not_ready)}/{total} " + f"snapshots confirmed readyToUse" ) - # ── Create implementations ──────────────────────────────────────────── + def _verify_all_clones_exist(self): + """K8s override: verify clone PVCs are Bound. - def _create_parent_impl(self, params: dict): - name = params["name"] - self._inc("attempts", "create_parent") + Uses ``-o json`` instead of jsonpath to avoid shell-quoting issues. + + Retries up to 30 minutes to allow clone PVCs to bind. + Warns for not-bound, only fails if >50% not bound. + """ + ns = self.k8s_utils.namespace + with self._lock: + clone_names = list(self._clone_registry.keys()) + if not clone_names: + self.logger.info("[verify_clones] No clones to verify") + return + + total = len(clone_names) + max_wait = 1800 # 30 minutes + poll_interval = 30 + waited = 0 + not_bound = [] + + while waited <= max_wait: + not_bound = [] + # Use -o json for reliable parsing + out, _ = self.k8s_utils._exec_kubectl( + f"kubectl get pvc -l test=ns-stress -n {ns} " + f"-o json 2>/dev/null || echo '{{\"items\":[]}}'", + supress_logs=True, + ) + found_pvcs = {} + try: + data = json.loads(out or '{"items":[]}') + for item in data.get("items", []): + name = item.get("metadata", {}).get("name", "") + phase = item.get("status", {}).get("phase", "") + found_pvcs[name] = phase + except (json.JSONDecodeError, TypeError): + self.logger.warning( + f"[verify_clones] Failed to parse kubectl JSON output " + f"(len={len(out or '')})" + ) + + for clone_name in clone_names: + phase = found_pvcs.get(clone_name, "not-found") + if phase != "Bound": + not_bound.append((clone_name, phase)) + + not_bound_pct = len(not_bound) * 100 / max(total, 1) + if not not_bound or not_bound_pct <= 50: + break # All Bound or within 50% tolerance + + if waited < max_wait: + self.logger.info( + f"[verify_clones] {len(not_bound)}/{total} " + f"({not_bound_pct:.1f}%) clone PVCs not Bound, " + f"waiting {poll_interval}s... " + f"(waited {waited}s/{max_wait}s)" + ) + sleep_n_sec(poll_interval) + waited += poll_interval + else: + break # Exhausted wait time + + not_bound_pct = len(not_bound) * 100 / max(total, 1) + if not_bound: + self.logger.warning( + f"[verify_clones] {len(not_bound)}/{total} " + f"({not_bound_pct:.1f}%) clone PVCs not Bound after " + f"{waited}s wait: " + f"{not_bound[:10]}{'...' if len(not_bound) > 10 else ''}" + ) + if not_bound_pct > 50: + raise RuntimeError( + f"[verify_clones] {not_bound_pct:.1f}% clone PVCs not " + f"Bound exceeds 50% threshold — " + f"{len(not_bound)}/{total}" + ) + self.logger.info( + f"[verify_clones] {total - len(not_bound)}/{total} clone " + f"PVCs confirmed Bound" + ) + + # ── Two-phase subsystem creation: parents then parallel children ──── + + def _phase_create_subsystems(self): + """Sub-phase 1: create all parent PVCs sequentially. + Sub-phase 2: create children for PARALLEL_PARENTS subsystems + concurrently.""" + pvcs_per_subsys = 1 + self.CHILDREN_PER_PARENT + total = self.NUM_PARENTS * pvcs_per_subsys + self.logger.info( + f"[create_subsystems] {self.NUM_PARENTS} subsystems × " + f"{pvcs_per_subsys} PVCs = {total} total " + f"(parallel={self.PARALLEL_PARENTS})" + ) + + # ── Sub-phase 1: Create all parent PVCs (parallel) ───────── + parent_items = [] + parent_names = [] + for i in range(self.NUM_PARENTS): + pname = f"ns-pvc-{_rand_seq(6)}-{i:04d}" + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" + parent_items.append({"name": pname, "idx": i, "sc_name": sc_name}) + parent_names.append(pname) + # Pre-register so children can reference parents + self._parent_registry[pname] = { + "id": pname, + "children": [], + "snapshots": [], + "start_child_idx": i * pvcs_per_subsys + 1, + "storage_class": sc_name, + "fs_type": fs_type, + } + self.logger.info( + f"[create_subsystems][sub1] Creating {self.NUM_PARENTS} parent " + f"PVCs (parallel, workers={self.MAX_WORKERS_CREATE})" + ) + parents_t0 = time.time() + _ok, parent_fail = self._batch_parallel( + parent_items, + self._create_single_parent_k8s, + self.MAX_WORKERS_CREATE, + "create_parents", + ) + parents_elapsed = time.time() - parents_t0 + self._log_op_stats( + "create_parent", batch_label="all parents", + batch_elapsed=parents_elapsed, + ) + + # Remove failed parents from registry (they were pre-registered) + failed_parents = [] + if parent_fail > 0: + created_parents = { + s["name"] for s in self._timing_samples + if s["op"] == "create_parent" + } + for pname in list(parent_names): + if pname not in created_parents: + failed_parents.append(pname) + parent_names.remove(pname) + self._parent_registry.pop(pname, None) + + self.logger.info( + f"[create_subsystems][sub1] {len(parent_names)} parents " + f"created in {parents_elapsed:.1f}s" + f"{f', {len(failed_parents)} FAILED: {failed_parents}' if failed_parents else ''}" + ) + + # ── Sub-phase 2: Create ALL child PVCs in parallel ───────── + total_children = len(parent_names) * self.CHILDREN_PER_PARENT + self.logger.info( + f"[create_subsystems][sub2] Creating {total_children} child " + f"PVCs in parallel (workers={self.MAX_WORKERS_CREATE})" + ) + # Build flat list of all children with parent assignment + child_items = [] + for pi, pname in enumerate(parent_names): + for c in range(self.CHILDREN_PER_PARENT): + child_idx = pi * pvcs_per_subsys + 1 + c + child_items.append({ + "name": f"ns-pvc-{_rand_seq(6)}-{child_idx:04d}", + "parent_name": pname, + }) + children_t0 = time.time() + _ok, child_fail = self._batch_parallel( + child_items, + self._create_single_child_k8s, + self.MAX_WORKERS_CREATE, + "create_children", + ) + children_elapsed = time.time() - children_t0 + self._log_op_stats( + "create_child", batch_label="all children", + batch_elapsed=children_elapsed, + ) + + # Identify failed children + failed_children = [] + if child_fail > 0: + created_children = set(self._child_registry.keys()) + for item in child_items: + if item["name"] not in created_children: + failed_children.append( + f"{item['name']} (parent={item['parent_name']})" + ) + + # ── Failure summary ────────────────────────────────────────── + total_attempted = self.NUM_PARENTS + total_children + total_failed = len(failed_parents) + len(failed_children) + fail_pct = (total_failed * 100 / max(total_attempted, 1)) + + if total_failed > 0: + self.logger.warning( + f"[create_subsystems] FAILED PVCs: {total_failed}/" + f"{total_attempted} ({fail_pct:.1f}%)" + ) + if failed_parents: + self.logger.warning( + f" Failed PARENTS ({len(failed_parents)}): " + f"{failed_parents}" + ) + if failed_children: + self.logger.warning( + f" Failed CHILDREN ({len(failed_children)}): " + f"{failed_children}" + ) + + if fail_pct > 50: + raise RuntimeError( + f"[create_subsystems] {fail_pct:.1f}% failure rate " + f"exceeds 50% threshold — {total_failed}/{total_attempted} " + f"PVCs failed (parents={len(failed_parents)}, " + f"children={len(failed_children)})" + ) + + # ── Bulk verify ────────────────────────────────────────────── + all_lvols = self.sbcli_utils.list_lvols() + expected_created = total_attempted - total_failed + if len(all_lvols) < expected_created: + self.logger.warning( + f"[create_subsystems] lvol count {len(all_lvols)} < " + f"expected {expected_created}" + ) + + self.logger.info( + f"[create_subsystems] Done: {len(self._parent_registry)} " + f"parents, {len(self._child_registry)} children" + f"{f' ({total_failed} failures tolerated)' if total_failed else ''}" + ) + + def _create_single_parent_k8s(self, item): + """Create a single parent PVC. Called from _batch_parallel.""" + name = item["name"] + sc_name = item.get("sc_name", self.STORAGE_CLASS_NAME) + t0 = time.time() + self._create_pvc(name, sc_name=sc_name) + self._record_timing( + "create_parent", name, + time.time() - t0, self._snapshot_inventory(), + ) + self._inc("counts", "parents_created") + + def _create_single_child_k8s(self, item): + """Create a single child PVC and register it under its parent. + + Called from _batch_parallel with MAX_WORKERS_CREATE concurrency — + all children for all parents run in parallel.""" + child_name = item["name"] + parent_name = item["parent_name"] + # Children inherit StorageClass (and thus fs_type) from parent + sc_name = self._parent_registry.get(parent_name, {}).get( + "storage_class", self.STORAGE_CLASS_NAME + ) + t0 = time.time() + self._create_pvc(child_name, sc_name=sc_name) + elapsed = time.time() - t0 + self._record_timing( + "create_child", child_name, + elapsed, self._snapshot_inventory(), + ) + with self._lock: + self._child_registry[child_name] = { + "id": child_name, "parent_name": parent_name, + } + self._parent_registry[parent_name]["children"].append( + child_name + ) + self._inc("counts", "children_created") + + def _create_pvc(self, name: str, sc_name: str = None): + """Create a single PVC with label and wait for Bound.""" + sc = sc_name or self.STORAGE_CLASS_NAME ns = self.k8s_utils.namespace - # Create PVC with label for easy cleanup yaml_content = ( f"apiVersion: v1\n" f"kind: PersistentVolumeClaim\n" @@ -1180,7 +3230,7 @@ def _create_parent_impl(self, params: dict): f"spec:\n" f" accessModes:\n" f" - ReadWriteOnce\n" - f" storageClassName: {self.STORAGE_CLASS_NAME}\n" + f" storageClassName: {sc}\n" f" resources:\n" f" requests:\n" f" storage: {self.PVC_SIZE}\n" @@ -1188,16 +3238,127 @@ def _create_parent_impl(self, params: dict): self.k8s_utils.apply_yaml(yaml_content, namespace=ns) if not self.k8s_utils.wait_pvc_bound(name, timeout=300, namespace=ns): raise TimeoutError(f"PVC {name} not Bound within 300s") + + # ── Write data (parallel FIO) to snapshot-target PVCs ────────────── + + def _phase_write_data(self): + """Run parallel FIO (100 MB write) on all PVCs that will be snapshotted. + + Snapshot targets = all parents + 1 random child. The chosen child is + stored in self._snapshot_child so _phase_create_snapshots reuses it. + """ + parents = list(self._parent_registry.keys()) + + # Pick the random child now so we FIO it and snapshot it later with self._lock: - self._parent_registry[name] = { - "id": name, "children": [], "snapshots": [], - } - self._metrics["counts"]["parents_created"] += 1 - self.logger.info(f"[create_pvc] {name} Bound") + child_names = list(self._child_registry.keys()) + if child_names: + self._snapshot_child = random.choice(child_names) + self.logger.info( + f"[write_data] Pre-selected child for snapshot: " + f"{self._snapshot_child}" + ) + else: + self._snapshot_child = None + + targets = list(parents) + if self._snapshot_child: + targets.append(self._snapshot_child) - def _create_child_impl(self, params: dict): - """No-op in K8s.""" - pass + child_label = " + 1 child" if self._snapshot_child else "" + self.logger.info( + f"[write_data] Running parallel FIO (100 MB) on " + f"{len(targets)} PVCs ({len(parents)} parents" + f"{child_label}) via K8s Jobs" + ) + + fio_items = [{"pvc_name": pvc} for pvc in targets] + write_t0 = time.time() + _ok, fail = self._batch_parallel( + fio_items, self._run_fio_job_k8s, + self.MAX_WORKERS_CREATE, "write_data", + ) + write_elapsed = time.time() - write_t0 + self.logger.info( + f"[write_data] Done: {_ok}/{len(targets)} OK, " + f"{fail} failed in {write_elapsed:.1f}s" + ) + if fail > 0: + self.logger.warning( + f"[write_data] {fail}/{len(targets)} FIO jobs failed" + ) + + def _run_fio_job_k8s(self, item): + """Create a K8s Job running FIO 100 MB sequential write on a PVC.""" + pvc_name = item["pvc_name"] + ns = self.k8s_utils.namespace + job_name = f"fio-{pvc_name[:40]}-{_rand_seq(4)}" + t0 = time.time() + + yaml_content = ( + f"apiVersion: batch/v1\n" + f"kind: Job\n" + f"metadata:\n" + f" name: {job_name}\n" + f" labels:\n" + f" test: ns-stress\n" + f" purpose: write-data\n" + f"spec:\n" + f" backoffLimit: 0\n" + f" template:\n" + f" spec:\n" + f" restartPolicy: Never\n" + f" containers:\n" + f" - name: fio\n" + f" image: dockerpinata/fio:2.1\n" + f" command:\n" + f" - fio\n" + f" args:\n" + f" - --name=write-{pvc_name[:20]}\n" + f" - --filename=/data/testfile\n" + f" - --size=100M\n" + f" - --bs=1M\n" + f" - --rw=write\n" + f" - --direct=1\n" + f" - --ioengine=libaio\n" + f" - --iodepth=1\n" + f" - --numjobs=1\n" + f" volumeMounts:\n" + f" - name: vol\n" + f" mountPath: /data\n" + f" volumes:\n" + f" - name: vol\n" + f" persistentVolumeClaim:\n" + f" claimName: {pvc_name}\n" + ) + self.k8s_utils.apply_yaml(yaml_content, namespace=ns) + result = self.k8s_utils.wait_job_complete( + job_name, timeout=300, namespace=ns, + ) + elapsed = time.time() - t0 + if result != "succeeded": + self.logger.error( + f"[write_data] FIO job {job_name} for PVC {pvc_name} " + f"ended with: {result} ({elapsed:.1f}s)" + ) + raise RuntimeError( + f"FIO job {job_name} for PVC {pvc_name} " + f"ended with: {result}" + ) + # Clean up the completed job + try: + self.k8s_utils.delete_resource("job", job_name, namespace=ns) + except Exception: + pass + self._record_timing( + "write_data", pvc_name, elapsed, + self._snapshot_inventory(), + ) + self.logger.info( + f"[write_data] {pvc_name} OK ({elapsed:.1f}s)" + ) + + # ── Create implementations ──────────────────────────────────────────── def _create_snapshot_impl(self, params: dict): snap_name = params["name"] @@ -1236,6 +3397,7 @@ def _create_snapshot_impl(self, params: dict): def _create_clone_impl(self, params: dict): clone_name = params["name"] snap_name = params["snap_name"] + sc_name = params.get("sc_name", self.STORAGE_CLASS_NAME) self._inc("attempts", "create_clone") ns = self.k8s_utils.namespace # Clone PVC from VolumeSnapshot with label @@ -1249,7 +3411,7 @@ def _create_clone_impl(self, params: dict): f"spec:\n" f" accessModes:\n" f" - ReadWriteOnce\n" - f" storageClassName: {self.STORAGE_CLASS_NAME}\n" + f" storageClassName: {sc_name}\n" f" resources:\n" f" requests:\n" f" storage: {self.PVC_SIZE}\n" @@ -1259,8 +3421,26 @@ def _create_clone_impl(self, params: dict): f" apiGroup: snapshot.storage.k8s.io\n" ) self.k8s_utils.apply_yaml(yaml_content, namespace=ns) - if not self.k8s_utils.wait_pvc_bound(clone_name, timeout=300, namespace=ns): - raise TimeoutError(f"Clone PVC {clone_name} not Bound within 300s") + with self._lock: + self._clones_binding += 1 + concurrent = self._clones_binding + self.logger.info( + f"[create_clone] {clone_name} waiting for Bound " + f"(concurrent_binding={concurrent})" + ) + bind_t0 = time.time() + try: + if not self.k8s_utils.wait_pvc_bound( + clone_name, timeout=self.CLONE_BIND_TIMEOUT, namespace=ns + ): + raise TimeoutError( + f"Clone PVC {clone_name} not Bound " + f"within {self.CLONE_BIND_TIMEOUT}s" + ) + finally: + with self._lock: + self._clones_binding -= 1 + bind_elapsed = time.time() - bind_t0 with self._lock: self._clone_registry[clone_name] = { "id": clone_name, "snap_name": snap_name, @@ -1268,7 +3448,128 @@ def _create_clone_impl(self, params: dict): if snap_name in self._snap_registry: self._snap_registry[snap_name]["clones"].append(clone_name) self._metrics["counts"]["clones_created"] += 1 - self.logger.info(f"[create_clone] {clone_name} Bound (snap={snap_name})") + self.logger.info( + f"[create_clone] {clone_name} Bound in {bind_elapsed:.1f}s " + f"(snap={snap_name})" + ) + + # ── Clone mount verification ───────────────────────────────────────── + + def _mount_verify_single_clone(self, item): + """Create a K8s FIO Job mounting the clone PVC, run read, check errors.""" + clone_name = item["clone_name"] + ns = self.k8s_utils.namespace + job_name = f"verify-{clone_name[:40]}-{_rand_seq(4)}" + t0 = time.time() + + try: + # 1. Create FIO Job that mounts the clone PVC and reads 4 MB + yaml_content = ( + f"apiVersion: batch/v1\n" + f"kind: Job\n" + f"metadata:\n" + f" name: {job_name}\n" + f" namespace: {ns}\n" + f" labels:\n" + f" test: ns-stress\n" + f" purpose: mount-verify\n" + f"spec:\n" + f" backoffLimit: 0\n" + f" template:\n" + f" spec:\n" + f" restartPolicy: Never\n" + f" containers:\n" + f" - name: fio\n" + f" image: dockerpinata/fio:2.1\n" + f" command:\n" + f" - fio\n" + f" args:\n" + f" - --name=verify-{clone_name[:20]}\n" + f" - --filename=/data/testfile\n" + f" - --size=4M\n" + f" - --bs=4K\n" + f" - --rw=read\n" + f" - --direct=1\n" + f" - --ioengine=libaio\n" + f" - --iodepth=1\n" + f" - --numjobs=1\n" + f" volumeMounts:\n" + f" - name: vol\n" + f" mountPath: /data\n" + f" volumes:\n" + f" - name: vol\n" + f" persistentVolumeClaim:\n" + f" claimName: {clone_name}\n" + ) + self.k8s_utils.apply_yaml(yaml_content, namespace=ns) + + # 2. Wait for job completion + result = self.k8s_utils.wait_job_complete( + job_name, timeout=300, namespace=ns, + ) + elapsed = time.time() - t0 + + # 3. Fetch pod logs for FIO output + fio_output = "" + try: + # Find the pod created by this job + pod_out, _ = self.k8s_utils._exec_kubectl( + f"kubectl get pods -n {ns} -l job-name={job_name} " + f"-o jsonpath='{{.items[0].metadata.name}}' 2>/dev/null", + supress_logs=True, + ) + pod_name = (pod_out or "").strip() + if pod_name: + fio_output = self.k8s_utils.get_pod_logs( + pod_name, namespace=ns, tail=100, + ) + except Exception: + pass + + # 4. Check for errors + if result != "succeeded": + self.logger.error( + f"[mount_verify] FIO job {job_name} for clone " + f"{clone_name} ended with: {result} ({elapsed:.1f}s)" + f"\nFIO output:\n{fio_output}" + ) + raise RuntimeError( + f"FIO verify job for clone {clone_name} failed: " + f"{result}" + ) + + # 5. Parse FIO output for err= + import re + for line in (fio_output or "").splitlines(): + if "err=" in line: + m = re.search(r"err=\s*(\d+)", line) + if m and int(m.group(1)) != 0: + self.logger.error( + f"[mount_verify] FIO reported error on clone " + f"{clone_name}:\n{fio_output}" + ) + raise RuntimeError( + f"FIO read error on clone {clone_name}: " + f"{line.strip()}" + ) + + self.logger.info( + f"[mount_verify] Clone {clone_name} verified OK " + f"({elapsed:.1f}s)" + ) + self._record_timing( + "mount_verify", clone_name, elapsed, + self._snapshot_inventory(), + ) + + finally: + # Always clean up the job + try: + self.k8s_utils.delete_resource( + "job", job_name, namespace=ns, + ) + except Exception: + pass # ── Delete implementations (with verification) ──────────────────────── @@ -1297,8 +3598,17 @@ def _delete_snapshot_impl(self, snap_name: str): self._metrics["counts"]["snapshots_deleted"] += 1 def _delete_child_impl(self, child_name: str): - """No-op in K8s — no separate children.""" - pass + """Delete child PVC in K8s.""" + self._inc("attempts", "delete_child") + ns = self.k8s_utils.namespace + self.k8s_utils._exec_kubectl( + f"kubectl delete pvc {child_name} -n {ns} " + f"--ignore-not-found --wait=false 2>/dev/null || true" + ) + self._wait_pvc_gone(child_name) + with self._lock: + self._child_registry.pop(child_name, None) + self._metrics["counts"]["children_deleted"] += 1 def _delete_parent_impl(self, parent_name: str): self._inc("attempts", "delete_parent") diff --git a/e2e/stress_test/device_failure_migration.py b/e2e/stress_test/device_failure_migration.py index ab43efe8d..93a76aba3 100755 --- a/e2e/stress_test/device_failure_migration.py +++ b/e2e/stress_test/device_failure_migration.py @@ -2,21 +2,43 @@ Device Failure Migration Stress Test Measures the time it takes to complete failure migration on a single device. -Two variants: - - DeviceFailureMigrationNoLoad: - Fill device to 65 %, fail it, measure migration time (no IO load). - - DeviceFailureMigrationUnderLoad: - Fill device to 65 %, start IO on every cluster node, fail device, - measure migration time while IO is running. +Variants: -Both tests are Docker-mode only (sbcli + SSH FIO). They work with any -cluster geometry (ndcs/npcs) and require at least one client node -(CLIENT_IP env var or mgmt node fallback). + Docker (sbcli + SSH FIO): + - DeviceFailureMigrationNoLoadDocker — API removal, no IO load + - DeviceFailureMigrationUnderLoadDocker — API removal, IO load running + - DeviceFailureMigrationPCIeNoLoadDocker — PCIe sysfs removal, no IO load + - DeviceFailureMigrationPCIeUnderLoadDocker — PCIe sysfs removal, IO load running + + K8s-native (PVC + FIO K8s Jobs): + - DeviceFailureMigrationNoLoadK8s — API removal, no IO load + - DeviceFailureMigrationUnderLoadK8s — API removal, IO load running + - DeviceFailureMigrationPCIeNoLoadK8s — PCIe sysfs removal, no IO load + - DeviceFailureMigrationPCIeUnderLoadK8s— PCIe sysfs removal, IO load running + +Failure modes: + - "api" : Logical removal via REST API + set-failed-device CLI + - "pcie" : Physical removal via /sys/bus/pci/devices//remove + +All tests work with any cluster geometry (ndcs/npcs) and require at least +one storage node with a device. + +Invocation: + # Docker + python3 stress.py --testname DeviceFailureMigrationNoLoadDocker --ndcs 2 --npcs 2 + python3 stress.py --testname DeviceFailureMigrationPCIeNoLoad --ndcs 2 --npcs 2 + + # K8s + python3 stress.py --testname DeviceFailureMigrationNoLoadK8s --ndcs 2 --npcs 2 --run_k8s True + python3 stress.py --testname DeviceFailureMigrationPCIeUnderLoadK8s --ndcs 2 --npcs 2 --run_k8s True """ import json import math +import os +import random +import string import threading import time from datetime import datetime, timezone @@ -28,8 +50,14 @@ from utils.common_utils import sleep_n_sec +def _rand_seq(length: int = 8) -> str: + first = random.choice(string.ascii_lowercase) + rest = "".join(random.choices(string.ascii_lowercase + string.digits, k=length - 1)) + return first + rest + + # ═══════════════════════════════════════════════════════════════════════════════ -# Mixin — shared orchestration for both variants +# Mixin — shared orchestration for all variants # ═══════════════════════════════════════════════════════════════════════════════ class _DeviceFailureMigrationBase: @@ -59,28 +87,50 @@ def _init_migration_state(self): self._load_fio_threads = [] self._sn_nodes = [] self._with_io_load = False + self._failure_mode = "api" + self._pre_migration_checksums = {} # {lvol_name: {filepath: md5}} # ── Main flow ──────────────────────────────────────────────────────────── - def _run_migration_test(self, with_io_load=False): - """Main flow: setup → fill → [start IO] → fail → migrate → cleanup.""" + def _run_migration_test(self, with_io_load=False, failure_mode="api"): + """Main flow: setup -> fill -> [checksum] -> [start IO] -> fail -> migrate -> validate -> recover -> cleanup. + + NoLoad: fill → md5sum → fail device → migrate → verify md5 + FIO fill logs → recover device → cleanup + UnderLoad: fill → start FIO (verify=md5) → fail device → migrate → check FIO OK → wait FIO complete → recover → cleanup + """ self._with_io_load = with_io_load + self._failure_mode = failure_mode + self._test_passed = False t0 = time.time() try: self._phase_setup_pool_and_lvols() self._phase_fill_devices() + if not with_io_load: + self._phase_compute_checksums() if with_io_load: self._phase_start_io_load() - self._phase_fail_and_migrate() + if failure_mode == "pcie": + self._phase_fail_and_migrate_pcie() + else: + self._phase_fail_and_migrate() + self._phase_validate() + if with_io_load: + # Wait for FIO to finish naturally — do NOT kill it + self._phase_wait_fio_completion() + self._phase_validate_fio() + self._test_passed = True finally: if with_io_load: - self._phase_stop_io_load() + self._phase_stop_io_load() # kill FIO only if still running (failure path) + self._phase_recover_device() self._phase_cleanup() self._timing["total_duration"] = time.time() - t0 self._print_migration_summary() self._write_timing_json() self._generate_charts() + self.logger.info("TEST CASE PASSED !!!") + # ── Phase 1: create pool, lvols, connect, format, mount ────────────────── def _phase_setup_pool_and_lvols(self): @@ -102,11 +152,20 @@ def _phase_setup_pool_and_lvols(self): raise RuntimeError( f"No devices found on target node {self._target_node_id}" ) - self._target_device_info = devices[0] - self._target_device_id = devices[0]["id"] + # Filter for online devices only — old failed_and_migrated devices + # remain in the list after recovery and must be skipped + online_devices = [d for d in devices if d.get("status") == "online"] + if not online_devices: + raise RuntimeError( + f"No online devices found on target node {self._target_node_id}. " + f"Device statuses: {[d.get('status') for d in devices]}" + ) + self._target_device_info = online_devices[0] + self._target_device_id = online_devices[0]["id"] self.logger.info( f"Target node: {self._target_node_id}, " - f"Target device: {self._target_device_id}" + f"Target device: {self._target_device_id} " + f"(selected from {len(online_devices)} online / {len(devices)} total devices)" ) # Get node capacity to calculate how many lvols to create @@ -227,9 +286,16 @@ def _phase_fill_devices(self): t.start() threads.append(t) - # Wait for all fills to complete + # Wait for FIO launch threads to return (they return after verifying + # FIO is running in tmux, but FIO itself is still writing) for t in threads: - t.join(timeout=3600) + t.join(timeout=60) + + # Wait for actual FIO processes to finish on the remote node + self.logger.info("Waiting for FIO fill processes to complete on remote node ...") + self.common_utils.manage_fio_threads( + node=client, threads=[], timeout=3600 + ) # Verify fill level sleep_n_sec(5) @@ -244,6 +310,153 @@ def _phase_fill_devices(self): f"Fill complete ({self._timing['fill_duration']:.1f}s)" ) + # ── Phase 2b: compute pre-migration checksums (no-load variant) ───────── + + def _phase_compute_checksums(self): + """Compute MD5 checksums of all files on target lvols before migration.""" + self.logger.info("=== Phase: Compute pre-migration checksums ===") + client = self.fio_node[0] + self._pre_migration_checksums = {} + + for name in self._lvols_on_target: + info = self.lvol_mount_details.get(name) + if not info: + continue + mount = info["Mount"] + try: + files = self.ssh_obj.find_files(client, mount) + if files: + checksums = self.ssh_obj.generate_checksums(client, files) + self._pre_migration_checksums[name] = checksums + self.logger.info( + f"Captured {len(checksums)} file checksums for {name}" + ) + else: + self.logger.warning(f"No files found on {mount} for checksum") + except Exception as exc: + self.logger.warning(f"Checksum capture failed for {name}: {exc}") + + self.logger.info( + f"Pre-migration checksums captured for " + f"{len(self._pre_migration_checksums)} lvols" + ) + + def _phase_verify_checksums(self): + """Verify MD5 checksums of target lvols match pre-migration values.""" + self.logger.info("=== Verifying post-migration data integrity ===") + client = self.fio_node[0] + mismatches = 0 + + for name, expected_checksums in self._pre_migration_checksums.items(): + info = self.lvol_mount_details.get(name) + if not info: + continue + mount = info["Mount"] + try: + files = self.ssh_obj.find_files(client, mount) + self.ssh_obj.verify_checksums( + client, files, expected_checksums, + message=( + f"Data integrity check failed for lvol {name} " + f"after device migration" + ), + ) + self.logger.info(f"Checksums verified for {name}: OK") + except ValueError as exc: + self.logger.error(f"Checksum MISMATCH for {name}: {exc}") + mismatches += 1 + except Exception as exc: + self.logger.error( + f"Checksum verification error for {name}: {exc}" + ) + mismatches += 1 + + assert mismatches == 0, ( + f"Data integrity check failed: {mismatches} lvol(s) had " + f"checksum mismatches after migration" + ) + self.logger.info( + "All post-migration checksums verified — data integrity OK" + ) + + def _phase_validate_fio(self): + """Check FIO logs for errors after migration (under-load variant). + + IO errors on lvols hosted on the failed device are expected and + logged as warnings. IO errors on lvols hosted on OTHER devices + are logged as errors. + """ + self.logger.info("=== Verifying FIO logs for errors ===") + client = self.fio_node[0] + fail_words = ["error", "fail", "interrupt", "terminate"] + target_errors = [] + other_errors = [] + + all_names = self._lvols_on_target + self._lvols_on_others + for name in all_names: + info = self.lvol_mount_details.get(name) + if not info or not info.get("Log"): + continue + try: + log_data = self.ssh_obj.exec_command( + client, f"cat {info['Log']} 2>/dev/null || true" + ) + if not log_data: + self.logger.warning(f"Empty or missing FIO log for {name}") + continue + log_lower = log_data.lower() if isinstance(log_data, str) else str(log_data).lower() + found = [w for w in fail_words if w in log_lower] + if found: + msg = f"{name}: FIO log contains {found}" + if name in self._lvols_on_target: + target_errors.append(msg) + self.logger.warning( + f"[expected] FIO error on failed-device lvol {name}: {found}" + ) + else: + other_errors.append(msg) + self.logger.error( + f"FIO error on non-target lvol {name}: {found}" + ) + else: + self.logger.info(f"FIO log for {name}: no errors") + except Exception as exc: + self.logger.warning(f"Could not read FIO log for {name}: {exc}") + + if target_errors: + self.logger.warning( + f"{len(target_errors)} FIO error(s) on target-device lvols " + f"(expected during device migration)" + ) + if other_errors: + self.logger.error( + f"{len(other_errors)} FIO error(s) on non-target lvols: " + f"{other_errors}" + ) + + # ── Phase: wait for FIO to complete naturally ────────────────────────── + + def _phase_wait_fio_completion(self): + """Wait for FIO processes to finish naturally (do NOT kill them). + + Uses ``common_utils.manage_fio_threads`` to poll for active FIO + processes on the client node until none remain. + """ + self.logger.info("=== Phase: Waiting for FIO to complete naturally ===") + client = self.fio_node[0] + t0 = time.time() + timeout = self.FIO_LOAD_RUNTIME + 300 # runtime + buffer + + self.common_utils.manage_fio_threads( + node=client, threads=[], timeout=timeout + ) + + self._timing["fio_completion_duration"] = time.time() - t0 + self.logger.info( + f"All FIO processes completed " + f"({self._timing['fio_completion_duration']:.1f}s)" + ) + # ── Phase 3: start random IO on all nodes (under-load variant) ─────────── def _phase_start_io_load(self): @@ -277,19 +490,20 @@ def _phase_start_io_load(self): f"IO load started: {len(self._load_fio_threads)} FIO threads" ) - # ── Phase 4: remove device → set-failed → wait migration ──────────────── + # ── Phase 4a: API removal -> set-failed -> wait migration ──────────────── def _phase_fail_and_migrate(self): self.logger.info( - f"=== Phase: Fail device {self._target_device_id} and migrate ===" + f"=== Phase: Fail device {self._target_device_id} via API and migrate ===" ) t0 = time.time() - # Step 1: remove device (ONLINE → REMOVED) - self.logger.info(f"Removing device {self._target_device_id} …") + # Step 1: remove device (ONLINE -> REMOVED) + self.logger.info(f"Removing device {self._target_device_id} ...") self.sbcli_utils.remove_device(self._target_device_id) self.sbcli_utils.wait_for_device_status( - self._target_node_id, "removed", timeout=120 + self._target_node_id, "removed", timeout=120, + device_id=self._target_device_id, ) self._timing["remove_duration"] = time.time() - t0 self.logger.info( @@ -306,14 +520,102 @@ def _phase_fail_and_migrate(self): sleep_n_sec(5) # Step 3: wait for migration to complete - self.logger.info("Waiting for failure migration tasks to complete …") - migration_elapsed = self.sbcli_utils.wait_migration_tasks_complete( - timeout=self.MIGRATION_TIMEOUT + self._wait_migration_and_verify(t1) + + # ── Phase 4b: PCIe sysfs removal -> set-failed -> wait migration ───────── + + def _phase_fail_and_migrate_pcie(self): + self.logger.info( + f"=== Phase: Fail device {self._target_device_id} via PCIe and migrate ===" ) - self._timing["migration_duration"] = time.time() - t1 + t0 = time.time() + + # Step 1: Get node IP and PCIe address + node_details = self.sbcli_utils.get_storage_node_details( + self._target_node_id + ) + node_ip = node_details[0]["mgmt_ip"] + pcie_addr = self._target_device_info.get("pcie_address", "") + if not pcie_addr: + raise RuntimeError( + f"No pcie_address found for device {self._target_device_id}" + ) + self.logger.info( + f"PCIe hot-unplug: device {self._target_device_id} " + f"at {pcie_addr} on {node_ip}" + ) + + # Step 2: PCIe hot-unplug via sysfs + self.ssh_obj.exec_command( + node=node_ip, + command=f"echo 1 | sudo tee /sys/bus/pci/devices/{pcie_addr}/remove" + ) + self.logger.info("PCIe device removed via sysfs") + sleep_n_sec(10) + + # Step 3: Wait for control plane to detect device loss + self.sbcli_utils.wait_for_device_status( + self._target_node_id, "unavailable", timeout=120, + device_id=self._target_device_id, + ) + self._timing["remove_duration"] = time.time() - t0 + self.logger.info( + f"Device detected as unavailable ({self._timing['remove_duration']:.1f}s)" + ) + + # Step 4: Logical remove + set-failed to trigger migration + t1 = time.time() + self.sbcli_utils.remove_device(self._target_device_id) + self.sbcli_utils.wait_for_device_status( + self._target_node_id, "removed", timeout=120, + device_id=self._target_device_id, + ) + + mgmt_ip = self.mgmt_nodes[0] + cmd = f"{self.base_cmd} sn set-failed-device {self._target_device_id}" + self.logger.info(f"Setting device failed via CLI: {cmd}") + result = self.ssh_obj.exec_command(mgmt_ip, cmd) + self.logger.info(f"set-failed-device result: {result}") + sleep_n_sec(5) + + # Step 5: wait for migration to complete + self._wait_migration_and_verify(t1) + + # Step 6: Rescan PCI bus to bring device back (for future tests) + self.logger.info("Rescanning PCI bus to restore device ...") + self.ssh_obj.exec_command( + node=node_ip, + command="echo 1 | sudo tee /sys/bus/pci/rescan" + ) + sleep_n_sec(10) + self.logger.info("PCI bus rescan complete") + + # ── Shared migration wait + verify ─────────────────────────────────────── + + def _wait_migration_and_verify(self, t_start): + """Wait for migration tasks and verify final device status. + + Tries the REST-based ``wait_migration_tasks_complete`` first. + If the API is unavailable (404 etc.), falls back to polling + ``sbctl cluster list-tasks`` via CLI. + """ + self.logger.info("Waiting for failure migration tasks to complete ...") + try: + migration_elapsed = self.sbcli_utils.wait_migration_tasks_complete( + timeout=self.MIGRATION_TIMEOUT + ) + except TimeoutError: + raise + except Exception as exc: + self.logger.warning( + f"REST migration wait failed ({exc}), falling back to CLI" + ) + migration_elapsed = self._wait_migration_cli_fallback() + + self._timing["migration_duration"] = time.time() - t_start self._timing["migration_tasks_elapsed"] = migration_elapsed - # Step 4: verify device status + # Verify device status sleep_n_sec(5) devices = self.sbcli_utils.get_device_details(self._target_node_id) target_dev = None @@ -330,14 +632,188 @@ def _phase_fail_and_migrate(self): # ── Phase 5: stop IO load ──────────────────────────────────────────────── + def _phase_validate(self): + """Validate migration results: device migrated, nodes healthy, data intact.""" + self.logger.info("=== Phase: Validate migration results ===") + + # 1. Device should be in a migrated/failed state + final_status = self._timing.get("device_final_status", "unknown") + assert final_status in ("failed_and_migrated", "failed"), ( + f"Device {self._target_device_id} has unexpected final status: " + f"{final_status} (expected failed_and_migrated or failed)" + ) + self.logger.info( + f"Device {self._target_device_id} status: {final_status}" + ) + + # 2. All storage nodes should still be online and healthy + storage_nodes = self.sbcli_utils.get_storage_nodes() + for node in storage_nodes["results"]: + assert node["status"] == "online", ( + f"Node {node['id']} is not online (status={node['status']})" + ) + assert node["health_check"], ( + f"Node {node['id']} health check failed" + ) + self.logger.info( + f"All {len(storage_nodes['results'])} storage nodes online and healthy" + ) + + # 3. Other devices on target node should still be online + devices = self.sbcli_utils.get_device_details(self._target_node_id) + for d in devices: + if d["id"] == self._target_device_id: + continue + assert d["status"] == "online", ( + f"Non-target device {d['id']} on target node has " + f"unexpected status: {d['status']}" + ) + self.logger.info("All non-target devices remain online") + + # 4. Data integrity checks (NoLoad only — UnderLoad is checked after FIO completes) + if not self._with_io_load: + self._phase_verify_checksums() + def _phase_stop_io_load(self): - self.logger.info("=== Phase: Stop IO load ===") + """Kill remaining FIO processes (failure path only). + + On the success path, FIO completes naturally via + ``_phase_wait_fio_completion``. This method runs in the + ``finally`` block to ensure cleanup if the test failed early. + """ + self.logger.info("=== Phase: Stop IO load (cleanup) ===") client = self.fio_node[0] self.ssh_obj.exec_command(client, "pkill -f fio || true") for t in self._load_fio_threads: t.join(timeout=30) self.logger.info("IO load stopped") + # ── Phase: recover failed device ───────────────────────────────────────── + + def _phase_recover_device(self): + """Create a new device from the failed one and add it back. + + Runs in the finally block so it executes even if the test fails. + + Steps: + 1. ``sbctl sn new-device-from-failed `` → new device ID + 2. ``sbctl sn add-device `` + 3. Wait for ``new_device_migration`` tasks to complete + """ + if not self._target_device_id: + return + self.logger.info( + f"=== Phase: Recover device {self._target_device_id} ===" + ) + mgmt_ip = self.mgmt_nodes[0] + + # Step 1: create new device from failed device + try: + cmd = ( + f"{self.base_cmd} sn new-device-from-failed " + f"{self._target_device_id}" + ) + self.logger.info(f"Creating new device from failed: {cmd}") + result = self.ssh_obj.exec_command(mgmt_ip, cmd) + result_str = result[0] if isinstance(result, tuple) else str(result) + result_str = result_str.strip() + self.logger.info(f"new-device-from-failed result: {result_str}") + + # Check for "already added back" — device was recovered previously + if "already added back from failed" in result_str.lower(): + self.logger.info( + "Device was already recovered from a previous run, " + "skipping add-device step" + ) + return + + # Check for other errors in output + if "error" in result_str.lower() and "new device id:" not in result_str.lower(): + self.logger.error( + f"new-device-from-failed returned error: {result_str}" + ) + return + + # The last line of successful output is the bare UUID + # e.g. "5ab70b74-c8c5-4e24-b76e-dd64bdcfa39d" + new_device_id = result_str.strip().split("\n")[-1].strip() + # Validate it looks like a UUID (8-4-4-4-12 hex) + import re + if not re.match( + r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', + new_device_id + ): + self.logger.error( + f"Could not parse valid device UUID from output. " + f"Got: '{new_device_id}', full output: {result_str}" + ) + return + self.logger.info(f"New device ID: {new_device_id}") + except Exception as exc: + self.logger.error(f"new-device-from-failed failed: {exc}") + return + + # Step 2: add the new device + try: + cmd = f"{self.base_cmd} -d sn add-device {new_device_id}" + self.logger.info(f"Adding new device: {cmd}") + result = self.ssh_obj.exec_command(mgmt_ip, cmd) + self.logger.info(f"add-device result: {result}") + sleep_n_sec(5) + except Exception as exc: + self.logger.error(f"add-device failed: {exc}") + return + + # Step 3: wait for new_device_migration tasks to complete + try: + self._wait_new_device_migration( + new_device_id, timeout=self.MIGRATION_TIMEOUT + ) + self.logger.info( + f"Device recovery complete — new device {new_device_id} online" + ) + except Exception as exc: + self.logger.warning( + f"new_device_migration did not complete: {exc}" + ) + + def _wait_new_device_migration(self, new_device_id, timeout=3600): + """Wait for all new_device_migration tasks for *new_device_id* to finish.""" + self.logger.info( + f"Waiting for new_device_migration tasks for {new_device_id} ..." + ) + start = time.time() + while time.time() - start < timeout: + try: + tasks = self.sbcli_utils.list_migration_tasks( + self.sbcli_utils.cluster_id + ) + active = [ + t for t in tasks.get("results", []) + if t.get("function_name") == "new_device_migration" + and new_device_id in str(t.get("target_id", "")) + and t.get("status") not in ("done", "cancelled", "error") + ] + if not active: + elapsed = time.time() - start + self.logger.info( + f"All new_device_migration tasks complete " + f"in {elapsed:.1f}s" + ) + return elapsed + self.logger.info( + f"Waiting for {len(active)} new_device_migration " + f"task(s) ..." + ) + except Exception as exc: + self.logger.warning( + f"Error checking migration tasks: {exc}" + ) + sleep_n_sec(10) + self.logger.warning( + f"new_device_migration not complete after {timeout}s" + ) + # ── Cleanup ────────────────────────────────────────────────────────────── def _phase_cleanup(self): @@ -380,12 +856,14 @@ def _print_migration_summary(self): self.logger.info(" DEVICE FAILURE MIGRATION SUMMARY") self.logger.info("=" * 70) self.logger.info(f" Test class: {self.__class__.__name__}") + self.logger.info(f" Failure mode: {self._failure_mode}") self.logger.info(f" IO load: {'YES' if self._with_io_load else 'NO'}") self.logger.info(f" Target node: {self._target_node_id}") self.logger.info(f" Target device: {self._target_device_id}") self.logger.info(f" Fill target: {self.FILL_PERCENT}%") self.logger.info(f" Lvols on target: {len(self._lvols_on_target)}") self.logger.info(f" Lvols on others: {len(self._lvols_on_others)}") + self.logger.info(f" Result: {'PASSED' if self._test_passed else 'FAILED'}") self.logger.info("-" * 70) for key, val in self._timing.items(): if isinstance(val, float): @@ -409,12 +887,13 @@ def _write_timing_json(self): report = { "test_class": self.__class__.__name__, "timestamp": datetime.now(timezone.utc).isoformat(), - "status": "passed", + "status": "passed" if self._test_passed else "failed", "geometry": {"ndcs": self.ndcs, "npcs": self.npcs}, "config": { "fill_percent": self.FILL_PERCENT, "lvol_size": self.LVOL_SIZE, "with_io_load": self._with_io_load, + "failure_mode": self._failure_mode, "target_node": self._target_node_id, "target_device": self._target_device_id, "lvols_on_target": len(self._lvols_on_target), @@ -491,6 +970,7 @@ def _generate_charts(self): plt.suptitle( f"{class_name}\n" f"IO load: {'YES' if self._with_io_load else 'NO'} | " + f"Failure: {self._failure_mode} | " f"Fill: {self.FILL_PERCENT}% | " f"Lvols: {len(self._lvols_on_target)} target + " f"{len(self._lvols_on_others)} other", @@ -547,11 +1027,11 @@ def _parse_size(size_str): # ═══════════════════════════════════════════════════════════════════════════════ -# Concrete test classes +# Docker concrete test classes (sbcli + SSH FIO) # ═══════════════════════════════════════════════════════════════════════════════ -class DeviceFailureMigrationNoLoad(_DeviceFailureMigrationBase, TestLvolHACluster): - """Fill device to 65 %, fail it, run migration WITHOUT IO load. +class DeviceFailureMigrationNoLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster): + """Fill device to 65 %, fail it via API, run migration WITHOUT IO load. Measures: setup time, fill time, device remove time, migration time. """ @@ -567,8 +1047,8 @@ def run(self): self._run_migration_test(with_io_load=False) -class DeviceFailureMigrationUnderLoad(_DeviceFailureMigrationBase, TestLvolHACluster): - """Fill device to 65 %, start IO on all nodes, fail device, migrate UNDER LOAD. +class DeviceFailureMigrationUnderLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster): + """Fill device to 65 %, start IO on all nodes, fail device via API, migrate UNDER LOAD. Measures: setup time, fill time, device remove time, migration time. IO errors during migration are logged but do not fail the test. @@ -583,3 +1063,622 @@ def __init__(self, **kwargs): def run(self): self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) self._run_migration_test(with_io_load=True) + + +class DeviceFailureMigrationPCIeNoLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster): + """Fill device to 65 %, remove via PCIe sysfs, run migration WITHOUT IO load. + + Uses physical PCIe hot-unplug (/sys/bus/pci/devices//remove) instead + of the control-plane API. After migration, rescans PCI bus to restore device. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_pcie_no_load" + + def run(self): + self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self._run_migration_test(with_io_load=False, failure_mode="pcie") + + +class DeviceFailureMigrationPCIeUnderLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster): + """Fill device to 65 %, start IO, remove via PCIe sysfs, migrate UNDER LOAD. + + Uses physical PCIe hot-unplug (/sys/bus/pci/devices//remove) instead + of the control-plane API. After migration, rescans PCI bus to restore device. + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_pcie_under_load" + + def run(self): + self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self._run_migration_test(with_io_load=True, failure_mode="pcie") + + +# ═══════════════════════════════════════════════════════════════════════════════ +# K8s-native concrete test classes (PVC + FIO K8s Jobs) +# ═══════════════════════════════════════════════════════════════════════════════ + +from stress_test.continuous_k8s_native_failover import K8sNativeFailoverTest # noqa: E402 + + +class _DeviceFailureMigrationK8s(_DeviceFailureMigrationBase): + """K8s-native overrides for setup, fill, IO load, and cleanup phases. + + Uses PVCs for storage provisioning and K8s FIO Jobs for workload + generation instead of sbcli + SSH. + + The device failure and migration phases are identical to Docker + (they operate at the control-plane / sysfs level, not the data path). + """ + + # K8s-specific sizing + K8S_PVC_SIZE = "50Gi" + K8S_FIO_FILL_SIZE = "45G" + K8S_FIO_LOAD_SIZE = "1G" + + def _init_migration_state(self): + super()._init_migration_state() + self._pvc_details = {} # pvc_name -> {job_name, configmap_name, node_id} + self._fill_jobs = [] # (job_name, configmap_name) for fill FIO jobs + self._load_jobs = [] # (job_name, configmap_name) for load FIO jobs + + # ── Phase 1 override: PVC-based setup ──────────────────────────────────── + + def _phase_setup_pool_and_lvols(self): + self.logger.info("=== Phase: Setup pool and PVCs (K8s) ===") + t0 = time.time() + + # Get storage nodes + storage_nodes = self.sbcli_utils.get_storage_nodes() + for r in storage_nodes["results"]: + self._sn_nodes.append(r["uuid"]) + self.node_vs_pvc[r["uuid"]] = [] + + if len(self._sn_nodes) < 1: + raise RuntimeError("No storage nodes found") + + # Pick target node and device + self._target_node_id = self._sn_nodes[0] + devices = self.sbcli_utils.get_device_details(self._target_node_id) + if not devices: + raise RuntimeError( + f"No devices found on target node {self._target_node_id}" + ) + # Filter for online devices only — old failed_and_migrated devices + # remain in the list after recovery and must be skipped + online_devices = [d for d in devices if d.get("status") == "online"] + if not online_devices: + raise RuntimeError( + f"No online devices found on target node {self._target_node_id}. " + f"Device statuses: {[d.get('status') for d in devices]}" + ) + self._target_device_info = online_devices[0] + self._target_device_id = online_devices[0]["id"] + self.logger.info( + f"Target node: {self._target_node_id}, " + f"Target device: {self._target_device_id} " + f"(selected from {len(online_devices)} online / {len(devices)} total devices)" + ) + + # Get node capacity to calculate how many PVCs to create + capacity = self.sbcli_utils.get_node_capacity(self._target_node_id) + if isinstance(capacity, list): + capacity = capacity[0] if capacity else {} + size_total_bytes = capacity.get("size_total", 0) + if isinstance(size_total_bytes, str): + size_total_bytes = self._parse_size(size_total_bytes) + target_bytes = int(size_total_bytes * self.FILL_PERCENT / 100) + lvol_bytes = self._parse_size(self.LVOL_SIZE) + num_lvols = max(1, math.ceil(target_bytes / lvol_bytes)) + self.logger.info( + f"Node capacity: {size_total_bytes} bytes, " + f"target fill: {target_bytes} bytes, " + f"creating {num_lvols} PVCs of {self.K8S_PVC_SIZE}" + ) + + # Create PVCs pinned to target node + for i in range(num_lvols): + pvc_name = f"mig-target-{_rand_seq(4)}-{i}" + self._create_pvc(pvc_name, self._target_node_id) + self._lvols_on_target.append(pvc_name) + + # Create 1 PVC per OTHER node (for IO load variant) + other_nodes = [n for n in self._sn_nodes if n != self._target_node_id] + for idx, node_id in enumerate(other_nodes): + pvc_name = f"mig-other-{_rand_seq(4)}-{idx}" + self._create_pvc(pvc_name, node_id) + self._lvols_on_others.append(pvc_name) + + self._timing["setup_duration"] = time.time() - t0 + self.logger.info( + f"Setup complete: {len(self._lvols_on_target)} target PVCs, " + f"{len(self._lvols_on_others)} other PVCs " + f"({self._timing['setup_duration']:.1f}s)" + ) + + def _create_pvc(self, pvc_name, node_id): + """Create a PVC pinned to a specific storage node.""" + self.k8s_utils.create_pvc( + pvc_name, self.K8S_PVC_SIZE, self.STORAGE_CLASS_NAME, + node_id=node_id, + ) + self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300) + sleep_n_sec(2) + + node_id_actual = self._get_pvc_node_id(pvc_name) or node_id + self._pvc_details[pvc_name] = { + "job_name": None, + "configmap_name": None, + "node_id": node_id_actual, + } + self.node_vs_pvc.setdefault(node_id_actual, []).append(pvc_name) + self.logger.info(f"PVC {pvc_name} created and bound (node={node_id_actual})") + + # ── Phase 2 override: fill via K8s FIO Jobs ────────────────────────────── + + def _phase_fill_devices(self): + self.logger.info( + f"=== Phase: Fill target device to {self.FILL_PERCENT}% (K8s FIO Jobs) ===" + ) + t0 = time.time() + + # Create fill FIO jobs for target PVCs + for pvc_name in self._lvols_on_target: + job_name = f"fio-fill-{pvc_name}" + cm_name = f"fiocfg-fill-{pvc_name}" + run_id = _rand_seq(6) + + fio_config = ( + f"[global]\n" + f"name=fill-{pvc_name}\n" + f"filename_format=/spdkvol/fio-fill-{run_id}.$jobnum\n" + f"rw=write\n" + f"bs={self.FIO_FILL_BS}\n" + f"iodepth=1\n" + f"direct=1\n" + f"ioengine=libaio\n" + f"size={self.K8S_FIO_FILL_SIZE}\n" + f"numjobs=1\n" + f"group_reporting\n" + f"\n" + f"[job1]\n" + ) + + try: + self.k8s_utils.create_fio_job( + job_name, pvc_name, cm_name, fio_config, + image=self.FIO_IMAGE, + ) + self._fill_jobs.append((job_name, cm_name)) + self.logger.info(f"Fill FIO job {job_name} created for {pvc_name}") + except Exception as exc: + self.logger.error(f"Fill FIO job failed for {pvc_name}: {exc}") + + # Wait for fill jobs to complete + self.logger.info(f"Waiting for {len(self._fill_jobs)} fill jobs to complete ...") + for job_name, _ in self._fill_jobs: + try: + self.k8s_utils.wait_job_complete(job_name, timeout=3600) + self.logger.info(f"Fill job {job_name} completed") + except Exception as exc: + self.logger.warning(f"Fill job {job_name} did not complete: {exc}") + + # Verify fill level + sleep_n_sec(5) + capacity = self.sbcli_utils.get_node_capacity(self._target_node_id) + if isinstance(capacity, list): + capacity = capacity[0] if capacity else {} + util = capacity.get("size_util", 0) + self.logger.info(f"Post-fill device utilisation: {util}%") + + # Cleanup fill jobs + for job_name, cm_name in self._fill_jobs: + try: + self.k8s_utils.delete_resource("job", job_name) + self.k8s_utils.delete_resource("configmap", cm_name) + except Exception: + pass + + self._timing["fill_duration"] = time.time() - t0 + self.logger.info( + f"Fill complete ({self._timing['fill_duration']:.1f}s)" + ) + + # ── Phase 3 override: IO load via K8s FIO Jobs ─────────────────────────── + + def _phase_start_io_load(self): + self.logger.info("=== Phase: Start IO load on all nodes (K8s FIO Jobs) ===") + all_pvc_names = self._lvols_on_target + self._lvols_on_others + + for pvc_name in all_pvc_names: + job_name = f"fio-load-{pvc_name}" + cm_name = f"fiocfg-load-{pvc_name}" + run_id = _rand_seq(6) + + fio_config = ( + f"[global]\n" + f"name=load-{pvc_name}\n" + f"filename_format=/spdkvol/fio-load-{run_id}.$jobnum\n" + f"rw=randrw\n" + f"rwmixread=50\n" + f"bs={self.FIO_LOAD_BS}\n" + f"iodepth={self.FIO_LOAD_IODEPTH}\n" + f"direct=1\n" + f"ioengine=libaio\n" + f"size={self.K8S_FIO_LOAD_SIZE}\n" + f"numjobs={self.FIO_LOAD_NUMJOBS}\n" + f"time_based\n" + f"runtime={self.FIO_LOAD_RUNTIME}\n" + f"verify=md5\n" + f"verify_dump=1\n" + f"verify_fatal=1\n" + f"verify_backlog=4096\n" + f"group_reporting\n" + f"\n" + f"[job1]\n" + ) + + try: + node_id = self._pvc_details.get(pvc_name, {}).get("node_id") + avoid = ( + self._get_k8s_node_for_storage_node(node_id) + if node_id else None + ) + self.k8s_utils.create_fio_job( + job_name, pvc_name, cm_name, fio_config, + image=self.FIO_IMAGE, + avoid_node=avoid, + ) + self._load_jobs.append((job_name, cm_name)) + self._pvc_details[pvc_name]["job_name"] = job_name + self._pvc_details[pvc_name]["configmap_name"] = cm_name + self.logger.info(f"Load FIO job {job_name} created for {pvc_name}") + except Exception as exc: + self.logger.error(f"Load FIO job failed for {pvc_name}: {exc}") + + sleep_n_sec(15) # let IO ramp up + self.logger.info( + f"IO load started: {len(self._load_jobs)} FIO jobs" + ) + + # ── Phase 2b override: checksums via K8s utility pods ─────────────────── + + def _phase_compute_checksums(self): + """Compute MD5 checksums via utility pods on target PVCs.""" + self.logger.info("=== Phase: Compute pre-migration checksums (K8s) ===") + self._pre_migration_checksums = {} + self._checksum_utility_pods = [] + + for pvc_name in self._lvols_on_target: + pod_name = f"cksum-pre-{pvc_name}" + try: + self.k8s_utils.create_utility_pod(pod_name, pvc_name) + self._checksum_utility_pods.append(pod_name) + self.k8s_utils.wait_pod_running(pod_name) + files = self.k8s_utils.find_files_in_pvc(pod_name) + if files: + checksums = self.k8s_utils.generate_checksums_in_pvc( + pod_name, files + ) + self._pre_migration_checksums[pvc_name] = checksums + self.logger.info( + f"Captured {len(checksums)} file checksums for {pvc_name}" + ) + else: + self.logger.warning( + f"No files found in PVC {pvc_name} for checksum" + ) + except Exception as exc: + self.logger.warning( + f"Checksum capture failed for {pvc_name}: {exc}" + ) + finally: + try: + self.k8s_utils.delete_pod(pod_name) + except Exception: + pass + + self.logger.info( + f"Pre-migration checksums captured for " + f"{len(self._pre_migration_checksums)} PVCs" + ) + + def _phase_verify_checksums(self): + """Verify MD5 checksums via utility pods on target PVCs.""" + self.logger.info("=== Verifying post-migration data integrity (K8s) ===") + mismatches = 0 + + for pvc_name, expected in self._pre_migration_checksums.items(): + pod_name = f"cksum-post-{pvc_name}" + try: + self.k8s_utils.create_utility_pod(pod_name, pvc_name) + self.k8s_utils.wait_pod_running(pod_name) + actual = self.k8s_utils.generate_checksums_in_pvc( + pod_name, + self.k8s_utils.find_files_in_pvc(pod_name), + ) + # Compare by filename (basename) + expected_by_name = { + os.path.basename(k): v for k, v in expected.items() + } + actual_by_name = { + os.path.basename(k): v for k, v in actual.items() + } + for fname, cksum in expected_by_name.items(): + if fname not in actual_by_name: + self.logger.error( + f"File {fname} missing in PVC {pvc_name} after migration" + ) + mismatches += 1 + elif actual_by_name[fname] != cksum: + self.logger.error( + f"Checksum MISMATCH for {fname} in {pvc_name}: " + f"expected {cksum}, got {actual_by_name[fname]}" + ) + mismatches += 1 + else: + self.logger.info(f"Checksum OK: {fname} in {pvc_name}") + except Exception as exc: + self.logger.error( + f"Checksum verification error for {pvc_name}: {exc}" + ) + mismatches += 1 + finally: + try: + self.k8s_utils.delete_pod(pod_name) + except Exception: + pass + + assert mismatches == 0, ( + f"Data integrity check failed: {mismatches} file(s) had " + f"checksum mismatches after migration" + ) + self.logger.info( + "All post-migration checksums verified — data integrity OK" + ) + + def _phase_validate_fio(self): + """Check FIO K8s Job status and pod logs for errors.""" + self.logger.info("=== Verifying FIO jobs for errors (K8s) ===") + target_errors = [] + other_errors = [] + + for job_name, _ in self._load_jobs: + # Determine if this job is on a target or other PVC + pvc_name = job_name.replace("fio-load-", "", 1) + is_target = pvc_name in self._lvols_on_target + try: + pod_name = self.k8s_utils.get_job_pod_name(job_name) + if not pod_name: + self.logger.warning( + f"Could not find pod for FIO job {job_name}" + ) + continue + logs = self.k8s_utils.get_pod_logs(pod_name, tail=500) + fail_words = ["error", "fail", "interrupt", "terminate"] + logs_lower = logs.lower() if logs else "" + found = [w for w in fail_words if w in logs_lower] + if found: + msg = f"{job_name} ({pvc_name}): pod logs contain {found}" + if is_target: + target_errors.append(msg) + self.logger.warning( + f"[expected] FIO error on failed-device PVC " + f"{pvc_name}: {found}" + ) + else: + other_errors.append(msg) + self.logger.error( + f"FIO error on non-target PVC {pvc_name}: {found}" + ) + else: + self.logger.info(f"FIO job {job_name}: no errors") + except Exception as exc: + self.logger.warning( + f"Could not check FIO job {job_name}: {exc}" + ) + + if target_errors: + self.logger.warning( + f"{len(target_errors)} FIO error(s) on target-device PVCs " + f"(expected during device migration)" + ) + if other_errors: + self.logger.error( + f"{len(other_errors)} FIO error(s) on non-target PVCs: " + f"{other_errors}" + ) + + # ── Phase: wait for FIO to complete naturally (K8s) ───────────────────── + + def _phase_wait_fio_completion(self): + """Wait for FIO K8s Jobs to complete naturally.""" + self.logger.info( + "=== Phase: Waiting for FIO K8s Jobs to complete naturally ===" + ) + t0 = time.time() + fio_timeout = self.FIO_LOAD_RUNTIME + 300 + + for job_name, _ in self._load_jobs: + try: + status = self.k8s_utils.wait_job_complete( + job_name, timeout=fio_timeout + ) + self.logger.info( + f"FIO job {job_name} completed: {status}" + ) + except Exception as exc: + self.logger.warning( + f"FIO job {job_name} did not complete: {exc}" + ) + + elapsed = time.time() - t0 + self._timing["fio_completion_duration"] = elapsed + self.logger.info( + f"All FIO jobs finished ({elapsed:.1f}s)" + ) + + # ── Phase 5 override: stop IO load (K8s) ───────────────────────────────── + + def _phase_stop_io_load(self): + """Delete remaining FIO jobs (failure path only).""" + self.logger.info("=== Phase: Stop IO load (K8s cleanup) ===") + for job_name, cm_name in self._load_jobs: + try: + self.k8s_utils.delete_resource("job", job_name) + self.k8s_utils.delete_resource("configmap", cm_name) + except Exception: + pass + self.logger.info("IO load stopped (K8s jobs deleted)") + + # ── Cleanup override (K8s) ─────────────────────────────────────────────── + + def _phase_cleanup(self): + self.logger.info("=== Phase: Cleanup (K8s) ===") + try: + # Delete all FIO jobs and configmaps + for job_name, cm_name in self._fill_jobs + self._load_jobs: + try: + self.k8s_utils.delete_resource("job", job_name) + self.k8s_utils.delete_resource("configmap", cm_name) + except Exception: + pass + + # Delete PVCs + all_pvcs = self._lvols_on_target + self._lvols_on_others + for pvc_name in all_pvcs: + try: + self.k8s_utils.delete_pvc(pvc_name) + except Exception: + pass + sleep_n_sec(10) + + # Delete storage pool + self.sbcli_utils.delete_all_storage_pools() + except Exception as e: + self.logger.error(f"Cleanup error: {e}") + + +# ── K8s concrete classes ───────────────────────────────────────────────────── + +class DeviceFailureMigrationNoLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest): + """K8s-native: fill device to 65 %, fail via API, run migration WITHOUT IO load.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_no_load_k8s" + + def run(self): + storage_nodes = self.sbcli_utils.get_storage_nodes() + for result in storage_nodes["results"]: + self.sn_nodes.append(result["uuid"]) + self.node_vs_pvc[result["uuid"]] = [] + + pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test + + cluster_id = self.cluster_id or "" + self.k8s_utils.create_storage_class( + name=self.STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + ) + self._run_migration_test(with_io_load=False, failure_mode="api") + + +class DeviceFailureMigrationUnderLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest): + """K8s-native: fill device to 65 %, start IO, fail via API, migrate UNDER LOAD.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_under_load_k8s" + + def run(self): + storage_nodes = self.sbcli_utils.get_storage_nodes() + for result in storage_nodes["results"]: + self.sn_nodes.append(result["uuid"]) + self.node_vs_pvc[result["uuid"]] = [] + + pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test + + cluster_id = self.cluster_id or "" + self.k8s_utils.create_storage_class( + name=self.STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + ) + self._run_migration_test(with_io_load=True, failure_mode="api") + + +class DeviceFailureMigrationPCIeNoLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest): + """K8s-native: fill device to 65 %, remove via PCIe sysfs, migrate WITHOUT IO load.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_pcie_no_load_k8s" + + def run(self): + storage_nodes = self.sbcli_utils.get_storage_nodes() + for result in storage_nodes["results"]: + self.sn_nodes.append(result["uuid"]) + self.node_vs_pvc[result["uuid"]] = [] + + pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test + + cluster_id = self.cluster_id or "" + self.k8s_utils.create_storage_class( + name=self.STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + ) + self._run_migration_test(with_io_load=False, failure_mode="pcie") + + +class DeviceFailureMigrationPCIeUnderLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest): + """K8s-native: fill device to 65 %, start IO, remove via PCIe sysfs, migrate UNDER LOAD.""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.logger = setup_logger(__name__) + self._init_migration_state() + self.test_name = "device_failure_migration_pcie_under_load_k8s" + + def run(self): + storage_nodes = self.sbcli_utils.get_storage_nodes() + for result in storage_nodes["results"]: + self.sn_nodes.append(result["uuid"]) + self.node_vs_pvc[result["uuid"]] = [] + + pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test + + cluster_id = self.cluster_id or "" + self.k8s_utils.create_storage_class( + name=self.STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + ) + self._run_migration_test(with_io_load=True, failure_mode="pcie") diff --git a/e2e/stress_test/large_scale_lvol_stress.py b/e2e/stress_test/large_scale_lvol_stress.py index 8d959eef0..be646c1d3 100755 --- a/e2e/stress_test/large_scale_lvol_stress.py +++ b/e2e/stress_test/large_scale_lvol_stress.py @@ -22,6 +22,7 @@ from __future__ import annotations +import json as _json import os import random import re @@ -69,6 +70,7 @@ class _LargeScaleMixin: # ── Parallelism ────────────────────────────────────────────────────────── MAX_WORKERS = 20 BATCH_SIZE = 50 + PARALLEL_PARENTS = 5 # concurrent parents/subsystems during creation # ── Internal state ─────────────────────────────────────────────────────── _phase_durations: dict @@ -86,6 +88,7 @@ def _init_mixin_state(self): def _run_large_scale_test(self): total = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM self._init_mixin_state() + self._creation_partial = False self.logger.info( f"=== Starting {self.__class__.__name__}: " f"{self.NUM_SUBSYSTEMS} subsystems × " @@ -93,8 +96,30 @@ def _run_large_scale_test(self): ) try: t0 = time.time() - self._phase_create_subsystems() - self._phase_durations["create"] = round(time.time() - t0, 1) + try: + self._phase_create_subsystems() + except Exception as create_err: + self._creation_partial = True + self._phase_durations["create"] = round(time.time() - t0, 1) + created = self._count_created_resources() + self.logger.error( + f"[create] CREATION FAILED after {created} resources: " + f"{create_err}" + ) + self.logger.info( + f"[create] *** Max resources created: {created} / " + f"{total} ({created * 100 // max(total, 1)}%) ***" + ) + if created == 0: + raise RuntimeError( + f"No resources created — cannot proceed: {create_err}" + ) + self.logger.info( + f"[create] Proceeding with FIO on {created} existing " + f"resources" + ) + else: + self._phase_durations["create"] = round(time.time() - t0, 1) t0 = time.time() self._phase_start_fio() @@ -120,6 +145,10 @@ def _run_large_scale_test(self): f"Large-scale test had {self._fio_failures} FIO failures" ) + def _count_created_resources(self): + """Count resources available for FIO — override in subclass.""" + return self._total_created + # ── Steady state (shared) ──────────────────────────────────────────────── def _phase_steady_state(self): @@ -147,6 +176,164 @@ def _phase_validate(self): """Override in subclass for mode-specific validation.""" self.logger.info("=== Validation phase ===") + # ── FIO log collection helpers (shared) ────────────────────────────────── + + def _save_fio_pod_logs(self, job_name: str, resource_name: str, + pvc_name: str = None): + """Save FIO pod logs and performance data to local log directory.""" + try: + pod_name = self.k8s_utils.get_job_pod_name(job_name) + if not pod_name: + return + logs = self.k8s_utils.get_pod_logs(pod_name, tail=2000) + if logs: + log_file = os.path.join( + self.log_path, f"{resource_name}_fio.log" + ) + with open(log_file, "w") as f: + f.write(logs) + self.logger.info( + f"[save_fio] Saved logs for {resource_name}" + ) + self._copy_fio_perf_logs( + pod_name, resource_name, pvc_name=pvc_name + ) + except Exception as exc: + self.logger.warning( + f"[save_fio] Could not save logs for {resource_name}: {exc}" + ) + + def _list_fio_perf_files(self, pod_name: str, ns: str, + container: str = None) -> list: + """List FIO-generated perf files in /spdkvol/ of a running pod.""" + container_flag = f"-c {container} " if container else "" + try: + file_list, _ = self.k8s_utils._exec_kubectl( + f"kubectl exec {container_flag}{pod_name} -n {ns} -- " + f"find /spdkvol/ -maxdepth 1 " + f"\\( -name '*fio*.log' -o -name '*-iolog.log' " + f"-o -name '*_lat.*' " + f"-o -name '*_bw.*' -o -name '*_iops.*' " + f"-o -name '*_clat.*' " + f"-o -name '*_slat.*' \\) " + f"2>/dev/null || true", + supress_logs=True, + ) + return [ + f.strip() for f in file_list.strip().splitlines() + if f.strip() + ] + except Exception: + return [] + + def _create_copier_pod(self, copier_name: str, pvc_name: str, + node_name: str, ns: str): + """Create a lightweight busybox pod mounting a PVC for log copy.""" + yaml_spec = ( + f"apiVersion: v1\n" + f"kind: Pod\n" + f"metadata:\n" + f" name: {copier_name}\n" + f" namespace: {ns}\n" + f" labels:\n" + f" app: fio-copier\n" + f"spec:\n" + f" nodeName: {node_name}\n" + f" tolerations:\n" + f" - operator: Exists\n" + f" containers:\n" + f" - name: copier\n" + f" image: busybox\n" + f" command: ['sleep', '300']\n" + f" volumeMounts:\n" + f" - mountPath: /spdkvol\n" + f" name: vol\n" + f" volumes:\n" + f" - name: vol\n" + f" persistentVolumeClaim:\n" + f" claimName: {pvc_name}\n" + f" restartPolicy: Never\n" + ) + self.k8s_utils._exec_kubectl( + f"cat <<'COPIER_EOF' | kubectl apply -f -\n" + f"{yaml_spec}COPIER_EOF", + ) + self.k8s_utils._exec_kubectl( + f"kubectl wait pod/{copier_name} -n {ns} " + f"--for=condition=Ready --timeout=120s", + ) + + def _copy_fio_perf_logs(self, pod_name: str, resource_name: str, + pvc_name: str = None): + """Copy FIO perf log files from /spdkvol/ in the pod to local dir.""" + ns = self.k8s_utils.namespace + perf_dir = os.path.join(self.log_path, f"{resource_name}_perf") + copier_name = None + copy_from_pod = pod_name + container = None + + try: + files = self._list_fio_perf_files(pod_name, ns) + + if not files and pvc_name: + node_name = self.k8s_utils.get_pod_node_name(pod_name) + if node_name: + copier_name = f"fio-cp-{_rand_seq(8)}" + self.logger.info( + f"[perf_copy] Creating copier pod {copier_name} " + f"on {node_name} for PVC {pvc_name}" + ) + try: + self._create_copier_pod( + copier_name, pvc_name, node_name, ns + ) + files = self._list_fio_perf_files( + copier_name, ns, container="copier" + ) + copy_from_pod = copier_name + container = "copier" + except Exception as exc: + self.logger.warning( + f"[perf_copy] Copier pod failed for " + f"{resource_name}: {exc}" + ) + files = [] + + if not files: + return + + os.makedirs(perf_dir, exist_ok=True) + container_flag = f" -c {container}" if container else "" + for src_path in files: + fname = os.path.basename(src_path) + dest = os.path.join(perf_dir, fname) + self.k8s_utils._exec_kubectl( + f"kubectl cp " + f"{ns}/{copy_from_pod}:{src_path} {dest}" + f"{container_flag} " + f"2>/dev/null || true", + supress_logs=True, + ) + self.logger.info( + f"[perf_copy] Copied {len(files)} perf log(s) " + f"for {resource_name}" + ) + except Exception as exc: + self.logger.warning( + f"[perf_copy] Could not copy perf logs for " + f"{resource_name}: {exc}" + ) + finally: + if copier_name: + try: + self.k8s_utils._exec_kubectl( + f"kubectl delete pod {copier_name} -n {ns} " + f"--force --grace-period=0 2>/dev/null || true", + supress_logs=True, + ) + except Exception: + pass + # ── Summary (shared) ───────────────────────────────────────────────────── def _print_large_scale_summary(self): @@ -326,6 +513,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.test_name = "large_scale_lvol_docker" self.fio_threads: list[threading.Thread] = [] + self.sn_nodes: list[str] = [] # parent_name → {id, client, ctrl_dev, nqn, devices: [dev_path]} self._parent_registry: dict[str, dict] = {} @@ -386,7 +574,12 @@ def _wait_until_namespace_device_gone(self, node: str, ctrl_dev: str, # ── run() ──────────────────────────────────────────────────────────────── def run(self): - self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + if actual_pool and actual_pool != self.pool_name: + self.logger.info( + f"[run] Pool name changed: {self.pool_name} -> {actual_pool}" + ) + self.pool_name = actual_pool storage_nodes = self.sbcli_utils.get_storage_nodes() for result in storage_nodes["results"]: self.sn_nodes.append(result["uuid"]) @@ -396,169 +589,217 @@ def run(self): def _phase_create_subsystems(self): self.logger.info("=== Phase: Create Subsystems (Docker) ===") - - # Sub-phase 1: Create 100 parent lvols in parallel + total_expected = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM self.logger.info( - f"[create] Sub-phase 1: Creating {self.NUM_SUBSYSTEMS} parents" + f"[create] {self.NUM_SUBSYSTEMS} parents × " + f"{self.NAMESPACES_PER_SUBSYSTEM} ns = {total_expected} lvols " + f"(parallel={self.PARALLEL_PARENTS})" ) - parent_items = [] - for i in range(self.NUM_SUBSYSTEMS): - name = f"lss-par-{_rand_seq(6)}-{i:03d}" - parent_items.append({"name": name, "idx": i}) - self._batch_exec(parent_items, self._create_parent, "create_parents") - - parent_count = len(self._parent_registry) - self.logger.info(f"[create] {parent_count} parents created") - if parent_count == 0: - raise RuntimeError("No parents created — cannot continue") - - # Sub-phase 2: NVMe connect all parents + format/mount parent device + # ── Sub-phase 1: Create all parent lvols in parallel ──────────── + parent_names = [ + f"lss-par-{_rand_seq(6)}-{i:03d}" + for i in range(self.NUM_SUBSYSTEMS) + ] self.logger.info( - f"[create] Sub-phase 2: NVMe connecting {parent_count} parents" + f"[create][sub1] Creating {len(parent_names)} parent lvols " + f"(parallel, workers={self.MAX_WORKERS})" ) - parent_names = list(self._parent_registry.keys()) - self._batch_exec( - parent_names, self._connect_parent, "connect_parents" + ok, fail = self._batch_exec( + [{"name": n} for n in parent_names], + self._create_parent, + "create_parents", + ) + if fail > 0: + self._total_created = len(self._device_registry) + raise RuntimeError( + f"[create][sub1] {fail} parent creations failed" + ) + # Verify all parents are registered + for pn in parent_names: + if pn not in self._parent_registry: + raise RuntimeError( + f"[create][sub1] Parent {pn} not in registry after create" + ) + self.logger.info( + f"[create][sub1] All {ok} parents created successfully" ) - connected = sum( - 1 for p in self._parent_registry.values() if p.get("ctrl_dev") + # ── Sub-phase 2: NVMe-connect all parents (sequential) ───────── + # Sequential to avoid device-detection races on same client. + self.logger.info( + f"[create][sub2] Connecting {len(parent_names)} parents " + f"(sequential)" + ) + for idx, pn in enumerate(parent_names): + # Pre-assign client round-robin + self._parent_registry[pn]["client"] = ( + self.fio_node[idx % len(self.fio_node)] + ) + self._connect_parent(pn) + pinfo = self._parent_registry[pn] + if not pinfo.get("ctrl_dev"): + raise RuntimeError( + f"[create][sub2] Parent {pn} NVMe connect failed" + ) + if (idx + 1) % 10 == 0 or idx == len(parent_names) - 1: + self.logger.info( + f"[create][sub2] Connected {idx+1}/" + f"{len(parent_names)}" + ) + self.logger.info( + f"[create][sub2] All {len(parent_names)} parents connected" ) - self.logger.info(f"[create] {connected} parents connected") - # Sub-phase 3: Create namespace children per parent - # (sequential within a parent, parallel across parents) - total_children = (self.NAMESPACES_PER_SUBSYSTEM - 1) * connected + # ── Sub-phase 3: Create children (PARALLEL_PARENTS concurrent) ── self.logger.info( - f"[create] Sub-phase 3: Creating {total_children} namespace " - f"children ({self.NAMESPACES_PER_SUBSYSTEM - 1} per parent)" + f"[create][sub3] Creating children for {len(parent_names)} " + f"parents (parallel, workers={self.PARALLEL_PARENTS})" ) - connected_parents = [ - pname for pname, pinfo in self._parent_registry.items() - if pinfo.get("ctrl_dev") - ] - # Each parent creates 31 children sequentially (~130s each worst case) - self._batch_exec( - connected_parents, + child_timeout = self.NAMESPACES_PER_SUBSYSTEM * 180 + ok, fail = self._batch_exec( + parent_names, self._create_children_for_parent, "create_children", - per_item_timeout=5400, # 90 min per parent + per_item_timeout=child_timeout, + max_workers=self.PARALLEL_PARENTS, ) + if fail > 0: + self._total_created = len(self._device_registry) + raise RuntimeError( + f"[create][sub3] {fail} parent child-creation batches failed" + ) + + # Verify child counts + for pn in parent_names: + children_done = sum( + 1 for c in self._child_registry.values() + if c["parent_name"] == pn + ) + expected = self.NAMESPACES_PER_SUBSYSTEM - 1 + if children_done < expected: + raise RuntimeError( + f"Parent {pn}: only {children_done}/{expected} " + f"children created — aborting" + ) - child_count = len(self._child_registry) self._total_created = len(self._device_registry) self.logger.info( - f"[create] {child_count} children created, " - f"{self._total_created} total devices formatted + mounted" + f"[create] All done: {len(self._parent_registry)} parents, " + f"{len(self._child_registry)} children, " + f"{self._total_created} total devices mounted" ) + def _count_created_resources(self): + """Count devices available for FIO from the device registry.""" + return len(self._device_registry) + def _create_parent(self, params: dict): name = params["name"] + self.sbcli_utils.add_lvol( + lvol_name=name, + pool_name=self.pool_name, + size=self.LVOL_SIZE, + distr_ndcs=self.ndcs, + distr_npcs=self.npcs, + distr_bs=self.bs, + distr_chunk_bs=self.chunk_bs, + max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM, + retry=3, + ) + sleep_n_sec(2) + lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=name) + if not lvol_id: + raise RuntimeError(f"[create_parent] {name}: ID not found") + # Get the node_id so children can target the same node via host_id + node_id = None try: - self.sbcli_utils.add_lvol( - lvol_name=name, - pool_name=self.pool_name, - size=self.LVOL_SIZE, - distr_ndcs=self.ndcs, - distr_npcs=self.npcs, - distr_bs=self.bs, - distr_chunk_bs=self.chunk_bs, - max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM, - retry=3, - ) - sleep_n_sec(2) - lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=name) - if not lvol_id: - self.logger.error(f"[create_parent] {name}: ID not found") - return - self._parent_registry[name] = { - "id": lvol_id, - "client": None, - "ctrl_dev": None, - "nqn": None, - "devices": [], - } - self.logger.info(f"[create_parent] {name} -> {lvol_id}") - except Exception as e: - self.logger.error(f"[create_parent] {name} failed: {e}") + details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id) + if details: + node_id = details[0].get("node_id") + except Exception as ex: + self.logger.warning(f"[create_parent] {name}: could not get node_id: {ex}") + self._parent_registry[name] = { + "id": lvol_id, + "node_id": node_id, + "client": None, + "ctrl_dev": None, + "nqn": None, + "devices": [], + } + self.logger.info(f"[create_parent] {name} -> {lvol_id} (node={node_id})") def _connect_parent(self, parent_name: str): """NVMe-connect parent, detect device, format + mount the parent - namespace (nsid=1).""" + namespace (nsid=1). Raises on any failure.""" pinfo = self._parent_registry.get(parent_name) if not pinfo: - return - try: - connect_ls = self.sbcli_utils.get_lvol_connect_str( - lvol_name=parent_name + raise RuntimeError(f"{parent_name}: not in registry") + + connect_ls = self.sbcli_utils.get_lvol_connect_str( + lvol_name=parent_name + ) + if not connect_ls: + raise RuntimeError( + f"[connect] {parent_name}: no connect strings" ) - if not connect_ls: - self.logger.error( - f"[connect] {parent_name}: no connect strings" - ) - return - # Round-robin across client nodes - client = self.fio_node[ - list(self._parent_registry.keys()).index(parent_name) - % len(self.fio_node) - ] - pinfo["client"] = client + # Use pre-assigned client if set (sub-phase 2), otherwise fall back + if not pinfo.get("client"): + idx = list(self._parent_registry.keys()).index(parent_name) + pinfo["client"] = self.fio_node[idx % len(self.fio_node)] + client = pinfo["client"] - initial_devices = self.ssh_obj.get_devices(node=client) + initial_devices = self.ssh_obj.get_devices(node=client) - for cmd in connect_ls: - self.ssh_obj.exec_command(node=client, command=cmd) - # Extract NQN for later disconnect - nqn_match = re.search(r"-n\s+(nqn\S+)", cmd) - if nqn_match: - pinfo["nqn"] = nqn_match.group(1) + for cmd in connect_ls: + self.ssh_obj.exec_command(node=client, command=cmd) + # Extract NQN for later disconnect + nqn_match = re.search(r"-n\s+(nqn\S+)", cmd) + if nqn_match: + pinfo["nqn"] = nqn_match.group(1) - sleep_n_sec(3) - final_devices = self.ssh_obj.get_devices(node=client) + sleep_n_sec(3) + final_devices = self.ssh_obj.get_devices(node=client) - parent_dev = None - for dev in final_devices: - if dev not in initial_devices: - parent_dev = f"/dev/{dev.strip()}" - break + parent_dev = None + for dev in final_devices: + if dev not in initial_devices: + parent_dev = f"/dev/{dev.strip()}" + break - if not parent_dev: - self.logger.error( - f"[connect] {parent_name}: no new device after connect" - ) - return + if not parent_dev: + raise RuntimeError( + f"[connect] {parent_name}: no new device after connect" + ) - ctrl_dev = get_parent_device(parent_dev) - pinfo["ctrl_dev"] = ctrl_dev - pinfo["devices"] = [parent_dev] + ctrl_dev = get_parent_device(parent_dev) + pinfo["ctrl_dev"] = ctrl_dev + pinfo["devices"] = [parent_dev] - # Format + mount the parent device (nsid=1) - mount_name = f"lss-{parent_name[-3:]}-ns01" - mount_point = f"{self.mount_path}/{mount_name}" - log_file = f"{self.log_path}/{mount_name}.log" - self.ssh_obj.format_disk( - node=client, device=parent_dev, fs_type="ext4" - ) - self.ssh_obj.mount_path( - node=client, device=parent_dev, mount_path=mount_point - ) - self._device_registry[parent_dev] = { - "name": mount_name, - "client": client, - "mount": mount_point, - "log": log_file, - "parent_name": parent_name, - "ctrl_dev": ctrl_dev, - "ns_idx": 1, - } - self.logger.info( - f"[connect] {parent_name}: {parent_dev} ns01 " - f"(ctrl={ctrl_dev}) on {client} -> {mount_point}" - ) - except Exception as e: - self.logger.error(f"[connect] {parent_name} failed: {e}") + # Format + mount the parent device (nsid=1) + mount_name = f"lss-{parent_name[-3:]}-ns01" + mount_point = f"{self.mount_path}/{mount_name}" + log_file = f"{self.log_path}/{mount_name}.log" + self.ssh_obj.format_disk( + node=client, device=parent_dev, fs_type="ext4" + ) + self.ssh_obj.mount_path( + node=client, device=parent_dev, mount_path=mount_point + ) + self._device_registry[parent_dev] = { + "name": mount_name, + "client": client, + "mount": mount_point, + "log": log_file, + "parent_name": parent_name, + "ctrl_dev": ctrl_dev, + "ns_idx": 1, + } + self.logger.info( + f"[connect] {parent_name}: {parent_dev} ns01 " + f"(ctrl={ctrl_dev}) on {client} -> {mount_point}" + ) def _create_children_for_parent(self, parent_name: str): """Create all namespace children for one parent sequentially. @@ -566,102 +807,89 @@ def _create_children_for_parent(self, parent_name: str): For each child: 1. add_lvol(namespace=parent_id) 2. Verify the new namespace device appears on the client - (rescan if it doesn't show up automatically) 3. Format + mount the new device + + Raises on any failure so the caller can abort immediately. """ pinfo = self._parent_registry.get(parent_name) if not pinfo or not pinfo.get("ctrl_dev"): - return + raise RuntimeError(f"{parent_name}: not connected") parent_id = pinfo["id"] client = pinfo["client"] ctrl_dev = pinfo["ctrl_dev"] # Snapshot of current namespace devices before creating children before_set = set(self._list_nvme_ns_devices(client, ctrl_dev)) - created = 0 for ns_idx in range(2, self.NAMESPACES_PER_SUBSYSTEM + 1): cname = ( f"lss-ch-{parent_name[-3:]}-ns{ns_idx:02d}-{_rand_seq(4)}" ) - try: - self.sbcli_utils.add_lvol( - lvol_name=cname, - pool_name=self.pool_name, - size=self.LVOL_SIZE, - distr_ndcs=self.ndcs, - distr_npcs=self.npcs, - distr_bs=self.bs, - distr_chunk_bs=self.chunk_bs, - namespace=parent_id, - retry=3, - ) - sleep_n_sec(2) - child_id = self.sbcli_utils.get_lvol_id(lvol_name=cname) - if not child_id: - self.logger.error( - f"[create_child] {cname}: ID not found" - ) - continue - # Wait for the new namespace device to appear on client - new_dev, new_set = self._wait_for_new_namespace_device( - node=client, - ctrl_dev=ctrl_dev, - before_set=before_set, - timeout=120, - interval=3, + self.sbcli_utils.add_lvol( + lvol_name=cname, + pool_name=self.pool_name, + size=self.LVOL_SIZE, + distr_ndcs=self.ndcs, + distr_npcs=self.npcs, + distr_bs=self.bs, + distr_chunk_bs=self.chunk_bs, + host_id=pinfo.get("node_id"), + namespace=parent_id, + retry=3, + ) + sleep_n_sec(2) + child_id = self.sbcli_utils.get_lvol_id(lvol_name=cname) + if not child_id: + raise RuntimeError( + f"[create_child] {cname}: lvol ID not found after create" ) - if not new_dev: - self.logger.error( - f"[create_child] {cname}: namespace device did not " - f"appear on {client} (ctrl={ctrl_dev})" - ) - continue - before_set = new_set - # Format + mount the new namespace device - mount_name = ( - f"lss-{parent_name[-3:]}-ns{ns_idx:02d}" - ) - mount_point = f"{self.mount_path}/{mount_name}" - log_file = f"{self.log_path}/{mount_name}.log" - self.ssh_obj.format_disk( - node=client, device=new_dev, fs_type="ext4" - ) - self.ssh_obj.mount_path( - node=client, device=new_dev, mount_path=mount_point + # Wait for the new namespace device to appear on client + new_dev, new_set = self._wait_for_new_namespace_device( + node=client, + ctrl_dev=ctrl_dev, + before_set=before_set, + timeout=120, + interval=3, + ) + if not new_dev: + raise RuntimeError( + f"[create_child] {cname}: namespace device did not " + f"appear on {client} (ctrl={ctrl_dev})" ) + before_set = new_set - self._child_registry[cname] = { - "id": child_id, - "parent_name": parent_name, - "device": new_dev, - "ns_idx": ns_idx, - } - self._device_registry[new_dev] = { - "name": mount_name, - "client": client, - "mount": mount_point, - "log": log_file, - "parent_name": parent_name, - "ctrl_dev": ctrl_dev, - "ns_idx": ns_idx, - } - created += 1 - self.logger.info( - f"[create_child] {cname} -> {child_id} " - f"ns{ns_idx:02d} device={new_dev} on {client}" - ) - except Exception as e: - self.logger.error( - f"[create_child] {cname} failed: {e}" - ) + # Format + mount the new namespace device + mount_name = f"lss-{parent_name[-3:]}-ns{ns_idx:02d}" + mount_point = f"{self.mount_path}/{mount_name}" + log_file = f"{self.log_path}/{mount_name}.log" + self.ssh_obj.format_disk( + node=client, device=new_dev, fs_type="ext4" + ) + self.ssh_obj.mount_path( + node=client, device=new_dev, mount_path=mount_point + ) - self.logger.info( - f"[create_children] {parent_name}: " - f"{created}/{self.NAMESPACES_PER_SUBSYSTEM - 1} children created" - ) + self._child_registry[cname] = { + "id": child_id, + "parent_name": parent_name, + "device": new_dev, + "ns_idx": ns_idx, + } + self._device_registry[new_dev] = { + "name": mount_name, + "client": client, + "mount": mount_point, + "log": log_file, + "parent_name": parent_name, + "ctrl_dev": ctrl_dev, + "ns_idx": ns_idx, + } + self.logger.info( + f"[create_child] {cname} -> {child_id} " + f"ns{ns_idx:02d} device={new_dev} on {client}" + ) # ── Phase 2: Start FIO ────────────────────────────────────────────────── @@ -727,6 +955,11 @@ def _log_health_status(self, elapsed: int): def _phase_validate(self): self.logger.info("=== Phase: Validate FIO (Docker) ===") + + # 1. Collect FIO logs from all clients + self._save_all_fio_logs_docker() + + # 2. Check thread liveness alive = sum(1 for t in self.fio_threads if t.is_alive()) dead = len(self.fio_threads) - alive self.logger.info( @@ -738,6 +971,82 @@ def _phase_validate(self): f"[validate] {dead} FIO threads died during test" ) + # 3. Validate FIO log contents for errors + validated = 0 + failed = 0 + for device, dinfo in self._device_registry.items(): + log_file = dinfo.get("log") + client = dinfo.get("client") + name = dinfo.get("name") + if not log_file or not client: + continue + try: + self.common_utils.validate_fio_test(client, log_file) + validated += 1 + except RuntimeError as e: + failed += 1 + self.logger.error( + f"[validate] FIO error in {name} on {client}: {e}" + ) + self.logger.info( + f"[validate] Log validation: {validated} passed, " + f"{failed} failed" + ) + self._fio_failures = max(self._fio_failures, failed) + + def _save_all_fio_logs_docker(self): + """Collect FIO log files from all clients to the local log dir.""" + saved = 0 + for device, dinfo in self._device_registry.items(): + log_file = dinfo.get("log") + client = dinfo.get("client") + name = dinfo.get("name") + if not log_file or not client: + continue + try: + file_data = self.ssh_obj.read_file(client, log_file) + if file_data: + local_path = os.path.join( + self.log_path, f"{name}_fio.log" + ) + with open(local_path, "w") as f: + f.write(file_data) + saved += 1 + except Exception: + pass + # Also collect perf logs (_bw, _lat, _iops, _iolog) + fio_log_base = log_file.replace(".log", "_fio") + perf_dir = os.path.join(self.log_path, f"{name}_perf") + try: + out, _ = self.ssh_obj.exec_command( + node=client, + command=f"bash -lc 'ls {fio_log_base}* " + f"{log_file.replace('.log', '_iolog.log')} " + f"2>/dev/null || true'", + supress_logs=True, + ) + perf_files = [ + f.strip() for f in (out or "").splitlines() + if f.strip() + ] + if perf_files: + os.makedirs(perf_dir, exist_ok=True) + for src in perf_files: + fname = os.path.basename(src) + dest = os.path.join(perf_dir, fname) + try: + data = self.ssh_obj.read_file(client, src) + if data: + with open(dest, "w") as f: + f.write(data) + except Exception: + pass + except Exception: + pass + self.logger.info( + f"[save_fio] Collected {saved} FIO logs from clients" + ) + # ── Cleanup ────────────────────────────────────────────────────────────── def _phase_cleanup(self): @@ -902,14 +1211,30 @@ def _delete_children_for_parent(self, parent_name: str, # ── Batch parallel helper ──────────────────────────────────────────────── def _batch_exec(self, items, task_fn, op_name: str, - per_item_timeout: int = 600): - """Execute task_fn(item) for each item using ThreadPoolExecutor.""" + per_item_timeout: int = 600, + max_workers: int = None, + max_failures: int = 10): + """Execute task_fn(item) for each item using ThreadPoolExecutor. + + Stops submitting new batches once failures >= max_failures. + Returns (success_count, failure_count). + """ total = len(items) success = 0 failures = 0 + workers = max_workers or self.MAX_WORKERS + stopped_early = False - with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: + with ThreadPoolExecutor(max_workers=workers) as executor: for batch_start in range(0, total, self.BATCH_SIZE): + if failures >= max_failures: + stopped_early = True + self.logger.error( + f"[{op_name}] Stopping: {failures} failures " + f"reached max_failures={max_failures}" + ) + break + batch = items[batch_start:batch_start + self.BATCH_SIZE] futures = {} for item in batch: @@ -923,7 +1248,8 @@ def _batch_exec(self, items, task_fn, op_name: str, except Exception as exc: failures += 1 self.logger.error( - f"[{op_name}] Failed: {exc}" + f"[{op_name}] Failed ({failures}/" + f"{max_failures} max): {exc}" ) done = batch_start + len(batch) @@ -932,6 +1258,12 @@ def _batch_exec(self, items, task_fn, op_name: str, f"(ok={success} fail={failures})" ) + if stopped_early: + self.logger.info( + f"[{op_name}] Stopped early: {success} succeeded, " + f"{failures} failed, " + f"{total - success - failures} skipped" + ) return success, failures @@ -948,9 +1280,8 @@ class LargeScaleLvolK8s(_LargeScaleMixin, K8sNativeFailoverTest): def __init__(self, **kwargs): super().__init__(**kwargs) self.test_name = "large_scale_lvol_k8s" - # Override base class FIO config for lightweight load + # Match Docker: lightweight FIO load self.fio_num_jobs = self.FIO_NUMJOBS - self.FIO_RUNTIME = 7200 # ── run() ──────────────────────────────────────────────────────────────── @@ -960,7 +1291,12 @@ def run(self): self.sn_nodes.append(result["uuid"]) self.node_vs_pvc[result["uuid"]] = [] - self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name) + if actual_pool and actual_pool != self.pool_name: + self.logger.info( + f"[run] Pool name changed: {self.pool_name} -> {actual_pool}" + ) + self.pool_name = actual_pool cluster_id = self.cluster_id or os.environ.get("CLUSTER_ID", "") self.k8s_utils.create_storage_class( @@ -971,184 +1307,245 @@ def run(self): npcs=self.npcs, max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM, ) + self.k8s_utils.create_storage_class( + name=self.XFS_STORAGE_CLASS_NAME, + cluster_id=cluster_id, + pool_name=self.pool_name, + ndcs=self.ndcs, + npcs=self.npcs, + fs_type="xfs", + max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM, + ) self._run_large_scale_test() - # ── Phase 1: Create subsystems ─────────────────────────────────────────── + def _count_created_resources(self): + """Count PVCs available for FIO from pvc_details.""" + return len(self.pvc_details) + + # ── Phase 1: Create subsystems (parallel across subsystems) ───────── def _phase_create_subsystems(self): + """Create PVCs with PARALLEL_PARENTS subsystems processed concurrently. + + Each subsystem creates NAMESPACES_PER_SUBSYSTEM PVCs sequentially + (to preserve device detection order within a subsystem), but multiple + subsystems run in parallel to reduce total wall-clock time.""" total_pvcs = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM self.logger.info( - f"=== Phase: Create {total_pvcs} PVCs (K8s) ===" + f"=== Phase: Create {total_pvcs} PVCs (K8s) — " + f"{self.NUM_SUBSYSTEMS} subsystems × " + f"{self.NAMESPACES_PER_SUBSYSTEM} PVCs " + f"(parallel={self.PARALLEL_PARENTS}) ===" ) - pvc_items = [] - for i in range(total_pvcs): - pvc_name = f"lss-pvc-{_rand_seq(6)}-{i:04d}" - pvc_items.append({"name": pvc_name, "idx": i}) + # Build work items: one per subsystem + work_items = [ + { + "subsys_idx": s, + "start_pvc_idx": s * self.NAMESPACES_PER_SUBSYSTEM, + } + for s in range(self.NUM_SUBSYSTEMS) + ] - if self.use_client_fio: - self._create_pvcs_client_mode(pvc_items) - else: - self._create_pvcs_job_mode(pvc_items) + subsys_timeout = self.NAMESPACES_PER_SUBSYSTEM * 60 + ok, fail = self._batch_exec_k8s( + work_items, + self._create_subsystem_pvcs, + "create_subsystems", + per_item_timeout=subsys_timeout, + max_workers=self.PARALLEL_PARENTS, + ) + if fail > 0: + self._total_created = len(self.pvc_details) + raise RuntimeError( + f"[create] {fail}/{self.NUM_SUBSYSTEMS} subsystems failed" + ) + + # Bulk verification at the end + all_lvols = self.sbcli_utils.list_lvols() + if len(all_lvols) < total_pvcs: + self.logger.warning( + f"[create] lvol count {len(all_lvols)} < " + f"expected {total_pvcs}" + ) self._total_created = len(self.pvc_details) - self.logger.info(f"[create] {self._total_created} PVCs created") + self.logger.info( + f"[create] {self._total_created} PVCs created, " + f"lvols in API: {len(all_lvols)}" + ) + + def _create_subsystem_pvcs(self, params: dict): + """Create all PVCs for one subsystem sequentially. + + Called from _batch_exec_k8s with PARALLEL_PARENTS concurrency. + PVCs within a subsystem must be sequential for device detection.""" + subsys_idx = params["subsys_idx"] + start_idx = params["start_pvc_idx"] + + self.logger.info( + f"[create] === Subsystem {subsys_idx+1}/" + f"{self.NUM_SUBSYSTEMS} ===" + ) + for ns in range(self.NAMESPACES_PER_SUBSYSTEM): + pvc_idx = start_idx + ns + pvc_name = f"lss-pvc-{_rand_seq(6)}-{pvc_idx:04d}" - def _create_pvcs_job_mode(self, items: list[dict]): - """Create PVCs in parallel (K8s Job FIO mode).""" - self._batch_exec_k8s(items, self._create_single_pvc, "create_pvcs") + if self.use_client_fio: + self._create_single_pvc_client( + {"name": pvc_name, "idx": pvc_idx} + ) + else: + self._create_single_pvc({"name": pvc_name}) + + if pvc_name not in self.pvc_details: + raise RuntimeError( + f"PVC {pvc_name} creation failed — aborting " + f"subsystem {subsys_idx+1}" + ) - def _create_pvcs_client_mode(self, items: list[dict]): - """Create PVCs + NVMe connect on clients.""" - self._batch_exec_k8s( - items, self._create_single_pvc_client, "create_pvcs_client" + self.logger.info( + f"[create] Subsystem {subsys_idx+1}/{self.NUM_SUBSYSTEMS} " + f"OK — {self.NAMESPACES_PER_SUBSYSTEM} PVCs created" ) def _create_single_pvc(self, params: dict): + """Create a single PVC and wait for Bound. Raises on failure.""" name = params["name"] - try: - self.k8s_utils.create_pvc( - name=name, - size=self.PVC_SIZE, - storage_class=self.STORAGE_CLASS_NAME, - ) - if not self.k8s_utils.wait_pvc_bound(name, timeout=300): - self.logger.error(f"[create_pvc] {name}: not Bound in 300s") - return - self.pvc_details[name] = { - "job_name": None, - "configmap_name": None, - "snapshots": [], - } - self.logger.info(f"[create_pvc] {name} Bound") - except Exception as e: - self.logger.error(f"[create_pvc] {name} failed: {e}") + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" + self.k8s_utils.create_pvc( + name=name, + size=self.PVC_SIZE, + storage_class=sc_name, + ) + if not self.k8s_utils.wait_pvc_bound(name, timeout=300): + raise TimeoutError(f"PVC {name} not Bound within 300s") + self.pvc_details[name] = { + "job_name": None, + "configmap_name": None, + "snapshots": [], + "storage_class": sc_name, + "fs_type": fs_type, + } + self.logger.info(f"[create_pvc] {name} Bound (fs={fs_type})") def _create_single_pvc_client(self, params: dict): """Create a single PVC, NVMe-connect on a client, and verify the - namespace device appears. CSI auto-groups PVCs into subsystems - based on the StorageClass max_namespace_per_subsys setting. + namespace device appears. Raises on any failure. - After NVMe connect, the device may appear as: - - A new controller + namespace (first PVC in a subsystem) - - A new namespace on an existing controller (shared subsystem) - Either way we verify a new block device is present. + CSI auto-groups PVCs into subsystems based on the StorageClass + max_namespace_per_subsys setting. After NVMe connect, the device + may appear as a new controller + namespace (first PVC in a subsystem) + or a new namespace on an existing controller (shared subsystem). """ name = params["name"] - try: - self.k8s_utils.create_pvc( - name=name, - size=self.PVC_SIZE, - storage_class=self.STORAGE_CLASS_NAME, - ) - if not self.k8s_utils.wait_pvc_bound(name, timeout=300): - self.logger.error(f"[create_pvc] {name}: not Bound in 300s") - return + sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]) + fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4" + self.k8s_utils.create_pvc( + name=name, + size=self.PVC_SIZE, + storage_class=sc_name, + ) + if not self.k8s_utils.wait_pvc_bound(name, timeout=300): + raise TimeoutError(f"PVC {name} not Bound within 300s") - # Get lvol info for NVMe connect - lvol_id = self.k8s_utils.get_pvc_volume_handle(name) - if not lvol_id: - self.logger.error( - f"[create_pvc] {name}: no volume handle" - ) - return + # Get lvol info for NVMe connect + lvol_id = self.k8s_utils.get_pvc_volume_handle(name) + if not lvol_id: + raise RuntimeError(f"PVC {name}: no volume handle") - lvol_name = None - lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id) - if lvol_details: - lvol_name = lvol_details[0].get("lvol_name", name) - else: - lvol_name = name + lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id) + lvol_name = ( + lvol_details[0].get("lvol_name", name) if lvol_details else name + ) - connect_ls = self.sbcli_utils.get_lvol_connect_str( - lvol_name=lvol_name - ) + connect_ls = self.sbcli_utils.get_lvol_connect_str( + lvol_name=lvol_name + ) + + client = self.fio_node[params["idx"] % len(self.fio_node)] - client = self.fio_node[params["idx"] % len(self.fio_node)] + # Snapshot devices before connect + initial_devices = set(self.ssh_obj.get_devices(node=client)) - # Snapshot devices before connect - initial_devices = set(self.ssh_obj.get_devices(node=client)) + # Extract NQN from connect strings for namespace tracking + nqn = None + for cmd in connect_ls: + self.ssh_obj.exec_command(node=client, command=cmd) + nqn_match = re.search(r"-n\s+(nqn\S+)", cmd) + if nqn_match: + nqn = nqn_match.group(1) - # Extract NQN from connect strings for namespace tracking - nqn = None - for cmd in connect_ls: - self.ssh_obj.exec_command(node=client, command=cmd) - nqn_match = re.search(r"-n\s+(nqn\S+)", cmd) - if nqn_match: - nqn = nqn_match.group(1) + sleep_n_sec(3) - sleep_n_sec(3) + # Check for new device — could be new controller or new namespace + final_devices = set(self.ssh_obj.get_devices(node=client)) + new_devs = sorted(final_devices - initial_devices) - # Check for new device — could be new controller or new namespace + new_dev = None + if new_devs: + new_dev = f"/dev/{new_devs[-1].strip()}" + else: + # Device didn't appear automatically — try NVMe rescan + self.logger.info( + f"[create_pvc] {name}: no new device, rescanning" + ) + rescan_cmd = ( + "bash -lc 'for c in /dev/nvme*; do " + "[ -c \"$c\" ] && nvme ns-rescan $c 2>/dev/null; " + "done || true'" + ) + self.ssh_obj.exec_command( + node=client, command=rescan_cmd + ) + sleep_n_sec(5) final_devices = set(self.ssh_obj.get_devices(node=client)) new_devs = sorted(final_devices - initial_devices) - - new_dev = None if new_devs: new_dev = f"/dev/{new_devs[-1].strip()}" - else: - # Device didn't appear automatically — try NVMe rescan - # Find controller for this NQN and rescan namespaces - self.logger.info( - f"[create_pvc] {name}: no new device, rescanning" - ) - # Rescan all controllers on this client - rescan_cmd = ( - "bash -lc 'for c in /dev/nvme*; do " - "[ -c \"$c\" ] && nvme ns-rescan $c 2>/dev/null; " - "done || true'" - ) - self.ssh_obj.exec_command( - node=client, command=rescan_cmd - ) - sleep_n_sec(5) - final_devices = set(self.ssh_obj.get_devices(node=client)) - new_devs = sorted(final_devices - initial_devices) - if new_devs: - new_dev = f"/dev/{new_devs[-1].strip()}" - if not new_dev: - self.logger.error( - f"[create_pvc] {name}: no device after NVMe " - f"connect + rescan on {client}" - ) - return + if not new_dev: + raise RuntimeError( + f"PVC {name}: no device after NVMe connect + rescan " + f"on {client}" + ) - ctrl_dev = get_parent_device(new_dev) - mount_point = f"{self.mount_path}/{name}" - log_file = f"{self.log_path}/{name}.log" + ctrl_dev = get_parent_device(new_dev) + mount_point = f"{self.mount_path}/{name}" + log_file = f"{self.log_path}/{name}.log" - self.ssh_obj.format_disk( - node=client, device=new_dev, fs_type="ext4" - ) - self.ssh_obj.mount_path( - node=client, device=new_dev, mount_path=mount_point - ) + self.ssh_obj.format_disk( + node=client, device=new_dev, fs_type=fs_type + ) + self.ssh_obj.mount_path( + node=client, device=new_dev, mount_path=mount_point + ) - self.pvc_details[name] = { - "job_name": None, - "configmap_name": None, - "snapshots": [], - } - self.lvol_mount_details[lvol_name] = { - "ID": lvol_id, - "Name": lvol_name, - "Mount": mount_point, - "Device": new_dev, - "FS": "ext4", - "Log": log_file, - "Client": client, - "pvc_name": name, - "ctrl_dev": ctrl_dev, - "nqn": nqn, - } - self.logger.info( - f"[create_pvc] {name} -> {new_dev} " - f"(ctrl={ctrl_dev}) on {client}" - ) - except Exception as e: - self.logger.error(f"[create_pvc] {name} failed: {e}") + self.pvc_details[name] = { + "job_name": None, + "configmap_name": None, + "snapshots": [], + "storage_class": sc_name, + "fs_type": fs_type, + } + self.lvol_mount_details[lvol_name] = { + "ID": lvol_id, + "Name": lvol_name, + "Mount": mount_point, + "Device": new_dev, + "FS": fs_type, + "Log": log_file, + "Client": client, + "pvc_name": name, + "ctrl_dev": ctrl_dev, + "nqn": nqn, + } + self.logger.info( + f"[create_pvc] {name} -> {new_dev} " + f"(ctrl={ctrl_dev}) on {client}" + ) # ── Phase 2: Start FIO ────────────────────────────────────────────────── @@ -1304,7 +1701,13 @@ def _log_health_status(self, elapsed: int): def _phase_validate(self): self.logger.info("=== Phase: Validate FIO (K8s) ===") + + # 1. Save all FIO logs first (regardless of pass/fail) + self._save_all_fio_logs_k8s() + self._save_fio_mapping_summary_k8s() + if self.use_client_fio: + # 2a. Check thread liveness alive = sum(1 for t in self.fio_threads if t.is_alive()) dead = len(self.fio_threads) - alive self.logger.info( @@ -1315,27 +1718,123 @@ def _phase_validate(self): self.logger.error( f"[validate] {dead} FIO threads died during test" ) + + # 2b. Validate client FIO log contents + validated = 0 + failed = 0 + for lvol_name, details in self.lvol_mount_details.items(): + log_file = details.get("Log") + client = details.get("Client") + if not log_file or not client: + continue + try: + self.common_utils.validate_fio_test(client, log_file) + validated += 1 + except RuntimeError as e: + failed += 1 + self.logger.error( + f"[validate] FIO error in {lvol_name}: {e}" + ) + self.logger.info( + f"[validate] Log validation: {validated} passed, " + f"{failed} failed" + ) + self._fio_failures = max(self._fio_failures, failed) else: - # Check K8s Job statuses - try: - ns = self.k8s_utils.namespace - out, _ = self.k8s_utils._exec_kubectl( - f"kubectl get jobs -n {ns} " - f"-l app=fio " - f"-o jsonpath='{{.items[*].status.failed}}' " - f"2>/dev/null || true", - supress_logs=True, - ) - failed_counts = [ - int(x) for x in (out or "").split() if x.strip() - ] - total_failed = sum(failed_counts) - self.logger.info( - f"[validate] {total_failed} jobs have failures" + # 2c. Validate K8s Job statuses + pod logs + fio_timeout = self.FIO_RUNTIME + 300 + validated = 0 + failed = 0 + for pvc_name, pvc_info in self.pvc_details.items(): + job_name = pvc_info.get("job_name") + if not job_name: + continue + try: + self.k8s_utils.validate_fio_job( + job_name, timeout=fio_timeout + ) + validated += 1 + except RuntimeError as e: + failed += 1 + self.logger.error( + f"[validate] FIO job {job_name} failed: {e}" + ) + self.logger.info( + f"[validate] Job validation: {validated} passed, " + f"{failed} failed" + ) + self._fio_failures = failed + + def _save_all_fio_logs_k8s(self): + """Save FIO pod logs and perf files for all PVCs.""" + if self.use_client_fio: + # Client mode: collect logs via SSH + saved = 0 + for lvol_name, details in self.lvol_mount_details.items(): + log_file = details.get("Log") + client = details.get("Client") + if not log_file or not client: + continue + try: + file_data = self.ssh_obj.read_file(client, log_file) + if file_data: + local_path = os.path.join( + self.log_path, f"{lvol_name}_fio.log" + ) + with open(local_path, "w") as f: + f.write(file_data) + saved += 1 + except Exception: + pass + self.logger.info( + f"[save_fio] Collected {saved} FIO logs from clients" + ) + return + + # K8s Job mode: collect pod logs + perf files + saved = 0 + for pvc_name, pvc_info in self.pvc_details.items(): + job_name = pvc_info.get("job_name") + if job_name: + self._save_fio_pod_logs( + job_name, pvc_name, pvc_name=pvc_name ) - self._fio_failures = total_failed - except Exception as e: - self.logger.warning(f"[validate] Job check failed: {e}") + saved += 1 + self.logger.info(f"[save_fio] Saved FIO logs for {saved} PVCs") + + # Bulk cleanup leftover copier pods + try: + self.k8s_utils._exec_kubectl( + f"kubectl delete pods -l app=fio-copier " + f"-n {self.k8s_utils.namespace} " + f"--force --grace-period=0 2>/dev/null || true", + supress_logs=True, + ) + except Exception: + pass + + def _save_fio_mapping_summary_k8s(self): + """Save a JSON summary mapping PVCs to lvols, workers, FIO jobs.""" + if self.use_client_fio: + return + try: + entries = self.k8s_utils.log_fio_pvc_mapping( + self.pvc_details + ) + if not entries: + return + summary_path = os.path.join( + self.docker_logs_path, "fio_mapping_summary.json" + ) + with open(summary_path, "w") as f: + _json.dump(entries, f, indent=2, default=str) + self.logger.info( + f"[save_fio] Wrote FIO mapping summary to {summary_path}" + ) + except Exception as exc: + self.logger.warning( + f"[save_fio] Could not write mapping summary: {exc}" + ) # ── Cleanup ────────────────────────────────────────────────────────────── @@ -1506,14 +2005,31 @@ def _phase_cleanup(self): # ── Batch parallel helper ──────────────────────────────────────────────── - def _batch_exec_k8s(self, items, task_fn, op_name: str): - """Execute task_fn(item) for each item using ThreadPoolExecutor.""" + def _batch_exec_k8s(self, items, task_fn, op_name: str, + per_item_timeout: int = 600, + max_workers: int = None, + max_failures: int = 10): + """Execute task_fn(item) for each item using ThreadPoolExecutor. + + Stops submitting new batches once failures >= max_failures. + Returns (success_count, failure_count). + """ total = len(items) success = 0 failures = 0 + workers = max_workers or self.MAX_WORKERS + stopped_early = False - with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor: + with ThreadPoolExecutor(max_workers=workers) as executor: for batch_start in range(0, total, self.BATCH_SIZE): + if failures >= max_failures: + stopped_early = True + self.logger.error( + f"[{op_name}] Stopping: {failures} failures " + f"reached max_failures={max_failures}" + ) + break + batch = items[batch_start:batch_start + self.BATCH_SIZE] futures = {} for item in batch: @@ -1522,11 +2038,14 @@ def _batch_exec_k8s(self, items, task_fn, op_name: str): for f in as_completed(futures): try: - f.result(timeout=600) + f.result(timeout=per_item_timeout) success += 1 except Exception as exc: failures += 1 - self.logger.error(f"[{op_name}] Failed: {exc}") + self.logger.error( + f"[{op_name}] Failed ({failures}/" + f"{max_failures} max): {exc}" + ) done = batch_start + len(batch) self.logger.info( @@ -1534,4 +2053,10 @@ def _batch_exec_k8s(self, items, task_fn, op_name: str): f"(ok={success} fail={failures})" ) + if stopped_early: + self.logger.info( + f"[{op_name}] Stopped early: {success} succeeded, " + f"{failures} failed, " + f"{total - success - failures} skipped" + ) return success, failures diff --git a/e2e/utils/k8s_utils.py b/e2e/utils/k8s_utils.py index 19b228d18..896fba523 100755 --- a/e2e/utils/k8s_utils.py +++ b/e2e/utils/k8s_utils.py @@ -810,6 +810,8 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None, except Exception: pass + fs_type = info.get("fs_type", "N/A") or "N/A" + all_entries.append({ "type": label, "name": name or "N/A", @@ -817,6 +819,7 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None, "lvol_id": vol_handle or "N/A", "storage_node": storage_node, "storage_class": sc, + "fs_type": fs_type, "snap_name": snap, "parent_pvc": parent_pvc, "fio_k8s_node": fio_node, @@ -825,22 +828,22 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None, if not all_entries: return - self.logger.info("=" * 180) + self.logger.info("=" * 190) self.logger.info("FIO Job → PVC/Clone → Lvol → Worker Mapping") - self.logger.info("-" * 180) + self.logger.info("-" * 190) self.logger.info( f"{'FIO Job':<30} {'PVC/Clone':<25} {'Lvol ID':<40} " f"{'Storage Node':<40} {'FIO K8s Node':<20} {'SC':<28} " - f"{'Snapshot':<20} {'Parent PVC':<25} {'Type':<6}" + f"{'FS':<6} {'Snapshot':<20} {'Parent PVC':<25} {'Type':<6}" ) - self.logger.info("-" * 180) + self.logger.info("-" * 190) for e in all_entries: self.logger.info( f"{e['job']:<30} {e['name']:<25} {e['lvol_id']:<40} " f"{e['storage_node']:<40} {e['fio_k8s_node']:<20} {e['storage_class']:<28} " - f"{e['snap_name']:<20} {e['parent_pvc']:<25} {e['type']:<6}" + f"{e['fs_type']:<6} {e['snap_name']:<20} {e['parent_pvc']:<25} {e['type']:<6}" ) - self.logger.info("=" * 180) + self.logger.info("=" * 190) return all_entries # ── VolumeSnapshot operations ──────────────────────────────────────────── diff --git a/e2e/utils/sbcli_utils.py b/e2e/utils/sbcli_utils.py index 32993378b..cbdbcd02f 100755 --- a/e2e/utils/sbcli_utils.py +++ b/e2e/utils/sbcli_utils.py @@ -480,8 +480,8 @@ def add_lvol(self, lvol_name, pool_name, size="256M", distr_ndcs=0, distr_npcs=0 body["max_namespace_per_subsys"] = int(max_namespace_per_subsys) if namespace: - # parent lvol id - body["namespace"] = namespace + # flag for auto-grouping into existing parent subsystem + body["namespaced"] = True self.post_request(api_url="/lvol", body=body, retry=retry) @@ -722,25 +722,50 @@ def all_expected_status(self, value_dict, expected_status): self.logger.info(f"Value: {value_match}") return all(value_match) - def wait_for_device_status(self, node_id, status, timeout=60): + def wait_for_device_status(self, node_id, status, timeout=60, device_id=None): + """Wait for device(s) to reach the expected status. + + Args: + node_id: Storage node UUID. + status: Expected status string or list of status strings. + timeout: Max seconds to wait. + device_id: If provided, only check this specific device. + If None, check ALL devices on the node (legacy behaviour). + """ + status = status if isinstance(status, list) else [status] device_ids = {} device_details = self.get_device_details(storage_node_id=node_id) total_devices = len(device_details) while timeout > 0: self.logger.info("Retrying Device Status check") device_details = self.get_device_details(storage_node_id=node_id) - for device in device_details: - device_ids[device['id']] = device['status'] - status = status if isinstance(status, list) else [status] + + if device_id: + # Single-device mode: only check the specified device + for device in device_details: + if device['id'] == device_id: + actual = device['status'] + self.logger.info(f"Device ID: {device_id} Expected Status: {status} / Actual Status: {actual}") + if actual in status: + return device_details + break + else: + self.logger.warning(f"Device {device_id} not found on node {node_id}") + else: + # All-devices mode (legacy): require every device to match + device_ids = {} + for device in device_details: + device_ids[device['id']] = device['status'] self.logger.info(f"Device statuses: {device_ids}") - if device['status'] in status: - if len(device_ids) == total_devices and self.all_expected_status(device_ids, status): - return device_details - self.logger.info(f"Device ID: {device['id']} Expected Status: {status} / Actual Status: {device['status']}") + if len(device_ids) == total_devices and self.all_expected_status(device_ids, status): + return device_details + for did, dstatus in device_ids.items(): + self.logger.info(f"Device ID: {did} Expected Status: {status} / Actual Status: {dstatus}") + sleep_n_sec(1) timeout -= 1 - raise TimeoutError(f"Timed out waiting for device status, Node id: {node_id}, Device id: {list(device_ids.keys())}" - f"Expected status: {status}, Actual status: {list(device_ids.values())}") + raise TimeoutError(f"Timed out waiting for device status, Node id: {node_id}, Device id: {device_id or list(device_ids.keys())}, " + f"Expected status: {status}, Actual status: {list(device_ids.values()) if not device_id else 'see above'}") def wait_for_health_status(self, node_id, status, timeout=60, device_id=None): actual_status = None @@ -782,10 +807,10 @@ def wait_for_health_status(self, node_id, status, timeout=60, device_id=None): def list_migration_tasks(self, cluster_id): """List all migration tasks for a given cluster.""" - return self.get_request(f"/cluster/list-tasks/{cluster_id}?limit=0") + return self.get_request(f"/cluster/get-tasks/{cluster_id}?limit=0") def wait_migration_tasks_complete(self, timeout=3600): - """Wait until all FN_FAILED_DEV_MIG tasks finish. + """Wait until all failed_device_migration tasks finish. Polls ``list_migration_tasks`` every 10 seconds until no active failure-migration tasks remain or *timeout* seconds elapse. @@ -803,10 +828,15 @@ def wait_migration_tasks_complete(self, timeout=3600): start = _time.time() active = [] while _time.time() - start < timeout: - tasks = self.list_migration_tasks(self.cluster_id) + try: + tasks = self.list_migration_tasks(self.cluster_id) + except Exception as exc: + self.logger.warning(f"list_migration_tasks API failed: {exc}") + sleep_n_sec(10) + continue active = [ t for t in tasks.get("results", []) - if t.get("function_name") == "FN_FAILED_DEV_MIG" + if t.get("function_name") == "failed_device_migration" and t.get("status") not in ("done", "cancelled", "error") ] if not active: diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py index 627ac6a61..276eee0b6 100755 --- a/e2e/utils/ssh_utils.py +++ b/e2e/utils/ssh_utils.py @@ -2939,6 +2939,43 @@ def stop_all_tshark(self, node_ip): self.exec_command(node_ip, stop_command) self.logger.info(f"Stopped all tshark processes on {node_ip}") + def start_full_pcap_capture(self, node_ip, log_dir, interface="any", + max_size_mb=500, max_files=3): + """Start full packet capture in pcap format with file rotation. + + Captures all packets on the given interface. Files rotate at + *max_size_mb* MB, keeping at most *max_files* rotated files + (total max disk = max_size_mb * max_files per node). + + Args: + node_ip: Target node IP. + log_dir: Directory to write pcap files into. + interface: Network interface (default ``any``). + max_size_mb: Rotate file after this many MB. + max_files: Maximum number of rotated files to keep. + """ + self.check_and_install_tcpdump(node_ip) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + pcap_file = f"{log_dir}/full_capture_{node_ip}_{timestamp}.pcap" + cmd = ( + f"sudo tmux new-session -d -s full_pcap_session " + f"\"tcpdump -i {interface} -w {pcap_file} " + f"-C {max_size_mb} -W {max_files} 2>&1\"" + ) + self.exec_command(node_ip, cmd) + self.logger.info( + f"Started full pcap capture on {node_ip} -> {pcap_file} " + f"(rotate={max_size_mb}MB x{max_files})" + ) + + def stop_full_pcap_capture(self, node_ip): + """Stop the full pcap capture tmux session on a node.""" + self.exec_command( + node_ip, + "sudo tmux kill-session -t full_pcap_session 2>/dev/null || true", + ) + self.logger.info(f"Stopped full pcap capture on {node_ip}") + def get_dmesg_logs_within_iso_window(self, node_ip, start_iso, end_iso): """ Fetch dmesg logs with ISO timestamps on a remote node within a time window.