diff --git a/.github/workflows/e2e-bootstrap-k8s.yml b/.github/workflows/e2e-bootstrap-k8s.yml
index 3276888e6..6aaa789f5 100755
--- a/.github/workflows/e2e-bootstrap-k8s.yml
+++ b/.github/workflows/e2e-bootstrap-k8s.yml
@@ -699,7 +699,7 @@ jobs:
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         shell: bash
         run: |
diff --git a/.github/workflows/e2e-bootstrap.yml b/.github/workflows/e2e-bootstrap.yml
index ed787eafe..1a1b2d2e6 100755
--- a/.github/workflows/e2e-bootstrap.yml
+++ b/.github/workflows/e2e-bootstrap.yml
@@ -1129,7 +1129,7 @@ jobs:
           PY
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         shell: bash
         run: |
diff --git a/.github/workflows/e2e-docker.yml b/.github/workflows/e2e-docker.yml
index d4f68a695..5d3ba1ee5 100755
--- a/.github/workflows/e2e-docker.yml
+++ b/.github/workflows/e2e-docker.yml
@@ -148,7 +148,7 @@ jobs:
           echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         env:
           MNODES: "${{ needs.deploy.outputs.mnodes }}"
diff --git a/.github/workflows/k8s-native-e2e-add-node.yaml b/.github/workflows/k8s-native-e2e-add-node.yaml
index 0f5211366..fbe656626 100755
--- a/.github/workflows/k8s-native-e2e-add-node.yaml
+++ b/.github/workflows/k8s-native-e2e-add-node.yaml
@@ -1061,7 +1061,7 @@ jobs:
             cid=$(echo "$output" | awk 'NR==4{print $2}')
             csecret=$(echo "$output" | awk 'NR==4{print $NF}')
             if [ -z "$cid" ] || [ "$cid" = "+" ]; then
-              echo "Table parsing failed, trying JSON..."
+              echo "Table parsing failed, trying JSON..." >&2
               local json_out
               json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
                 sbctl cluster list --json 2>&1) || true
@@ -1071,7 +1071,7 @@ jobs:
             if [ -n "$cid" ] && [ "$cid" != "+" ]; then
               echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV
               echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV
-              echo "Extracted CLUSTER_ID=${cid}"
+              echo "Extracted CLUSTER_ID=${cid}" >&2
             fi
             echo "$cid"
           }
@@ -1196,9 +1196,14 @@ jobs:
           echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV
           echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 ))
+          [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+          echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV
+
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         run: |
           set +e
           echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ==="
diff --git a/.github/workflows/k8s-native-e2e-node-migration.yaml b/.github/workflows/k8s-native-e2e-node-migration.yaml
index 95f3317c5..089c53aa3 100755
--- a/.github/workflows/k8s-native-e2e-node-migration.yaml
+++ b/.github/workflows/k8s-native-e2e-node-migration.yaml
@@ -1059,7 +1059,7 @@ jobs:
             cid=$(echo "$output" | awk 'NR==4{print $2}')
             csecret=$(echo "$output" | awk 'NR==4{print $NF}')
             if [ -z "$cid" ] || [ "$cid" = "+" ]; then
-              echo "Table parsing failed, trying JSON..."
+              echo "Table parsing failed, trying JSON..." >&2
               local json_out
               json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
                 sbctl cluster list --json 2>&1) || true
@@ -1069,7 +1069,7 @@ jobs:
             if [ -n "$cid" ] && [ "$cid" != "+" ]; then
               echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV
               echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV
-              echo "Extracted CLUSTER_ID=${cid}"
+              echo "Extracted CLUSTER_ID=${cid}" >&2
             fi
             echo "$cid"
           }
@@ -1194,9 +1194,14 @@ jobs:
           echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV
           echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 ))
+          [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+          echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV
+
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         run: |
           set +e
           echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ==="
diff --git a/.github/workflows/k8s-native-e2e.yaml b/.github/workflows/k8s-native-e2e.yaml
index 02595ca71..ef680bc78 100755
--- a/.github/workflows/k8s-native-e2e.yaml
+++ b/.github/workflows/k8s-native-e2e.yaml
@@ -1212,7 +1212,7 @@ jobs:
             cid=$(echo "$output" | awk 'NR==4{print $2}')
             csecret=$(echo "$output" | awk 'NR==4{print $NF}')
             if [ -z "$cid" ] || [ "$cid" = "+" ]; then
-              echo "Table parsing failed, trying JSON..."
+              echo "Table parsing failed, trying JSON..." >&2
               local json_out
               json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
                 sbctl cluster list --json 2>&1) || true
@@ -1222,7 +1222,7 @@ jobs:
             if [ -n "$cid" ] && [ "$cid" != "+" ]; then
               echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV
               echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV
-              echo "Extracted CLUSTER_ID=${cid}"
+              echo "Extracted CLUSTER_ID=${cid}" >&2
             fi
             echo "$cid"
           }
@@ -1350,9 +1350,14 @@ jobs:
           echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV
           echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 ))
+          [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+          echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV
+
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         run: |
           set +e
           echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ==="
diff --git a/.github/workflows/k8s-native-stress.yaml b/.github/workflows/k8s-native-stress.yaml
index 4536e9438..4f096cf98 100755
--- a/.github/workflows/k8s-native-stress.yaml
+++ b/.github/workflows/k8s-native-stress.yaml
@@ -1162,7 +1162,7 @@ jobs:
             cid=$(echo "$output" | awk 'NR==4{print $2}')
             csecret=$(echo "$output" | awk 'NR==4{print $NF}')
             if [ -z "$cid" ] || [ "$cid" = "+" ]; then
-              echo "Table parsing failed, trying JSON..."
+              echo "Table parsing failed, trying JSON..." >&2
               local json_out
               json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
                 sbctl cluster list --json 2>&1) || true
@@ -1172,7 +1172,7 @@ jobs:
             if [ -n "$cid" ] && [ "$cid" != "+" ]; then
               echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV
               echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV
-              echo "Extracted CLUSTER_ID=${cid}"
+              echo "Extracted CLUSTER_ID=${cid}" >&2
             fi
             echo "$cid"
           }
@@ -1303,9 +1303,14 @@ jobs:
           echo "TEST_TIME_MINS=$TEST_TIME_MINS" >> $GITHUB_ENV
           echo "TEST_TIME_SECS=$TEST_TIME_SECS" >> $GITHUB_ENV
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          LOG_COLLECT_TIMEOUT_MINS=$(( (TEST_TIME + 119) / 120 ))
+          [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+          echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> $GITHUB_ENV
+
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         run: |
           set +e
           echo "=== Collecting Graylog/OpenSearch logs (per-hour chunks) ==="
diff --git a/.github/workflows/monitoring-suite-docker.yaml b/.github/workflows/monitoring-suite-docker.yaml
index 95a7dee2e..86bf3b987 100755
--- a/.github/workflows/monitoring-suite-docker.yaml
+++ b/.github/workflows/monitoring-suite-docker.yaml
@@ -605,6 +605,14 @@ jobs:
           echo "TEST_END_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          if [ -n "${TEST_START_EPOCH:-}" ]; then
+            _elapsed=$(( $(date +%s) - TEST_START_EPOCH ))
+            LOG_COLLECT_TIMEOUT_MINS=$(( (_elapsed + 119) / 120 ))
+            [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+            echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> "$GITHUB_ENV"
+          fi
+
       # ============================================================
       # POST-TEST CLEANUP + LOG COLLECTION
       # ============================================================
@@ -687,8 +695,8 @@ jobs:
           done
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         shell: bash
         run: |
           set +e
diff --git a/.github/workflows/monitoring-suite-k8s-native.yaml b/.github/workflows/monitoring-suite-k8s-native.yaml
index 39e8ef9a1..835a0fbe9 100755
--- a/.github/workflows/monitoring-suite-k8s-native.yaml
+++ b/.github/workflows/monitoring-suite-k8s-native.yaml
@@ -111,6 +111,14 @@ on:
         options:
           - 'false'
           - 'true'
+      use_existing_cluster:
+        description: 'Skip cluster cleanup and setup, reuse existing cluster'
+        required: false
+        default: 'false'
+        type: choice
+        options:
+          - 'false'
+          - 'true'
       send_slack_notification:
         description: 'Send Slack notification?'
         required: false
@@ -315,6 +323,7 @@ jobs:
       # CLEANUP OLD DEPLOYMENT
       # ============================================================
       - name: Cleanup old CSI deployment
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           set +e
           NAMESPACE=simplyblock
@@ -458,6 +467,7 @@ jobs:
           kubectl delete -f $GITHUB_WORKSPACE/helm-charts/charts/simplyblock-operator/crds/ --ignore-not-found 2>/dev/null || true
 
       - name: Cleanup old cert-manager
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           set +e
           helm uninstall cert-manager -n cert-manager 2>/dev/null || true
@@ -465,6 +475,7 @@ jobs:
           kubectl wait --for=delete namespace/cert-manager --timeout=120s 2>/dev/null || true
 
       - name: Cleanup old KMS
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           set +e
           helm uninstall openbao -n vault 2>/dev/null || true
@@ -475,6 +486,7 @@ jobs:
       # LABEL + NAMESPACE + DEPLOY
       # ============================================================
       - name: Label worker nodes
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           CLUSTER_ENV="${{ github.event.inputs.cluster_environment || 'local' }}"
           IFS=',' read -ra NODES <<< "${{ github.event.inputs.worker_nodes }}"
@@ -486,6 +498,7 @@ jobs:
           done
 
       - name: Create namespace + pod-security labels
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           kubectl create namespace simplyblock --dry-run=client -o yaml | kubectl apply -f -
           kubectl label namespace simplyblock \
@@ -495,6 +508,7 @@ jobs:
             --overwrite
 
       - name: Create Docker registry secret
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           kubectl create secret docker-registry regcred \
             --docker-server=https://index.docker.io/v1/ \
@@ -507,7 +521,7 @@ jobs:
           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
 
       - name: Configure OpenShift SCC policies
-        if: ${{ github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local' }}
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' && (github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local') }}
         run: |
           oc adm policy add-scc-to-user privileged -z default -n simplyblock
           oc adm policy add-scc-to-user anyuid -z default -n simplyblock
@@ -518,10 +532,11 @@ jobs:
             --overwrite
 
       - name: Wait before helm install
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: sleep 30
 
       - name: Install cert-manager (TLS prerequisite)
-        if: ${{ github.event.inputs.tls_enabled == 'true' }}
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' && github.event.inputs.tls_enabled == 'true' }}
         run: |
           helm repo add jetstack https://charts.jetstack.io
           helm repo update
@@ -531,6 +546,7 @@ jobs:
           kubectl wait --for=condition=Ready pods --all -n cert-manager --timeout=120s
 
       - name: Install Helm Chart for simplyblock-operator
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           cd $GITHUB_WORKSPACE/helm-charts/charts/simplyblock-operator/
           TLS_FLAGS=""
@@ -562,13 +578,14 @@ jobs:
             $TLS_FLAGS $CSI_FLAGS
 
       - name: Grant OpenShift SCC post-helm
-        if: ${{ github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local' }}
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' && (github.event.inputs.cluster_environment == 'aws-openshift' || github.event.inputs.cluster_environment == 'openshift-local') }}
         run: |
           for sa in $(oc get sa -n simplyblock -o name | cut -d/ -f2); do
             oc adm policy add-scc-to-user privileged -z $sa -n simplyblock
           done
 
       - name: Patch fluent-bit daemonset
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           PATCHED=false
@@ -587,6 +604,7 @@ jobs:
           echo "FLUENTBIT_PATCHED=$PATCHED" >> $GITHUB_ENV
 
       - name: Patch service accounts with imagePullSecrets
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           for sa in $(kubectl get serviceaccounts -n simplyblock --no-headers | awk '{print $1}'); do
             kubectl patch serviceaccount "$sa" -n simplyblock \
@@ -594,6 +612,7 @@ jobs:
           done
 
       - name: Delete ImagePullBackOff pods
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           for pod in $(kubectl get pods -n $NAMESPACE --no-headers 2>/dev/null | grep ImagePullBackOff | awk '{print $1}'); do
@@ -608,6 +627,7 @@ jobs:
       # OPERATOR CRDs
       # ============================================================
       - name: Wait for operator pod
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           for i in $(seq 1 60); do
@@ -621,7 +641,7 @@ jobs:
           done
 
       - name: Setup KMS (vault) for encryption
-        if: ${{ github.event.inputs.tls_enabled == 'true' }}
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' && github.event.inputs.tls_enabled == 'true' }}
         run: |
           STORAGE_CLASS=$(kubectl get sc -o jsonpath='{.items[?(@.metadata.annotations.storageclass\.kubernetes\.io/is-default-class=="true")].metadata.name}' | awk '{print $1}')
           [ -z "$STORAGE_CLASS" ] && STORAGE_CLASS=$(kubectl get sc -o jsonpath='{.items[0].metadata.name}')
@@ -630,6 +650,7 @@ jobs:
           kubectl wait --for=condition=Ready pods -l app.kubernetes.io/name=openbao -n vault --timeout=300s || true
 
       - name: Apply operator custom resources
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           IFC_NAMES="${{ github.event.inputs.ifc_names || 'ens18:enp1s0' }}"
@@ -735,6 +756,7 @@ jobs:
           NPCS: ${{ env.NPCS }}
 
       - name: Patch service accounts post-CRD
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           for sa in $(kubectl get serviceaccounts -n simplyblock --no-headers | awk '{print $1}'); do
             kubectl patch serviceaccount "$sa" -n simplyblock \
@@ -742,6 +764,7 @@ jobs:
           done
 
       - name: Delete ImagePullBackOff pods post-CRD
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           for pod in $(kubectl get pods -n $NAMESPACE --no-headers 2>/dev/null | grep ImagePullBackOff | awk '{print $1}'); do
@@ -753,6 +776,7 @@ jobs:
           done
 
       - name: Wait for storage SA + patch + restart daemonset
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
         run: |
           NAMESPACE=simplyblock
           CLUSTER_ENV="${{ github.event.inputs.cluster_environment }}"
@@ -834,10 +858,85 @@ jobs:
             fi
             echo "Not active yet ($i/$MAX_POLL)..."; sleep 10
           done
-          echo "ERROR: Cluster not active" && exit 1
+          echo "WARNING: Cluster did not become active within timeout — will attempt force-activate"
+          kubectl -n $NAMESPACE get pods
+          kubectl -n $NAMESPACE exec "$ADMIN_POD" -- sbctl cluster list 2>&1 || true
+
+      - name: Verify and force-activate cluster if needed
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' }}
+        run: |
+          NAMESPACE=simplyblock
+          ADMIN_POD=$(kubectl -n $NAMESPACE get pods \
+            -l app=simplyblock-admin-control \
+            -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) || true
+
+          if [ -z "$ADMIN_POD" ]; then
+            echo "ERROR: No admin pod found"
+            exit 1
+          fi
+
+          # Helper: extract cluster ID and secret from sbctl output and export to GITHUB_ENV
+          extract_cluster_info() {
+            local output="$1"
+            local cid csecret
+            cid=$(echo "$output" | awk 'NR==4{print $2}')
+            csecret=$(echo "$output" | awk 'NR==4{print $NF}')
+            if [ -z "$cid" ] || [ "$cid" = "+" ]; then
+              echo "Table parsing failed, trying JSON..." >&2
+              local json_out
+              json_out=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
+                sbctl cluster list --json 2>&1) || true
+              cid=$(echo "$json_out" | jq -r '.[0].id // .[0].uuid // empty')
+              csecret=$(echo "$json_out" | jq -r '.[0].secret // empty')
+            fi
+            if [ -n "$cid" ] && [ "$cid" != "+" ]; then
+              echo "CLUSTER_ID=${cid}" >> $GITHUB_ENV
+              echo "CLUSTER_SECRET=${csecret}" >> $GITHUB_ENV
+              echo "Extracted CLUSTER_ID=${cid}" >&2
+            fi
+            echo "$cid"
+          }
+
+          echo "=== Verifying cluster activation ==="
+          OUTPUT=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
+            sbctl cluster list 2>&1) || true
+          echo "$OUTPUT"
+
+          if echo "$OUTPUT" | grep -qi "active"; then
+            echo "Cluster is active, ensuring env vars are set"
+            extract_cluster_info "$OUTPUT"
+            exit 0
+          fi
+
+          echo "Cluster is NOT active, attempting forced activation..."
+          CID=$(extract_cluster_info "$OUTPUT")
+          if [ -n "$CID" ] && [ "$CID" != "+" ]; then
+            kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
+              sbctl -d cluster activate "${CID}" 2>&1 || true
+          else
+            echo "WARNING: Could not extract cluster ID, trying activate without ID..."
+            kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
+              sbctl -d cluster activate 2>&1 || true
+          fi
+
+          echo "Waiting 60s for activation to take effect..."
+          sleep 60
+
+          OUTPUT=$(kubectl -n $NAMESPACE exec "$ADMIN_POD" -- \
+            sbctl cluster list 2>&1) || true
+          echo "$OUTPUT"
+
+          if echo "$OUTPUT" | grep -qi "active"; then
+            echo "Cluster is now active after forced activation"
+            extract_cluster_info "$OUTPUT"
+            exit 0
+          fi
+
+          echo "ERROR: Cluster is still not active after forced activation"
+          exit 1
 
       - name: Patch fluent-bit post-active
-        if: ${{ env.FLUENTBIT_PATCHED != 'true' }}
+        if: ${{ github.event.inputs.use_existing_cluster != 'true' && env.FLUENTBIT_PATCHED != 'true' }}
         run: |
           NAMESPACE=simplyblock
           for i in $(seq 1 30); do
@@ -901,6 +1000,14 @@ jobs:
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
           echo "TEST_END_TIME=$(date +%s)" >> $GITHUB_ENV
 
+          # Log collection timeout: half the test runtime, minimum 30 minutes
+          if [ -n "${TEST_START_TIME:-}" ]; then
+            _elapsed=$(( $(date +%s) - TEST_START_TIME ))
+            LOG_COLLECT_TIMEOUT_MINS=$(( (_elapsed + 119) / 120 ))
+            [ "$LOG_COLLECT_TIMEOUT_MINS" -lt 30 ] && LOG_COLLECT_TIMEOUT_MINS=30
+            echo "LOG_COLLECT_TIMEOUT_MINS=$LOG_COLLECT_TIMEOUT_MINS" >> "$GITHUB_ENV"
+          fi
+
       # ============================================================
       # POST-TEST: LOG COLLECTION
       # ============================================================
@@ -912,8 +1019,8 @@ jobs:
           [[ -n "${RUN_BASE_DIR}" ]] && echo "RUN_BASE_DIR=${RUN_BASE_DIR}" >> "$GITHUB_ENV" || true
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
-        timeout-minutes: 240
+        if: '!cancelled()'
+        timeout-minutes: ${{ fromJSON(env.LOG_COLLECT_TIMEOUT_MINS || '240') }}
         run: |
           set +e
           NAMESPACE=simplyblock
diff --git a/.github/workflows/stress-run-bootstrap-k8s.yml b/.github/workflows/stress-run-bootstrap-k8s.yml
index e03d43896..7e9153cde 100755
--- a/.github/workflows/stress-run-bootstrap-k8s.yml
+++ b/.github/workflows/stress-run-bootstrap-k8s.yml
@@ -714,6 +714,15 @@ jobs:
           echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
           echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
+      - name: Enable shared placement
+        shell: bash
+        run: |
+          set -euxo pipefail
+          admin_pod="$(kubectl get pods -n "${K8S_NAMESPACE}" --no-headers \
+            -o custom-columns=:metadata.name | grep simplyblock-admin-control | head -1)"
+          kubectl exec -n "${K8S_NAMESPACE}" "${admin_pod}" -- \
+            bash -c "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true
+
       - name: Run stress (foreground; runs until failure)
         shell: bash
         working-directory: sbcli/e2e
@@ -760,7 +769,7 @@ jobs:
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         shell: bash
         run: |
diff --git a/.github/workflows/stress-run-bootstrap-v2.yml b/.github/workflows/stress-run-bootstrap-v2.yml
index 6c02f4044..05bd17f67 100755
--- a/.github/workflows/stress-run-bootstrap-v2.yml
+++ b/.github/workflows/stress-run-bootstrap-v2.yml
@@ -752,6 +752,15 @@ jobs:
           echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
           echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
+      - name: Enable shared placement
+        shell: bash
+        run: |
+          set -euxo pipefail
+          ssh_opts="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${KEY_PATH}"
+          mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')"
+          ssh ${ssh_opts} "${SSH_USER}@${mgmt_ip}" \
+            "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true
+
       - name: Run stress (foreground; runs until failure)
         shell: bash
         working-directory: sbcli/e2e
@@ -822,7 +831,7 @@ jobs:
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         shell: bash
         run: |
diff --git a/.github/workflows/stress-run-bootstrap.yml b/.github/workflows/stress-run-bootstrap.yml
index a2cd37ad6..3af3aecce 100755
--- a/.github/workflows/stress-run-bootstrap.yml
+++ b/.github/workflows/stress-run-bootstrap.yml
@@ -736,6 +736,15 @@ jobs:
           echo "TEST_START_EPOCH=$(date +%s)" >> "$GITHUB_ENV"
           echo "TEST_START_HUMAN=$(date -u +'%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
+      - name: Enable shared placement
+        shell: bash
+        run: |
+          set -euxo pipefail
+          ssh_opts="-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ${KEY_PATH}"
+          mgmt_ip="$(echo "${MNODES}" | awk '{print $1}')"
+          ssh ${ssh_opts} "${SSH_USER}@${mgmt_ip}" \
+            "${SBCLI_CMD} cluster set-shared-placement ${CLUSTER_ID} --force" || true
+
       - name: Run stress (foreground; runs until failure)
         shell: bash
         working-directory: sbcli/e2e
@@ -806,7 +815,7 @@ jobs:
           echo "TEST_END_HUMAN=$(date -u '+%Y-%m-%d %H:%M:%S UTC')" >> "$GITHUB_ENV"
 
       - name: Collect Graylog/OpenSearch logs
-        if: always()
+        if: '!cancelled()'
         timeout-minutes: 240
         shell: bash
         run: |
diff --git a/e2e/__init__.py b/e2e/__init__.py
index d03818e24..7248a5953 100755
--- a/e2e/__init__.py
+++ b/e2e/__init__.py
@@ -27,6 +27,11 @@
 from e2e_tests.ha_journal.lvol_journal_device_node_restart import TestDeviceNodeRestart
 from e2e_tests.data_migration.data_migration_ha_fio import FioWorkloadTest
 from e2e_tests.multi_node_crash_fio_clone import TestMultiFioSnapshotDowntime
+from e2e_tests.test_multi_node_outage import (
+    TestMultiNodeOutageDocker,
+    TestMultiNodeOutageK8s,
+    TestMultiNodeVMRebootDocker
+)
 
 
 from e2e_tests.add_node_fio_run import (
@@ -85,8 +90,14 @@
     LargeScaleLvolK8s,
 )
 from stress_test.device_failure_migration import (
-    DeviceFailureMigrationNoLoad,
-    DeviceFailureMigrationUnderLoad,
+    DeviceFailureMigrationNoLoadDocker,
+    DeviceFailureMigrationUnderLoadDocker,
+    DeviceFailureMigrationPCIeNoLoadDocker,
+    DeviceFailureMigrationPCIeUnderLoadDocker,
+    DeviceFailureMigrationNoLoadK8s,
+    DeviceFailureMigrationUnderLoadK8s,
+    DeviceFailureMigrationPCIeNoLoadK8s,
+    DeviceFailureMigrationPCIeUnderLoadK8s,
 )
 from stress_test.continuous_failover_ha_security import (
     RandomSecurityFailoverTest,
@@ -274,8 +285,17 @@
     BulkLvolHotDeleteK8s,
     LargeScaleLvolDocker,
     LargeScaleLvolK8s,
-    DeviceFailureMigrationNoLoad,
-    DeviceFailureMigrationUnderLoad,
+    DeviceFailureMigrationNoLoadDocker,
+    DeviceFailureMigrationUnderLoadDocker,
+    DeviceFailureMigrationPCIeNoLoadDocker,
+    DeviceFailureMigrationPCIeUnderLoadDocker,
+    DeviceFailureMigrationNoLoadK8s,
+    DeviceFailureMigrationUnderLoadK8s,
+    DeviceFailureMigrationPCIeNoLoadK8s,
+    DeviceFailureMigrationPCIeUnderLoadK8s,
+    TestMultiNodeOutageDocker,
+    TestMultiNodeOutageK8s,
+    TestMultiNodeVMRebootDocker,
 ]
 
 def get_all_tests(custom=True, ha_test=False):
@@ -392,8 +412,14 @@ def get_stress_tests():
         BulkLvolHotDeleteK8s,
         LargeScaleLvolDocker,
         LargeScaleLvolK8s,
-        DeviceFailureMigrationNoLoad,
-        DeviceFailureMigrationUnderLoad,
+        DeviceFailureMigrationNoLoadDocker,
+        DeviceFailureMigrationUnderLoadDocker,
+        DeviceFailureMigrationPCIeNoLoadDocker,
+        DeviceFailureMigrationPCIeUnderLoadDocker,
+        DeviceFailureMigrationNoLoadK8s,
+        DeviceFailureMigrationUnderLoadK8s,
+        DeviceFailureMigrationPCIeNoLoadK8s,
+        DeviceFailureMigrationPCIeUnderLoadK8s,
     ]
     return tests
 
@@ -409,9 +435,16 @@ def get_monitoring_tests():
         BulkLvolHotDeleteK8s,
         LargeScaleLvolDocker,
         LargeScaleLvolK8s,
-        DeviceFailureMigrationNoLoad,
-        DeviceFailureMigrationUnderLoad,
+        DeviceFailureMigrationNoLoadDocker,
+        DeviceFailureMigrationUnderLoadDocker,
+        DeviceFailureMigrationPCIeNoLoadDocker,
+        DeviceFailureMigrationPCIeUnderLoadDocker,
+        DeviceFailureMigrationNoLoadK8s,
+        DeviceFailureMigrationUnderLoadK8s,
+        DeviceFailureMigrationPCIeNoLoadK8s,
+        DeviceFailureMigrationPCIeUnderLoadK8s,
         TestLvolOutageLoadTest,
+        TestParallelLvolSnapshotCloneAPI,
     ]
 
 def get_backup_tests():
diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py
index 7237e6640..50fcb5fe7 100755
--- a/e2e/e2e_tests/cluster_test_base.py
+++ b/e2e/e2e_tests/cluster_test_base.py
@@ -319,6 +319,9 @@ def stop_docker_logs_collect(self):
         self.logger.info("All log monitoring threads stopped.")
     
     def stop_k8s_log_collect(self):
+        if not self.runner_k8s_log or isinstance(self.runner_k8s_log, str):
+            self.logger.warning("[stop_k8s_log_collect] runner_k8s_log not initialized — skipping")
+            return
         self.runner_k8s_log.stop_log_monitor()
         self.runner_k8s_log.stop_logging()
 
diff --git a/e2e/e2e_tests/k8s_native_add_node.py b/e2e/e2e_tests/k8s_native_add_node.py
index 428f7f39d..976ae3764 100755
--- a/e2e/e2e_tests/k8s_native_add_node.py
+++ b/e2e/e2e_tests/k8s_native_add_node.py
@@ -61,6 +61,7 @@ def __init__(self, **kwargs):
 
         # K8s resource naming
         self.STORAGE_CLASS_NAME = "simplyblock-csi-sc"
+        self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs"
         self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass"
         self.FIO_IMAGE = "dockerpinata/fio:2.1"
 
@@ -221,6 +222,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
         self.k8s_utils.create_volume_snapshot_class(name=self.SNAPSHOT_CLASS_NAME)
 
         # Record initial node count
@@ -238,11 +247,13 @@ def run(self):
             pvc_name = f"add-node-pvc-{_rand_seq(4)}-{i}"
             job_name = f"fio-{pvc_name}"
             cm_name = f"fio-cfg-{pvc_name}"
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
 
             self.k8s_utils.create_pvc(
                 name=pvc_name,
                 size=self.pvc_size,
-                storage_class=self.STORAGE_CLASS_NAME,
+                storage_class=sc_name,
             )
             self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300)
 
@@ -250,6 +261,8 @@ def run(self):
                 "job_name": job_name,
                 "configmap_name": cm_name,
                 "snapshots": [],
+                "storage_class": sc_name,
+                "fs_type": fs_type,
             }
 
         # ── Step 3: Start FIO on existing PVCs ───────────────────────────
@@ -289,10 +302,12 @@ def run(self):
             detail["snapshots"].append(snap_name)
             self.snapshot_details[snap_name] = {"pvc_name": pvc_name}
 
+            clone_sc = detail.get("storage_class", self.STORAGE_CLASS_NAME)
+            clone_fs_type = detail.get("fs_type", "ext4")
             self.k8s_utils.create_clone_pvc(
                 name=clone_name,
                 size=self.pvc_size,
-                storage_class=self.STORAGE_CLASS_NAME,
+                storage_class=clone_sc,
                 snapshot_name=snap_name,
             )
             self.k8s_utils.wait_pvc_bound(clone_name, timeout=300)
@@ -312,6 +327,8 @@ def run(self):
                 "snap_name": snap_name,
                 "job_name": clone_job,
                 "configmap_name": clone_cm,
+                "storage_class": clone_sc,
+                "fs_type": clone_fs_type,
             }
             sleep_n_sec(5)
 
@@ -394,11 +411,13 @@ def run(self):
             pvc_name = f"new-node-pvc-{_rand_seq(4)}-{i}"
             job_name = f"fio-{pvc_name}"
             cm_name = f"fio-cfg-{pvc_name}"
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
 
             self.k8s_utils.create_pvc(
                 name=pvc_name,
                 size=self.pvc_size,
-                storage_class=self.STORAGE_CLASS_NAME,
+                storage_class=sc_name,
             )
             self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300)
 
@@ -416,6 +435,8 @@ def run(self):
             new_pvc_details[pvc_name] = {
                 "job_name": job_name,
                 "configmap_name": cm_name,
+                "storage_class": sc_name,
+                "fs_type": fs_type,
             }
             sleep_n_sec(5)
 
diff --git a/e2e/e2e_tests/k8s_native_node_migration.py b/e2e/e2e_tests/k8s_native_node_migration.py
index d41a93fc2..7037fee0c 100755
--- a/e2e/e2e_tests/k8s_native_node_migration.py
+++ b/e2e/e2e_tests/k8s_native_node_migration.py
@@ -56,6 +56,7 @@ def __init__(self, **kwargs):
 
         # K8s resource naming
         self.STORAGE_CLASS_NAME = "simplyblock-csi-sc"
+        self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs"
         self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass"
         self.FIO_IMAGE = "dockerpinata/fio:2.1"
 
@@ -212,6 +213,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
         self.k8s_utils.create_volume_snapshot_class(name=self.SNAPSHOT_CLASS_NAME)
 
         # Record nodes
@@ -226,11 +235,13 @@ def run(self):
             pvc_name = f"mig-pvc-{_rand_seq(4)}-{i}"
             job_name = f"fio-{pvc_name}"
             cm_name = f"fio-cfg-{pvc_name}"
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
 
             self.k8s_utils.create_pvc(
                 name=pvc_name,
                 size=self.pvc_size,
-                storage_class=self.STORAGE_CLASS_NAME,
+                storage_class=sc_name,
             )
             self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300)
 
@@ -249,6 +260,8 @@ def run(self):
                 "job_name": job_name,
                 "configmap_name": cm_name,
                 "snapshots": [],
+                "storage_class": sc_name,
+                "fs_type": fs_type,
             }
             sleep_n_sec(5)
 
@@ -273,10 +286,12 @@ def run(self):
             detail["snapshots"].append(snap_name)
             self.snapshot_details[snap_name] = {"pvc_name": pvc_name}
 
+            clone_sc = detail.get("storage_class", self.STORAGE_CLASS_NAME)
+            clone_fs_type = detail.get("fs_type", "ext4")
             self.k8s_utils.create_clone_pvc(
                 name=clone_name,
                 size=self.pvc_size,
-                storage_class=self.STORAGE_CLASS_NAME,
+                storage_class=clone_sc,
                 snapshot_name=snap_name,
             )
             self.k8s_utils.wait_pvc_bound(clone_name, timeout=300)
@@ -296,6 +311,8 @@ def run(self):
                 "snap_name": snap_name,
                 "job_name": clone_job,
                 "configmap_name": clone_cm,
+                "storage_class": clone_sc,
+                "fs_type": clone_fs_type,
             }
             sleep_n_sec(5)
 
diff --git a/e2e/e2e_tests/test_multi_node_outage.py b/e2e/e2e_tests/test_multi_node_outage.py
new file mode 100755
index 000000000..65a913c97
--- /dev/null
+++ b/e2e/e2e_tests/test_multi_node_outage.py
@@ -0,0 +1,815 @@
+"""E2E Multi-Node Outage Test with Data Integrity Verification.
+
+Tests cluster resilience when 3 out of 4 storage nodes experience
+simultaneous outage (random mix of SPDK crash and network disconnect).
+
+Flow:
+  1. Create 3 lvols per storage node, run FIO on all.
+  2. Wait for 1 FIO per node to complete (short write), keep 2 running.
+  3. Compute md5sum on completed lvols, take pre-outage snapshots+clones.
+  4. Trigger simultaneous outage on 3 random nodes for ~3 minutes.
+  5. Wait for recovery: all nodes online, cluster Active.
+  6. Verify md5sum on completed lvols (data integrity).
+  7. Create 1 new lvol per node + run FIO (basic functionality).
+  8. Take post-outage snapshots+clones (snapshot/clone functionality).
+
+Two variants:
+  - TestMultiNodeOutageDocker: SSH-based (k8s_run=False)
+  - TestMultiNodeOutageK8s: K8s sbcli via kubectl (k8s_run=True)
+"""
+
+import os
+import random
+import threading
+import time
+
+from e2e_tests.cluster_test_base import TestClusterBase, generate_random_sequence
+from logger_config import setup_logger
+from utils.common_utils import sleep_n_sec
+
+
+class _TestMultiNodeOutageBase(TestClusterBase):
+    """Shared logic for Docker and K8s multi-node outage tests."""
+
+    def __init__(self, k8s_run=False, **kwargs):
+        super().__init__(k8s_run=k8s_run, **kwargs)
+        self.logger = setup_logger(__name__)
+
+        # Test parameters
+        self.lvol_size = "5G"
+        self.fio_size = "1G"
+        self.short_fio_runtime = 120    # seconds — short FIO should complete well within this
+        self.long_fio_runtime = 1000     # seconds — long FIO runs during outage
+        self.outage_duration = 180      # 3 minutes
+        self.num_lvols_per_node = 3
+        self.num_outage_nodes = 3
+
+        # Internal state
+        self._node_info = {}        # node_uuid -> {ip, rpc_port, data_nics, if_names}
+        self._lvol_info = {}        # lvol_name -> {node_uuid, device, mount_path, fio_name}
+        self._completed_lvols = []  # lvol names where short FIO completed
+        self._running_lvols = []    # lvol names where long FIO is still running
+        self._pre_checksums = {}    # lvol_name -> {filepath: md5}
+        self._outage_plan = {}      # node_uuid -> "spdk_crash" | "network_outage"
+        self._outage_threads = []
+
+    # ── Snapshot/clone helpers (branched by k8s_test) ────────────────
+
+    def _create_snapshot(self, lvol_id, snap_name):
+        if self.k8s_test:
+            self.sbcli_utils.add_snapshot(lvol_id=lvol_id, snapshot_name=snap_name)
+        else:
+            self.ssh_obj.add_snapshot(
+                node=self.mgmt_nodes[0], lvol_id=lvol_id, snapshot_name=snap_name
+            )
+
+    def _get_snapshot_id(self, snap_name):
+        if self.k8s_test:
+            return self.sbcli_utils.get_snapshot_id(snap_name=snap_name)
+        else:
+            return self.ssh_obj.get_snapshot_id(
+                node=self.mgmt_nodes[0], snapshot_name=snap_name
+            )
+
+    def _create_clone(self, snap_id, clone_name):
+        if self.k8s_test:
+            self.sbcli_utils.add_clone(snapshot_id=snap_id, clone_name=clone_name)
+        else:
+            self.ssh_obj.add_clone(
+                node=self.mgmt_nodes[0], snapshot_id=snap_id, clone_name=clone_name
+            )
+
+    # ── SPDK crash helper (branched by k8s_test) ────────────────────
+
+    def _trigger_spdk_crash(self, node_uuid, node_ip, rpc_port):
+        if self.k8s_test:
+            k8s = getattr(self.sbcli_utils, "k8s", None)
+            if k8s:
+                k8s.stop_spdk_pod(node_ip)
+            else:
+                self.logger.warning(
+                    f"k8s_utils not available — falling back to SSH spdk_process_kill"
+                )
+                self.ssh_obj.stop_spdk_process(node_ip, rpc_port, self.cluster_id)
+        else:
+            self.ssh_obj.stop_spdk_process(node_ip, rpc_port, self.cluster_id)
+
+    # ── NVMe connect/reconnect helpers ──────────────────────────────
+
+    def _connect_lvol(self, client, lvol_name):
+        """Run NVMe connect commands for a lvol on the given client."""
+        connect_ls = self.sbcli_utils.get_lvol_connect_str(lvol_name=lvol_name)
+        if not connect_ls:
+            raise RuntimeError(f"No connect strings for lvol {lvol_name}")
+        for connect_str in connect_ls:
+            self.ssh_obj.exec_command(node=client, command=connect_str)
+
+    def _detect_new_device(self, client, initial_devices):
+        """Return the first new device that appeared since initial_devices."""
+        final_devices = self.ssh_obj.get_devices(node=client)
+        for device in final_devices:
+            if device not in initial_devices:
+                return f"/dev/{device.strip()}"
+        return None
+
+    def _reconnect_lvol(self, client, lvol_name, mount_path):
+        """Reconnect NVMe, detect device, mount without format. Returns device path."""
+        # Unmount if still mounted (may fail — that's ok)
+        self.ssh_obj.exec_command(
+            node=client, command=f"sudo umount {mount_path} 2>/dev/null || true"
+        )
+
+        # Disconnect existing NVMe paths for this lvol
+        lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name)
+        if lvol_id:
+            subsystems = self.ssh_obj.get_nvme_subsystems(node=client, nqn_filter=lvol_id)
+            for subsys in subsystems:
+                self.ssh_obj.disconnect_nvme(node=client, nqn_grep=subsys)
+        sleep_n_sec(3)
+
+        # Re-connect NVMe
+        initial_devices = self.ssh_obj.get_devices(node=client)
+        self._connect_lvol(client, lvol_name)
+        sleep_n_sec(5)
+
+        device = self._detect_new_device(client, initial_devices)
+        if not device:
+            # Device might have reconnected with same name — try the old device
+            old_device = self._lvol_info.get(lvol_name, {}).get("device")
+            if old_device:
+                self.logger.info(
+                    f"No new device detected for {lvol_name}, trying old device {old_device}"
+                )
+                device = old_device
+            else:
+                raise RuntimeError(f"Could not detect device for {lvol_name} after reconnect")
+
+        # Mount (no format — data must be preserved)
+        self.ssh_obj.exec_command(
+            node=client, command=f"sudo mkdir -p {mount_path}"
+        )
+        self.ssh_obj.mount_path(node=client, device=device, mount_path=mount_path)
+        return device
+
+    # ── FIO wait helper ─────────────────────────────────────────────
+
+    def _wait_fio_complete(self, client, fio_name, timeout=300):
+        """Poll until the tmux session for this FIO exits."""
+        deadline = time.time() + timeout
+        session = f"fio_{fio_name}"
+        while time.time() < deadline:
+            out, _ = self.ssh_obj.exec_command(
+                node=client,
+                command=f"sudo tmux has-session -t {session} 2>&1 && echo RUNNING || echo DONE",
+                max_retries=1,
+            )
+            if "DONE" in out:
+                self.logger.info(f"FIO session '{session}' completed on {client}")
+                return True
+            sleep_n_sec(10)
+        self.logger.warning(f"FIO session '{session}' did not complete within {timeout}s")
+        return False
+
+    def _kill_fio_session(self, client, fio_name):
+        """Kill a tmux FIO session if still running."""
+        session = f"fio_{fio_name}"
+        self.ssh_obj.exec_command(
+            node=client,
+            command=f"sudo tmux kill-session -t {session} 2>/dev/null || true",
+            max_retries=1,
+        )
+
+    # ── Outage + recovery (overridable by subclasses) ──────────────
+
+    def _execute_outage_and_recovery(self, node_uuids, client):
+        """Steps 9-11: plan outage, execute, wait for recovery.
+
+        Subclasses can override this to change the outage mechanism
+        (e.g. VM reboot instead of SPDK crash / network disconnect).
+        """
+        # ── Step 9: Plan and execute multi-node outage ──────────────
+        self.logger.info("[step-9] Planning multi-node outage")
+        outage_nodes = random.sample(node_uuids, self.num_outage_nodes)
+        for node_uuid in outage_nodes:
+            outage_type = random.choice(["spdk_crash", "network_outage"])
+            self._outage_plan[node_uuid] = outage_type
+
+        self.logger.info("[step-9] Outage plan:")
+        for node_uuid, otype in self._outage_plan.items():
+            ip = self._node_info[node_uuid]["ip"]
+            self.logger.info(f"  Node {node_uuid[:8]} ({ip}): {otype}")
+
+        # Collect pre-outage diagnostics
+        self.logger.info("[step-9] Collecting pre-outage diagnostics")
+        try:
+            self.collect_management_details(suffix="_pre_outage")
+        except Exception as e:
+            self.logger.warning(f"Pre-outage diagnostics failed: {e}")
+
+        # Execute outages simultaneously
+        self.logger.info("[step-9] TRIGGERING OUTAGES ON 3 NODES")
+        self._outage_threads = []
+        for node_uuid, outage_type in self._outage_plan.items():
+            ninfo = self._node_info[node_uuid]
+            node_ip = ninfo["ip"]
+
+            if outage_type == "spdk_crash":
+                t = threading.Thread(
+                    target=self._trigger_spdk_crash,
+                    args=(node_uuid, node_ip, ninfo["rpc_port"]),
+                    daemon=True,
+                )
+            else:  # network_outage
+                if_names = ninfo["if_names"]
+                if not if_names:
+                    self.logger.warning(
+                        f"No interface names for {node_uuid} — "
+                        f"falling back to get_active_interfaces"
+                    )
+                    if_names = self.ssh_obj.get_active_interfaces(node_ip)
+                t = threading.Thread(
+                    target=self.ssh_obj.disconnect_all_active_interfaces,
+                    args=(node_ip, if_names, self.outage_duration),
+                    daemon=True,
+                )
+
+            self._outage_threads.append(t)
+            t.start()
+            self.logger.info(
+                f"  Outage thread started for {node_uuid[:8]} ({outage_type})"
+            )
+
+        # ── Step 10: Wait for outage to pass ────────────────────────
+        self.logger.info("[step-10] Waiting for cluster to become Suspended or Degraded")
+        try:
+            self.sbcli_utils.wait_for_cluster_status(
+                status=["suspended", "degraded"], timeout=600
+            )
+            self.logger.info("[step-10] Cluster is Suspended/Degraded (outage confirmed)")
+        except TimeoutError:
+            cluster_status = self.sbcli_utils.get_cluster_status()
+            self.logger.warning(
+                f"Cluster did not reach Suspended/Degraded — "
+                f"current status: {cluster_status}"
+            )
+
+        wait_secs = self.outage_duration + 60  # extra buffer
+        self.logger.info(f"[step-10] Waiting {wait_secs}s for outage period to pass")
+        sleep_n_sec(wait_secs)
+
+        # Join outage threads (network disconnect threads block for duration)
+        for t in self._outage_threads:
+            t.join(timeout=120)
+
+        # ── Step 11: Wait for recovery ──────────────────────────────
+        self.logger.info("[step-11] Waiting for all nodes to come back online")
+        for node_uuid in outage_nodes:
+            try:
+                self.sbcli_utils.wait_for_storage_node_status(
+                    node_uuid, status=["online"], timeout=600
+                )
+                self.logger.info(f"  Node {node_uuid[:8]} is online")
+            except TimeoutError:
+                self.logger.error(
+                    f"  Node {node_uuid[:8]} did NOT come back online within 600s"
+                )
+                raise
+
+        self.logger.info("[step-11] Waiting for cluster to become Active")
+        try:
+            self.sbcli_utils.wait_for_cluster_status(
+                status=["active"], timeout=600
+            )
+            self.logger.info("[step-11] Cluster is Active")
+        except TimeoutError:
+            self.logger.warning("Cluster did not reach Active")
+            cluster_status = self.sbcli_utils.get_cluster_status()
+            self.logger.info(f"Current cluster status: {cluster_status}")
+            raise
+
+        # Collect post-recovery diagnostics
+        try:
+            self.collect_management_details(suffix="_post_recovery")
+        except Exception as e:
+            self.logger.warning(f"Post-recovery diagnostics failed: {e}")
+
+        sleep_n_sec(30)  # settle time after recovery
+
+    # ── Main test flow ──────────────────────────────────────────────
+
+    def run(self):
+        self.logger.info("=" * 70)
+        self.logger.info("Starting Multi-Node Outage E2E Test")
+        self.logger.info("=" * 70)
+
+        client = self.fio_node[0]
+
+        # K8s mode: establish SSH to storage nodes (needed for network outage)
+        if self.k8s_test:
+            for node in self.storage_nodes:
+                self.logger.info(f"[setup] SSH-connecting to storage node {node}")
+                self.ssh_obj.connect(
+                    address=node, bastion_server_address=self.bastion_server
+                )
+                sleep_n_sec(1)
+
+        # ── Step 1: Discover storage nodes ──────────────────────────
+        self.logger.info("[step-1] Discovering storage nodes")
+        storage_nodes_data = self.sbcli_utils.get_storage_nodes()
+        node_uuids = []
+        for result in storage_nodes_data["results"]:
+            if not result.get("is_secondary_node", False):
+                uuid = result["uuid"]
+                node_uuids.append(uuid)
+                self._node_info[uuid] = {
+                    "ip": result["mgmt_ip"],
+                    "rpc_port": result.get("rpc_port", ""),
+                    "data_nics": result.get("data_nics", []),
+                    "if_names": [
+                        nic["if_name"]
+                        for nic in result.get("data_nics", [])
+                        if nic.get("if_name")
+                    ],
+                }
+
+        num_nodes = len(node_uuids)
+        self.logger.info(f"[step-1] Found {num_nodes} primary storage nodes: {node_uuids}")
+        assert num_nodes >= 4, (
+            f"Need at least 4 storage nodes for this test, found {num_nodes}"
+        )
+
+        # ── Step 2: Create pool ─────────────────────────────────────
+        self.logger.info("[step-2] Creating storage pool")
+        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        pools = self.sbcli_utils.list_storage_pools()
+        assert self.pool_name in pools, f"Pool {self.pool_name} not created"
+        sleep_n_sec(5)
+
+        # ── Step 3: Create 3 lvols per node ─────────────────────────
+        self.logger.info("[step-3] Creating lvols")
+        node_lvol_names = {}  # uuid -> [lvol_name, ...]
+        for node_uuid in node_uuids:
+            short_id = node_uuid[:6]
+            node_lvol_names[node_uuid] = []
+            for i in range(self.num_lvols_per_node):
+                lvol_name = f"mno-{short_id}-{i}"
+                self.logger.info(
+                    f"  Creating lvol {lvol_name} on node {node_uuid} ({self._node_info[node_uuid]['ip']})"
+                )
+                self.sbcli_utils.add_lvol(
+                    lvol_name=lvol_name,
+                    pool_name=self.pool_name,
+                    size=self.lvol_size,
+                    host_id=node_uuid,
+                    distr_ndcs=self.ndcs,
+                    distr_npcs=self.npcs,
+                    distr_bs=self.bs,
+                    distr_chunk_bs=self.chunk_bs,
+                )
+                node_lvol_names[node_uuid].append(lvol_name)
+                self._lvol_info[lvol_name] = {
+                    "node_uuid": node_uuid,
+                    "device": None,
+                    "mount_path": f"/mnt/mno_{lvol_name}",
+                    "fio_name": None,
+                }
+
+        total_lvols = sum(len(v) for v in node_lvol_names.values())
+        self.logger.info(f"[step-3] Created {total_lvols} lvols across {num_nodes} nodes")
+
+        # ── Step 4: Connect, format, mount all lvols ────────────────
+        self.logger.info("[step-4] Connecting, formatting, and mounting all lvols")
+        for lvol_name, info in self._lvol_info.items():
+            initial_devices = self.ssh_obj.get_devices(node=client)
+            self._connect_lvol(client, lvol_name)
+            sleep_n_sec(3)
+
+            device = self._detect_new_device(client, initial_devices)
+            if not device:
+                raise RuntimeError(f"No new device detected after connecting {lvol_name}")
+
+            info["device"] = device
+            mount_path = info["mount_path"]
+
+            self.ssh_obj.unmount_path(node=client, device=device)
+            self.ssh_obj.format_disk(node=client, device=device, fs_type="ext4")
+            self.ssh_obj.mount_path(node=client, device=device, mount_path=mount_path)
+            self.logger.info(f"  {lvol_name}: {device} → {mount_path}")
+
+        # ── Step 5: Run short FIO (1 per node) and wait ─────────────
+        self.logger.info("[step-5] Running short FIO on 1 lvol per node (write 1G)")
+        for node_uuid in node_uuids:
+            lvol_name = node_lvol_names[node_uuid][0]  # first lvol per node
+            info = self._lvol_info[lvol_name]
+            fio_name = f"short_{lvol_name}"
+            info["fio_name"] = fio_name
+
+            self.ssh_obj.run_fio_test(
+                node=client,
+                directory=info["mount_path"],
+                log_file=os.path.join(self.log_path, f"{fio_name}.log"),
+                name=fio_name,
+                rw="write",
+                bs="1M",
+                size=self.fio_size,
+                numjobs=1,
+                nrfiles=4,
+                runtime=self.short_fio_runtime,
+                time_based=False,
+                use_latency=False,
+            )
+            self._completed_lvols.append(lvol_name)
+
+        # Wait for all short FIOs to complete
+        self.logger.info("[step-5] Waiting for short FIOs to complete")
+        for lvol_name in self._completed_lvols:
+            fio_name = self._lvol_info[lvol_name]["fio_name"]
+            ok = self._wait_fio_complete(client, fio_name, timeout=self.short_fio_runtime + 120)
+            if not ok:
+                self.logger.warning(f"Short FIO {fio_name} may not have completed cleanly")
+
+        sleep_n_sec(5)
+
+        # ── Step 6: Compute pre-outage md5sum on completed lvols ────
+        self.logger.info("[step-6] Computing pre-outage md5sum checksums")
+        for lvol_name in self._completed_lvols:
+            mount_path = self._lvol_info[lvol_name]["mount_path"]
+            files = self.ssh_obj.find_files(client, directory=mount_path)
+            if not files or files == [""]:
+                self.logger.warning(f"No files found in {mount_path} for {lvol_name}")
+                continue
+            checksums = self.ssh_obj.generate_checksums(client, files)
+            self._pre_checksums[lvol_name] = checksums
+            self.logger.info(
+                f"  {lvol_name}: {len(checksums)} files checksummed"
+            )
+
+        assert self._pre_checksums, "No pre-outage checksums computed — aborting"
+
+        # ── Step 7: Pre-outage snapshots + clones ───────────────────
+        self.logger.info("[step-7] Creating pre-outage snapshots and clones")
+        for lvol_name in self._completed_lvols:
+            lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name)
+            if not lvol_id:
+                self.logger.warning(f"Cannot find lvol_id for {lvol_name} — skipping snapshot")
+                continue
+
+            snap_name = f"{lvol_name}_snap_pre"
+            clone_name = f"{lvol_name}_clone_pre"
+            self.logger.info(f"  Snapshot: {snap_name}, Clone: {clone_name}")
+
+            self._create_snapshot(lvol_id, snap_name)
+            snap_id = self._get_snapshot_id(snap_name)
+            if snap_id:
+                self._create_clone(snap_id, clone_name)
+            else:
+                self.logger.warning(f"Could not get snapshot ID for {snap_name}")
+
+        # ── Step 8: Start long FIO on remaining 2 lvols per node ────
+        self.logger.info("[step-8] Starting long FIO on remaining lvols")
+        for node_uuid in node_uuids:
+            for lvol_name in node_lvol_names[node_uuid][1:]:  # lvols 1 and 2
+                info = self._lvol_info[lvol_name]
+                fio_name = f"long_{lvol_name}"
+                info["fio_name"] = fio_name
+
+                self.ssh_obj.run_fio_test(
+                    node=client,
+                    directory=info["mount_path"],
+                    log_file=os.path.join(self.log_path, f"{fio_name}.log"),
+                    name=fio_name,
+                    rw="randrw",
+                    bs="4K",
+                    size=self.fio_size,
+                    numjobs=4,
+                    iodepth=16,
+                    runtime=self.long_fio_runtime,
+                    time_based=True,
+                    rwmixread=70,
+                )
+                self._running_lvols.append(lvol_name)
+
+        self.logger.info(f"[step-8] {len(self._running_lvols)} long FIOs started")
+        sleep_n_sec(30)  # let FIOs establish
+
+        # ── Steps 9-11: Outage + recovery (overridable) ──────────
+        self._execute_outage_and_recovery(node_uuids, client)
+
+        # ── Step 12: Kill remaining long FIOs (they may have errored) ─
+        self.logger.info("[step-12] Killing remaining long FIO sessions")
+        for lvol_name in self._running_lvols:
+            fio_name = self._lvol_info[lvol_name].get("fio_name")
+            if fio_name:
+                self._kill_fio_session(client, fio_name)
+
+        sleep_n_sec(10)
+
+        # ── Step 13: Verify md5sum on completed lvols ───────────────
+        self.logger.info("[step-13] Verifying data integrity (md5sum) on completed lvols")
+        checksum_failures = []
+        for lvol_name in self._completed_lvols:
+            if lvol_name not in self._pre_checksums:
+                self.logger.warning(f"No pre-outage checksum for {lvol_name} — skipping")
+                continue
+
+            mount_path = self._lvol_info[lvol_name]["mount_path"]
+            self.logger.info(f"  Reconnecting {lvol_name}")
+
+            try:
+                device = self._reconnect_lvol(client, lvol_name, mount_path)
+                self._lvol_info[lvol_name]["device"] = device
+            except Exception as e:
+                self.logger.error(f"  Failed to reconnect {lvol_name}: {e}")
+                checksum_failures.append(lvol_name)
+                continue
+
+            files = self.ssh_obj.find_files(client, directory=mount_path)
+            if not files or files == [""]:
+                self.logger.error(f"  No files found in {mount_path} after recovery")
+                checksum_failures.append(lvol_name)
+                continue
+
+            post_checksums = self.ssh_obj.generate_checksums(client, files)
+            pre_set = set(self._pre_checksums[lvol_name].values())
+            post_set = set(post_checksums.values())
+
+            if pre_set == post_set:
+                self.logger.info(
+                    f"  {lvol_name}: CHECKSUM OK ({len(post_checksums)} files verified)"
+                )
+            else:
+                self.logger.error(
+                    f"  {lvol_name}: CHECKSUM MISMATCH!\n"
+                    f"    Pre:  {self._pre_checksums[lvol_name]}\n"
+                    f"    Post: {post_checksums}"
+                )
+                checksum_failures.append(lvol_name)
+
+        if checksum_failures:
+            raise AssertionError(
+                f"Data integrity check failed on {len(checksum_failures)} lvols: {checksum_failures}"
+            )
+        self.logger.info("[step-13] All checksum verifications passed")
+
+        # ── Step 14: Create 1 new lvol per node + run FIO ───────────
+        self.logger.info("[step-14] Creating new lvols post-recovery and running FIO")
+        new_lvol_names = []
+        for node_uuid in node_uuids:
+            short_id = node_uuid[:6]
+            new_name = f"mno-new-{short_id}"
+            self.logger.info(
+                f"  Creating {new_name} on node {node_uuid[:8]} ({self._node_info[node_uuid]['ip']})"
+            )
+            self.sbcli_utils.add_lvol(
+                lvol_name=new_name,
+                pool_name=self.pool_name,
+                size=self.lvol_size,
+                host_id=node_uuid,
+                distr_ndcs=self.ndcs,
+                distr_npcs=self.npcs,
+                distr_bs=self.bs,
+                distr_chunk_bs=self.chunk_bs,
+            )
+
+            # Connect, format, mount
+            initial_devices = self.ssh_obj.get_devices(node=client)
+            self._connect_lvol(client, new_name)
+            sleep_n_sec(3)
+            device = self._detect_new_device(client, initial_devices)
+            if not device:
+                raise RuntimeError(f"No new device for post-recovery lvol {new_name}")
+
+            new_mount = f"/mnt/mno_{new_name}"
+            self.ssh_obj.unmount_path(node=client, device=device)
+            self.ssh_obj.format_disk(node=client, device=device, fs_type="ext4")
+            self.ssh_obj.mount_path(node=client, device=device, mount_path=new_mount)
+
+            # Run short FIO
+            fio_name = f"post_{new_name}"
+            self.ssh_obj.run_fio_test(
+                node=client,
+                directory=new_mount,
+                log_file=os.path.join(self.log_path, f"{fio_name}.log"),
+                name=fio_name,
+                rw="write",
+                bs="1M",
+                size=self.fio_size,
+                numjobs=1,
+                nrfiles=4,
+                runtime=self.short_fio_runtime,
+                time_based=False,
+                use_latency=False,
+            )
+            new_lvol_names.append(new_name)
+            self._lvol_info[new_name] = {
+                "node_uuid": node_uuid,
+                "device": device,
+                "mount_path": new_mount,
+                "fio_name": fio_name,
+            }
+
+        # Wait for new FIOs to complete
+        self.logger.info("[step-14] Waiting for post-recovery FIOs to complete")
+        for new_name in new_lvol_names:
+            fio_name = self._lvol_info[new_name]["fio_name"]
+            ok = self._wait_fio_complete(client, fio_name, timeout=self.short_fio_runtime + 120)
+            assert ok, f"Post-recovery FIO {fio_name} did not complete"
+
+        self.logger.info("[step-14] All post-recovery FIOs completed successfully")
+
+        # ── Step 15: Post-outage snapshots + clones ─────────────────
+        self.logger.info("[step-15] Creating post-outage snapshots and clones")
+        for lvol_name in self._completed_lvols:
+            lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=lvol_name)
+            if not lvol_id:
+                self.logger.warning(f"Cannot find lvol_id for {lvol_name} — skipping")
+                continue
+
+            snap_name = f"{lvol_name}_snap_post"
+            clone_name = f"{lvol_name}_clone_post"
+            self.logger.info(f"  Snapshot: {snap_name}, Clone: {clone_name}")
+
+            self._create_snapshot(lvol_id, snap_name)
+            snap_id = self._get_snapshot_id(snap_name)
+            if snap_id:
+                self._create_clone(snap_id, clone_name)
+            else:
+                self.logger.warning(f"Could not get snapshot ID for {snap_name}")
+
+        self.logger.info("=" * 70)
+        self.logger.info("Multi-Node Outage E2E Test PASSED")
+        self.logger.info("=" * 70)
+
+
+class _TestMultiNodeVMRebootBase(_TestMultiNodeOutageBase):
+    """VM reboot variant — reboots 3 nodes instead of SPDK crash / network outage."""
+
+    def _execute_outage_and_recovery(self, node_uuids, client):
+        """Override: reboot VMs, verify offline + degraded/suspended, wait for recovery."""
+        # ── Step 9: Select and reboot nodes ───────────────────────────
+        self.logger.info("[step-9] Planning VM reboot outage")
+        outage_nodes = random.sample(node_uuids, self.num_outage_nodes)
+        for node_uuid in outage_nodes:
+            self._outage_plan[node_uuid] = "vm_reboot"
+            ip = self._node_info[node_uuid]["ip"]
+            self.logger.info(f"  Node {node_uuid[:8]} ({ip}): vm_reboot")
+
+        # Collect pre-outage diagnostics
+        self.logger.info("[step-9] Collecting pre-outage diagnostics")
+        try:
+            self.collect_management_details(suffix="_pre_outage")
+        except Exception as e:
+            self.logger.warning(f"Pre-outage diagnostics failed: {e}")
+
+        # Trigger reboots — just send `sudo reboot` and close SSH,
+        # do NOT wait for reconnect yet (we need to verify offline first).
+        self.logger.info("[step-9] TRIGGERING VM REBOOTS ON 3 NODES")
+        for node_uuid in outage_nodes:
+            node_ip = self._node_info[node_uuid]["ip"]
+            try:
+                self.ssh_obj.exec_command(
+                    node=node_ip, command="sudo reboot", max_retries=1
+                )
+            except Exception:
+                pass  # Expected — connection drops during reboot
+            # Close SSH connection so subsequent checks don't reuse stale socket
+            if node_ip in self.ssh_obj.ssh_connections:
+                try:
+                    self.ssh_obj.ssh_connections[node_ip].close()
+                except Exception:
+                    pass
+                del self.ssh_obj.ssh_connections[node_ip]
+            self.logger.info(f"  Reboot triggered for {node_uuid[:8]} ({node_ip})")
+
+        sleep_n_sec(15)  # Give nodes time to go down
+
+        # ── Step 10a: Verify nodes are NOT online ─────────────────────
+        self.logger.info("[step-10] Verifying nodes are offline/unreachable")
+        for node_uuid in outage_nodes:
+            try:
+                self.sbcli_utils.wait_for_storage_node_status(
+                    node_uuid,
+                    status=["offline", "unreachable"],
+                    timeout=120,
+                )
+                self.logger.info(f"  Node {node_uuid[:8]} is offline/unreachable (good)")
+            except TimeoutError:
+                try:
+                    details = self.sbcli_utils.get_storage_node_details(
+                        storage_node_id=node_uuid
+                    )
+                    node_status = details[0]["status"] if details else "unknown"
+                except Exception:
+                    node_status = "unknown"
+                self.logger.warning(
+                    f"  Node {node_uuid[:8]} did not go offline within 120s "
+                    f"(current: {node_status})"
+                )
+
+        # ── Step 10b: Verify cluster is degraded or suspended ─────────
+        self.logger.info("[step-10] Waiting for cluster to become Suspended or Degraded")
+        try:
+            self.sbcli_utils.wait_for_cluster_status(
+                status=["suspended", "degraded"], timeout=600
+            )
+            self.logger.info("[step-10] Cluster is Suspended/Degraded (outage confirmed)")
+        except TimeoutError:
+            cluster_status = self.sbcli_utils.get_cluster_status()
+            self.logger.warning(
+                f"Cluster did not reach Suspended/Degraded — "
+                f"current status: {cluster_status}"
+            )
+
+        # ── Step 11: Wait for nodes to come back online ───────────────
+        self.logger.info("[step-11] Waiting for all nodes to come back online after reboot")
+        for node_uuid in outage_nodes:
+            node_ip = self._node_info[node_uuid]["ip"]
+            # Poll SSH until the node is reachable again
+            self.logger.info(f"  Waiting for SSH on {node_uuid[:8]} ({node_ip})")
+            start_time = time.time()
+            ssh_ok = False
+            while time.time() - start_time < 600:
+                try:
+                    self.ssh_obj.connect(
+                        address=node_ip,
+                        bastion_server_address=getattr(self, "bastion_server", None),
+                    )
+                    self.logger.info(f"  SSH reconnected to {node_uuid[:8]} ({node_ip})")
+                    ssh_ok = True
+                    break
+                except Exception:
+                    sleep_n_sec(10)
+            if not ssh_ok:
+                self.logger.error(
+                    f"  SSH reconnect failed for {node_uuid[:8]} ({node_ip}) "
+                    f"after 600s"
+                )
+
+        # Wait for storage node status to become online
+        for node_uuid in outage_nodes:
+            try:
+                self.sbcli_utils.wait_for_storage_node_status(
+                    node_uuid, status=["online"], timeout=600
+                )
+                self.logger.info(f"  Node {node_uuid[:8]} is online")
+            except TimeoutError:
+                self.logger.error(
+                    f"  Node {node_uuid[:8]} did NOT come back online within 600s"
+                )
+                raise
+
+        self.logger.info("[step-11] Waiting for cluster to become Active")
+        try:
+            self.sbcli_utils.wait_for_cluster_status(
+                status=["active"], timeout=600
+            )
+            self.logger.info("[step-11] Cluster is Active")
+        except TimeoutError:
+            self.logger.warning("Cluster did not reach Active")
+            cluster_status = self.sbcli_utils.get_cluster_status()
+            self.logger.info(f"Current cluster status: {cluster_status}")
+            raise
+
+        # Collect post-recovery diagnostics
+        try:
+            self.collect_management_details(suffix="_post_recovery")
+        except Exception as e:
+            self.logger.warning(f"Post-recovery diagnostics failed: {e}")
+
+        sleep_n_sec(30)  # settle time after recovery
+
+
+class TestMultiNodeVMRebootDocker(_TestMultiNodeVMRebootBase):
+    """Docker SSH-based multi-node VM reboot test."""
+
+    def __init__(self, **kwargs):
+        kwargs.pop("k8s_run", None)
+        super().__init__(k8s_run=False, **kwargs)
+        self.test_name = "multi_node_vm_reboot_docker"
+
+
+class TestMultiNodeVMRebootK8s(_TestMultiNodeVMRebootBase):
+    """K8s-based multi-node VM reboot test."""
+
+    def __init__(self, **kwargs):
+        kwargs.pop("k8s_run", None)
+        super().__init__(k8s_run=True, **kwargs)
+        self.test_name = "multi_node_vm_reboot_k8s"
+
+
+class TestMultiNodeOutageDocker(_TestMultiNodeOutageBase):
+    """Docker SSH-based multi-node outage test."""
+
+    def __init__(self, **kwargs):
+        kwargs.pop("k8s_run", None)
+        super().__init__(k8s_run=False, **kwargs)
+        self.test_name = "multi_node_outage_docker"
+
+
+class TestMultiNodeOutageK8s(_TestMultiNodeOutageBase):
+    """K8s-based multi-node outage test (sbcli via kubectl exec)."""
+
+    def __init__(self, **kwargs):
+        kwargs.pop("k8s_run", None)
+        super().__init__(k8s_run=True, **kwargs)
+        self.test_name = "multi_node_outage_k8s"
diff --git a/e2e/stress_test/continuous_bulk_lvol_delete.py b/e2e/stress_test/continuous_bulk_lvol_delete.py
index 0b8c6a0f3..5449ee782 100755
--- a/e2e/stress_test/continuous_bulk_lvol_delete.py
+++ b/e2e/stress_test/continuous_bulk_lvol_delete.py
@@ -21,6 +21,7 @@
 
 from __future__ import annotations
 
+import os
 import random
 import string
 import threading
@@ -175,6 +176,13 @@ def _wait_lvol_deleted(self, lvol_name, timeout=300):
         )
         return False
 
+    def _validate_fio_batch(self, iteration, names):
+        """Validate FIO liveness + collect logs before deletion.
+
+        Override in Docker/K8s subclasses.  Returns failure count.
+        """
+        return 0
+
     def _run_bulk_iterations(self):
         results = []
         for iteration in range(1, self.NUM_ITERATIONS + 1):
@@ -189,14 +197,19 @@ def _run_bulk_iterations(self):
             )
             sleep_n_sec(self.WAIT_AFTER_CREATE)
 
+            # Validate FIO before deletion
+            fio_failures = self._validate_fio_batch(iteration, names)
+
             t_del = time.time()
             result = self._bulk_delete_sequential(iteration, names)
             result["delete_duration"] = time.time() - t_del
+            result["fio_validation_failures"] = fio_failures
             results.append(result)
             self.logger.info(
                 f"Iteration {iteration} done: "
                 f"created={result['created']} deleted={result['deleted']} "
                 f"failed={result['failed']} stale={result['stale']} "
+                f"fio_failures={fio_failures} "
                 f"delete_time={result['delete_duration']:.1f}s"
             )
 
@@ -209,6 +222,9 @@ def _run_bulk_iterations(self):
         total_core_dumps = sum(
             r.get("core_dumps_detected", 0) for r in results
         )
+        total_fio_failures = sum(
+            r.get("fio_validation_failures", 0) for r in results
+        )
 
         if total_core_dumps > 0:
             raise RuntimeError(
@@ -216,6 +232,12 @@ def _run_bulk_iterations(self):
                 f"on storage nodes across {self.NUM_ITERATIONS} iterations"
             )
 
+        if total_fio_failures > 0:
+            raise RuntimeError(
+                f"Bulk delete test detected {total_fio_failures} FIO "
+                f"validation failures across {self.NUM_ITERATIONS} iterations"
+            )
+
         if total_failed > 0:
             raise RuntimeError(
                 f"Bulk delete test had {total_failed} total failures across "
@@ -231,16 +253,21 @@ def _print_bulk_summary(self, results):
         self.logger.info("=== Bulk Lvol Delete Test Summary ===")
         self.logger.info(
             f"{'Iter':>4} | {'Created':>7} | {'Deleted':>7} | "
-            f"{'Failed':>6} | {'Stale':>5}"
+            f"{'Failed':>6} | {'Stale':>5} | {'FIO Err':>7}"
         )
         for r in results:
+            fio_f = r.get("fio_validation_failures", 0)
             self.logger.info(
                 f"{r['iteration']:>4} | {r['created']:>7} | {r['deleted']:>7} | "
-                f"{r['failed']:>6} | {r['stale']:>5}"
+                f"{r['failed']:>6} | {r['stale']:>5} | {fio_f:>7}"
             )
         total_f = sum(r["failed"] for r in results)
         total_s = sum(r["stale"] for r in results)
-        self.logger.info(f"Total failures: {total_f}  Total stale: {total_s}")
+        total_fio = sum(r.get("fio_validation_failures", 0) for r in results)
+        self.logger.info(
+            f"Total failures: {total_f}  Total stale: {total_s}  "
+            f"Total FIO errors: {total_fio}"
+        )
 
     def _write_monitoring_json(self, results):
         """Write standardised timing JSON for monitoring suite aggregation."""
@@ -259,16 +286,18 @@ def _write_monitoring_json(self, results):
                 avg_delete = round(
                     sum(t["delete_sec"] for t in per_lvol) / len(per_lvol), 3
                 )
+            fio_f = r.get("fio_validation_failures", 0)
             phases.append({
                 "name": f"iteration_{r['iteration']}",
                 "duration_sec": round(r.get("delete_duration", 0), 2),
-                "status": "ok" if r["failed"] + r["stale"] == 0 else "degraded",
+                "status": "ok" if r["failed"] + r["stale"] + fio_f == 0 else "degraded",
                 "details": {
                     "created": r["created"],
                     "deleted": r["deleted"],
                     "failed": r["failed"],
                     "stale": r["stale"],
                     "core_dumps_detected": cd,
+                    "fio_validation_failures": fio_f,
                     "avg_delete_sec": avg_delete,
                     "per_lvol_times": per_lvol,
                 },
@@ -466,7 +495,12 @@ def __init__(self, **kwargs):
         self._run_id = _rand_seq(8)
 
     def run(self):
-        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        if actual_pool and actual_pool != self.pool_name:
+            self.logger.info(
+                f"[run] Pool name changed: {self.pool_name} -> {actual_pool}"
+            )
+            self.pool_name = actual_pool
 
         storage_nodes = self.sbcli_utils.get_storage_nodes()
         for result in storage_nodes["results"]:
@@ -615,6 +649,112 @@ def _bulk_create(self, iteration):
 
         return names
 
+    # ── FIO validation ────────────────────────────────────────────────────
+
+    def _validate_fio_batch(self, iteration, names):
+        """Check FIO thread liveness + collect and validate FIO logs."""
+        self.logger.info(
+            f"[validate {iteration}] Checking FIO status for "
+            f"{len(names)} lvols"
+        )
+        failures = 0
+
+        # 1. Check thread liveness
+        alive = sum(1 for t in self.fio_threads if t.is_alive())
+        dead = len(self.fio_threads) - alive
+        self.logger.info(
+            f"[validate {iteration}] FIO threads: {alive} alive, "
+            f"{dead} dead"
+        )
+        if dead > 0:
+            failures += dead
+            self.logger.error(
+                f"[validate {iteration}] {dead} FIO threads died "
+                f"during wait"
+            )
+
+        # 2. Collect FIO logs from remote clients + validate
+        log_dir = os.path.join("logs", "ClientLogs")
+        os.makedirs(log_dir, exist_ok=True)
+        saved = 0
+        for lvol_name in names:
+            details = self.lvol_mount_details.get(lvol_name, {})
+            log_file = details.get("Log")
+            client = details.get("Client")
+            if not log_file or not client:
+                continue
+            # Save FIO stdout log locally
+            try:
+                file_data = self.ssh_obj.read_file(client, log_file)
+                if file_data:
+                    local_path = os.path.join(
+                        log_dir, f"{lvol_name}_fio.log"
+                    )
+                    with open(local_path, "w") as f:
+                        f.write(file_data)
+                    saved += 1
+            except Exception as e:
+                self.logger.warning(
+                    f"[collect {iteration}] Failed to save FIO log for "
+                    f"{lvol_name} on {client} (remote: {log_file}): {e}"
+                )
+            # Validate log contents for error keywords
+            try:
+                self.common_utils.validate_fio_test(client, log_file)
+            except RuntimeError as e:
+                failures += 1
+                self.logger.error(
+                    f"[validate {iteration}] FIO error in "
+                    f"{lvol_name} on {client}: {e}"
+                )
+            except Exception:
+                pass
+
+        # 3. Collect FIO perf logs (iolog, bw, lat, iops files)
+        for lvol_name in names:
+            details = self.lvol_mount_details.get(lvol_name, {})
+            client = details.get("Client")
+            iolog_base = details.get("iolog_base_path")
+            if not client or not iolog_base:
+                continue
+            perf_dir = os.path.join(log_dir, f"{lvol_name}_perf")
+            try:
+                out, _ = self.ssh_obj.exec_command(
+                    node=client,
+                    command=(
+                        f"bash -lc 'ls {iolog_base}* "
+                        f"2>/dev/null || true'"
+                    ),
+                )
+                perf_files = [
+                    f.strip() for f in (out or "").splitlines()
+                    if f.strip()
+                ]
+                if perf_files:
+                    os.makedirs(perf_dir, exist_ok=True)
+                    for src in perf_files:
+                        fname = os.path.basename(src)
+                        dest = os.path.join(perf_dir, fname)
+                        try:
+                            data = self.ssh_obj.read_file(client, src)
+                            if data:
+                                with open(dest, "w") as f:
+                                    f.write(data)
+                        except Exception as e:
+                            self.logger.warning(
+                                f"[validate {iteration}] Failed to collect "
+                                f"perf file for {lvol_name} on {client}: "
+                                f"{src} -> {dest}: {e}"
+                            )
+            except Exception:
+                pass
+
+        self.logger.info(
+            f"[validate {iteration}] Collected {saved} FIO logs, "
+            f"{failures} failures"
+        )
+        return failures
+
     # ── Delete (sequential, one-by-one) ──────────────────────────────────
 
     def _bulk_delete_sequential(self, iteration, names):
@@ -820,6 +960,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
 
         self._run_bulk_iterations()
 
@@ -836,13 +984,16 @@ def _bulk_create(self, iteration):
                 f"({i+1}/{self.NUM_LVOLS})"
             )
 
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            pvc_fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
+
             # Snapshot lvol IDs before PVC creation (for client mode mapping)
             if self.use_client_fio:
                 old_lvol_ids = self._snapshot_lvol_ids()
 
             try:
                 self.k8s_utils.create_pvc(
-                    pvc_name, self.PVC_SIZE, self.STORAGE_CLASS_NAME,
+                    pvc_name, self.PVC_SIZE, sc_name,
                 )
                 self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300)
             except Exception as exc:
@@ -920,7 +1071,7 @@ def _bulk_create(self, iteration):
                     "client": client,
                     "log_file": log_file,
                     "fs_type": fs_type,
-                    "storage_class": self.STORAGE_CLASS_NAME,
+                    "storage_class": sc_name,
                 }
                 self.lvol_mount_details[lvol_name] = {
                     "ID": lvol_id,
@@ -968,7 +1119,8 @@ def _bulk_create(self, iteration):
                     "configmap_name": cm_name,
                     "snapshots": [],
                     "node_id": node_id,
-                    "storage_class": self.STORAGE_CLASS_NAME,
+                    "storage_class": sc_name,
+                    "fs_type": pvc_fs_type,
                 }
 
                 self.logger.info(
@@ -983,6 +1135,125 @@ def _bulk_create(self, iteration):
 
         return names
 
+    # ── FIO validation ────────────────────────────────────────────────────
+
+    def _validate_fio_batch(self, iteration, names):
+        """Check FIO liveness + collect and validate FIO logs."""
+        self.logger.info(
+            f"[validate {iteration}] Checking FIO status for "
+            f"{len(names)} PVCs"
+        )
+        failures = 0
+        log_dir = os.path.join("logs", "ClientLogs")
+        os.makedirs(log_dir, exist_ok=True)
+        saved = 0
+
+        if self.use_client_fio:
+            # ── Client SSH FIO path ──
+            for pvc_name in names:
+                pvc_info = self.pvc_details.get(pvc_name, {})
+                log_file = pvc_info.get("log_file")
+                client = pvc_info.get("client")
+                if not log_file or not client:
+                    continue
+                # Save FIO stdout log locally
+                try:
+                    file_data = self.ssh_obj.read_file(client, log_file)
+                    if file_data:
+                        local_path = os.path.join(
+                            log_dir, f"{pvc_name}_fio.log"
+                        )
+                        with open(local_path, "w") as f:
+                            f.write(file_data)
+                        saved += 1
+                except Exception as e:
+                    self.logger.warning(
+                        f"[validate {iteration}] Unable to save FIO log for "
+                        f"{pvc_name} on {client} ({log_file}): {e}"
+                    )
+                # Validate log contents
+                try:
+                    self.common_utils.validate_fio_test(client, log_file)
+                except RuntimeError as e:
+                    failures += 1
+                    self.logger.error(
+                        f"[validate {iteration}] FIO error in "
+                        f"{pvc_name} on {client}: {e}"
+                    )
+                except Exception:
+                    pass
+        else:
+            # ── K8s Job FIO path ──
+            fail_words = ["error", "fail", "interrupt", "terminate"]
+            for pvc_name in names:
+                pvc_info = self.pvc_details.get(pvc_name, {})
+                job_name = pvc_info.get("job_name")
+                if not job_name:
+                    continue
+                try:
+                    # Save pod logs
+                    pod_name = self.k8s_utils.get_job_pod_name(job_name)
+                    if not pod_name:
+                        continue
+                    logs = self.k8s_utils.get_pod_logs(
+                        pod_name, tail=2000
+                    )
+                    if logs:
+                        local_path = os.path.join(
+                            log_dir, f"{pvc_name}_fio.log"
+                        )
+                        with open(local_path, "w") as f:
+                            f.write(logs)
+                        saved += 1
+
+                    # Copy FIO perf logs from pod
+                    try:
+                        self._save_fio_pod_logs(
+                            job_name, pvc_name, pvc_name=pvc_name
+                        )
+                    except Exception:
+                        pass
+
+                    # Check pod status — Failed/Error means FIO crashed
+                    status_out, _ = self.k8s_utils._exec_kubectl(
+                        f"get pod {pod_name} "
+                        f"-o jsonpath='{{.status.phase}}'",
+                        supress_logs=True,
+                    )
+                    pod_phase = (status_out or "").strip()
+                    if pod_phase in ("Failed", "Error"):
+                        failures += 1
+                        self.logger.error(
+                            f"[validate {iteration}] FIO pod "
+                            f"{pod_name} phase={pod_phase} for "
+                            f"{pvc_name}"
+                        )
+                        continue
+
+                    # Check pod logs for error keywords
+                    if logs:
+                        logs_lower = logs.lower()
+                        for word in fail_words:
+                            if word in logs_lower:
+                                failures += 1
+                                self.logger.error(
+                                    f"[validate {iteration}] FIO "
+                                    f"pod logs for {pvc_name} "
+                                    f"contain '{word}'"
+                                )
+                                break
+                except Exception as exc:
+                    self.logger.warning(
+                        f"[validate {iteration}] Could not check "
+                        f"FIO for {pvc_name}: {exc}"
+                    )
+
+        self.logger.info(
+            f"[validate {iteration}] Collected {saved} FIO logs, "
+            f"{failures} failures"
+        )
+        return failures
+
     # ── Delete (sequential, one-by-one) ──────────────────────────────────
 
     def _bulk_delete_sequential(self, iteration, names):
diff --git a/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py b/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py
old mode 100644
new mode 100755
index 168b890fc..14945b5a5
--- a/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py
+++ b/e2e/stress_test/continuous_failover_ha_multi_outage_all_nodes.py
@@ -1,3 +1,4 @@
+import os
 import random
 import threading
 import time
@@ -147,6 +148,39 @@ def run(self):
             )
 
         self.logger.info(
-            f"max_fault_tolerance={max_fault_tolerance} — proceeding with all-nodes outage test."
+            f"max_fault_tolerance={max_fault_tolerance} — proceeding "
+            f"with all-nodes outage test."
         )
-        super().run()
+
+        # Start full pcap capture on all nodes for network diagnostics
+        all_node_ips = set(
+            self.storage_nodes + self.mgmt_nodes + self.fio_node
+        )
+        self.logger.info(
+            f"Starting full pcap capture on {len(all_node_ips)} nodes"
+        )
+        for node_ip in all_node_ips:
+            try:
+                node_log_dir = os.path.join(
+                    self.docker_logs_path, node_ip,
+                )
+                self.ssh_obj.make_directory(
+                    node=node_ip, dir_name=node_log_dir,
+                )
+                self.ssh_obj.start_full_pcap_capture(
+                    node_ip, node_log_dir,
+                )
+            except Exception as exc:
+                self.logger.warning(
+                    f"Failed to start pcap on {node_ip}: {exc}"
+                )
+
+        try:
+            super().run()
+        finally:
+            # Stop pcap capture on all nodes
+            for node_ip in all_node_ips:
+                try:
+                    self.ssh_obj.stop_full_pcap_capture(node_ip)
+                except Exception:
+                    pass
diff --git a/e2e/stress_test/continuous_k8s_native_failover.py b/e2e/stress_test/continuous_k8s_native_failover.py
index ab5ccfe77..035c62590 100755
--- a/e2e/stress_test/continuous_k8s_native_failover.py
+++ b/e2e/stress_test/continuous_k8s_native_failover.py
@@ -69,6 +69,7 @@ def __init__(self, **kwargs):
 
         # K8s resource naming
         self.STORAGE_CLASS_NAME = "simplyblock-csi-sc"
+        self.XFS_STORAGE_CLASS_NAME = "simplyblock-csi-sc-xfs"
         self.CRYPTO_STORAGE_CLASS_NAME = "simplyblock-csi-sc-crypto"
         self.CRYPTO_POOL_NAME = "encryption-pool"
         self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass"
@@ -1192,16 +1193,17 @@ def create_pvcs_with_fio(self, count: int, node_ids: list[str] = None,
             pvc_name = f"pvc-{_rand_seq(12)}"
             target_node = node_ids[i] if node_ids and i < len(node_ids) else None
 
-            # Determine StorageClass: explicit > 50/50 alternation > regular
+            # Determine StorageClass: explicit > TLS alternation > random ext4/xfs
             if storage_class:
                 sc_name = storage_class
             elif self.tls_enabled and (existing_count + i) % 2 == 1:
                 sc_name = self.CRYPTO_STORAGE_CLASS_NAME
             else:
-                sc_name = self.STORAGE_CLASS_NAME
+                sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
 
             self.logger.info(
-                f"[create_pvc] Creating PVC {pvc_name} ({i+1}/{count}) SC={sc_name}"
+                f"[create_pvc] Creating PVC {pvc_name} ({i+1}/{count}) SC={sc_name} fs={fs_type}"
                 + (f" pinned to node {target_node}" if target_node else "")
             )
 
@@ -1358,10 +1360,11 @@ def create_pvcs_with_fio(self, count: int, node_ids: list[str] = None,
                     "snapshots": [],
                     "node_id": node_id,
                     "storage_class": sc_name,
+                    "fs_type": fs_type,
                 }
 
                 self.logger.info(
-                    f"[create_pvc] PVC {pvc_name} on node {node_id} with FIO Job {job_name} SC={sc_name}"
+                    f"[create_pvc] PVC {pvc_name} on node {node_id} with FIO Job {job_name} SC={sc_name} fs={fs_type}"
                 )
 
             if node_id:
@@ -1431,8 +1434,9 @@ def create_snapshots_and_clones(self):
             # Snapshot lvol IDs before clone PVC (for client mode mapping)
             old_lvol_ids = self._snapshot_lvol_ids() if self.use_client_fio else set()
 
-            # Create clone PVC — use same StorageClass as source PVC
+            # Create clone PVC — use same StorageClass/fs_type as source PVC
             clone_sc = self.pvc_details.get(pvc_name, {}).get("storage_class", self.STORAGE_CLASS_NAME)
+            clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4")
             sleep_n_sec(10)
             try:
                 self.k8s_utils.create_clone_pvc(
@@ -1487,6 +1491,7 @@ def create_snapshots_and_clones(self):
                         "client": client,
                         "log_file": None,
                         "storage_class": clone_sc,
+                        "fs_type": clone_fs_type,
                     }
                     continue
 
@@ -1512,6 +1517,7 @@ def create_snapshots_and_clones(self):
                     "client": client,
                     "log_file": log_file,
                     "storage_class": clone_sc,
+                    "fs_type": clone_fs_type,
                 }
                 self.clone_mount_details[clone_lvol_name] = {
                     "ID": clone_lvol_id,
@@ -1551,6 +1557,7 @@ def create_snapshots_and_clones(self):
                     "job_name": clone_job,
                     "configmap_name": clone_cm,
                     "storage_class": clone_sc,
+                    "fs_type": clone_fs_type,
                 }
 
             # Resize source PVC and clone PVC
@@ -2754,6 +2761,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
         if self.tls_enabled:
             self.logger.info("TLS enabled — ensuring encryption pool exists")
             self.sbcli_utils.ensure_pool_exists(
@@ -2960,8 +2975,9 @@ def create_snapshots_and_clones_with_cleanup(self, count: int = None):
             # Snapshot lvol IDs before clone PVC (for client mode mapping)
             old_lvol_ids = self._snapshot_lvol_ids() if self.use_client_fio else set()
 
-            # Create clone PVC — use same StorageClass as source PVC
+            # Create clone PVC — use same StorageClass/fs_type as source PVC
             clone_sc = self.pvc_details.get(pvc_name, {}).get("storage_class", self.STORAGE_CLASS_NAME)
+            clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4")
             sleep_n_sec(10)
             try:
                 self.k8s_utils.create_clone_pvc(
@@ -3060,6 +3076,7 @@ def create_snapshots_and_clones_with_cleanup(self, count: int = None):
                     "job_name": clone_job,
                     "configmap_name": clone_cm,
                     "storage_class": clone_sc,
+                    "fs_type": clone_fs_type,
                 }
 
             # Resize source PVC and clone PVC
@@ -3134,6 +3151,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
         self.k8s_utils.delete_volume_snapshot_class(self.SNAPSHOT_CLASS_NAME)
         self.k8s_utils.create_volume_snapshot_class(self.SNAPSHOT_CLASS_NAME)
         sleep_n_sec(5)
@@ -3321,13 +3346,14 @@ def _create_pvcs_deferred(self, count: int):
         self._ensure_k8s_utils()
         for i in range(count):
             pvc_name = f"pvc-{_rand_seq(12)}"
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
             self.logger.info(
                 f"[deferred_create] Creating PVC {pvc_name} "
-                f"({i+1}/{count}) — will bind after recovery"
+                f"({i+1}/{count}) SC={sc_name} — will bind after recovery"
             )
             try:
                 self.k8s_utils.create_pvc(
-                    pvc_name, self.pvc_size, self.STORAGE_CLASS_NAME,
+                    pvc_name, self.pvc_size, sc_name,
                 )
             except Exception as exc:
                 self.logger.warning(
@@ -3579,10 +3605,11 @@ def _create_permanent_snapshots_and_clones(self):
                 self._snapshot_lvol_ids() if self.use_client_fio else set()
             )
 
-            # Create clone PVC — use same StorageClass as source PVC
+            # Create clone PVC — use same StorageClass/fs_type as source PVC
             clone_sc = self.pvc_details.get(pvc_name, {}).get(
                 "storage_class", self.STORAGE_CLASS_NAME
             )
+            clone_fs_type = self.pvc_details.get(pvc_name, {}).get("fs_type", "ext4")
             sleep_n_sec(10)
             try:
                 self.k8s_utils.create_clone_pvc(
@@ -3659,6 +3686,7 @@ def _create_permanent_snapshots_and_clones(self):
                     "client": client,
                     "log_file": log_file,
                     "storage_class": clone_sc,
+                    "fs_type": clone_fs_type,
                 }
                 self.clone_mount_details[clone_lvol_name] = {
                     "ID": clone_lvol_id,
@@ -3702,6 +3730,7 @@ def _create_permanent_snapshots_and_clones(self):
                     "job_name": clone_job,
                     "configmap_name": clone_cm,
                     "storage_class": clone_sc,
+                    "fs_type": clone_fs_type,
                 }
 
             self.logger.info(
@@ -4120,6 +4149,14 @@ def run(self):
             ndcs=self.ndcs,
             npcs=self.npcs,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+        )
         if self.tls_enabled:
             self.logger.info("TLS enabled — ensuring encryption pool exists")
             self.sbcli_utils.ensure_pool_exists(
diff --git a/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py b/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py
old mode 100644
new mode 100755
index 7285b2354..96d6a7689
--- a/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py
+++ b/e2e/stress_test/continuous_parallel_lvol_snapshot_clone.py
@@ -1,8 +1,11 @@
+import json as _json
 import os
 import time
 import threading
 from collections import deque
 from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime, timezone
+from pathlib import Path
 
 from e2e_tests.cluster_test_base import TestClusterBase, generate_random_sequence
 from utils.common_utils import sleep_n_sec
@@ -94,6 +97,11 @@ def __init__(self, **kwargs):
         # clone_registry[clone_name] = { id, client, mount_path, snap_name, delete_state }
         self._clone_registry = {}
 
+        # Per-operation timing: list of (wall_ts, op_type, duration_sec, ok)
+        self._op_events: list[tuple] = []
+        # Inventory timeline: list of (wall_ts, lvols, snapshots, clones)
+        self._inventory_timeline: list[tuple] = []
+
         # Metrics
         self._metrics = {
             "start_ts": None,
@@ -158,6 +166,33 @@ def _inc(self, bucket: str, key: str, n: int = 1):
         with self._lock:
             self._metrics[bucket][key] += n
 
+    def _record_op(self, op: str, duration: float, ok: bool):
+        """Append a timing event (thread-safe)."""
+        with self._lock:
+            self._op_events.append((time.time(), op, duration, ok))
+
+    def _snapshot_inventory(self):
+        """Record current inventory counts (thread-safe)."""
+        with self._lock:
+            self._inventory_timeline.append((
+                time.time(),
+                len(self._lvol_registry),
+                len(self._snap_registry),
+                len(self._clone_registry),
+            ))
+
+    def _timed(self, op: str, fn, *args, **kwargs):
+        """Wrap a task function with timing collection."""
+        t0 = time.time()
+        ok = True
+        try:
+            return fn(*args, **kwargs)
+        except Exception:
+            ok = False
+            raise
+        finally:
+            self._record_op(op, time.time() - t0, ok)
+
     def _set_failure(self, op: str, exc: Exception, details: str = "", ctx: dict = None, api_err: dict = None):
         with self._lock:
             if self._metrics["failure_info"] is None:
@@ -1028,7 +1063,7 @@ def _submit_creates(self, ex, create_f: dict, idx_counter: dict):
             idx = idx_counter["idx"]
             idx_counter["idx"] += 1
             lvol_name = f"lvl{generate_random_sequence(15)}_{idx}_{int(time.time())}"
-            f = ex.submit(lambda i=idx, n=lvol_name: self._task_create_lvol(i, n))
+            f = ex.submit(lambda i=idx, n=lvol_name: self._timed("create_lvol", self._task_create_lvol, i, n))
             create_f[f] = time.time()
 
     def _submit_snapshots(self, ex, snap_f: dict):
@@ -1053,7 +1088,7 @@ def _submit_snapshots(self, ex, snap_f: dict):
 
             lvol_name, lvol_id = candidate
             snap_name = f"snap{generate_random_sequence(15)}_{int(time.time())}"
-            f = ex.submit(lambda ln=lvol_name, lid=lvol_id, sn=snap_name: self._task_create_snapshot(ln, lid, sn))
+            f = ex.submit(lambda ln=lvol_name, lid=lvol_id, sn=snap_name: self._timed("create_snapshot", self._task_create_snapshot, ln, lid, sn))
             snap_f[f] = time.time()
 
     def _submit_clones(self, ex, clone_f: dict):
@@ -1079,7 +1114,7 @@ def _submit_clones(self, ex, clone_f: dict):
             snap_name, snap_id = candidate
             idx = int(time.time())
             clone_name = f"cln{generate_random_sequence(15)}_{idx}_{int(time.time())}"
-            f = ex.submit(lambda s=snap_name, sid=snap_id, i=idx, cn=clone_name: self._task_create_clone(s, sid, i, cn))
+            f = ex.submit(lambda s=snap_name, sid=snap_id, i=idx, cn=clone_name: self._timed("create_clone", self._task_create_clone, s, sid, i, cn))
             clone_f[f] = time.time()
 
     def _submit_snapshot_delete_trees(self, ex, snap_del_f: dict):
@@ -1088,7 +1123,7 @@ def _submit_snapshot_delete_trees(self, ex, snap_del_f: dict):
                 if not self._snapshot_delete_tree_q:
                     return
                 sn = self._snapshot_delete_tree_q.popleft()
-            f = ex.submit(lambda sn=sn: self._task_delete_snapshot_tree(sn))
+            f = ex.submit(lambda sn=sn: self._timed("delete_snapshot_tree", self._task_delete_snapshot_tree, sn))
             snap_del_f[f] = time.time()
 
     def _submit_lvol_delete_trees(self, ex, lvol_del_f: dict):
@@ -1097,7 +1132,7 @@ def _submit_lvol_delete_trees(self, ex, lvol_del_f: dict):
                 if not self._lvol_delete_tree_q:
                     return
                 ln = self._lvol_delete_tree_q.popleft()
-            f = ex.submit(lambda ln=ln: self._task_delete_lvol_tree(ln))
+            f = ex.submit(lambda ln=ln: self._timed("delete_lvol_tree", self._task_delete_lvol_tree, ln))
             lvol_del_f[f] = time.time()
 
     def _update_peaks(self, create_f, snap_f, clone_f, snap_del_f, lvol_del_f):
@@ -1194,6 +1229,269 @@ def _print_summary(self):
 
             self.logger.info("===========================================================")
 
+    # ----------------------------
+    # Monitoring JSON + Charts
+    # ----------------------------
+    def _write_monitoring_json(self):
+        """Persist metrics, per-op timing, and inventory timeline to JSON."""
+        out_dir = Path("logs")
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        with self._lock:
+            start_ts = self._metrics["start_ts"] or time.time()
+            end_ts = self._metrics["end_ts"] or time.time()
+            dur = end_ts - start_ts
+
+            # Build per-operation latency summaries
+            op_latencies: dict[str, list[float]] = {}
+            for _, op, duration, ok in self._op_events:
+                if ok:
+                    op_latencies.setdefault(op, []).append(duration)
+
+            op_summary = {}
+            for op, lats in op_latencies.items():
+                lats_sorted = sorted(lats)
+                n = len(lats_sorted)
+                op_summary[op] = {
+                    "count": n,
+                    "min": round(lats_sorted[0], 2) if n else 0,
+                    "max": round(lats_sorted[-1], 2) if n else 0,
+                    "avg": round(sum(lats_sorted) / n, 2) if n else 0,
+                    "p50": round(lats_sorted[n // 2], 2) if n else 0,
+                    "p90": round(lats_sorted[int(n * 0.9)], 2) if n else 0,
+                    "p99": round(lats_sorted[int(n * 0.99)], 2) if n else 0,
+                }
+
+            # Throughput: ops/min buckets
+            if self._op_events:
+                bucket_size = 60  # 1-minute buckets
+                throughput_buckets: dict[int, dict[str, int]] = {}
+                for ts, op, _, ok in self._op_events:
+                    if ok:
+                        bucket = int((ts - start_ts) // bucket_size)
+                        throughput_buckets.setdefault(bucket, {})
+                        throughput_buckets[bucket][op] = throughput_buckets[bucket].get(op, 0) + 1
+                throughput_timeline = [
+                    {"minute": b, **counts}
+                    for b, counts in sorted(throughput_buckets.items())
+                ]
+            else:
+                throughput_timeline = []
+
+            report = {
+                "test_class": self.__class__.__name__,
+                "timestamp": datetime.now(timezone.utc).isoformat(),
+                "status": "passed" if not self._metrics["failure_info"] else "failed",
+                "duration_sec": round(dur, 2),
+                "geometry": {"ndcs": self.ndcs, "npcs": self.npcs},
+                "config": {
+                    "create_inflight": self.CREATE_INFLIGHT,
+                    "snapshot_inflight": self.SNAPSHOT_INFLIGHT,
+                    "clone_inflight": self.CLONE_INFLIGHT,
+                    "total_inventory_max": self.TOTAL_INVENTORY_MAX,
+                    "total_delete_threshold": self.TOTAL_DELETE_THRESHOLD,
+                    "lvol_size": self.LVOL_SIZE,
+                },
+                "counts": dict(self._metrics["counts"]),
+                "attempts": dict(self._metrics["attempts"]),
+                "success": dict(self._metrics["success"]),
+                "failures": dict(self._metrics["failures"]),
+                "peak_inflight": dict(self._metrics["peak_inflight"]),
+                "op_latency_summary": op_summary,
+                "throughput_per_minute": throughput_timeline,
+                "op_events": [
+                    {"ts": round(ts - start_ts, 2), "op": op,
+                     "duration": round(d, 2), "ok": ok}
+                    for ts, op, d, ok in self._op_events
+                ],
+                "inventory_timeline": [
+                    {"ts": round(ts - start_ts, 2), "lvols": lv,
+                     "snapshots": sn, "clones": cl}
+                    for ts, lv, sn, cl in self._inventory_timeline
+                ],
+            }
+
+        out_path = out_dir / "parallel_lvol_snapshot_clone_timing.json"
+        with open(out_path, "w") as f:
+            _json.dump(report, f, indent=2)
+        self.logger.info(f"Monitoring JSON written to {out_path}")
+
+    def _generate_charts(self):
+        """Generate performance charts from collected timing data."""
+        out_dir = Path("logs")
+        out_dir.mkdir(parents=True, exist_ok=True)
+
+        try:
+            import matplotlib
+            matplotlib.use("Agg")
+            import matplotlib.pyplot as plt
+        except ImportError:
+            self.logger.warning("matplotlib not available — skipping charts")
+            return
+
+        with self._lock:
+            start_ts = self._metrics["start_ts"] or 0
+            op_events = list(self._op_events)
+            inv_timeline = list(self._inventory_timeline)
+            counts = dict(self._metrics["counts"])
+
+        class_name = self.__class__.__name__
+
+        # --- Chart 1: Operation latency scatter ---
+        try:
+            if op_events:
+                fig, ax = plt.subplots(figsize=(14, 6))
+                op_colors = {
+                    "create_lvol": "#3498db",
+                    "create_snapshot": "#2ecc71",
+                    "create_clone": "#f39c12",
+                    "delete_snapshot_tree": "#e74c3c",
+                    "delete_lvol_tree": "#9b59b6",
+                }
+                for op, color in op_colors.items():
+                    pts = [(ts - start_ts, d) for ts, o, d, ok in op_events if o == op and ok]
+                    if pts:
+                        xs, ys = zip(*pts)
+                        ax.scatter(xs, ys, c=color, alpha=0.5, s=12, label=op)
+                ax.set_xlabel("Time (seconds since start)")
+                ax.set_ylabel("Duration (seconds)")
+                ax.set_title(f"{class_name} — Operation Latency Over Time")
+                ax.legend(fontsize=8, loc="upper right")
+                ax.grid(True, alpha=0.3)
+                plt.tight_layout()
+                fig.savefig(str(out_dir / "op_latency_scatter.png"), dpi=150)
+                plt.close(fig)
+                self.logger.info("Chart saved: op_latency_scatter.png")
+        except Exception as exc:
+            self.logger.warning(f"Latency scatter chart failed: {exc}")
+
+        # --- Chart 2: Inventory timeline (stacked area) ---
+        try:
+            if inv_timeline:
+                ts_vals = [t - start_ts for t, _, _, _ in inv_timeline]
+                lvols = [lv for _, lv, _, _ in inv_timeline]
+                snaps = [sn for _, _, sn, _ in inv_timeline]
+                clones = [cl for _, _, _, cl in inv_timeline]
+
+                fig, ax = plt.subplots(figsize=(14, 5))
+                ax.stackplot(ts_vals, lvols, snaps, clones,
+                             labels=["LVols", "Snapshots", "Clones"],
+                             colors=["#3498db", "#2ecc71", "#f39c12"], alpha=0.7)
+                ax.axhline(y=self.TOTAL_INVENTORY_MAX, color="red",
+                           linestyle="--", alpha=0.6, label=f"Max ({self.TOTAL_INVENTORY_MAX})")
+                ax.axhline(y=self.TOTAL_DELETE_THRESHOLD, color="orange",
+                           linestyle="--", alpha=0.6, label=f"Delete threshold ({self.TOTAL_DELETE_THRESHOLD})")
+                ax.set_xlabel("Time (seconds since start)")
+                ax.set_ylabel("Count")
+                ax.set_title(f"{class_name} — Inventory Over Time")
+                ax.legend(fontsize=8, loc="upper left")
+                ax.grid(True, alpha=0.3)
+                plt.tight_layout()
+                fig.savefig(str(out_dir / "inventory_timeline.png"), dpi=150)
+                plt.close(fig)
+                self.logger.info("Chart saved: inventory_timeline.png")
+        except Exception as exc:
+            self.logger.warning(f"Inventory timeline chart failed: {exc}")
+
+        # --- Chart 3: Throughput (ops/min bar chart) ---
+        try:
+            if op_events:
+                bucket_size = 60
+                buckets: dict[int, dict[str, int]] = {}
+                for ts, op, _, ok in op_events:
+                    if ok:
+                        b = int((ts - start_ts) // bucket_size)
+                        buckets.setdefault(b, {})
+                        buckets[b][op] = buckets[b].get(op, 0) + 1
+
+                if buckets:
+                    max_bucket = max(buckets.keys())
+                    minutes = list(range(max_bucket + 1))
+                    op_types = sorted({op for c in buckets.values() for op in c})
+                    op_colors_list = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#9b59b6"]
+
+                    fig, ax = plt.subplots(figsize=(14, 5))
+                    bottom = [0] * len(minutes)
+                    for i, op in enumerate(op_types):
+                        vals = [buckets.get(m, {}).get(op, 0) for m in minutes]
+                        color = op_colors_list[i % len(op_colors_list)]
+                        ax.bar(minutes, vals, bottom=bottom, label=op,
+                               color=color, alpha=0.8, width=0.8)
+                        bottom = [b + v for b, v in zip(bottom, vals)]
+                    ax.set_xlabel("Minute")
+                    ax.set_ylabel("Completed Operations")
+                    ax.set_title(f"{class_name} — Throughput (ops/min)")
+                    ax.legend(fontsize=8, loc="upper right")
+                    ax.grid(True, axis="y", alpha=0.3)
+                    plt.tight_layout()
+                    fig.savefig(str(out_dir / "throughput_per_minute.png"), dpi=150)
+                    plt.close(fig)
+                    self.logger.info("Chart saved: throughput_per_minute.png")
+        except Exception as exc:
+            self.logger.warning(f"Throughput chart failed: {exc}")
+
+        # --- Chart 4: Operations summary (total counts bar) ---
+        try:
+            creates = [
+                ("LVols created", counts.get("lvols_created", 0)),
+                ("Snapshots created", counts.get("snapshots_created", 0)),
+                ("Clones created", counts.get("clones_created", 0)),
+            ]
+            deletes = [
+                ("LVols deleted", counts.get("lvols_deleted", 0)),
+                ("Snapshots deleted", counts.get("snapshots_deleted", 0)),
+                ("Clones deleted", counts.get("clones_deleted", 0)),
+            ]
+            labels = [c[0] for c in creates] + [d[0] for d in deletes]
+            values = [c[1] for c in creates] + [d[1] for d in deletes]
+            colors = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#c0392b", "#d35400"]
+
+            fig, ax = plt.subplots(figsize=(10, 5))
+            bars = ax.bar(range(len(labels)), values, color=colors, alpha=0.8)
+            ax.set_xticks(range(len(labels)))
+            ax.set_xticklabels(labels, rotation=30, ha="right", fontsize=9)
+            ax.set_ylabel("Count")
+            ax.set_title(f"{class_name} — Total Operations")
+            for b, v in zip(bars, values):
+                if v > 0:
+                    ax.text(b.get_x() + b.get_width() / 2,
+                            b.get_height() + max(values) * 0.02,
+                            str(v), ha="center", va="bottom", fontsize=9)
+            ax.grid(True, axis="y", alpha=0.3)
+            plt.tight_layout()
+            fig.savefig(str(out_dir / "operations_summary.png"), dpi=150)
+            plt.close(fig)
+            self.logger.info("Chart saved: operations_summary.png")
+        except Exception as exc:
+            self.logger.warning(f"Operations summary chart failed: {exc}")
+
+        # --- Chart 5: Latency box plot per operation ---
+        try:
+            op_latencies: dict[str, list[float]] = {}
+            for _, op, d, ok in op_events:
+                if ok:
+                    op_latencies.setdefault(op, []).append(d)
+
+            if op_latencies:
+                fig, ax = plt.subplots(figsize=(10, 5))
+                ops = sorted(op_latencies.keys())
+                data = [op_latencies[op] for op in ops]
+                bp = ax.boxplot(data, tick_labels=ops, patch_artist=True)
+                box_colors = ["#3498db", "#2ecc71", "#f39c12", "#e74c3c", "#9b59b6"]
+                for i, patch in enumerate(bp["boxes"]):
+                    patch.set_facecolor(box_colors[i % len(box_colors)])
+                    patch.set_alpha(0.7)
+                ax.set_ylabel("Duration (seconds)")
+                ax.set_title(f"{class_name} — Latency Distribution Per Operation")
+                ax.tick_params(axis="x", rotation=30)
+                ax.grid(True, axis="y", alpha=0.3)
+                plt.tight_layout()
+                fig.savefig(str(out_dir / "latency_boxplot.png"), dpi=150)
+                plt.close(fig)
+                self.logger.info("Chart saved: latency_boxplot.png")
+        except Exception as exc:
+            self.logger.warning(f"Latency box plot failed: {exc}")
+
     # ----------------------------
     # Main
     # ----------------------------
@@ -1248,6 +1546,9 @@ def run(self):
                         self._submit_snapshot_delete_trees(ex, snap_del_f)
                         self._submit_lvol_delete_trees(ex, lvol_del_f)
 
+                    # Record inventory snapshot every loop iteration
+                    self._snapshot_inventory()
+
                     # Update peaks and harvest
                     self._update_peaks(create_f, snap_f, clone_f, snap_del_f, lvol_del_f)
                     self._harvest_fail_fast(create_f)
@@ -1270,6 +1571,8 @@ def run(self):
 
         finally:
             self._print_summary()
+            self._write_monitoring_json()
+            self._generate_charts()
 
         with self._lock:
             failure_info = self._metrics["failure_info"]
diff --git a/e2e/stress_test/continuous_parallel_namespace_lvol.py b/e2e/stress_test/continuous_parallel_namespace_lvol.py
index cef2a8f8d..a6c3f8b50 100755
--- a/e2e/stress_test/continuous_parallel_namespace_lvol.py
+++ b/e2e/stress_test/continuous_parallel_namespace_lvol.py
@@ -1,10 +1,14 @@
 """
 Parallel Namespace LVol Stress Test (Docker + K8s)
 
-Creates 300 parent lvols each with 6 namespace partitions (1800 total),
-takes 2 snapshots per lvol (3600 total), clones 1 picked snapshot 1500 times,
-then deletes everything in parallel — with verified deletion.  Repeats for
-NUM_ITERATIONS cycles to measure latency degradation over time.
+Creates 100 parent lvols each with 50 namespace children (5100 total lvols),
+writes 10 MB data to each parent, takes 2 snapshots per parent (+ 1 random
+child), clones 1 picked snapshot 1500 times, verifies everything, then deletes
+in parallel — with verified deletion.  Repeats for NUM_ITERATIONS cycles to
+measure latency degradation over time.
+
+**Sequential per-parent flow**: for each parent, all 50 children are created
+and verified before moving to the next parent.  Any failure aborts the test.
 
 Two variants:
   - TestParallelNamespaceLvolDocker: sbcli API (add_lvol with namespace=)
@@ -23,8 +27,10 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
+from datetime import datetime, timezone
 from e2e_tests.cluster_test_base import TestClusterBase
 from utils.common_utils import sleep_n_sec
+from utils.ssh_utils import RunnerK8sLog
 
 try:
     import requests
@@ -47,12 +53,12 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
         # ── Scale ──────────────────────────────────────────────────────────
-        self.NUM_PARENTS = 300
-        self.NAMESPACES_PER_PARENT = 100     # max_namespace_per_subsys
-        self.CHILDREN_PER_PARENT = 5         # 300 × 5 = 1500 children
-        self.SNAPSHOTS_PER_LVOL = 2          # per parent + 1 random child
+        self.NUM_PARENTS = 20
+        self.NAMESPACES_PER_PARENT = 26      # max_namespace_per_subsys (parent + 25 children)
+        self.CHILDREN_PER_PARENT = 25        # 20 × 25 = 500 children
+        self.SNAPSHOTS_PER_LVOL = 2          # per parent + 1 random child → ~42 total
         self.NUM_CLONES = 1500               # from 1 picked snapshot
-        self.NUM_ITERATIONS = 20
+        self.NUM_ITERATIONS = 1
 
         # ── Sizing ─────────────────────────────────────────────────────────
         self.LVOL_SIZE = "1G"
@@ -63,14 +69,18 @@ def __init__(self, **kwargs):
         self.MAX_WORKERS_DELETE = 30
         self.BATCH_SIZE = 50
         self.TASK_TIMEOUT = 300
+        self.PARALLEL_PARENTS = 10           # concurrent parents during child creation
+        self.CLONE_BATCH_SIZE = 250          # clone creation batch size for stats
+        self.CLONE_BIND_TIMEOUT = 3600       # 1 hour — large clone batches queue in CSI
 
         # ── Retry ─────────────────────────────────────────────────────────
         self.RETRY_MAX = 10
-        self.RETRY_INTERVAL = 5
+        self.RETRY_INTERVAL = 30
 
         # ── Thread-safe state ─────────────────────────────────────────────
         self._lock = threading.Lock()
         self._stop_event = threading.Event()
+        self._clones_binding = 0             # how many clones waiting for Bound right now
 
         # parent_name -> {id, children: [child_name], snapshots: [snap_name]}
         self._parent_registry = {}
@@ -83,8 +93,10 @@ def __init__(self, **kwargs):
 
         # ── Timing samples ────────────────────────────────────────────────
         self._timing_samples = []   # list of dicts
+        self._batch_timings = []    # batch-level summaries for graphs
         self._iteration_timings = []  # per-iteration phase durations
         self._current_iteration = 0
+        self._snapshot_child = None  # pre-selected child for snapshot (set in write_data)
 
         # ── Metrics ───────────────────────────────────────────────────────
         self._metrics = {
@@ -133,15 +145,55 @@ def _snapshot_inventory(self) -> dict:
                 "clones": clones, "total": lvols + snaps + clones,
             }
 
-    def _record_timing(self, op: str, name: str, elapsed: float, inventory: dict):
+    def _record_timing(self, op: str, name: str, elapsed: float,
+                       inventory: dict, api_elapsed: float = None):
         with self._lock:
-            self._timing_samples.append({
+            sample = {
                 "iteration": self._current_iteration,
                 "op": op,
                 "name": name,
                 "elapsed_sec": round(elapsed, 4),
                 "inventory": inventory,
                 "timestamp": time.time(),
+            }
+            if api_elapsed is not None:
+                sample["api_elapsed_sec"] = round(api_elapsed, 4)
+            self._timing_samples.append(sample)
+
+    def _log_op_stats(self, op: str, batch_label: str = "",
+                      batch_elapsed: float = 0, count: int = 0):
+        """Log avg/p50/p95 stats for a given op in the current iteration."""
+        with self._lock:
+            samples = [
+                s["elapsed_sec"] for s in self._timing_samples
+                if s["iteration"] == self._current_iteration and s["op"] == op
+            ]
+        if not samples:
+            return
+        samples_sorted = sorted(samples)
+        n = len(samples_sorted)
+        avg = sum(samples_sorted) / n
+        p50 = samples_sorted[n // 2]
+        p95 = samples_sorted[min(int(n * 0.95), n - 1)]
+        mn, mx = samples_sorted[0], samples_sorted[-1]
+        tag = f" ({batch_label})" if batch_label else ""
+        self.logger.info(
+            f"[{op}]{tag}: {count or n} ops in {batch_elapsed:.1f}s — "
+            f"avg={avg:.2f}s p50={p50:.2f}s p95={p95:.2f}s "
+            f"min={mn:.2f}s max={mx:.2f}s"
+        )
+        with self._lock:
+            self._batch_timings.append({
+                "iteration": self._current_iteration,
+                "op": op,
+                "batch_label": batch_label,
+                "batch_elapsed": round(batch_elapsed, 2),
+                "count": count or n,
+                "avg": round(avg, 4),
+                "p50": round(p50, 4),
+                "p95": round(p95, 4),
+                "min": round(mn, 4),
+                "max": round(mx, 4),
             })
 
     # ── API error helpers (reused from existing parallel test) ────────────
@@ -184,6 +236,14 @@ def _is_sync_deletion_error(self, api_err: dict) -> bool:
         msg = (api_err.get("msg") or "").lower()
         return "lvol sync deletion found" in text or "lvol sync deletion found" in msg
 
+    def _is_already_exists_error(self, api_err: dict) -> bool:
+        """Detect 'LVol name must be unique' — resource was created by a
+        prior attempt that appeared to fail but actually succeeded."""
+        text = (api_err.get("text") or "").lower()
+        msg = (api_err.get("msg") or "").lower()
+        return ("must be unique" in text or "must be unique" in msg
+                or "already exists" in text or "already exists" in msg)
+
     def _api_retry(self, op: str, fn, ctx: dict = None):
         """Call fn() with retry.  Returns fn() result on success."""
         ctx = ctx or {}
@@ -196,6 +256,14 @@ def _api_retry(self, op: str, fn, ctx: dict = None):
                     self._inc("failures", op)
                     self.logger.warning(f"[max_lvols] op={op} ctx={ctx}")
                     raise
+                # "Name must be unique" means a prior attempt actually
+                # succeeded — treat as success, not failure
+                if self._is_already_exists_error(api_err):
+                    self.logger.info(
+                        f"[retry] op={op} resource already exists "
+                        f"(prior attempt succeeded): ctx={ctx}"
+                    )
+                    return None  # treat as success
                 if attempt < self.RETRY_MAX:
                     self.logger.warning(
                         f"[retry] op={op} attempt {attempt}/{self.RETRY_MAX} "
@@ -251,6 +319,230 @@ def _wait_snapshot_gone(self, snap_name: str, timeout: int = 120) -> float:
         self.logger.warning(f"snapshot {snap_name} still exists after {timeout}s")
         return time.time() - start
 
+    # ── Verification helpers ──────────────────────────────────────────────
+
+    def _verify_all_lvols_exist(self):
+        """Verify registered parents and children exist in lvol list.
+
+        Retries up to 30 minutes to allow resources to settle.
+        Warns for missing, only fails if >50% missing.
+        """
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+
+        with self._lock:
+            total = len(self._parent_registry) + len(self._child_registry)
+
+        while waited <= max_wait:
+            all_lvols = self.sbcli_utils.list_lvols()
+            missing = []
+            with self._lock:
+                for name in self._parent_registry:
+                    if name not in all_lvols:
+                        missing.append(("parent", name))
+                for name in self._child_registry:
+                    if name not in all_lvols:
+                        missing.append(("child", name))
+
+            miss_pct = len(missing) * 100 / max(total, 1)
+            if miss_pct <= 50:
+                break  # Within tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_lvols] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                    f"lvols missing, waiting {poll_interval}s... "
+                    f"(waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        miss_pct = len(missing) * 100 / max(total, 1)
+        if missing:
+            self.logger.warning(
+                f"[verify_lvols] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                f"lvols missing from API after {waited}s wait: "
+                f"{missing[:10]}{'...' if len(missing) > 10 else ''}"
+            )
+        if miss_pct > 50:
+            raise RuntimeError(
+                f"[verify_lvols] {miss_pct:.1f}% lvols missing exceeds "
+                f"50% threshold — {len(missing)}/{total}"
+            )
+        self.logger.info(
+            f"[verify_lvols] {total - len(missing)}/{total} lvols "
+            f"confirmed in API"
+        )
+
+    def _verify_all_snapshots_exist(self):
+        """Verify registered snapshots exist in snapshot list.
+
+        Retries up to 30 minutes to allow resources to settle.
+        Warns for missing, only fails if >50% missing.
+        """
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+
+        with self._lock:
+            total = len(self._snap_registry)
+
+        while waited <= max_wait:
+            all_snaps = self.sbcli_utils.list_snapshots()
+            missing = []
+            with self._lock:
+                for name in self._snap_registry:
+                    if name not in all_snaps:
+                        missing.append(name)
+
+            miss_pct = len(missing) * 100 / max(total, 1)
+            if miss_pct <= 50:
+                break  # Within tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_snapshots] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                    f"snapshots missing, waiting {poll_interval}s... "
+                    f"(waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        miss_pct = len(missing) * 100 / max(total, 1)
+        if missing:
+            self.logger.warning(
+                f"[verify_snapshots] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                f"snapshots missing after {waited}s wait: "
+                f"{missing[:10]}{'...' if len(missing) > 10 else ''}"
+            )
+        if miss_pct > 50:
+            raise RuntimeError(
+                f"[verify_snapshots] {miss_pct:.1f}% snapshots missing "
+                f"exceeds 50% threshold — {len(missing)}/{total}"
+            )
+        self.logger.info(
+            f"[verify_snapshots] {total - len(missing)}/{total} snapshots "
+            f"confirmed in API"
+        )
+
+    def _verify_all_clones_exist(self):
+        """Verify registered clones exist in lvol list.
+
+        Retries up to 30 minutes to allow resources to settle.
+        Warns for missing, only fails if >50% missing.
+        """
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+
+        with self._lock:
+            total = len(self._clone_registry)
+
+        while waited <= max_wait:
+            all_lvols = self.sbcli_utils.list_lvols()
+            missing = []
+            with self._lock:
+                for name in self._clone_registry:
+                    if name not in all_lvols:
+                        missing.append(name)
+
+            miss_pct = len(missing) * 100 / max(total, 1)
+            if miss_pct <= 50:
+                break  # Within tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_clones] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                    f"clones missing, waiting {poll_interval}s... "
+                    f"(waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        miss_pct = len(missing) * 100 / max(total, 1)
+        if missing:
+            self.logger.warning(
+                f"[verify_clones] {len(missing)}/{total} ({miss_pct:.1f}%) "
+                f"clones missing from API after {waited}s wait: "
+                f"{missing[:10]}{'...' if len(missing) > 10 else ''}"
+            )
+        if miss_pct > 50:
+            raise RuntimeError(
+                f"[verify_clones] {miss_pct:.1f}% clones missing exceeds "
+                f"50% threshold — {len(missing)}/{total}"
+            )
+        self.logger.info(
+            f"[verify_clones] {total - len(missing)}/{total} clones "
+            f"confirmed in API"
+        )
+
+    def _phase_mount_verify_clones(self):
+        """Mount 20 random clones and run short FIO read to verify accessibility.
+
+        Picks up to 20 random clones from the registry, connects/mounts each,
+        runs a 4 MB FIO read, checks for errors, and disconnects.  Fails the
+        phase if any clone verification fails.
+        """
+        with self._lock:
+            clone_names = list(self._clone_registry.keys())
+        sample_size = min(20, len(clone_names))
+        if sample_size == 0:
+            self.logger.info("[mount_verify] No clones to verify, skipping")
+            return
+        selected = random.sample(clone_names, sample_size)
+        self.logger.info(
+            f"[mount_verify] Verifying {sample_size} clones with FIO read"
+        )
+        ok, fail = self._batch_parallel(
+            [{"clone_name": c} for c in selected],
+            self._mount_verify_single_clone,
+            min(sample_size, self.MAX_WORKERS_CREATE),
+            "mount_verify",
+        )
+        self.logger.info(
+            f"[mount_verify] {ok}/{sample_size} OK, {fail} failed"
+        )
+        if fail > 0:
+            raise RuntimeError(
+                f"[mount_verify] {fail}/{sample_size} clone mount+FIO "
+                f"verifications failed. Check logs for FIO err= or "
+                f"connect failures."
+            )
+
+    def _mount_verify_single_clone(self, item):
+        """Subclass must implement: connect/mount clone, FIO read, verify."""
+        raise NotImplementedError
+
+    def _verify_nodes_healthy(self):
+        """Verify all storage nodes are online and healthy."""
+        nodes_data = self.sbcli_utils.get_storage_nodes()
+        unhealthy = []
+        for node in nodes_data.get("results", []):
+            node_id = node.get("id", "?")
+            hostname = node.get("hostname", "?")
+            status = node.get("status", "unknown")
+            health = node.get("health_check", None)
+            if status != "online" or health is not True:
+                unhealthy.append(
+                    f"{hostname}(id={node_id}, status={status}, "
+                    f"health={health})"
+                )
+        if unhealthy:
+            raise RuntimeError(
+                f"[verify_nodes] Unhealthy nodes: {', '.join(unhealthy)}"
+            )
+        total = len(nodes_data.get("results", []))
+        self.logger.info(
+            f"[verify_nodes] All {total} storage nodes online and healthy"
+        )
+
     # ── Batch parallel execution ──────────────────────────────────────────
 
     def _batch_parallel(self, items, task_fn, max_workers: int, op_name: str):
@@ -306,17 +598,76 @@ def _run_phase(self, name: str, fn):
         except Exception as e:
             self.logger.error(f"[{name}] Phase failed: {e}")
             self._set_failure(name, e, f"Phase {name} failed")
+            self._stop_event.set()
         finally:
             dur = time.time() - start
             self.logger.info(f"=== Phase {name} done in {dur:.1f}s ===")
+            # Flush timing data after every phase so data survives cancellation
+            try:
+                self._flush_timing_data()
+            except Exception:
+                pass
             return dur  # used for iteration timing
 
+    def _flush_timing_data(self):
+        """Write intermediate timing JSON to disk (fast, no graphs).
+
+        Called after every phase so data survives if the test is killed.
+        """
+        try:
+            out_dir = self._get_log_dir()
+        except Exception:
+            return
+        report = {
+            "config": {
+                "NUM_PARENTS": self.NUM_PARENTS,
+                "NAMESPACES_PER_PARENT": self.NAMESPACES_PER_PARENT,
+                "CHILDREN_PER_PARENT": self.CHILDREN_PER_PARENT,
+                "SNAPSHOTS_PER_LVOL": self.SNAPSHOTS_PER_LVOL,
+                "NUM_CLONES": self.NUM_CLONES,
+                "NUM_ITERATIONS": self.NUM_ITERATIONS,
+                "BATCH_SIZE": self.BATCH_SIZE,
+                "MAX_WORKERS_CREATE": self.MAX_WORKERS_CREATE,
+                "CLONE_BATCH_SIZE": self.CLONE_BATCH_SIZE,
+            },
+            "iterations": self._iteration_timings,
+            "samples": self._timing_samples,
+            "batch_timings": self._batch_timings,
+            "metrics": self._metrics,
+            "mappings": self._get_registry_mappings(),
+        }
+        path = os.path.join(out_dir, "namespace_stress_timings.json")
+        try:
+            with open(path, "w") as f:
+                json.dump(report, f, indent=2, default=str)
+        except Exception:
+            pass
+
+    def _get_registry_mappings(self) -> dict:
+        """Snapshot current registry relationships for graph generation."""
+        with self._lock:
+            child_to_parent = {
+                cn: ci.get("parent_name", "unknown")
+                for cn, ci in self._child_registry.items()
+            }
+            clone_to_snap = {
+                cn: ci.get("snap_name", "unknown")
+                for cn, ci in self._clone_registry.items()
+            }
+            parent_list = list(self._parent_registry.keys())
+        return {
+            "child_to_parent": child_to_parent,
+            "clone_to_snap": clone_to_snap,
+            "parent_list": parent_list,
+        }
+
     def _clear_registries(self):
         with self._lock:
             self._parent_registry.clear()
             self._child_registry.clear()
             self._snap_registry.clear()
             self._clone_registry.clear()
+            self._snapshot_child = None
 
     # ── Abstract-like methods (subclasses override) ───────────────────────
 
@@ -326,10 +677,12 @@ def _phase_setup(self):
     def _phase_cleanup(self):
         raise NotImplementedError
 
-    def _create_parent_impl(self, params: dict):
+    def _phase_create_subsystems(self):
+        """Sequential per-parent: create parent + children + verify."""
         raise NotImplementedError
 
-    def _create_child_impl(self, params: dict):
+    def _phase_write_data(self):
+        """Write 10 MB to each parent lvol before snapshotting."""
         raise NotImplementedError
 
     def _create_snapshot_impl(self, params: dict):
@@ -350,6 +703,38 @@ def _delete_child_impl(self, child_name: str):
     def _delete_parent_impl(self, parent_name: str):
         raise NotImplementedError
 
+    def _phase_verify_cleanup(self):
+        """Verify all test resources are gone before next iteration."""
+        all_lvols = self.sbcli_utils.list_lvols()
+        if all_lvols:
+            self.logger.warning(
+                f"[verify_cleanup] {len(all_lvols)} lvols still present "
+                f"— retrying cleanup"
+            )
+            try:
+                self.sbcli_utils.delete_all_clones()
+            except Exception:
+                pass
+            try:
+                self.sbcli_utils.delete_all_snapshots()
+            except Exception as e:
+                self.logger.warning(
+                    "[verify_cleanup] delete_all_snapshots failed during retry: %s",
+                    e,
+                )
+            try:
+                self.sbcli_utils.delete_all_lvols()
+            except Exception:
+                pass
+            sleep_n_sec(10)
+            remaining = self.sbcli_utils.list_lvols()
+            if remaining:
+                raise RuntimeError(
+                    f"Cleanup verification failed: "
+                    f"{len(remaining)} lvols still exist"
+                )
+        self.logger.info("[verify_cleanup] All resources confirmed deleted")
+
     # ── Timed wrappers (called by _batch_parallel) ───────────────────────
 
     def _timed_create_parent(self, params: dict):
@@ -376,9 +761,12 @@ def _timed_create_snapshot(self, params: dict):
     def _timed_create_clone(self, params: dict):
         inv = self._snapshot_inventory()
         t0 = time.time()
-        self._create_clone_impl(params)
+        api_elapsed = self._create_clone_impl(params)
         elapsed = time.time() - t0
-        self._record_timing("create_clone", params["name"], elapsed, inv)
+        self._record_timing(
+            "create_clone", params["name"], elapsed, inv,
+            api_elapsed=api_elapsed,
+        )
 
     def _timed_delete_clone(self, clone_name: str):
         inv = self._snapshot_inventory()
@@ -410,35 +798,6 @@ def _timed_delete_parent(self, parent_name: str):
 
     # ── Phase implementations ─────────────────────────────────────────────
 
-    def _phase_create_parents(self):
-        items = []
-        for i in range(self.NUM_PARENTS):
-            name = f"ns-par-{_rand_seq(6)}-{i:04d}"
-            items.append({"name": name, "idx": i})
-        self._batch_parallel(
-            items, self._timed_create_parent,
-            self.MAX_WORKERS_CREATE, "create_parents",
-        )
-
-    def _phase_create_children(self):
-        """Create CHILDREN_PER_PARENT child namespace lvols per parent."""
-        items = []
-        with self._lock:
-            parents = list(self._parent_registry.items())
-        for parent_name, pinfo in parents:
-            parent_id = pinfo["id"]
-            for c in range(self.CHILDREN_PER_PARENT):
-                child_name = f"ns-ch-{_rand_seq(6)}-{parent_name[-4:]}-{c}"
-                items.append({
-                    "name": child_name,
-                    "parent_name": parent_name,
-                    "parent_id": parent_id,
-                })
-        self._batch_parallel(
-            items, self._timed_create_child,
-            self.MAX_WORKERS_CREATE, "create_children",
-        )
-
     def _phase_create_snapshots(self):
         """Create SNAPSHOTS_PER_LVOL snapshots for each parent + 1 random child."""
         items = []
@@ -447,10 +806,12 @@ def _phase_create_snapshots(self):
             snap_lvols = []
             for pname, pinfo in self._parent_registry.items():
                 snap_lvols.append((pname, pinfo["id"]))
-            # Pick 1 random child (if any)
+            # Use pre-selected child (from write_data) or pick a random one
+            chosen_child = getattr(self, "_snapshot_child", None)
             child_names = list(self._child_registry.keys())
-            if child_names:
+            if not chosen_child and child_names:
                 chosen_child = random.choice(child_names)
+            if chosen_child and chosen_child in self._child_registry:
                 cinfo = self._child_registry[chosen_child]
                 snap_lvols.append((chosen_child, cinfo["id"]))
                 self.logger.info(
@@ -469,13 +830,30 @@ def _phase_create_snapshots(self):
             f"[create_snapshots] Creating {len(items)} snapshots "
             f"({len(snap_lvols)} lvols × {self.SNAPSHOTS_PER_LVOL})"
         )
-        self._batch_parallel(
+        snap_t0 = time.time()
+        _ok, fail = self._batch_parallel(
             items, self._timed_create_snapshot,
             self.MAX_WORKERS_CREATE, "create_snapshots",
         )
+        snap_elapsed = time.time() - snap_t0
+        self._log_op_stats(
+            "create_snapshot", batch_label="all snapshots",
+            batch_elapsed=snap_elapsed,
+        )
+        snap_fail_pct = fail * 100 / max(len(items), 1)
+        if fail > 0:
+            self.logger.warning(
+                f"[create_snapshots] {fail}/{len(items)} "
+                f"({snap_fail_pct:.1f}%) snapshots failed"
+            )
+        if snap_fail_pct > 50:
+            raise RuntimeError(
+                f"[create_snapshots] {snap_fail_pct:.1f}% snapshot failures "
+                f"exceeds 50% threshold — {fail}/{len(items)}"
+            )
 
     def _phase_create_clones(self):
-        """Pick 1 random snapshot and create NUM_CLONES clones from it."""
+        """Pick 1 random snapshot and create NUM_CLONES clones in batches."""
         with self._lock:
             snap_names = list(self._snap_registry.keys())
         if not snap_names:
@@ -484,66 +862,273 @@ def _phase_create_clones(self):
         chosen_snap = random.choice(snap_names)
         with self._lock:
             snap_id = self._snap_registry[chosen_snap]["snap_id"]
+            snap_parent = self._snap_registry[chosen_snap].get("lvol_name", "")
+            clone_sc = self._parent_registry.get(snap_parent, {}).get(
+                "storage_class", self.STORAGE_CLASS_NAME
+            )
         self.logger.info(
             f"[create_clones] Chosen snapshot: {chosen_snap} (id={snap_id})"
         )
-        items = []
+        all_items = []
         for i in range(self.NUM_CLONES):
             clone_name = f"cln-{_rand_seq(6)}-{i:04d}"
-            items.append({
+            all_items.append({
                 "name": clone_name,
                 "snap_name": chosen_snap,
                 "snap_id": snap_id,
+                "sc_name": clone_sc,
             })
-        self._batch_parallel(
-            items, self._timed_create_clone,
-            self.MAX_WORKERS_CREATE, "create_clones",
+
+        total_batches = (
+            (len(all_items) + self.CLONE_BATCH_SIZE - 1)
+            // self.CLONE_BATCH_SIZE
+        )
+        overall_t0 = time.time()
+        total_clone_fail = 0
+
+        for batch_idx in range(0, len(all_items), self.CLONE_BATCH_SIZE):
+            batch = all_items[batch_idx:batch_idx + self.CLONE_BATCH_SIZE]
+            batch_num = batch_idx // self.CLONE_BATCH_SIZE + 1
+            self.logger.info(
+                f"[create_clones] Batch {batch_num}/{total_batches}: "
+                f"{len(batch)} clones"
+            )
+            batch_t0 = time.time()
+            _ok, batch_fail = self._batch_parallel(
+                batch, self._timed_create_clone,
+                self.MAX_WORKERS_CREATE,
+                f"create_clones_b{batch_num}",
+            )
+            batch_elapsed = time.time() - batch_t0
+            total_clone_fail += batch_fail
+            with self._lock:
+                still_binding = self._clones_binding
+            if batch_fail > 0:
+                self.logger.warning(
+                    f"[create_clones] Batch {batch_num}: "
+                    f"{batch_fail}/{len(batch)} clones failed "
+                    f"(still_binding={still_binding})"
+                )
+            # Per-batch stats (only for clones created in this batch)
+            with self._lock:
+                batch_samples = [
+                    s["elapsed_sec"] for s in self._timing_samples
+                    if (s["iteration"] == self._current_iteration
+                        and s["op"] == "create_clone"
+                        and s["timestamp"] >= batch_t0)
+                ]
+            if batch_samples:
+                bs = sorted(batch_samples)
+                n = len(bs)
+                throughput = n / batch_elapsed if batch_elapsed > 0 else 0
+                effective_per_clone = batch_elapsed / n if n > 0 else 0
+                self.logger.info(
+                    f"[create_clones] Batch {batch_num} stats: "
+                    f"{n} ops in {batch_elapsed:.1f}s — "
+                    f"avg_wall={sum(bs)/n:.2f}s "
+                    f"p50={bs[n//2]:.2f}s "
+                    f"p95={bs[min(int(n*0.95), n-1)]:.2f}s "
+                    f"min={bs[0]:.2f}s max={bs[-1]:.2f}s | "
+                    f"throughput={throughput:.2f} clones/s "
+                    f"effective_per_clone={effective_per_clone:.2f}s"
+                )
+                with self._lock:
+                    self._batch_timings.append({
+                        "iteration": self._current_iteration,
+                        "op": "create_clone",
+                        "batch_label": f"batch {batch_num}/{total_batches}",
+                        "batch_elapsed": round(batch_elapsed, 2),
+                        "count": n,
+                        "avg_wall": round(sum(bs) / n, 4),
+                        "p50": round(bs[n // 2], 4),
+                        "p95": round(bs[min(int(n * 0.95), n - 1)], 4),
+                        "min": round(bs[0], 4),
+                        "max": round(bs[-1], 4),
+                        "throughput_per_sec": round(throughput, 4),
+                        "effective_per_clone": round(effective_per_clone, 4),
+                    })
+
+        overall_elapsed = time.time() - overall_t0
+        self._log_op_stats(
+            "create_clone", batch_label="all clones",
+            batch_elapsed=overall_elapsed,
         )
 
+        # Overall clone failure check
+        clone_fail_pct = total_clone_fail * 100 / max(len(all_items), 1)
+        if total_clone_fail > 0:
+            self.logger.warning(
+                f"[create_clones] Total: {total_clone_fail}/{len(all_items)} "
+                f"({clone_fail_pct:.1f}%) clones failed across all batches"
+            )
+        if clone_fail_pct > 50:
+            raise RuntimeError(
+                f"[create_clones] {clone_fail_pct:.1f}% clone failures "
+                f"exceeds 50% threshold — "
+                f"{total_clone_fail}/{len(all_items)}"
+            )
+
     def _phase_delete_all(self):
         """Delete: clones → snapshots → children → parents (ordered)."""
+        total_failures = 0
+
         # Step 1: clones
         with self._lock:
             clone_names = list(self._clone_registry.keys())
         if clone_names:
             self.logger.info(f"[delete_all] Deleting {len(clone_names)} clones")
-            self._batch_parallel(
+            t0 = time.time()
+            _ok, fail = self._batch_parallel(
                 clone_names, self._timed_delete_clone,
                 self.MAX_WORKERS_DELETE, "delete_clones",
             )
+            self._log_op_stats(
+                "delete_clone", batch_label="all clones",
+                batch_elapsed=time.time() - t0, count=len(clone_names),
+            )
+            if fail > 0:
+                self.logger.warning(
+                    f"[delete_all] {fail}/{len(clone_names)} clone "
+                    f"deletions failed"
+                )
+                total_failures += fail
 
         # Step 2: snapshots
         with self._lock:
             snap_names = list(self._snap_registry.keys())
         if snap_names:
             self.logger.info(f"[delete_all] Deleting {len(snap_names)} snapshots")
-            self._batch_parallel(
+            t0 = time.time()
+            _ok, fail = self._batch_parallel(
                 snap_names, self._timed_delete_snapshot,
                 self.MAX_WORKERS_DELETE, "delete_snapshots",
             )
+            self._log_op_stats(
+                "delete_snapshot", batch_label="all snapshots",
+                batch_elapsed=time.time() - t0, count=len(snap_names),
+            )
+            if fail > 0:
+                self.logger.warning(
+                    f"[delete_all] {fail}/{len(snap_names)} snapshot "
+                    f"deletions failed"
+                )
+                total_failures += fail
 
         # Step 3: children
         with self._lock:
             child_names = list(self._child_registry.keys())
         if child_names:
             self.logger.info(f"[delete_all] Deleting {len(child_names)} children")
-            self._batch_parallel(
+            t0 = time.time()
+            _ok, fail = self._batch_parallel(
                 child_names, self._timed_delete_child,
                 self.MAX_WORKERS_DELETE, "delete_children",
             )
+            self._log_op_stats(
+                "delete_child", batch_label="all children",
+                batch_elapsed=time.time() - t0, count=len(child_names),
+            )
+            if fail > 0:
+                self.logger.warning(
+                    f"[delete_all] {fail}/{len(child_names)} child "
+                    f"deletions failed"
+                )
+                total_failures += fail
 
         # Step 4: parents
         with self._lock:
             parent_names = list(self._parent_registry.keys())
         if parent_names:
             self.logger.info(f"[delete_all] Deleting {len(parent_names)} parents")
-            self._batch_parallel(
+            t0 = time.time()
+            _ok, fail = self._batch_parallel(
                 parent_names, self._timed_delete_parent,
                 self.MAX_WORKERS_DELETE, "delete_parents",
             )
+            self._log_op_stats(
+                "delete_parent", batch_label="all parents",
+                batch_elapsed=time.time() - t0, count=len(parent_names),
+            )
+            if fail > 0:
+                self.logger.warning(
+                    f"[delete_all] {fail}/{len(parent_names)} parent "
+                    f"deletions failed"
+                )
+                total_failures += fail
+
+        if total_failures > 0:
+            self.logger.warning(
+                f"[delete_all] Total: {total_failures} deletion failures — "
+                f"verify_cleanup phase will retry"
+            )
 
     # ── Reporting ─────────────────────────────────────────────────────────
 
+    def _compute_per_iteration_summary(self):
+        """Compute per-iteration avg/min/max/p50/p95 for create operations.
+
+        Uses api_elapsed_sec when available (Docker — API-only time),
+        otherwise falls back to elapsed_sec (K8s — time to PVC Bound).
+        """
+        summary = {}
+        with self._lock:
+            all_samples = list(self._timing_samples)
+        if not all_samples:
+            return summary
+        iterations = sorted(set(s["iteration"] for s in all_samples))
+        create_ops = [
+            "create_parent", "create_child", "create_clone",
+        ]
+        for it in iterations:
+            it_key = str(it)
+            summary[it_key] = {}
+            for op in create_ops:
+                samples = [
+                    s for s in all_samples
+                    if s["iteration"] == it and s["op"] == op
+                ]
+                if not samples:
+                    continue
+                times = [
+                    s.get("api_elapsed_sec", s["elapsed_sec"])
+                    for s in samples
+                ]
+                times_sorted = sorted(times)
+                n = len(times_sorted)
+                op_summary = {
+                    "count": n,
+                    "avg_wall": round(sum(times_sorted) / n, 4),
+                    "min": round(times_sorted[0], 4),
+                    "max": round(times_sorted[-1], 4),
+                    "p50": round(times_sorted[n // 2], 4),
+                    "p95": round(
+                        times_sorted[min(int(n * 0.95), n - 1)], 4
+                    ),
+                }
+                # For clone ops, compute throughput from batch timings
+                if op == "create_clone":
+                    with self._lock:
+                        it_batches = [
+                            b for b in self._batch_timings
+                            if b["iteration"] == it and b["op"] == op
+                        ]
+                    if it_batches:
+                        total_elapsed = sum(
+                            b["batch_elapsed"] for b in it_batches
+                        )
+                        total_count = sum(
+                            b["count"] for b in it_batches
+                        )
+                        if total_elapsed > 0:
+                            op_summary["throughput_per_sec"] = round(
+                                total_count / total_elapsed, 4
+                            )
+                            op_summary["effective_per_clone"] = round(
+                                total_elapsed / total_count, 4
+                            )
+                summary[it_key][op] = op_summary
+        return summary
+
     def _get_log_dir(self) -> str:
         """Return the directory for timing/graph output."""
         d = getattr(self, "docker_logs_path", None)
@@ -562,10 +1147,16 @@ def _write_timing_report(self):
                 "SNAPSHOTS_PER_LVOL": self.SNAPSHOTS_PER_LVOL,
                 "NUM_CLONES": self.NUM_CLONES,
                 "NUM_ITERATIONS": self.NUM_ITERATIONS,
+                "BATCH_SIZE": self.BATCH_SIZE,
+                "MAX_WORKERS_CREATE": self.MAX_WORKERS_CREATE,
+                "CLONE_BATCH_SIZE": self.CLONE_BATCH_SIZE,
             },
             "iterations": self._iteration_timings,
+            "per_iteration_summary": self._compute_per_iteration_summary(),
             "samples": self._timing_samples,
+            "batch_timings": self._batch_timings,
             "metrics": self._metrics,
+            "mappings": self._get_registry_mappings(),
         }
         path = os.path.join(out_dir, "namespace_stress_timings.json")
         try:
@@ -615,19 +1206,22 @@ def _generate_graphs(self):
         except Exception as exc:
             self.logger.warning(f"Graph 1 failed: {exc}")
 
-        # ── 2. Latency per iteration (box plot) ──────────────────────────
+        # ── 2. Latency per iteration (box plot with legend) ──────────────
         try:
+            from matplotlib.patches import Patch
             create_ops = [
                 "create_parent", "create_child",
                 "create_snapshot", "create_clone",
             ]
+            op_labels = ["parent", "child", "snapshot", "clone"]
             iterations = sorted(set(s["iteration"] for s in samples))
             fig, ax = plt.subplots(figsize=(14, 8))
             positions = []
             labels = []
             data_groups = []
+            op_indices = []  # track which op each box belongs to
             for it in iterations:
-                for op in create_ops:
+                for oi, op in enumerate(create_ops):
                     vals = [
                         s["elapsed_sec"] for s in samples
                         if s["iteration"] == it and s["op"] == op
@@ -639,11 +1233,12 @@ def _generate_graphs(self):
                             + create_ops.index(op)
                         )
                         labels.append(f"i{it}_{op.split('_')[-1]}")
+                        op_indices.append(oi)
             if data_groups:
                 bp = ax.boxplot(data_groups, positions=positions, widths=0.6,
                                 patch_artist=True, showfliers=False)
                 for j, patch in enumerate(bp["boxes"]):
-                    c_idx = j % len(create_ops)
+                    c_idx = op_indices[j] if j < len(op_indices) else j
                     patch.set_facecolor(colors[c_idx % len(colors)])
                 ax.set_xlabel("Iteration / Operation")
                 ax.set_ylabel("Latency (sec)")
@@ -653,6 +1248,12 @@ def _generate_graphs(self):
                     [f"iter {it}" for it in iterations],
                     rotation=45, fontsize=7,
                 )
+                # Add explicit legend mapping colors to operations
+                legend_patches = [
+                    Patch(facecolor=colors[i % len(colors)], label=op_labels[i])
+                    for i in range(len(create_ops))
+                ]
+                ax.legend(handles=legend_patches, fontsize=8, loc="upper left")
             fig.tight_layout()
             fig.savefig(os.path.join(out_dir, "latency_per_iteration.png"),
                         dpi=150)
@@ -664,8 +1265,9 @@ def _generate_graphs(self):
         # ── 3. Phase duration per iteration (stacked bar) ────────────────
         try:
             phase_names = [
-                "create_parents", "create_children",
+                "create_subsystems", "write_data",
                 "create_snapshots", "create_clones", "delete_all",
+                "verify_cleanup",
             ]
             fig, ax = plt.subplots(figsize=(12, 6))
             x_pos = list(range(len(self._iteration_timings)))
@@ -694,7 +1296,7 @@ def _generate_graphs(self):
         except Exception as exc:
             self.logger.warning(f"Graph 3 failed: {exc}")
 
-        # ── 4. Clone latency vs clone index (per iteration) ──────────────
+        # ── 4. Clone latency vs clone index with batch boundaries ────────
         try:
             fig, ax = plt.subplots(figsize=(14, 8))
             for it in iterations:
@@ -709,9 +1311,27 @@ def _generate_graphs(self):
                         [s["elapsed_sec"] for s in clone_samples],
                         label=f"iter {it}", alpha=0.7, linewidth=0.8,
                     )
+                    # Mark batch boundaries (CLONE_BATCH_SIZE)
+                    cbs = self.CLONE_BATCH_SIZE
+                    for bi in range(cbs, len(clone_samples), cbs):
+                        ax.axvline(
+                            x=bi, color="gray", linestyle="--",
+                            alpha=0.4, linewidth=0.6,
+                        )
+                    # Mark _batch_parallel BATCH_SIZE boundaries too
+                    bs = self.BATCH_SIZE
+                    for bi in range(bs, len(clone_samples), bs):
+                        ax.axvline(
+                            x=bi, color="red", linestyle=":",
+                            alpha=0.3, linewidth=0.5,
+                        )
             ax.set_xlabel("Clone index (creation order)")
             ax.set_ylabel("Latency (sec)")
-            ax.set_title("Clone Creation Latency vs Clone Count")
+            ax.set_title(
+                f"Clone Creation Latency vs Clone Count "
+                f"(gray=clone batch/{self.CLONE_BATCH_SIZE}, "
+                f"red=submit batch/{self.BATCH_SIZE})"
+            )
             ax.legend(fontsize=7)
             fig.tight_layout()
             fig.savefig(
@@ -751,6 +1371,333 @@ def _generate_graphs(self):
         except Exception as exc:
             self.logger.warning(f"Graph 5 failed: {exc}")
 
+        # ── 6. Batch timing stats (bar chart) ────────────────────────────
+        try:
+            bt = self._batch_timings
+            if bt:
+                clone_batches = [
+                    b for b in bt
+                    if b["op"] == "create_clone"
+                    and b["batch_label"].startswith("batch ")
+                ]
+                if clone_batches:
+                    fig, ax = plt.subplots(figsize=(14, 8))
+                    labels = [b["batch_label"] for b in clone_batches]
+                    avgs = [b["avg_wall"] for b in clone_batches]
+                    p50s = [b["p50"] for b in clone_batches]
+                    p95s = [b["p95"] for b in clone_batches]
+                    effs = [
+                        b.get("effective_per_clone", 0)
+                        for b in clone_batches
+                    ]
+                    x = range(len(labels))
+                    width = 0.2
+                    ax.bar(
+                        [i - 1.5 * width for i in x], avgs, width,
+                        label="avg wall", color=colors[0],
+                    )
+                    ax.bar(
+                        [i - 0.5 * width for i in x], p50s, width,
+                        label="p50", color=colors[1],
+                    )
+                    ax.bar(
+                        [i + 0.5 * width for i in x], p95s, width,
+                        label="p95", color=colors[2],
+                    )
+                    ax.bar(
+                        [i + 1.5 * width for i in x], effs, width,
+                        label="effective/clone", color=colors[3 % len(colors)],
+                    )
+                    # Annotate throughput on each batch
+                    for idx, b in enumerate(clone_batches):
+                        tp = b.get("throughput_per_sec", 0)
+                        if tp > 0:
+                            ax.text(
+                                idx, max(avgs[idx], p95s[idx]) + 0.5,
+                                f"{tp:.2f}/s",
+                                ha="center", fontsize=6, color="black",
+                            )
+                    ax.set_xlabel("Clone Batch")
+                    ax.set_ylabel("Latency (sec)")
+                    ax.set_title(
+                        "Clone Creation — Per-Batch Latency "
+                        "(wall vs effective vs throughput)"
+                    )
+                    ax.set_xticks(list(x))
+                    ax.set_xticklabels(labels, rotation=45, fontsize=7)
+                    ax.legend(fontsize=7)
+                    fig.tight_layout()
+                    fig.savefig(
+                        os.path.join(
+                            out_dir, "clone_batch_latency_stats.png"
+                        ),
+                        dpi=150,
+                    )
+                    plt.close(fig)
+                    self.logger.info(
+                        "Generated clone_batch_latency_stats.png"
+                    )
+        except Exception as exc:
+            self.logger.warning(f"Graph 6 failed: {exc}")
+
+        # ── 7. Creation timeline — latency over wall-clock time ───────
+        try:
+            create_ops_ordered = [
+                "create_parent", "create_child",
+                "create_snapshot", "create_clone",
+            ]
+            fig, ax = plt.subplots(figsize=(16, 8))
+            t0_global = min(s["timestamp"] for s in samples)
+            for i, op in enumerate(create_ops_ordered):
+                pts = sorted(
+                    [s for s in samples if s["op"] == op],
+                    key=lambda s: s["timestamp"],
+                )
+                if pts:
+                    x = [(p["timestamp"] - t0_global) / 60.0 for p in pts]
+                    y = [p["elapsed_sec"] for p in pts]
+                    ax.plot(x, y, label=op, alpha=0.7, linewidth=0.8,
+                            color=colors[i % len(colors)])
+            ax.set_xlabel("Time since test start (minutes)")
+            ax.set_ylabel("Latency (sec)")
+            ax.set_title("Creation Latency Over Time")
+            ax.legend(fontsize=7)
+            fig.tight_layout()
+            fig.savefig(
+                os.path.join(out_dir, "creation_latency_timeline.png"),
+                dpi=150,
+            )
+            plt.close(fig)
+            self.logger.info("Generated creation_latency_timeline.png")
+        except Exception as exc:
+            self.logger.warning(f"Graph 7 failed: {exc}")
+
+        # ── 8. Per-parent child creation duration (bar chart) ─────────
+        try:
+            child_samples = [
+                s for s in samples if s["op"] == "create_child"
+            ]
+            if child_samples:
+                # Build child→parent mapping from registry or saved JSON
+                with self._lock:
+                    child_to_parent = {
+                        cn: ci.get("parent_name", "unknown")
+                        for cn, ci in self._child_registry.items()
+                    }
+                # Fall back to saved mappings if registry was cleared
+                if not child_to_parent:
+                    try:
+                        rpath = os.path.join(
+                            out_dir, "namespace_stress_timings.json"
+                        )
+                        with open(rpath) as rf:
+                            saved = json.load(rf)
+                        child_to_parent = saved.get(
+                            "mappings", {}
+                        ).get("child_to_parent", {})
+                    except Exception:
+                        pass
+
+                parent_durations = {}
+                for s in child_samples:
+                    pname = child_to_parent.get(s["name"], "unknown")
+                    parent_durations.setdefault(pname, []).append(
+                        s["elapsed_sec"]
+                    )
+                parents_sorted = sorted(parent_durations.keys())
+                fig, ax = plt.subplots(figsize=(14, 6))
+                x = range(len(parents_sorted))
+                totals = [
+                    sum(parent_durations[p]) for p in parents_sorted
+                ]
+                avgs = [
+                    sum(parent_durations[p]) / len(parent_durations[p])
+                    for p in parents_sorted
+                ]
+                counts = [
+                    len(parent_durations[p]) for p in parents_sorted
+                ]
+                ax.bar(x, totals, color=colors[0], alpha=0.7,
+                       label="total (sec)")
+                ax2 = ax.twinx()
+                ax2.plot(list(x), avgs, "ro-", markersize=4,
+                         label="avg per child (sec)")
+                ax.set_xlabel("Parent subsystem")
+                ax.set_ylabel("Total creation time (sec)")
+                ax2.set_ylabel("Avg per child (sec)")
+                ax.set_title(
+                    f"Child Creation Duration per Parent "
+                    f"({len(parents_sorted)} parents, "
+                    f"{len(child_samples)} children)"
+                )
+                ax.set_xticks(list(x))
+                ax.set_xticklabels(
+                    [f"{p[-8:]}({counts[i]})" for i, p in enumerate(parents_sorted)],
+                    rotation=45, fontsize=7,
+                )
+                ax.legend(loc="upper left", fontsize=7)
+                ax2.legend(loc="upper right", fontsize=7)
+                fig.tight_layout()
+                fig.savefig(
+                    os.path.join(
+                        out_dir, "child_creation_per_parent.png"
+                    ),
+                    dpi=150,
+                )
+                plt.close(fig)
+                self.logger.info(
+                    "Generated child_creation_per_parent.png"
+                )
+        except Exception as exc:
+            self.logger.warning(f"Graph 8 failed: {exc}")
+
+        # ── 9-12. Individual per-op latency over time (one graph each) ──
+        individual_ops = [
+            ("create_parent", "Parent LVol Creation Latency Over Time"),
+            ("create_child", "Child LVol Creation Latency Over Time"),
+            ("create_snapshot", "Snapshot Creation Latency Over Time"),
+            ("create_clone", "Clone Creation Latency Over Time"),
+        ]
+        for op_name, title in individual_ops:
+            try:
+                op_samples = sorted(
+                    [s for s in samples if s["op"] == op_name],
+                    key=lambda s: s["timestamp"],
+                )
+                if not op_samples:
+                    continue
+                fig, ax = plt.subplots(figsize=(14, 8))
+                t0_global = min(s["timestamp"] for s in samples)
+                x = [(s["timestamp"] - t0_global) / 60.0
+                     for s in op_samples]
+                y = [s["elapsed_sec"] for s in op_samples]
+
+                ax.scatter(x, y, alpha=0.5, s=12,
+                           color=colors[0], label="latency")
+                # Rolling average (window=20)
+                if len(y) >= 20:
+                    window = 20
+                    rolling = [
+                        sum(y[max(0, i - window):i]) / min(i, window)
+                        for i in range(1, len(y) + 1)
+                    ]
+                    ax.plot(x, rolling, color="red", linewidth=1.5,
+                            alpha=0.8, label=f"rolling avg (w={window})")
+
+                # Mark batch boundaries
+                bs = self.BATCH_SIZE
+                for bi in range(bs, len(op_samples), bs):
+                    ax.axvline(
+                        x=x[bi] if bi < len(x) else x[-1],
+                        color="gray", linestyle="--",
+                        alpha=0.3, linewidth=0.5,
+                    )
+
+                ax.set_xlabel("Time since test start (minutes)")
+                ax.set_ylabel("Latency (sec)")
+                ax.set_title(
+                    f"{title} ({len(op_samples)} ops, "
+                    f"batch_size={bs}, workers={self.MAX_WORKERS_CREATE})"
+                )
+                ax.legend(fontsize=8)
+                fig.tight_layout()
+                fname = f"{op_name}_latency_over_time.png"
+                fig.savefig(os.path.join(out_dir, fname), dpi=150)
+                plt.close(fig)
+                self.logger.info(f"Generated {fname}")
+            except Exception as exc:
+                self.logger.warning(
+                    f"Graph {op_name}_latency_over_time failed: {exc}"
+                )
+
+        # ── 13. Per-iteration average create time (grouped bar) ────────
+        try:
+            per_it = self._compute_per_iteration_summary()
+            if per_it:
+                create_ops_bar = [
+                    "create_parent", "create_child", "create_clone",
+                ]
+                op_labels_bar = ["parent", "child", "clone"]
+                it_keys = sorted(per_it.keys(), key=int)
+                fig, ax = plt.subplots(figsize=(14, 8))
+                n_its = len(it_keys)
+                n_ops = len(create_ops_bar)
+                width = 0.8 / max(n_ops, 1)
+                has_data = False
+
+                for oi, (op, label) in enumerate(
+                    zip(create_ops_bar, op_labels_bar)
+                ):
+                    avgs = []
+                    mins = []
+                    maxs = []
+                    x_pos = []
+                    eff_times = []  # effective per-clone (throughput-based)
+                    for xi, it_key in enumerate(it_keys):
+                        stats = per_it[it_key].get(op)
+                        if stats:
+                            avgs.append(stats["avg_wall"])
+                            mins.append(stats["min"])
+                            maxs.append(stats["max"])
+                            eff_times.append(
+                                stats.get("effective_per_clone")
+                            )
+                            x_pos.append(xi)
+                    if avgs:
+                        has_data = True
+                        offsets = [
+                            x + (oi - n_ops / 2 + 0.5) * width
+                            for x in x_pos
+                        ]
+                        err_lo = [a - m for a, m in zip(avgs, mins)]
+                        err_hi = [m - a for a, m in zip(avgs, maxs)]
+                        ax.bar(
+                            offsets, avgs, width,
+                            label=f"{label} (avg wall)",
+                            color=colors[oi % len(colors)],
+                            alpha=0.8,
+                            yerr=[err_lo, err_hi],
+                            capsize=3,
+                            error_kw={"linewidth": 0.8},
+                        )
+                        # Annotate counts + effective time
+                        for j, xi in enumerate(x_pos):
+                            cnt = per_it[it_keys[xi]][op]["count"]
+                            ann = f"n={cnt}"
+                            if eff_times[j] is not None:
+                                ann += f"\neff={eff_times[j]:.1f}s"
+                            ax.text(
+                                offsets[j], avgs[j] + err_hi[j] + 0.3,
+                                ann, ha="center", fontsize=6,
+                            )
+
+                if has_data:
+                    ax.set_xlabel("Iteration")
+                    ax.set_ylabel("Create time (sec)")
+                    ax.set_title(
+                        "Per-Iteration Average Create Time "
+                        "(API time for Docker, PVC Bound for K8s)"
+                    )
+                    ax.set_xticks(range(n_its))
+                    ax.set_xticklabels(
+                        [f"iter {k}" for k in it_keys], fontsize=8,
+                    )
+                    ax.legend(fontsize=8)
+                    fig.tight_layout()
+                    fig.savefig(
+                        os.path.join(
+                            out_dir,
+                            "per_iteration_avg_create_time.png",
+                        ),
+                        dpi=150,
+                    )
+                    self.logger.info(
+                        "Generated per_iteration_avg_create_time.png"
+                    )
+                plt.close(fig)
+        except Exception as exc:
+            self.logger.warning(f"Graph 13 failed: {exc}")
+
     def _print_summary(self):
         self.logger.info("=" * 60)
         self.logger.info("  PARALLEL NAMESPACE LVOL STRESS — SUMMARY")
@@ -797,11 +1744,18 @@ def run(self):
 
                 phase_durations = {}
                 for phase_name, phase_fn in [
-                    ("create_parents", self._phase_create_parents),
-                    ("create_children", self._phase_create_children),
+                    ("create_subsystems", self._phase_create_subsystems),
+                    ("verify_lvols", self._verify_all_lvols_exist),
+                    ("verify_nodes_healthy", self._verify_nodes_healthy),
+                    ("write_data", self._phase_write_data),
                     ("create_snapshots", self._phase_create_snapshots),
+                    ("verify_snapshots", self._verify_all_snapshots_exist),
                     ("create_clones", self._phase_create_clones),
+                    ("verify_clones", self._verify_all_clones_exist),
+                    ("mount_verify_clones", self._phase_mount_verify_clones),
+                    ("verify_nodes_final", self._verify_nodes_healthy),
                     ("delete_all", self._phase_delete_all),
+                    ("verify_cleanup", self._phase_verify_cleanup),
                 ]:
                     dur = self._run_phase(phase_name, phase_fn)
                     phase_durations[phase_name] = round(dur or 0, 2)
@@ -810,7 +1764,10 @@ def run(self):
                     "iteration": iteration,
                     "phase_durations_sec": phase_durations,
                 })
-                self._clear_registries()
+                # Only clear registries if iteration succeeded — graphs
+                # need the mappings and they run in the finally block
+                if not self._stop_event.is_set():
+                    self._clear_registries()
 
         finally:
             self._metrics["end_ts"] = time.time()
@@ -842,11 +1799,17 @@ def __init__(self, **kwargs):
     # ── Setup / Cleanup ───────────────────────────────────────────────────
 
     def _phase_setup(self):
-        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        if actual_pool and actual_pool != self.pool_name:
+            self.logger.info(
+                f"[setup] Pool name changed: {self.pool_name} -> {actual_pool}"
+            )
+            self.pool_name = actual_pool
         sleep_n_sec(2)
 
     def _phase_cleanup(self):
-        self.logger.info("[cleanup] Bulk delete safety net")
+        self.logger.info("[cleanup] Bulk delete safety net (ns-* only)")
+        # Delete only test resources by prefix, not all lvols
         try:
             self.sbcli_utils.delete_all_clones()
         except Exception:
@@ -856,7 +1819,23 @@ def _phase_cleanup(self):
         except Exception:
             pass
         try:
-            self.sbcli_utils.delete_all_lvols()
+            all_lvols = self.sbcli_utils.list_lvols()
+            test_lvols = [
+                name for name in all_lvols
+                if name.startswith("ns-") or name.startswith("cln-")
+                or name.startswith("snap-")
+            ]
+            self.logger.info(
+                f"[cleanup] Deleting {len(test_lvols)}/{len(all_lvols)} "
+                f"test lvols"
+            )
+            for lv_name in test_lvols:
+                try:
+                    self.sbcli_utils.delete_lvol(
+                        lvol_name=lv_name, skip_error=True
+                    )
+                except Exception:
+                    pass
         except Exception:
             pass
         try:
@@ -864,14 +1843,184 @@ def _phase_cleanup(self):
         except Exception:
             pass
 
-    # ── Create implementations ────────────────────────────────────────────
+    # ── Two-phase subsystem creation: parents then parallel children ────
 
-    def _create_parent_impl(self, params: dict):
-        name = params["name"]
-        self._inc("attempts", "create_parent")
-        self._api_retry("create_parent", lambda: self.sbcli_utils.add_lvol(
-            lvol_name=name,
-            pool_name=self.pool_name,
+    def _phase_create_subsystems(self):
+        """Sub-phase 1: create all parents in parallel.
+        Sub-phase 2: create ALL children in parallel (flat list).
+        50% failure threshold with detailed name logging."""
+        pvcs_per_subsys = 1 + self.CHILDREN_PER_PARENT
+        total_expected = self.NUM_PARENTS * pvcs_per_subsys
+        self.logger.info(
+            f"[create_subsystems] {self.NUM_PARENTS} parents × "
+            f"{pvcs_per_subsys} lvols = {total_expected} total "
+            f"(parallel, workers={self.MAX_WORKERS_CREATE})"
+        )
+
+        # ── Sub-phase 1: Create all parents (parallel) ─────────────
+        parent_items = []
+        parent_names = []
+        for i in range(self.NUM_PARENTS):
+            pname = f"ns-par-{_rand_seq(6)}-{i:04d}"
+            parent_items.append({"name": pname, "idx": i})
+            parent_names.append(pname)
+
+        self.logger.info(
+            f"[create_subsystems][sub1] Creating {self.NUM_PARENTS} parents "
+            f"(parallel, workers={self.MAX_WORKERS_CREATE})"
+        )
+        parents_t0 = time.time()
+        _ok, parent_fail = self._batch_parallel(
+            parent_items,
+            self._create_single_parent_docker,
+            self.MAX_WORKERS_CREATE,
+            "create_parents",
+        )
+        parents_elapsed = time.time() - parents_t0
+        self._log_op_stats(
+            "create_parent", batch_label="all parents",
+            batch_elapsed=parents_elapsed,
+        )
+
+        # Remove failed parents
+        failed_parents = []
+        if parent_fail > 0:
+            created_parents = set(self._parent_registry.keys())
+            for pname in list(parent_names):
+                if pname not in created_parents:
+                    failed_parents.append(pname)
+                    parent_names.remove(pname)
+
+        self.logger.info(
+            f"[create_subsystems][sub1] {len(parent_names)} parents "
+            f"created in {parents_elapsed:.1f}s"
+            f"{f', {len(failed_parents)} FAILED: {failed_parents}' if failed_parents else ''}"
+        )
+
+        # ── Sub-phase 2: Create ALL children in parallel ───────────
+        total_children = len(parent_names) * self.CHILDREN_PER_PARENT
+        self.logger.info(
+            f"[create_subsystems][sub2] Creating {total_children} children "
+            f"in parallel (workers={self.MAX_WORKERS_CREATE})"
+        )
+        child_items = []
+        for pname in parent_names:
+            pinfo = self._parent_registry[pname]
+            for c in range(self.CHILDREN_PER_PARENT):
+                child_items.append({
+                    "name": f"ns-ch-{_rand_seq(6)}-{pname[-4:]}-{c:02d}",
+                    "parent_name": pname,
+                    "parent_id": pinfo["id"],
+                    "parent_node_id": pinfo.get("node_id"),
+                })
+        children_t0 = time.time()
+        _ok, child_fail = self._batch_parallel(
+            child_items,
+            self._create_single_child_docker,
+            self.MAX_WORKERS_CREATE,
+            "create_children",
+        )
+        children_elapsed = time.time() - children_t0
+        self._log_op_stats(
+            "create_child", batch_label="all children",
+            batch_elapsed=children_elapsed,
+        )
+
+        # Identify failed children
+        failed_children = []
+        if child_fail > 0:
+            created_children = set(self._child_registry.keys())
+            for item in child_items:
+                if item["name"] not in created_children:
+                    failed_children.append(
+                        f"{item['name']} (parent={item['parent_name']})"
+                    )
+
+        # ── Failure summary ──────────────────────────────────────────
+        total_attempted = self.NUM_PARENTS + total_children
+        total_failed = len(failed_parents) + len(failed_children)
+        fail_pct = (total_failed * 100 / max(total_attempted, 1))
+
+        if total_failed > 0:
+            self.logger.warning(
+                f"[create_subsystems] FAILED lvols: {total_failed}/"
+                f"{total_attempted} ({fail_pct:.1f}%)"
+            )
+            if failed_parents:
+                self.logger.warning(
+                    f"  Failed PARENTS ({len(failed_parents)}): "
+                    f"{failed_parents}"
+                )
+            if failed_children:
+                self.logger.warning(
+                    f"  Failed CHILDREN ({len(failed_children)}): "
+                    f"{failed_children[:20]}"
+                    f"{'...' if len(failed_children) > 20 else ''}"
+                )
+
+        if fail_pct > 50:
+            raise RuntimeError(
+                f"[create_subsystems] {fail_pct:.1f}% failure rate "
+                f"exceeds 50% threshold — {total_failed}/{total_attempted} "
+                f"(parents={len(failed_parents)}, "
+                f"children={len(failed_children)})"
+            )
+
+        # ── Bulk verify ──────────────────────────────────────────────
+        all_lvols = self.sbcli_utils.list_lvols()
+        expected_created = total_attempted - total_failed
+        if len(all_lvols) < expected_created:
+            self.logger.warning(
+                f"[create_subsystems] lvol count {len(all_lvols)} < "
+                f"expected {expected_created}"
+            )
+
+        self.logger.info(
+            f"[create_subsystems] Done: {len(self._parent_registry)} parents, "
+            f"{len(self._child_registry)} children"
+            f"{f' ({total_failed} failures tolerated)' if total_failed else ''}"
+        )
+
+    def _create_single_parent_docker(self, item):
+        """Create a single parent lvol. Called from _batch_parallel."""
+        name = item["name"]
+        t0 = time.time()
+        api_elapsed = self._create_parent(name)
+        self._record_timing(
+            "create_parent", name,
+            time.time() - t0, self._snapshot_inventory(),
+            api_elapsed=api_elapsed,
+        )
+
+    def _create_single_child_docker(self, item):
+        """Create a single child lvol and register under its parent.
+
+        Called from _batch_parallel with MAX_WORKERS_CREATE concurrency —
+        all children for all parents run in parallel."""
+        child_name = item["name"]
+        parent_name = item["parent_name"]
+        parent_id = item["parent_id"]
+        parent_node_id = item["parent_node_id"]
+        t0 = time.time()
+        api_elapsed = self._create_child(
+            child_name, parent_name, parent_id, parent_node_id,
+        )
+        self._record_timing(
+            "create_child", child_name,
+            time.time() - t0, self._snapshot_inventory(),
+            api_elapsed=api_elapsed,
+        )
+
+    def _create_parent(self, name: str):
+        """Create a single parent lvol + register. Raises on failure.
+
+        Returns the API-only elapsed time (seconds) for timing reports.
+        """
+        self._inc("attempts", "create_parent")
+        api_t0 = time.time()
+        self._api_retry("create_parent", lambda: self.sbcli_utils.add_lvol(
+            lvol_name=name,
+            pool_name=self.pool_name,
             size=self.LVOL_SIZE,
             distr_ndcs=self.ndcs,
             distr_npcs=self.npcs,
@@ -880,20 +2029,36 @@ def _create_parent_impl(self, params: dict):
             max_namespace_per_subsys=self.NAMESPACES_PER_PARENT,
             retry=1,
         ), ctx={"name": name})
+        api_elapsed = time.time() - api_t0
         lvol_id = self._wait_lvol_id(name)
+        node_id = None
+        try:
+            details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)
+            if details:
+                node_id = details[0].get("node_id")
+        except Exception as ex:
+            self.logger.warning(
+                f"[create_parent] {name}: could not get node_id: {ex}"
+            )
         with self._lock:
             self._parent_registry[name] = {
-                "id": lvol_id, "children": [], "snapshots": [],
+                "id": lvol_id, "node_id": node_id,
+                "children": [], "snapshots": [],
             }
             self._metrics["counts"]["parents_created"] += 1
-        self._inc("attempts", "create_parent", 0)  # already counted
-        self.logger.info(f"[create_parent] {name} -> {lvol_id}")
+        self.logger.info(
+            f"[create_parent] {name} -> {lvol_id} (node={node_id})"
+        )
+        return api_elapsed
 
-    def _create_child_impl(self, params: dict):
-        name = params["name"]
-        parent_name = params["parent_name"]
-        parent_id = params["parent_id"]
+    def _create_child(self, name: str, parent_name: str,
+                      parent_id: str, parent_node_id: str):
+        """Create a single child namespace lvol. Raises on failure.
+
+        Returns the API-only elapsed time (seconds) for timing reports.
+        """
         self._inc("attempts", "create_child")
+        api_t0 = time.time()
         self._api_retry("create_child", lambda: self.sbcli_utils.add_lvol(
             lvol_name=name,
             pool_name=self.pool_name,
@@ -902,18 +2067,199 @@ def _create_child_impl(self, params: dict):
             distr_npcs=self.npcs,
             distr_bs=self.bs,
             distr_chunk_bs=self.chunk_bs,
+            host_id=parent_node_id,
             namespace=parent_id,
             retry=1,
         ), ctx={"name": name, "parent": parent_name})
+        api_elapsed = time.time() - api_t0
         child_id = self._wait_lvol_id(name)
         with self._lock:
             self._child_registry[name] = {
                 "id": child_id, "parent_name": parent_name,
             }
-            if parent_name in self._parent_registry:
-                self._parent_registry[parent_name]["children"].append(name)
+            self._parent_registry[parent_name]["children"].append(name)
             self._metrics["counts"]["children_created"] += 1
-        self.logger.info(f"[create_child] {name} -> {child_id} (parent={parent_name})")
+        self.logger.info(
+            f"[create_child] {name} -> {child_id} (parent={parent_name})"
+        )
+        return api_elapsed
+
+    # ── Write data (parallel FIO per parent group) ─────────────────────
+
+    def _phase_write_data(self):
+        """Parallel FIO: one thread per parent group.
+
+        Each thread NVMe-connects the parent + all its children, runs
+        FIO (100 MB sequential write) on each device, then disconnects.
+        Also pre-selects the snapshot child so _phase_create_snapshots
+        reuses it.
+        """
+        # Pre-select snapshot child
+        with self._lock:
+            child_names = list(self._child_registry.keys())
+        if child_names:
+            self._snapshot_child = random.choice(child_names)
+            self.logger.info(
+                f"[write_data] Pre-selected child for snapshot: "
+                f"{self._snapshot_child}"
+            )
+        else:
+            self._snapshot_child = None
+
+        # Build per-parent groups: parent + all its children
+        parent_items = []
+        with self._lock:
+            for pname, pinfo in self._parent_registry.items():
+                lvols = [(pname, pinfo["id"])]
+                for cname in pinfo.get("children", []):
+                    cinfo = self._child_registry.get(cname)
+                    if cinfo:
+                        lvols.append((cname, cinfo["id"]))
+                parent_items.append({
+                    "parent_name": pname,
+                    "lvols": lvols,
+                })
+
+        total_lvols = sum(len(item["lvols"]) for item in parent_items)
+        self.logger.info(
+            f"[write_data] Running parallel FIO (100 MB) on {total_lvols} "
+            f"lvols across {len(parent_items)} parent groups "
+            f"(workers={self.MAX_WORKERS_CREATE})"
+        )
+
+        write_t0 = time.time()
+        _ok, fail = self._batch_parallel(
+            parent_items, self._fio_parent_group_docker,
+            self.MAX_WORKERS_CREATE, "write_data",
+        )
+        write_elapsed = time.time() - write_t0
+        self.logger.info(
+            f"[write_data] Done: {_ok}/{len(parent_items)} groups OK, "
+            f"{fail} failed in {write_elapsed:.1f}s"
+        )
+        if fail > 0:
+            self.logger.warning(
+                f"[write_data] {fail}/{len(parent_items)} FIO groups failed"
+            )
+
+    def _extract_nqn(self, connect_strs):
+        """Extract NQN from nvme connect command strings."""
+        for cs in connect_strs:
+            for part in cs.split():
+                if part.startswith("--nqn="):
+                    return part.split("=", 1)[1]
+                if part.startswith("-n ") or part == "-n":
+                    continue
+        return None
+
+    def _find_device_by_nqn(self, client, nqn):
+        """Find NVMe block device for a given NQN via nvme list-subsys."""
+        import json as _json
+        out, _ = self.ssh_obj.exec_command(
+            client,
+            "sudo nvme list-subsys -o json 2>/dev/null || echo '[]'",
+            supress_logs=True,
+        )
+        try:
+            subsys_data = _json.loads(out)
+            if isinstance(subsys_data, list) and subsys_data:
+                subsys_data = subsys_data[0]
+            for ss in subsys_data.get("Subsystems", []):
+                if ss.get("NQN") == nqn:
+                    for path in ss.get("Paths", []):
+                        dev_name = path.get("Name")
+                        if dev_name:
+                            return f"/dev/{dev_name}"
+        except Exception:
+            pass
+        return None
+
+    def _fio_parent_group_docker(self, item):
+        """Connect all lvols in a parent group, run FIO on each, disconnect.
+
+        Each parent thread owns its NVMe connections exclusively — no shared
+        connect strings across threads.
+        """
+        client = self.fio_node[0]
+        parent_name = item["parent_name"]
+        lvols = item["lvols"]  # [(name, id), ...]
+        connected_nqns = []
+        t0_group = time.time()
+
+        try:
+            # ── Step 1: NVMe-connect all lvols in this group ─────────
+            nqn_map = {}  # lvol_name -> nqn
+            for lvol_name, lvol_id in lvols:
+                try:
+                    connect_strs = self.sbcli_utils.get_lvol_connect_str(
+                        lvol_name
+                    )
+                    if not connect_strs:
+                        self.logger.warning(
+                            f"[write_data] No connect strings for {lvol_name}"
+                        )
+                        continue
+                    nqn = self._extract_nqn(connect_strs)
+                    for cs in connect_strs:
+                        self.ssh_obj.exec_command(client, cs)
+                    if nqn:
+                        nqn_map[lvol_name] = nqn
+                        connected_nqns.append(nqn)
+                except Exception as exc:
+                    self.logger.warning(
+                        f"[write_data] Connect failed for {lvol_name}: {exc}"
+                    )
+
+            sleep_n_sec(3)
+
+            # ── Step 2: Discover devices and run FIO on each ─────────
+            fio_ok = 0
+            for lvol_name, nqn in nqn_map.items():
+                try:
+                    device = self._find_device_by_nqn(client, nqn)
+                    if not device:
+                        self.logger.warning(
+                            f"[write_data] No device found for "
+                            f"{lvol_name} (nqn={nqn})"
+                        )
+                        continue
+                    t0 = time.time()
+                    self.ssh_obj.exec_command(
+                        client,
+                        f"sudo fio --name=write-{lvol_name[:20]} "
+                        f"--filename={device} --size=100M --bs=1M "
+                        f"--rw=write --direct=1 --ioengine=libaio "
+                        f"--iodepth=1 --numjobs=1",
+                    )
+                    elapsed = time.time() - t0
+                    self._record_timing(
+                        "write_data", lvol_name, elapsed,
+                        self._snapshot_inventory(),
+                    )
+                    fio_ok += 1
+                except Exception as exc:
+                    self.logger.warning(
+                        f"[write_data] FIO failed for {lvol_name}: {exc}"
+                    )
+
+            group_elapsed = time.time() - t0_group
+            self.logger.info(
+                f"[write_data] Group {parent_name}: "
+                f"{fio_ok}/{len(lvols)} lvols written "
+                f"in {group_elapsed:.1f}s"
+            )
+
+        finally:
+            # ── Step 3: NVMe-disconnect all ──────────────────────────
+            for nqn in connected_nqns:
+                try:
+                    self.ssh_obj.exec_command(
+                        client, f"sudo nvme disconnect -n {nqn}",
+                    )
+                except Exception:
+                    pass
+
+    # ── Create implementations ────────────────────────────────────────────
 
     def _create_snapshot_impl(self, params: dict):
         snap_name = params["name"]
@@ -943,11 +2289,13 @@ def _create_clone_impl(self, params: dict):
         snap_name = params["snap_name"]
         snap_id = params["snap_id"]
         self._inc("attempts", "create_clone")
+        api_t0 = time.time()
         self._api_retry("create_clone", lambda: self.sbcli_utils.add_clone(
             snapshot_id=snap_id,
             clone_name=clone_name,
             retry=1,
         ), ctx={"clone": clone_name, "snap": snap_name})
+        api_elapsed = time.time() - api_t0
         clone_id = self._wait_lvol_id(clone_name)
         with self._lock:
             self._clone_registry[clone_name] = {
@@ -957,6 +2305,134 @@ def _create_clone_impl(self, params: dict):
                 self._snap_registry[snap_name]["clones"].append(clone_name)
             self._metrics["counts"]["clones_created"] += 1
         self.logger.info(f"[create_clone] {clone_name} -> {clone_id}")
+        return api_elapsed
+
+    # ── Clone mount verification ─────────────────────────────────────────
+
+    def _mount_verify_single_clone(self, item):
+        """Connect a clone via NVMe, run short FIO read, check for errors."""
+        clone_name = item["clone_name"]
+        client = self.fio_node[0]
+        nqn = None
+        t0 = time.time()
+
+        try:
+            # 1. Get connect strings (works for clones — they are lvols)
+            connect_strs = self.sbcli_utils.get_lvol_connect_str(clone_name)
+            if not connect_strs:
+                raise RuntimeError(
+                    f"No connect strings returned for clone {clone_name}"
+                )
+            nqn = self._extract_nqn(connect_strs)
+
+            # 2. Record devices before connect
+            initial_devices = set(self.ssh_obj.get_devices(node=client))
+
+            # 3. NVMe connect
+            for cs in connect_strs:
+                self.ssh_obj.exec_command(client, cs)
+            sleep_n_sec(3)
+
+            # 4. Detect new device (namespace lvols may add namespace to
+            #    existing controller rather than creating a new one)
+            final_devices = set(self.ssh_obj.get_devices(node=client))
+            new_devices = list(final_devices - initial_devices)
+
+            device = None
+            if new_devices:
+                device = f"/dev/{new_devices[0]}"
+            else:
+                # Namespace lvol: try ns-rescan on existing controllers
+                out, _ = self.ssh_obj.exec_command(
+                    client,
+                    "ls /dev/nvme[0-9]* 2>/dev/null | grep -oP 'nvme\\d+$' "
+                    "| sort -u",
+                    supress_logs=True,
+                )
+                for ctrl in (out or "").strip().splitlines():
+                    ctrl = ctrl.strip()
+                    if ctrl:
+                        self.ssh_obj.exec_command(
+                            client,
+                            f"sudo nvme ns-rescan /dev/{ctrl}",
+                            supress_logs=True,
+                        )
+                sleep_n_sec(2)
+                rescan_devices = set(self.ssh_obj.get_devices(node=client))
+                new_after_rescan = list(rescan_devices - initial_devices)
+                if new_after_rescan:
+                    device = f"/dev/{new_after_rescan[0]}"
+
+            if not device:
+                # Fall back: find any device for this NQN
+                device = self._find_device_by_nqn(client, nqn)
+
+            if not device:
+                raise RuntimeError(
+                    f"Could not find block device for clone {clone_name} "
+                    f"after NVMe connect (NQN={nqn})"
+                )
+
+            self.logger.info(
+                f"[mount_verify] Clone {clone_name} -> device {device}"
+            )
+
+            # 5. Run short FIO read with output capture
+            fio_log = f"/tmp/fio_verify_{clone_name}.log"
+            fio_cmd = (
+                f"sudo fio --name=verify-{clone_name[:20]} "
+                f"--filename={device} --size=4M --bs=4K "
+                f"--rw=read --direct=1 --ioengine=libaio "
+                f"--iodepth=1 --numjobs=1 "
+                f"--output={fio_log}"
+            )
+            self.ssh_obj.exec_command(client, fio_cmd)
+
+            # 6. Check FIO log for errors
+            fio_output, _ = self.ssh_obj.exec_command(
+                client, f"cat {fio_log}", supress_logs=True,
+            )
+            fio_output = fio_output or ""
+
+            # Parse err= from FIO output
+            err_found = False
+            for line in fio_output.splitlines():
+                if "err=" in line:
+                    # Extract err value: "err= 5" or "err=5"
+                    import re
+                    m = re.search(r"err=\s*(\d+)", line)
+                    if m and int(m.group(1)) != 0:
+                        err_found = True
+                        break
+
+            if err_found:
+                self.logger.error(
+                    f"[mount_verify] FIO reported error on clone "
+                    f"{clone_name}:\n{fio_output}"
+                )
+                raise RuntimeError(
+                    f"FIO read error on clone {clone_name}: {fio_output[:200]}"
+                )
+
+            elapsed = time.time() - t0
+            self.logger.info(
+                f"[mount_verify] Clone {clone_name} verified OK "
+                f"({elapsed:.1f}s)"
+            )
+            self._record_timing(
+                "mount_verify", clone_name, elapsed,
+                self._snapshot_inventory(),
+            )
+
+        finally:
+            # Always disconnect
+            if nqn:
+                try:
+                    self.ssh_obj.exec_command(
+                        client, f"sudo nvme disconnect -n {nqn}",
+                    )
+                except Exception:
+                    pass
 
     # ── Delete implementations (with verification) ────────────────────────
 
@@ -1031,9 +2507,67 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.test_name = "parallel_namespace_lvol_k8s"
         self.STORAGE_CLASS_NAME = "simplyblock-ns-stress-sc"
+        self.XFS_STORAGE_CLASS_NAME = "simplyblock-ns-stress-sc-xfs"
         self.SNAPSHOT_CLASS_NAME = "simplyblock-csi-snapshotclass"
         self.k8s_utils = None
 
+    def setup(self):
+        """K8s-native setup: no SSH client machines needed — FIO runs as K8s Jobs."""
+        self.logger.info("Inside TestParallelNamespaceLvolK8s.setup()")
+
+        retry = 30
+        while retry > 0:
+            try:
+                self.logger.info("Getting all storage nodes")
+                self.mgmt_nodes, self.storage_nodes = self.sbcli_utils.get_all_nodes_ip()
+                self.sbcli_utils.list_lvols()
+                self.sbcli_utils.list_storage_pools()
+                break
+            except Exception as e:
+                self.logger.debug(f"API call failed with error: {e}")
+                retry -= 1
+                if retry == 0:
+                    self.logger.info(f"Retry attempt exhausted. API failed with: {e}. Exiting")
+                    raise e
+                self.logger.info(f"Retrying Base APIs before starting tests. Attempt: {30 - retry + 1}")
+                sleep_n_sec(10)
+
+        # No client machines needed — FIO runs as K8s Jobs
+        self.client_machines = []
+        self.fio_node = []
+
+        # Record UTC start time for Graylog log export at teardown
+        self.test_start_time_utc = datetime.now(timezone.utc)
+
+        # Initialize k8s_utils early so it's available even if _phase_setup fails
+        self._init_k8s_utils()
+
+        # Set up log directories
+        timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+        log_base = self.nfs_log_base
+        try:
+            os.makedirs(log_base, exist_ok=True)
+        except OSError:
+            log_base = os.path.join(os.path.expanduser("~"), "e2e-logs")
+            os.makedirs(log_base, exist_ok=True)
+        self.docker_logs_path = os.path.join(log_base, f"{self.test_name}-{timestamp}")
+        self.log_path = os.path.join(self.docker_logs_path, "ClientLogs")
+        os.makedirs(self.log_path, exist_ok=True)
+        os.makedirs(self.docker_logs_path, exist_ok=True)
+
+        run_file = os.getenv("RUN_DIR_FILE", None)
+        if run_file:
+            with open(run_file, "w") as f:
+                f.write(self.docker_logs_path)
+
+        # Start K8s log monitor
+        self.runner_k8s_log = RunnerK8sLog(
+            log_dir=self.docker_logs_path,
+            test_name=self.test_name,
+        )
+        self.runner_k8s_log.start_logging()
+        self.runner_k8s_log.monitor_pod_logs()
+
     # ── K8s helpers ───────────────────────────────────────────────────────
 
     def _init_k8s_utils(self):
@@ -1078,10 +2612,15 @@ def _wait_snapshot_k8s_gone(self, snap_name: str, timeout: int = 120) -> float:
     def _phase_setup(self):
         self._init_k8s_utils()
         # Create pool via sbcli
-        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        if actual_pool and actual_pool != self.pool_name:
+            self.logger.info(
+                f"[setup] Pool name changed: {self.pool_name} -> {actual_pool}"
+            )
+            self.pool_name = actual_pool
         sleep_n_sec(2)
 
-        # Create StorageClass with namespace support
+        # Create StorageClasses with namespace support (ext4 + xfs)
         cluster_id = self.cluster_id or os.environ.get("CLUSTER_ID", "")
         self.k8s_utils.create_storage_class(
             name=self.STORAGE_CLASS_NAME,
@@ -1091,6 +2630,15 @@ def _phase_setup(self):
             npcs=self.npcs,
             max_namespace_per_subsys=self.NAMESPACES_PER_PARENT,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+            max_namespace_per_subsys=self.NAMESPACES_PER_PARENT,
+        )
         self.k8s_utils.create_volume_snapshot_class(
             name=self.SNAPSHOT_CLASS_NAME,
         )
@@ -1099,31 +2647,40 @@ def _phase_cleanup(self):
         self.logger.info("[cleanup] K8s bulk cleanup")
         ns = self.k8s_utils.namespace if self.k8s_utils else "default"
         if self.k8s_utils:
-            # Delete all PVCs with our label
+            # Delete FIO/write-data jobs with our label
             try:
                 self.k8s_utils._exec_kubectl(
-                    f"kubectl delete pvc -l test=ns-stress -n {ns} "
+                    f"kubectl delete job -l test=ns-stress -n {ns} "
                     f"--wait=false --ignore-not-found 2>/dev/null || true"
                 )
             except Exception:
                 pass
-            # Delete all volume snapshots
+            # Delete all PVCs with our label
             try:
                 self.k8s_utils._exec_kubectl(
-                    f"kubectl delete volumesnapshot -l test=ns-stress -n {ns} "
+                    f"kubectl delete pvc -l test=ns-stress -n {ns} "
                     f"--wait=false --ignore-not-found 2>/dev/null || true"
                 )
             except Exception:
                 pass
-            # Delete StorageClass
+            # Delete all volume snapshots
             try:
                 self.k8s_utils._exec_kubectl(
-                    f"kubectl delete storageclass {self.STORAGE_CLASS_NAME} "
-                    f"--ignore-not-found 2>/dev/null || true"
+                    f"kubectl delete volumesnapshot -l test=ns-stress -n {ns} "
+                    f"--wait=false --ignore-not-found 2>/dev/null || true"
                 )
             except Exception:
                 pass
-        # Bulk sbcli cleanup
+            # Delete StorageClasses
+            for sc in [self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME]:
+                try:
+                    self.k8s_utils._exec_kubectl(
+                        f"kubectl delete storageclass {sc} "
+                        f"--ignore-not-found 2>/dev/null || true"
+                    )
+                except Exception:
+                    pass
+        # Targeted sbcli cleanup — only test resources
         try:
             self.sbcli_utils.delete_all_clones()
         except Exception:
@@ -1133,7 +2690,23 @@ def _phase_cleanup(self):
         except Exception:
             pass
         try:
-            self.sbcli_utils.delete_all_lvols()
+            all_lvols = self.sbcli_utils.list_lvols()
+            test_lvols = [
+                name for name in all_lvols
+                if name.startswith("ns-") or name.startswith("cln-")
+                or name.startswith("snap-")
+            ]
+            self.logger.info(
+                f"[cleanup] Deleting {len(test_lvols)}/{len(all_lvols)} "
+                f"test lvols"
+            )
+            for lv_name in test_lvols:
+                try:
+                    self.sbcli_utils.delete_lvol(
+                        lvol_name=lv_name, skip_error=True
+                    )
+                except Exception:
+                    pass
         except Exception:
             pass
         try:
@@ -1141,35 +2714,512 @@ def _phase_cleanup(self):
         except Exception:
             pass
 
-    # ── Phase overrides ───────────────────────────────────────────────────
+    def _phase_verify_cleanup(self):
+        """K8s override: also verify no test PVCs remain."""
+        ns = self.k8s_utils.namespace if self.k8s_utils else "default"
+        # Check K8s PVCs with test label
+        if self.k8s_utils:
+            try:
+                output = self.k8s_utils._exec_kubectl(
+                    f"kubectl get pvc -l test=ns-stress -n {ns} "
+                    f"--no-headers 2>/dev/null || true"
+                )
+                if output and output.strip():
+                    lines = [
+                        ln for ln in output.strip().split("\n")
+                        if ln.strip()
+                    ]
+                    self.logger.warning(
+                        f"[verify_cleanup] {len(lines)} test PVCs still "
+                        f"present — force deleting"
+                    )
+                    self.k8s_utils._exec_kubectl(
+                        f"kubectl delete pvc -l test=ns-stress -n {ns} "
+                        f"--wait=false --ignore-not-found 2>/dev/null || true"
+                    )
+                    sleep_n_sec(10)
+            except Exception:
+                pass
+        # Delegate to base for sbcli-level verification
+        super()._phase_verify_cleanup()
 
-    def _phase_create_parents(self):
-        """In K8s, create ALL PVCs (NUM_PARENTS × NAMESPACES_PER_PARENT).
-        CSI driver groups into subsystems automatically."""
-        total = self.NUM_PARENTS * self.NAMESPACES_PER_PARENT
-        items = []
-        for i in range(total):
-            pvc_name = f"ns-pvc-{_rand_seq(6)}-{i:04d}"
-            items.append({"name": pvc_name, "idx": i})
-        self._batch_parallel(
-            items, self._timed_create_parent,
-            self.MAX_WORKERS_CREATE, "create_pvcs",
+    # ── K8s verification overrides ────────────────────────────────────────
+    # PVC names != API lvol names (CSI driver uses its own naming), so
+    # verify via K8s PVC status + API lvol count instead of name matching.
+
+    def _verify_all_lvols_exist(self):
+        """K8s override: verify PVCs are Bound and PV names exist in API.
+
+        PVC names (ns-pvc-xxx) don't match API lvol names.  The PV name
+        (VOLUME column in ``kubectl get pvc``) matches the lvol name in the
+        API (``sbctl lvol list``).  We verify both: PVC Bound + PV in API.
+
+        Retries up to 30 minutes to allow stragglers to settle after creation.
+        """
+        ns = self.k8s_utils.namespace
+        with self._lock:
+            all_pvc_names = set(
+                list(self._parent_registry.keys())
+                + list(self._child_registry.keys())
+            )
+        expected = len(all_pvc_names)
+
+        # Retry loop: wait for PVCs to settle (some may still be binding)
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+        not_bound = []
+        pv_names = []
+        found_pvcs = set()
+
+        while waited <= max_wait:
+            not_bound = []
+            pv_names = []
+            found_pvcs = set()
+
+            # Bulk fetch all test PVCs via -o json (avoids jsonpath quoting issues)
+            out, _ = self.k8s_utils._exec_kubectl(
+                f"kubectl get pvc -l test=ns-stress -n {ns} "
+                f"-o json 2>/dev/null || echo '{{\"items\":[]}}'",
+                supress_logs=True,
+            )
+
+            try:
+                data = json.loads(out or '{"items":[]}')
+                for item in data.get("items", []):
+                    pvc_name = item.get("metadata", {}).get("name", "")
+                    phase = item.get("status", {}).get("phase", "")
+                    pv_name = item.get("spec", {}).get("volumeName", "")
+                    if pvc_name not in all_pvc_names:
+                        continue
+                    found_pvcs.add(pvc_name)
+                    if phase != "Bound":
+                        not_bound.append((pvc_name, phase))
+                    elif pv_name:
+                        pv_names.append((pvc_name, pv_name))
+            except (json.JSONDecodeError, TypeError):
+                self.logger.warning(
+                    f"[verify_lvols] Failed to parse kubectl JSON output "
+                    f"(len={len(out or '')})"
+                )
+
+            # Check for PVCs not found in K8s at all
+            missing_pvcs = all_pvc_names - found_pvcs
+            if missing_pvcs:
+                not_bound.extend(
+                    (name, "not-found") for name in list(missing_pvcs)[:50]
+                )
+
+            not_bound_pct = len(not_bound) * 100 / max(expected, 1)
+            if not not_bound or not_bound_pct <= 50:
+                break  # All Bound or within 50% tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_lvols] {len(not_bound)}/{expected} PVCs "
+                    f"({not_bound_pct:.1f}%) not yet Bound, waiting "
+                    f"{poll_interval}s... (waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        # Final assessment after wait
+        not_bound_pct = len(not_bound) * 100 / max(expected, 1)
+        if not_bound:
+            self.logger.warning(
+                f"[verify_lvols] {len(not_bound)}/{expected} PVCs "
+                f"({not_bound_pct:.1f}%) not Bound/found after "
+                f"{waited}s wait: "
+                f"{not_bound[:10]}{'...' if len(not_bound) > 10 else ''}"
+            )
+        if not_bound_pct > 50:
+            raise RuntimeError(
+                f"[verify_lvols] {not_bound_pct:.1f}% PVCs not Bound "
+                f"exceeds 50% threshold — {len(not_bound)}/{expected}"
+            )
+
+        # Cross-check: PV names (VOLUME column) should exist in API lvol list
+        all_lvols = self.sbcli_utils.list_lvols()
+        lvol_names = set(all_lvols.keys()) if isinstance(all_lvols, dict) else set(all_lvols)
+        missing_in_api = []
+        for pvc_name, pv_name in pv_names:
+            if pv_name not in lvol_names:
+                missing_in_api.append((pvc_name, pv_name))
+
+        if missing_in_api:
+            self.logger.warning(
+                f"[verify_lvols] {len(missing_in_api)}/{expected} PVCs Bound "
+                f"but PV not in API: "
+                f"{missing_in_api[:10]}{'...' if len(missing_in_api) > 10 else ''}"
+            )
+
+        bound_count = len(found_pvcs) - len(not_bound)
+        self.logger.info(
+            f"[verify_lvols] {bound_count}/{expected} PVCs Bound, "
+            f"{len(pv_names)} PVs found in API "
+            f"(lvol count={len(all_lvols)})"
         )
 
-    def _phase_create_children(self):
-        """No-op in K8s — CSI groups namespaces automatically."""
+    def _verify_all_snapshots_exist(self):
+        """K8s override: verify VolumeSnapshots are readyToUse.
+
+        Uses ``-o json`` instead of jsonpath to avoid shell-quoting issues
+        when _exec_kubectl runs through bash -c or SSH layers.
+
+        Retries up to 30 minutes to allow snapshots to become ready.
+        Warns for not-ready, only fails if >50% not ready.
+        """
+        ns = self.k8s_utils.namespace
+        with self._lock:
+            snap_names = list(self._snap_registry.keys())
+        if not snap_names:
+            self.logger.info("[verify_snapshots] No snapshots to verify")
+            return
+
+        total = len(snap_names)
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+        not_ready = []
+
+        while waited <= max_wait:
+            not_ready = []
+            # Use -o json for reliable parsing (jsonpath has shell-quoting issues)
+            out, _ = self.k8s_utils._exec_kubectl(
+                f"kubectl get volumesnapshot -l test=ns-stress -n {ns} "
+                f"-o json 2>/dev/null || echo '{{\"items\":[]}}'",
+                supress_logs=True,
+            )
+            found_snaps = {}
+            try:
+                data = json.loads(out or '{"items":[]}')
+                for item in data.get("items", []):
+                    name = item.get("metadata", {}).get("name", "")
+                    ready = item.get("status", {}).get("readyToUse", False)
+                    found_snaps[name] = str(ready).lower()
+            except (json.JSONDecodeError, TypeError):
+                self.logger.warning(
+                    f"[verify_snapshots] Failed to parse kubectl JSON output "
+                    f"(len={len(out or '')})"
+                )
+
+            for snap_name in snap_names:
+                ready = found_snaps.get(snap_name, "not-found")
+                if ready != "true":
+                    not_ready.append((snap_name, ready))
+
+            not_ready_pct = len(not_ready) * 100 / max(total, 1)
+            if not not_ready or not_ready_pct <= 50:
+                break  # All ready or within 50% tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_snapshots] {len(not_ready)}/{total} "
+                    f"({not_ready_pct:.1f}%) snapshots not ready, "
+                    f"waiting {poll_interval}s... "
+                    f"(waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        not_ready_pct = len(not_ready) * 100 / max(total, 1)
+        if not_ready:
+            self.logger.warning(
+                f"[verify_snapshots] {len(not_ready)}/{total} "
+                f"({not_ready_pct:.1f}%) snapshots not ready after "
+                f"{waited}s wait: "
+                f"{not_ready[:10]}{'...' if len(not_ready) > 10 else ''}"
+            )
+        if not_ready_pct > 50:
+            raise RuntimeError(
+                f"[verify_snapshots] {not_ready_pct:.1f}% snapshots not "
+                f"ready exceeds 50% threshold — "
+                f"{len(not_ready)}/{total}"
+            )
         self.logger.info(
-            "[K8s] Children phase is no-op; CSI driver groups "
-            "PVCs into subsystems automatically"
+            f"[verify_snapshots] {total - len(not_ready)}/{total} "
+            f"snapshots confirmed readyToUse"
         )
 
-    # ── Create implementations ────────────────────────────────────────────
+    def _verify_all_clones_exist(self):
+        """K8s override: verify clone PVCs are Bound.
 
-    def _create_parent_impl(self, params: dict):
-        name = params["name"]
-        self._inc("attempts", "create_parent")
+        Uses ``-o json`` instead of jsonpath to avoid shell-quoting issues.
+
+        Retries up to 30 minutes to allow clone PVCs to bind.
+        Warns for not-bound, only fails if >50% not bound.
+        """
+        ns = self.k8s_utils.namespace
+        with self._lock:
+            clone_names = list(self._clone_registry.keys())
+        if not clone_names:
+            self.logger.info("[verify_clones] No clones to verify")
+            return
+
+        total = len(clone_names)
+        max_wait = 1800  # 30 minutes
+        poll_interval = 30
+        waited = 0
+        not_bound = []
+
+        while waited <= max_wait:
+            not_bound = []
+            # Use -o json for reliable parsing
+            out, _ = self.k8s_utils._exec_kubectl(
+                f"kubectl get pvc -l test=ns-stress -n {ns} "
+                f"-o json 2>/dev/null || echo '{{\"items\":[]}}'",
+                supress_logs=True,
+            )
+            found_pvcs = {}
+            try:
+                data = json.loads(out or '{"items":[]}')
+                for item in data.get("items", []):
+                    name = item.get("metadata", {}).get("name", "")
+                    phase = item.get("status", {}).get("phase", "")
+                    found_pvcs[name] = phase
+            except (json.JSONDecodeError, TypeError):
+                self.logger.warning(
+                    f"[verify_clones] Failed to parse kubectl JSON output "
+                    f"(len={len(out or '')})"
+                )
+
+            for clone_name in clone_names:
+                phase = found_pvcs.get(clone_name, "not-found")
+                if phase != "Bound":
+                    not_bound.append((clone_name, phase))
+
+            not_bound_pct = len(not_bound) * 100 / max(total, 1)
+            if not not_bound or not_bound_pct <= 50:
+                break  # All Bound or within 50% tolerance
+
+            if waited < max_wait:
+                self.logger.info(
+                    f"[verify_clones] {len(not_bound)}/{total} "
+                    f"({not_bound_pct:.1f}%) clone PVCs not Bound, "
+                    f"waiting {poll_interval}s... "
+                    f"(waited {waited}s/{max_wait}s)"
+                )
+                sleep_n_sec(poll_interval)
+                waited += poll_interval
+            else:
+                break  # Exhausted wait time
+
+        not_bound_pct = len(not_bound) * 100 / max(total, 1)
+        if not_bound:
+            self.logger.warning(
+                f"[verify_clones] {len(not_bound)}/{total} "
+                f"({not_bound_pct:.1f}%) clone PVCs not Bound after "
+                f"{waited}s wait: "
+                f"{not_bound[:10]}{'...' if len(not_bound) > 10 else ''}"
+            )
+        if not_bound_pct > 50:
+            raise RuntimeError(
+                f"[verify_clones] {not_bound_pct:.1f}% clone PVCs not "
+                f"Bound exceeds 50% threshold — "
+                f"{len(not_bound)}/{total}"
+            )
+        self.logger.info(
+            f"[verify_clones] {total - len(not_bound)}/{total} clone "
+            f"PVCs confirmed Bound"
+        )
+
+    # ── Two-phase subsystem creation: parents then parallel children ────
+
+    def _phase_create_subsystems(self):
+        """Sub-phase 1: create all parent PVCs sequentially.
+        Sub-phase 2: create children for PARALLEL_PARENTS subsystems
+        concurrently."""
+        pvcs_per_subsys = 1 + self.CHILDREN_PER_PARENT
+        total = self.NUM_PARENTS * pvcs_per_subsys
+        self.logger.info(
+            f"[create_subsystems] {self.NUM_PARENTS} subsystems × "
+            f"{pvcs_per_subsys} PVCs = {total} total "
+            f"(parallel={self.PARALLEL_PARENTS})"
+        )
+
+        # ── Sub-phase 1: Create all parent PVCs (parallel) ─────────
+        parent_items = []
+        parent_names = []
+        for i in range(self.NUM_PARENTS):
+            pname = f"ns-pvc-{_rand_seq(6)}-{i:04d}"
+            sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+            fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
+            parent_items.append({"name": pname, "idx": i, "sc_name": sc_name})
+            parent_names.append(pname)
+            # Pre-register so children can reference parents
+            self._parent_registry[pname] = {
+                "id": pname,
+                "children": [],
+                "snapshots": [],
+                "start_child_idx": i * pvcs_per_subsys + 1,
+                "storage_class": sc_name,
+                "fs_type": fs_type,
+            }
+        self.logger.info(
+            f"[create_subsystems][sub1] Creating {self.NUM_PARENTS} parent "
+            f"PVCs (parallel, workers={self.MAX_WORKERS_CREATE})"
+        )
+        parents_t0 = time.time()
+        _ok, parent_fail = self._batch_parallel(
+            parent_items,
+            self._create_single_parent_k8s,
+            self.MAX_WORKERS_CREATE,
+            "create_parents",
+        )
+        parents_elapsed = time.time() - parents_t0
+        self._log_op_stats(
+            "create_parent", batch_label="all parents",
+            batch_elapsed=parents_elapsed,
+        )
+
+        # Remove failed parents from registry (they were pre-registered)
+        failed_parents = []
+        if parent_fail > 0:
+            created_parents = {
+                s["name"] for s in self._timing_samples
+                if s["op"] == "create_parent"
+            }
+            for pname in list(parent_names):
+                if pname not in created_parents:
+                    failed_parents.append(pname)
+                    parent_names.remove(pname)
+                    self._parent_registry.pop(pname, None)
+
+        self.logger.info(
+            f"[create_subsystems][sub1] {len(parent_names)} parents "
+            f"created in {parents_elapsed:.1f}s"
+            f"{f', {len(failed_parents)} FAILED: {failed_parents}' if failed_parents else ''}"
+        )
+
+        # ── Sub-phase 2: Create ALL child PVCs in parallel ─────────
+        total_children = len(parent_names) * self.CHILDREN_PER_PARENT
+        self.logger.info(
+            f"[create_subsystems][sub2] Creating {total_children} child "
+            f"PVCs in parallel (workers={self.MAX_WORKERS_CREATE})"
+        )
+        # Build flat list of all children with parent assignment
+        child_items = []
+        for pi, pname in enumerate(parent_names):
+            for c in range(self.CHILDREN_PER_PARENT):
+                child_idx = pi * pvcs_per_subsys + 1 + c
+                child_items.append({
+                    "name": f"ns-pvc-{_rand_seq(6)}-{child_idx:04d}",
+                    "parent_name": pname,
+                })
+        children_t0 = time.time()
+        _ok, child_fail = self._batch_parallel(
+            child_items,
+            self._create_single_child_k8s,
+            self.MAX_WORKERS_CREATE,
+            "create_children",
+        )
+        children_elapsed = time.time() - children_t0
+        self._log_op_stats(
+            "create_child", batch_label="all children",
+            batch_elapsed=children_elapsed,
+        )
+
+        # Identify failed children
+        failed_children = []
+        if child_fail > 0:
+            created_children = set(self._child_registry.keys())
+            for item in child_items:
+                if item["name"] not in created_children:
+                    failed_children.append(
+                        f"{item['name']} (parent={item['parent_name']})"
+                    )
+
+        # ── Failure summary ──────────────────────────────────────────
+        total_attempted = self.NUM_PARENTS + total_children
+        total_failed = len(failed_parents) + len(failed_children)
+        fail_pct = (total_failed * 100 / max(total_attempted, 1))
+
+        if total_failed > 0:
+            self.logger.warning(
+                f"[create_subsystems] FAILED PVCs: {total_failed}/"
+                f"{total_attempted} ({fail_pct:.1f}%)"
+            )
+            if failed_parents:
+                self.logger.warning(
+                    f"  Failed PARENTS ({len(failed_parents)}): "
+                    f"{failed_parents}"
+                )
+            if failed_children:
+                self.logger.warning(
+                    f"  Failed CHILDREN ({len(failed_children)}): "
+                    f"{failed_children}"
+                )
+
+        if fail_pct > 50:
+            raise RuntimeError(
+                f"[create_subsystems] {fail_pct:.1f}% failure rate "
+                f"exceeds 50% threshold — {total_failed}/{total_attempted} "
+                f"PVCs failed (parents={len(failed_parents)}, "
+                f"children={len(failed_children)})"
+            )
+
+        # ── Bulk verify ──────────────────────────────────────────────
+        all_lvols = self.sbcli_utils.list_lvols()
+        expected_created = total_attempted - total_failed
+        if len(all_lvols) < expected_created:
+            self.logger.warning(
+                f"[create_subsystems] lvol count {len(all_lvols)} < "
+                f"expected {expected_created}"
+            )
+
+        self.logger.info(
+            f"[create_subsystems] Done: {len(self._parent_registry)} "
+            f"parents, {len(self._child_registry)} children"
+            f"{f' ({total_failed} failures tolerated)' if total_failed else ''}"
+        )
+
+    def _create_single_parent_k8s(self, item):
+        """Create a single parent PVC. Called from _batch_parallel."""
+        name = item["name"]
+        sc_name = item.get("sc_name", self.STORAGE_CLASS_NAME)
+        t0 = time.time()
+        self._create_pvc(name, sc_name=sc_name)
+        self._record_timing(
+            "create_parent", name,
+            time.time() - t0, self._snapshot_inventory(),
+        )
+        self._inc("counts", "parents_created")
+
+    def _create_single_child_k8s(self, item):
+        """Create a single child PVC and register it under its parent.
+
+        Called from _batch_parallel with MAX_WORKERS_CREATE concurrency —
+        all children for all parents run in parallel."""
+        child_name = item["name"]
+        parent_name = item["parent_name"]
+        # Children inherit StorageClass (and thus fs_type) from parent
+        sc_name = self._parent_registry.get(parent_name, {}).get(
+            "storage_class", self.STORAGE_CLASS_NAME
+        )
+        t0 = time.time()
+        self._create_pvc(child_name, sc_name=sc_name)
+        elapsed = time.time() - t0
+        self._record_timing(
+            "create_child", child_name,
+            elapsed, self._snapshot_inventory(),
+        )
+        with self._lock:
+            self._child_registry[child_name] = {
+                "id": child_name, "parent_name": parent_name,
+            }
+            self._parent_registry[parent_name]["children"].append(
+                child_name
+            )
+        self._inc("counts", "children_created")
+
+    def _create_pvc(self, name: str, sc_name: str = None):
+        """Create a single PVC with label and wait for Bound."""
+        sc = sc_name or self.STORAGE_CLASS_NAME
         ns = self.k8s_utils.namespace
-        # Create PVC with label for easy cleanup
         yaml_content = (
             f"apiVersion: v1\n"
             f"kind: PersistentVolumeClaim\n"
@@ -1180,7 +3230,7 @@ def _create_parent_impl(self, params: dict):
             f"spec:\n"
             f"  accessModes:\n"
             f"    - ReadWriteOnce\n"
-            f"  storageClassName: {self.STORAGE_CLASS_NAME}\n"
+            f"  storageClassName: {sc}\n"
             f"  resources:\n"
             f"    requests:\n"
             f"      storage: {self.PVC_SIZE}\n"
@@ -1188,16 +3238,127 @@ def _create_parent_impl(self, params: dict):
         self.k8s_utils.apply_yaml(yaml_content, namespace=ns)
         if not self.k8s_utils.wait_pvc_bound(name, timeout=300, namespace=ns):
             raise TimeoutError(f"PVC {name} not Bound within 300s")
+
+    # ── Write data (parallel FIO) to snapshot-target PVCs ──────────────
+
+    def _phase_write_data(self):
+        """Run parallel FIO (100 MB write) on all PVCs that will be snapshotted.
+
+        Snapshot targets = all parents + 1 random child.  The chosen child is
+        stored in self._snapshot_child so _phase_create_snapshots reuses it.
+        """
+        parents = list(self._parent_registry.keys())
+
+        # Pick the random child now so we FIO it and snapshot it later
         with self._lock:
-            self._parent_registry[name] = {
-                "id": name, "children": [], "snapshots": [],
-            }
-            self._metrics["counts"]["parents_created"] += 1
-        self.logger.info(f"[create_pvc] {name} Bound")
+            child_names = list(self._child_registry.keys())
+        if child_names:
+            self._snapshot_child = random.choice(child_names)
+            self.logger.info(
+                f"[write_data] Pre-selected child for snapshot: "
+                f"{self._snapshot_child}"
+            )
+        else:
+            self._snapshot_child = None
+
+        targets = list(parents)
+        if self._snapshot_child:
+            targets.append(self._snapshot_child)
 
-    def _create_child_impl(self, params: dict):
-        """No-op in K8s."""
-        pass
+        child_label = " + 1 child" if self._snapshot_child else ""
+        self.logger.info(
+            f"[write_data] Running parallel FIO (100 MB) on "
+            f"{len(targets)} PVCs ({len(parents)} parents"
+            f"{child_label}) via K8s Jobs"
+        )
+
+        fio_items = [{"pvc_name": pvc} for pvc in targets]
+        write_t0 = time.time()
+        _ok, fail = self._batch_parallel(
+            fio_items, self._run_fio_job_k8s,
+            self.MAX_WORKERS_CREATE, "write_data",
+        )
+        write_elapsed = time.time() - write_t0
+        self.logger.info(
+            f"[write_data] Done: {_ok}/{len(targets)} OK, "
+            f"{fail} failed in {write_elapsed:.1f}s"
+        )
+        if fail > 0:
+            self.logger.warning(
+                f"[write_data] {fail}/{len(targets)} FIO jobs failed"
+            )
+
+    def _run_fio_job_k8s(self, item):
+        """Create a K8s Job running FIO 100 MB sequential write on a PVC."""
+        pvc_name = item["pvc_name"]
+        ns = self.k8s_utils.namespace
+        job_name = f"fio-{pvc_name[:40]}-{_rand_seq(4)}"
+        t0 = time.time()
+
+        yaml_content = (
+            f"apiVersion: batch/v1\n"
+            f"kind: Job\n"
+            f"metadata:\n"
+            f"  name: {job_name}\n"
+            f"  labels:\n"
+            f"    test: ns-stress\n"
+            f"    purpose: write-data\n"
+            f"spec:\n"
+            f"  backoffLimit: 0\n"
+            f"  template:\n"
+            f"    spec:\n"
+            f"      restartPolicy: Never\n"
+            f"      containers:\n"
+            f"      - name: fio\n"
+            f"        image: dockerpinata/fio:2.1\n"
+            f"        command:\n"
+            f"        - fio\n"
+            f"        args:\n"
+            f"        - --name=write-{pvc_name[:20]}\n"
+            f"        - --filename=/data/testfile\n"
+            f"        - --size=100M\n"
+            f"        - --bs=1M\n"
+            f"        - --rw=write\n"
+            f"        - --direct=1\n"
+            f"        - --ioengine=libaio\n"
+            f"        - --iodepth=1\n"
+            f"        - --numjobs=1\n"
+            f"        volumeMounts:\n"
+            f"        - name: vol\n"
+            f"          mountPath: /data\n"
+            f"      volumes:\n"
+            f"      - name: vol\n"
+            f"        persistentVolumeClaim:\n"
+            f"          claimName: {pvc_name}\n"
+        )
+        self.k8s_utils.apply_yaml(yaml_content, namespace=ns)
+        result = self.k8s_utils.wait_job_complete(
+            job_name, timeout=300, namespace=ns,
+        )
+        elapsed = time.time() - t0
+        if result != "succeeded":
+            self.logger.error(
+                f"[write_data] FIO job {job_name} for PVC {pvc_name} "
+                f"ended with: {result} ({elapsed:.1f}s)"
+            )
+            raise RuntimeError(
+                f"FIO job {job_name} for PVC {pvc_name} "
+                f"ended with: {result}"
+            )
+        # Clean up the completed job
+        try:
+            self.k8s_utils.delete_resource("job", job_name, namespace=ns)
+        except Exception:
+            pass
+        self._record_timing(
+            "write_data", pvc_name, elapsed,
+            self._snapshot_inventory(),
+        )
+        self.logger.info(
+            f"[write_data] {pvc_name} OK ({elapsed:.1f}s)"
+        )
+
+    # ── Create implementations ────────────────────────────────────────────
 
     def _create_snapshot_impl(self, params: dict):
         snap_name = params["name"]
@@ -1236,6 +3397,7 @@ def _create_snapshot_impl(self, params: dict):
     def _create_clone_impl(self, params: dict):
         clone_name = params["name"]
         snap_name = params["snap_name"]
+        sc_name = params.get("sc_name", self.STORAGE_CLASS_NAME)
         self._inc("attempts", "create_clone")
         ns = self.k8s_utils.namespace
         # Clone PVC from VolumeSnapshot with label
@@ -1249,7 +3411,7 @@ def _create_clone_impl(self, params: dict):
             f"spec:\n"
             f"  accessModes:\n"
             f"    - ReadWriteOnce\n"
-            f"  storageClassName: {self.STORAGE_CLASS_NAME}\n"
+            f"  storageClassName: {sc_name}\n"
             f"  resources:\n"
             f"    requests:\n"
             f"      storage: {self.PVC_SIZE}\n"
@@ -1259,8 +3421,26 @@ def _create_clone_impl(self, params: dict):
             f"    apiGroup: snapshot.storage.k8s.io\n"
         )
         self.k8s_utils.apply_yaml(yaml_content, namespace=ns)
-        if not self.k8s_utils.wait_pvc_bound(clone_name, timeout=300, namespace=ns):
-            raise TimeoutError(f"Clone PVC {clone_name} not Bound within 300s")
+        with self._lock:
+            self._clones_binding += 1
+            concurrent = self._clones_binding
+        self.logger.info(
+            f"[create_clone] {clone_name} waiting for Bound "
+            f"(concurrent_binding={concurrent})"
+        )
+        bind_t0 = time.time()
+        try:
+            if not self.k8s_utils.wait_pvc_bound(
+                clone_name, timeout=self.CLONE_BIND_TIMEOUT, namespace=ns
+            ):
+                raise TimeoutError(
+                    f"Clone PVC {clone_name} not Bound "
+                    f"within {self.CLONE_BIND_TIMEOUT}s"
+                )
+        finally:
+            with self._lock:
+                self._clones_binding -= 1
+        bind_elapsed = time.time() - bind_t0
         with self._lock:
             self._clone_registry[clone_name] = {
                 "id": clone_name, "snap_name": snap_name,
@@ -1268,7 +3448,128 @@ def _create_clone_impl(self, params: dict):
             if snap_name in self._snap_registry:
                 self._snap_registry[snap_name]["clones"].append(clone_name)
             self._metrics["counts"]["clones_created"] += 1
-        self.logger.info(f"[create_clone] {clone_name} Bound (snap={snap_name})")
+        self.logger.info(
+            f"[create_clone] {clone_name} Bound in {bind_elapsed:.1f}s "
+            f"(snap={snap_name})"
+        )
+
+    # ── Clone mount verification ─────────────────────────────────────────
+
+    def _mount_verify_single_clone(self, item):
+        """Create a K8s FIO Job mounting the clone PVC, run read, check errors."""
+        clone_name = item["clone_name"]
+        ns = self.k8s_utils.namespace
+        job_name = f"verify-{clone_name[:40]}-{_rand_seq(4)}"
+        t0 = time.time()
+
+        try:
+            # 1. Create FIO Job that mounts the clone PVC and reads 4 MB
+            yaml_content = (
+                f"apiVersion: batch/v1\n"
+                f"kind: Job\n"
+                f"metadata:\n"
+                f"  name: {job_name}\n"
+                f"  namespace: {ns}\n"
+                f"  labels:\n"
+                f"    test: ns-stress\n"
+                f"    purpose: mount-verify\n"
+                f"spec:\n"
+                f"  backoffLimit: 0\n"
+                f"  template:\n"
+                f"    spec:\n"
+                f"      restartPolicy: Never\n"
+                f"      containers:\n"
+                f"      - name: fio\n"
+                f"        image: dockerpinata/fio:2.1\n"
+                f"        command:\n"
+                f"        - fio\n"
+                f"        args:\n"
+                f"        - --name=verify-{clone_name[:20]}\n"
+                f"        - --filename=/data/testfile\n"
+                f"        - --size=4M\n"
+                f"        - --bs=4K\n"
+                f"        - --rw=read\n"
+                f"        - --direct=1\n"
+                f"        - --ioengine=libaio\n"
+                f"        - --iodepth=1\n"
+                f"        - --numjobs=1\n"
+                f"        volumeMounts:\n"
+                f"        - name: vol\n"
+                f"          mountPath: /data\n"
+                f"      volumes:\n"
+                f"      - name: vol\n"
+                f"        persistentVolumeClaim:\n"
+                f"          claimName: {clone_name}\n"
+            )
+            self.k8s_utils.apply_yaml(yaml_content, namespace=ns)
+
+            # 2. Wait for job completion
+            result = self.k8s_utils.wait_job_complete(
+                job_name, timeout=300, namespace=ns,
+            )
+            elapsed = time.time() - t0
+
+            # 3. Fetch pod logs for FIO output
+            fio_output = ""
+            try:
+                # Find the pod created by this job
+                pod_out, _ = self.k8s_utils._exec_kubectl(
+                    f"kubectl get pods -n {ns} -l job-name={job_name} "
+                    f"-o jsonpath='{{.items[0].metadata.name}}' 2>/dev/null",
+                    supress_logs=True,
+                )
+                pod_name = (pod_out or "").strip()
+                if pod_name:
+                    fio_output = self.k8s_utils.get_pod_logs(
+                        pod_name, namespace=ns, tail=100,
+                    )
+            except Exception:
+                pass
+
+            # 4. Check for errors
+            if result != "succeeded":
+                self.logger.error(
+                    f"[mount_verify] FIO job {job_name} for clone "
+                    f"{clone_name} ended with: {result} ({elapsed:.1f}s)"
+                    f"\nFIO output:\n{fio_output}"
+                )
+                raise RuntimeError(
+                    f"FIO verify job for clone {clone_name} failed: "
+                    f"{result}"
+                )
+
+            # 5. Parse FIO output for err=
+            import re
+            for line in (fio_output or "").splitlines():
+                if "err=" in line:
+                    m = re.search(r"err=\s*(\d+)", line)
+                    if m and int(m.group(1)) != 0:
+                        self.logger.error(
+                            f"[mount_verify] FIO reported error on clone "
+                            f"{clone_name}:\n{fio_output}"
+                        )
+                        raise RuntimeError(
+                            f"FIO read error on clone {clone_name}: "
+                            f"{line.strip()}"
+                        )
+
+            self.logger.info(
+                f"[mount_verify] Clone {clone_name} verified OK "
+                f"({elapsed:.1f}s)"
+            )
+            self._record_timing(
+                "mount_verify", clone_name, elapsed,
+                self._snapshot_inventory(),
+            )
+
+        finally:
+            # Always clean up the job
+            try:
+                self.k8s_utils.delete_resource(
+                    "job", job_name, namespace=ns,
+                )
+            except Exception:
+                pass
 
     # ── Delete implementations (with verification) ────────────────────────
 
@@ -1297,8 +3598,17 @@ def _delete_snapshot_impl(self, snap_name: str):
             self._metrics["counts"]["snapshots_deleted"] += 1
 
     def _delete_child_impl(self, child_name: str):
-        """No-op in K8s — no separate children."""
-        pass
+        """Delete child PVC in K8s."""
+        self._inc("attempts", "delete_child")
+        ns = self.k8s_utils.namespace
+        self.k8s_utils._exec_kubectl(
+            f"kubectl delete pvc {child_name} -n {ns} "
+            f"--ignore-not-found --wait=false 2>/dev/null || true"
+        )
+        self._wait_pvc_gone(child_name)
+        with self._lock:
+            self._child_registry.pop(child_name, None)
+            self._metrics["counts"]["children_deleted"] += 1
 
     def _delete_parent_impl(self, parent_name: str):
         self._inc("attempts", "delete_parent")
diff --git a/e2e/stress_test/device_failure_migration.py b/e2e/stress_test/device_failure_migration.py
index ab43efe8d..93a76aba3 100755
--- a/e2e/stress_test/device_failure_migration.py
+++ b/e2e/stress_test/device_failure_migration.py
@@ -2,21 +2,43 @@
 Device Failure Migration Stress Test
 
 Measures the time it takes to complete failure migration on a single device.
-Two variants:
 
-  - DeviceFailureMigrationNoLoad:
-        Fill device to 65 %, fail it, measure migration time (no IO load).
-  - DeviceFailureMigrationUnderLoad:
-        Fill device to 65 %, start IO on every cluster node, fail device,
-        measure migration time while IO is running.
+Variants:
 
-Both tests are Docker-mode only (sbcli + SSH FIO).  They work with any
-cluster geometry (ndcs/npcs) and require at least one client node
-(CLIENT_IP env var or mgmt node fallback).
+  Docker (sbcli + SSH FIO):
+  - DeviceFailureMigrationNoLoadDocker          — API removal, no IO load
+  - DeviceFailureMigrationUnderLoadDocker       — API removal, IO load running
+  - DeviceFailureMigrationPCIeNoLoadDocker      — PCIe sysfs removal, no IO load
+  - DeviceFailureMigrationPCIeUnderLoadDocker   — PCIe sysfs removal, IO load running
+
+  K8s-native (PVC + FIO K8s Jobs):
+  - DeviceFailureMigrationNoLoadK8s       — API removal, no IO load
+  - DeviceFailureMigrationUnderLoadK8s    — API removal, IO load running
+  - DeviceFailureMigrationPCIeNoLoadK8s   — PCIe sysfs removal, no IO load
+  - DeviceFailureMigrationPCIeUnderLoadK8s— PCIe sysfs removal, IO load running
+
+Failure modes:
+  - "api"  : Logical removal via REST API + set-failed-device CLI
+  - "pcie" : Physical removal via /sys/bus/pci/devices/<addr>/remove
+
+All tests work with any cluster geometry (ndcs/npcs) and require at least
+one storage node with a device.
+
+Invocation:
+  # Docker
+  python3 stress.py --testname DeviceFailureMigrationNoLoadDocker --ndcs 2 --npcs 2
+  python3 stress.py --testname DeviceFailureMigrationPCIeNoLoad --ndcs 2 --npcs 2
+
+  # K8s
+  python3 stress.py --testname DeviceFailureMigrationNoLoadK8s --ndcs 2 --npcs 2 --run_k8s True
+  python3 stress.py --testname DeviceFailureMigrationPCIeUnderLoadK8s --ndcs 2 --npcs 2 --run_k8s True
 """
 
 import json
 import math
+import os
+import random
+import string
 import threading
 import time
 from datetime import datetime, timezone
@@ -28,8 +50,14 @@
 from utils.common_utils import sleep_n_sec
 
 
+def _rand_seq(length: int = 8) -> str:
+    first = random.choice(string.ascii_lowercase)
+    rest = "".join(random.choices(string.ascii_lowercase + string.digits, k=length - 1))
+    return first + rest
+
+
 # ═══════════════════════════════════════════════════════════════════════════════
-#  Mixin — shared orchestration for both variants
+#  Mixin — shared orchestration for all variants
 # ═══════════════════════════════════════════════════════════════════════════════
 
 class _DeviceFailureMigrationBase:
@@ -59,28 +87,50 @@ def _init_migration_state(self):
         self._load_fio_threads = []
         self._sn_nodes = []
         self._with_io_load = False
+        self._failure_mode = "api"
+        self._pre_migration_checksums = {}  # {lvol_name: {filepath: md5}}
 
     # ── Main flow ────────────────────────────────────────────────────────────
 
-    def _run_migration_test(self, with_io_load=False):
-        """Main flow: setup → fill → [start IO] → fail → migrate → cleanup."""
+    def _run_migration_test(self, with_io_load=False, failure_mode="api"):
+        """Main flow: setup -> fill -> [checksum] -> [start IO] -> fail -> migrate -> validate -> recover -> cleanup.
+
+        NoLoad:  fill → md5sum → fail device → migrate → verify md5 + FIO fill logs → recover device → cleanup
+        UnderLoad: fill → start FIO (verify=md5) → fail device → migrate → check FIO OK → wait FIO complete → recover → cleanup
+        """
         self._with_io_load = with_io_load
+        self._failure_mode = failure_mode
+        self._test_passed = False
         t0 = time.time()
         try:
             self._phase_setup_pool_and_lvols()
             self._phase_fill_devices()
+            if not with_io_load:
+                self._phase_compute_checksums()
             if with_io_load:
                 self._phase_start_io_load()
-            self._phase_fail_and_migrate()
+            if failure_mode == "pcie":
+                self._phase_fail_and_migrate_pcie()
+            else:
+                self._phase_fail_and_migrate()
+            self._phase_validate()
+            if with_io_load:
+                # Wait for FIO to finish naturally — do NOT kill it
+                self._phase_wait_fio_completion()
+                self._phase_validate_fio()
+            self._test_passed = True
         finally:
             if with_io_load:
-                self._phase_stop_io_load()
+                self._phase_stop_io_load()  # kill FIO only if still running (failure path)
+            self._phase_recover_device()
             self._phase_cleanup()
             self._timing["total_duration"] = time.time() - t0
             self._print_migration_summary()
             self._write_timing_json()
             self._generate_charts()
 
+        self.logger.info("TEST CASE PASSED !!!")
+
     # ── Phase 1: create pool, lvols, connect, format, mount ──────────────────
 
     def _phase_setup_pool_and_lvols(self):
@@ -102,11 +152,20 @@ def _phase_setup_pool_and_lvols(self):
             raise RuntimeError(
                 f"No devices found on target node {self._target_node_id}"
             )
-        self._target_device_info = devices[0]
-        self._target_device_id = devices[0]["id"]
+        # Filter for online devices only — old failed_and_migrated devices
+        # remain in the list after recovery and must be skipped
+        online_devices = [d for d in devices if d.get("status") == "online"]
+        if not online_devices:
+            raise RuntimeError(
+                f"No online devices found on target node {self._target_node_id}. "
+                f"Device statuses: {[d.get('status') for d in devices]}"
+            )
+        self._target_device_info = online_devices[0]
+        self._target_device_id = online_devices[0]["id"]
         self.logger.info(
             f"Target node: {self._target_node_id}, "
-            f"Target device: {self._target_device_id}"
+            f"Target device: {self._target_device_id} "
+            f"(selected from {len(online_devices)} online / {len(devices)} total devices)"
         )
 
         # Get node capacity to calculate how many lvols to create
@@ -227,9 +286,16 @@ def _phase_fill_devices(self):
             t.start()
             threads.append(t)
 
-        # Wait for all fills to complete
+        # Wait for FIO launch threads to return (they return after verifying
+        # FIO is running in tmux, but FIO itself is still writing)
         for t in threads:
-            t.join(timeout=3600)
+            t.join(timeout=60)
+
+        # Wait for actual FIO processes to finish on the remote node
+        self.logger.info("Waiting for FIO fill processes to complete on remote node ...")
+        self.common_utils.manage_fio_threads(
+            node=client, threads=[], timeout=3600
+        )
 
         # Verify fill level
         sleep_n_sec(5)
@@ -244,6 +310,153 @@ def _phase_fill_devices(self):
             f"Fill complete ({self._timing['fill_duration']:.1f}s)"
         )
 
+    # ── Phase 2b: compute pre-migration checksums (no-load variant) ─────────
+
+    def _phase_compute_checksums(self):
+        """Compute MD5 checksums of all files on target lvols before migration."""
+        self.logger.info("=== Phase: Compute pre-migration checksums ===")
+        client = self.fio_node[0]
+        self._pre_migration_checksums = {}
+
+        for name in self._lvols_on_target:
+            info = self.lvol_mount_details.get(name)
+            if not info:
+                continue
+            mount = info["Mount"]
+            try:
+                files = self.ssh_obj.find_files(client, mount)
+                if files:
+                    checksums = self.ssh_obj.generate_checksums(client, files)
+                    self._pre_migration_checksums[name] = checksums
+                    self.logger.info(
+                        f"Captured {len(checksums)} file checksums for {name}"
+                    )
+                else:
+                    self.logger.warning(f"No files found on {mount} for checksum")
+            except Exception as exc:
+                self.logger.warning(f"Checksum capture failed for {name}: {exc}")
+
+        self.logger.info(
+            f"Pre-migration checksums captured for "
+            f"{len(self._pre_migration_checksums)} lvols"
+        )
+
+    def _phase_verify_checksums(self):
+        """Verify MD5 checksums of target lvols match pre-migration values."""
+        self.logger.info("=== Verifying post-migration data integrity ===")
+        client = self.fio_node[0]
+        mismatches = 0
+
+        for name, expected_checksums in self._pre_migration_checksums.items():
+            info = self.lvol_mount_details.get(name)
+            if not info:
+                continue
+            mount = info["Mount"]
+            try:
+                files = self.ssh_obj.find_files(client, mount)
+                self.ssh_obj.verify_checksums(
+                    client, files, expected_checksums,
+                    message=(
+                        f"Data integrity check failed for lvol {name} "
+                        f"after device migration"
+                    ),
+                )
+                self.logger.info(f"Checksums verified for {name}: OK")
+            except ValueError as exc:
+                self.logger.error(f"Checksum MISMATCH for {name}: {exc}")
+                mismatches += 1
+            except Exception as exc:
+                self.logger.error(
+                    f"Checksum verification error for {name}: {exc}"
+                )
+                mismatches += 1
+
+        assert mismatches == 0, (
+            f"Data integrity check failed: {mismatches} lvol(s) had "
+            f"checksum mismatches after migration"
+        )
+        self.logger.info(
+            "All post-migration checksums verified — data integrity OK"
+        )
+
+    def _phase_validate_fio(self):
+        """Check FIO logs for errors after migration (under-load variant).
+
+        IO errors on lvols hosted on the failed device are expected and
+        logged as warnings.  IO errors on lvols hosted on OTHER devices
+        are logged as errors.
+        """
+        self.logger.info("=== Verifying FIO logs for errors ===")
+        client = self.fio_node[0]
+        fail_words = ["error", "fail", "interrupt", "terminate"]
+        target_errors = []
+        other_errors = []
+
+        all_names = self._lvols_on_target + self._lvols_on_others
+        for name in all_names:
+            info = self.lvol_mount_details.get(name)
+            if not info or not info.get("Log"):
+                continue
+            try:
+                log_data = self.ssh_obj.exec_command(
+                    client, f"cat {info['Log']} 2>/dev/null || true"
+                )
+                if not log_data:
+                    self.logger.warning(f"Empty or missing FIO log for {name}")
+                    continue
+                log_lower = log_data.lower() if isinstance(log_data, str) else str(log_data).lower()
+                found = [w for w in fail_words if w in log_lower]
+                if found:
+                    msg = f"{name}: FIO log contains {found}"
+                    if name in self._lvols_on_target:
+                        target_errors.append(msg)
+                        self.logger.warning(
+                            f"[expected] FIO error on failed-device lvol {name}: {found}"
+                        )
+                    else:
+                        other_errors.append(msg)
+                        self.logger.error(
+                            f"FIO error on non-target lvol {name}: {found}"
+                        )
+                else:
+                    self.logger.info(f"FIO log for {name}: no errors")
+            except Exception as exc:
+                self.logger.warning(f"Could not read FIO log for {name}: {exc}")
+
+        if target_errors:
+            self.logger.warning(
+                f"{len(target_errors)} FIO error(s) on target-device lvols "
+                f"(expected during device migration)"
+            )
+        if other_errors:
+            self.logger.error(
+                f"{len(other_errors)} FIO error(s) on non-target lvols: "
+                f"{other_errors}"
+            )
+
+    # ── Phase: wait for FIO to complete naturally ──────────────────────────
+
+    def _phase_wait_fio_completion(self):
+        """Wait for FIO processes to finish naturally (do NOT kill them).
+
+        Uses ``common_utils.manage_fio_threads`` to poll for active FIO
+        processes on the client node until none remain.
+        """
+        self.logger.info("=== Phase: Waiting for FIO to complete naturally ===")
+        client = self.fio_node[0]
+        t0 = time.time()
+        timeout = self.FIO_LOAD_RUNTIME + 300  # runtime + buffer
+
+        self.common_utils.manage_fio_threads(
+            node=client, threads=[], timeout=timeout
+        )
+
+        self._timing["fio_completion_duration"] = time.time() - t0
+        self.logger.info(
+            f"All FIO processes completed "
+            f"({self._timing['fio_completion_duration']:.1f}s)"
+        )
+
     # ── Phase 3: start random IO on all nodes (under-load variant) ───────────
 
     def _phase_start_io_load(self):
@@ -277,19 +490,20 @@ def _phase_start_io_load(self):
             f"IO load started: {len(self._load_fio_threads)} FIO threads"
         )
 
-    # ── Phase 4: remove device → set-failed → wait migration ────────────────
+    # ── Phase 4a: API removal -> set-failed -> wait migration ────────────────
 
     def _phase_fail_and_migrate(self):
         self.logger.info(
-            f"=== Phase: Fail device {self._target_device_id} and migrate ==="
+            f"=== Phase: Fail device {self._target_device_id} via API and migrate ==="
         )
         t0 = time.time()
 
-        # Step 1: remove device (ONLINE → REMOVED)
-        self.logger.info(f"Removing device {self._target_device_id} …")
+        # Step 1: remove device (ONLINE -> REMOVED)
+        self.logger.info(f"Removing device {self._target_device_id} ...")
         self.sbcli_utils.remove_device(self._target_device_id)
         self.sbcli_utils.wait_for_device_status(
-            self._target_node_id, "removed", timeout=120
+            self._target_node_id, "removed", timeout=120,
+            device_id=self._target_device_id,
         )
         self._timing["remove_duration"] = time.time() - t0
         self.logger.info(
@@ -306,14 +520,102 @@ def _phase_fail_and_migrate(self):
         sleep_n_sec(5)
 
         # Step 3: wait for migration to complete
-        self.logger.info("Waiting for failure migration tasks to complete …")
-        migration_elapsed = self.sbcli_utils.wait_migration_tasks_complete(
-            timeout=self.MIGRATION_TIMEOUT
+        self._wait_migration_and_verify(t1)
+
+    # ── Phase 4b: PCIe sysfs removal -> set-failed -> wait migration ─────────
+
+    def _phase_fail_and_migrate_pcie(self):
+        self.logger.info(
+            f"=== Phase: Fail device {self._target_device_id} via PCIe and migrate ==="
         )
-        self._timing["migration_duration"] = time.time() - t1
+        t0 = time.time()
+
+        # Step 1: Get node IP and PCIe address
+        node_details = self.sbcli_utils.get_storage_node_details(
+            self._target_node_id
+        )
+        node_ip = node_details[0]["mgmt_ip"]
+        pcie_addr = self._target_device_info.get("pcie_address", "")
+        if not pcie_addr:
+            raise RuntimeError(
+                f"No pcie_address found for device {self._target_device_id}"
+            )
+        self.logger.info(
+            f"PCIe hot-unplug: device {self._target_device_id} "
+            f"at {pcie_addr} on {node_ip}"
+        )
+
+        # Step 2: PCIe hot-unplug via sysfs
+        self.ssh_obj.exec_command(
+            node=node_ip,
+            command=f"echo 1 | sudo tee /sys/bus/pci/devices/{pcie_addr}/remove"
+        )
+        self.logger.info("PCIe device removed via sysfs")
+        sleep_n_sec(10)
+
+        # Step 3: Wait for control plane to detect device loss
+        self.sbcli_utils.wait_for_device_status(
+            self._target_node_id, "unavailable", timeout=120,
+            device_id=self._target_device_id,
+        )
+        self._timing["remove_duration"] = time.time() - t0
+        self.logger.info(
+            f"Device detected as unavailable ({self._timing['remove_duration']:.1f}s)"
+        )
+
+        # Step 4: Logical remove + set-failed to trigger migration
+        t1 = time.time()
+        self.sbcli_utils.remove_device(self._target_device_id)
+        self.sbcli_utils.wait_for_device_status(
+            self._target_node_id, "removed", timeout=120,
+            device_id=self._target_device_id,
+        )
+
+        mgmt_ip = self.mgmt_nodes[0]
+        cmd = f"{self.base_cmd} sn set-failed-device {self._target_device_id}"
+        self.logger.info(f"Setting device failed via CLI: {cmd}")
+        result = self.ssh_obj.exec_command(mgmt_ip, cmd)
+        self.logger.info(f"set-failed-device result: {result}")
+        sleep_n_sec(5)
+
+        # Step 5: wait for migration to complete
+        self._wait_migration_and_verify(t1)
+
+        # Step 6: Rescan PCI bus to bring device back (for future tests)
+        self.logger.info("Rescanning PCI bus to restore device ...")
+        self.ssh_obj.exec_command(
+            node=node_ip,
+            command="echo 1 | sudo tee /sys/bus/pci/rescan"
+        )
+        sleep_n_sec(10)
+        self.logger.info("PCI bus rescan complete")
+
+    # ── Shared migration wait + verify ───────────────────────────────────────
+
+    def _wait_migration_and_verify(self, t_start):
+        """Wait for migration tasks and verify final device status.
+
+        Tries the REST-based ``wait_migration_tasks_complete`` first.
+        If the API is unavailable (404 etc.), falls back to polling
+        ``sbctl cluster list-tasks`` via CLI.
+        """
+        self.logger.info("Waiting for failure migration tasks to complete ...")
+        try:
+            migration_elapsed = self.sbcli_utils.wait_migration_tasks_complete(
+                timeout=self.MIGRATION_TIMEOUT
+            )
+        except TimeoutError:
+            raise
+        except Exception as exc:
+            self.logger.warning(
+                f"REST migration wait failed ({exc}), falling back to CLI"
+            )
+            migration_elapsed = self._wait_migration_cli_fallback()
+
+        self._timing["migration_duration"] = time.time() - t_start
         self._timing["migration_tasks_elapsed"] = migration_elapsed
 
-        # Step 4: verify device status
+        # Verify device status
         sleep_n_sec(5)
         devices = self.sbcli_utils.get_device_details(self._target_node_id)
         target_dev = None
@@ -330,14 +632,188 @@ def _phase_fail_and_migrate(self):
 
     # ── Phase 5: stop IO load ────────────────────────────────────────────────
 
+    def _phase_validate(self):
+        """Validate migration results: device migrated, nodes healthy, data intact."""
+        self.logger.info("=== Phase: Validate migration results ===")
+
+        # 1. Device should be in a migrated/failed state
+        final_status = self._timing.get("device_final_status", "unknown")
+        assert final_status in ("failed_and_migrated", "failed"), (
+            f"Device {self._target_device_id} has unexpected final status: "
+            f"{final_status} (expected failed_and_migrated or failed)"
+        )
+        self.logger.info(
+            f"Device {self._target_device_id} status: {final_status}"
+        )
+
+        # 2. All storage nodes should still be online and healthy
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for node in storage_nodes["results"]:
+            assert node["status"] == "online", (
+                f"Node {node['id']} is not online (status={node['status']})"
+            )
+            assert node["health_check"], (
+                f"Node {node['id']} health check failed"
+            )
+        self.logger.info(
+            f"All {len(storage_nodes['results'])} storage nodes online and healthy"
+        )
+
+        # 3. Other devices on target node should still be online
+        devices = self.sbcli_utils.get_device_details(self._target_node_id)
+        for d in devices:
+            if d["id"] == self._target_device_id:
+                continue
+            assert d["status"] == "online", (
+                f"Non-target device {d['id']} on target node has "
+                f"unexpected status: {d['status']}"
+            )
+        self.logger.info("All non-target devices remain online")
+
+        # 4. Data integrity checks (NoLoad only — UnderLoad is checked after FIO completes)
+        if not self._with_io_load:
+            self._phase_verify_checksums()
+
     def _phase_stop_io_load(self):
-        self.logger.info("=== Phase: Stop IO load ===")
+        """Kill remaining FIO processes (failure path only).
+
+        On the success path, FIO completes naturally via
+        ``_phase_wait_fio_completion``.  This method runs in the
+        ``finally`` block to ensure cleanup if the test failed early.
+        """
+        self.logger.info("=== Phase: Stop IO load (cleanup) ===")
         client = self.fio_node[0]
         self.ssh_obj.exec_command(client, "pkill -f fio || true")
         for t in self._load_fio_threads:
             t.join(timeout=30)
         self.logger.info("IO load stopped")
 
+    # ── Phase: recover failed device ─────────────────────────────────────────
+
+    def _phase_recover_device(self):
+        """Create a new device from the failed one and add it back.
+
+        Runs in the finally block so it executes even if the test fails.
+
+        Steps:
+          1. ``sbctl sn new-device-from-failed <failed_device_id>`` → new device ID
+          2. ``sbctl sn add-device <new_device_id>``
+          3. Wait for ``new_device_migration`` tasks to complete
+        """
+        if not self._target_device_id:
+            return
+        self.logger.info(
+            f"=== Phase: Recover device {self._target_device_id} ==="
+        )
+        mgmt_ip = self.mgmt_nodes[0]
+
+        # Step 1: create new device from failed device
+        try:
+            cmd = (
+                f"{self.base_cmd} sn new-device-from-failed "
+                f"{self._target_device_id}"
+            )
+            self.logger.info(f"Creating new device from failed: {cmd}")
+            result = self.ssh_obj.exec_command(mgmt_ip, cmd)
+            result_str = result[0] if isinstance(result, tuple) else str(result)
+            result_str = result_str.strip()
+            self.logger.info(f"new-device-from-failed result: {result_str}")
+
+            # Check for "already added back" — device was recovered previously
+            if "already added back from failed" in result_str.lower():
+                self.logger.info(
+                    "Device was already recovered from a previous run, "
+                    "skipping add-device step"
+                )
+                return
+
+            # Check for other errors in output
+            if "error" in result_str.lower() and "new device id:" not in result_str.lower():
+                self.logger.error(
+                    f"new-device-from-failed returned error: {result_str}"
+                )
+                return
+
+            # The last line of successful output is the bare UUID
+            # e.g. "5ab70b74-c8c5-4e24-b76e-dd64bdcfa39d"
+            new_device_id = result_str.strip().split("\n")[-1].strip()
+            # Validate it looks like a UUID (8-4-4-4-12 hex)
+            import re
+            if not re.match(
+                r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$',
+                new_device_id
+            ):
+                self.logger.error(
+                    f"Could not parse valid device UUID from output. "
+                    f"Got: '{new_device_id}', full output: {result_str}"
+                )
+                return
+            self.logger.info(f"New device ID: {new_device_id}")
+        except Exception as exc:
+            self.logger.error(f"new-device-from-failed failed: {exc}")
+            return
+
+        # Step 2: add the new device
+        try:
+            cmd = f"{self.base_cmd} -d sn add-device {new_device_id}"
+            self.logger.info(f"Adding new device: {cmd}")
+            result = self.ssh_obj.exec_command(mgmt_ip, cmd)
+            self.logger.info(f"add-device result: {result}")
+            sleep_n_sec(5)
+        except Exception as exc:
+            self.logger.error(f"add-device failed: {exc}")
+            return
+
+        # Step 3: wait for new_device_migration tasks to complete
+        try:
+            self._wait_new_device_migration(
+                new_device_id, timeout=self.MIGRATION_TIMEOUT
+            )
+            self.logger.info(
+                f"Device recovery complete — new device {new_device_id} online"
+            )
+        except Exception as exc:
+            self.logger.warning(
+                f"new_device_migration did not complete: {exc}"
+            )
+
+    def _wait_new_device_migration(self, new_device_id, timeout=3600):
+        """Wait for all new_device_migration tasks for *new_device_id* to finish."""
+        self.logger.info(
+            f"Waiting for new_device_migration tasks for {new_device_id} ..."
+        )
+        start = time.time()
+        while time.time() - start < timeout:
+            try:
+                tasks = self.sbcli_utils.list_migration_tasks(
+                    self.sbcli_utils.cluster_id
+                )
+                active = [
+                    t for t in tasks.get("results", [])
+                    if t.get("function_name") == "new_device_migration"
+                    and new_device_id in str(t.get("target_id", ""))
+                    and t.get("status") not in ("done", "cancelled", "error")
+                ]
+                if not active:
+                    elapsed = time.time() - start
+                    self.logger.info(
+                        f"All new_device_migration tasks complete "
+                        f"in {elapsed:.1f}s"
+                    )
+                    return elapsed
+                self.logger.info(
+                    f"Waiting for {len(active)} new_device_migration "
+                    f"task(s) ..."
+                )
+            except Exception as exc:
+                self.logger.warning(
+                    f"Error checking migration tasks: {exc}"
+                )
+            sleep_n_sec(10)
+        self.logger.warning(
+            f"new_device_migration not complete after {timeout}s"
+        )
+
     # ── Cleanup ──────────────────────────────────────────────────────────────
 
     def _phase_cleanup(self):
@@ -380,12 +856,14 @@ def _print_migration_summary(self):
         self.logger.info("  DEVICE FAILURE MIGRATION SUMMARY")
         self.logger.info("=" * 70)
         self.logger.info(f"  Test class:       {self.__class__.__name__}")
+        self.logger.info(f"  Failure mode:     {self._failure_mode}")
         self.logger.info(f"  IO load:          {'YES' if self._with_io_load else 'NO'}")
         self.logger.info(f"  Target node:      {self._target_node_id}")
         self.logger.info(f"  Target device:    {self._target_device_id}")
         self.logger.info(f"  Fill target:      {self.FILL_PERCENT}%")
         self.logger.info(f"  Lvols on target:  {len(self._lvols_on_target)}")
         self.logger.info(f"  Lvols on others:  {len(self._lvols_on_others)}")
+        self.logger.info(f"  Result:           {'PASSED' if self._test_passed else 'FAILED'}")
         self.logger.info("-" * 70)
         for key, val in self._timing.items():
             if isinstance(val, float):
@@ -409,12 +887,13 @@ def _write_timing_json(self):
         report = {
             "test_class": self.__class__.__name__,
             "timestamp": datetime.now(timezone.utc).isoformat(),
-            "status": "passed",
+            "status": "passed" if self._test_passed else "failed",
             "geometry": {"ndcs": self.ndcs, "npcs": self.npcs},
             "config": {
                 "fill_percent": self.FILL_PERCENT,
                 "lvol_size": self.LVOL_SIZE,
                 "with_io_load": self._with_io_load,
+                "failure_mode": self._failure_mode,
                 "target_node": self._target_node_id,
                 "target_device": self._target_device_id,
                 "lvols_on_target": len(self._lvols_on_target),
@@ -491,6 +970,7 @@ def _generate_charts(self):
                 plt.suptitle(
                     f"{class_name}\n"
                     f"IO load: {'YES' if self._with_io_load else 'NO'}  |  "
+                    f"Failure: {self._failure_mode}  |  "
                     f"Fill: {self.FILL_PERCENT}%  |  "
                     f"Lvols: {len(self._lvols_on_target)} target + "
                     f"{len(self._lvols_on_others)} other",
@@ -547,11 +1027,11 @@ def _parse_size(size_str):
 
 
 # ═══════════════════════════════════════════════════════════════════════════════
-#  Concrete test classes
+#  Docker concrete test classes (sbcli + SSH FIO)
 # ═══════════════════════════════════════════════════════════════════════════════
 
-class DeviceFailureMigrationNoLoad(_DeviceFailureMigrationBase, TestLvolHACluster):
-    """Fill device to 65 %, fail it, run migration WITHOUT IO load.
+class DeviceFailureMigrationNoLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster):
+    """Fill device to 65 %, fail it via API, run migration WITHOUT IO load.
 
     Measures: setup time, fill time, device remove time, migration time.
     """
@@ -567,8 +1047,8 @@ def run(self):
         self._run_migration_test(with_io_load=False)
 
 
-class DeviceFailureMigrationUnderLoad(_DeviceFailureMigrationBase, TestLvolHACluster):
-    """Fill device to 65 %, start IO on all nodes, fail device, migrate UNDER LOAD.
+class DeviceFailureMigrationUnderLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster):
+    """Fill device to 65 %, start IO on all nodes, fail device via API, migrate UNDER LOAD.
 
     Measures: setup time, fill time, device remove time, migration time.
     IO errors during migration are logged but do not fail the test.
@@ -583,3 +1063,622 @@ def __init__(self, **kwargs):
     def run(self):
         self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
         self._run_migration_test(with_io_load=True)
+
+
+class DeviceFailureMigrationPCIeNoLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster):
+    """Fill device to 65 %, remove via PCIe sysfs, run migration WITHOUT IO load.
+
+    Uses physical PCIe hot-unplug (/sys/bus/pci/devices/<addr>/remove) instead
+    of the control-plane API.  After migration, rescans PCI bus to restore device.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_pcie_no_load"
+
+    def run(self):
+        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self._run_migration_test(with_io_load=False, failure_mode="pcie")
+
+
+class DeviceFailureMigrationPCIeUnderLoadDocker(_DeviceFailureMigrationBase, TestLvolHACluster):
+    """Fill device to 65 %, start IO, remove via PCIe sysfs, migrate UNDER LOAD.
+
+    Uses physical PCIe hot-unplug (/sys/bus/pci/devices/<addr>/remove) instead
+    of the control-plane API.  After migration, rescans PCI bus to restore device.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_pcie_under_load"
+
+    def run(self):
+        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self._run_migration_test(with_io_load=True, failure_mode="pcie")
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+#  K8s-native concrete test classes (PVC + FIO K8s Jobs)
+# ═══════════════════════════════════════════════════════════════════════════════
+
+from stress_test.continuous_k8s_native_failover import K8sNativeFailoverTest  # noqa: E402
+
+
+class _DeviceFailureMigrationK8s(_DeviceFailureMigrationBase):
+    """K8s-native overrides for setup, fill, IO load, and cleanup phases.
+
+    Uses PVCs for storage provisioning and K8s FIO Jobs for workload
+    generation instead of sbcli + SSH.
+
+    The device failure and migration phases are identical to Docker
+    (they operate at the control-plane / sysfs level, not the data path).
+    """
+
+    # K8s-specific sizing
+    K8S_PVC_SIZE = "50Gi"
+    K8S_FIO_FILL_SIZE = "45G"
+    K8S_FIO_LOAD_SIZE = "1G"
+
+    def _init_migration_state(self):
+        super()._init_migration_state()
+        self._pvc_details = {}     # pvc_name -> {job_name, configmap_name, node_id}
+        self._fill_jobs = []       # (job_name, configmap_name) for fill FIO jobs
+        self._load_jobs = []       # (job_name, configmap_name) for load FIO jobs
+
+    # ── Phase 1 override: PVC-based setup ────────────────────────────────────
+
+    def _phase_setup_pool_and_lvols(self):
+        self.logger.info("=== Phase: Setup pool and PVCs (K8s) ===")
+        t0 = time.time()
+
+        # Get storage nodes
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for r in storage_nodes["results"]:
+            self._sn_nodes.append(r["uuid"])
+            self.node_vs_pvc[r["uuid"]] = []
+
+        if len(self._sn_nodes) < 1:
+            raise RuntimeError("No storage nodes found")
+
+        # Pick target node and device
+        self._target_node_id = self._sn_nodes[0]
+        devices = self.sbcli_utils.get_device_details(self._target_node_id)
+        if not devices:
+            raise RuntimeError(
+                f"No devices found on target node {self._target_node_id}"
+            )
+        # Filter for online devices only — old failed_and_migrated devices
+        # remain in the list after recovery and must be skipped
+        online_devices = [d for d in devices if d.get("status") == "online"]
+        if not online_devices:
+            raise RuntimeError(
+                f"No online devices found on target node {self._target_node_id}. "
+                f"Device statuses: {[d.get('status') for d in devices]}"
+            )
+        self._target_device_info = online_devices[0]
+        self._target_device_id = online_devices[0]["id"]
+        self.logger.info(
+            f"Target node: {self._target_node_id}, "
+            f"Target device: {self._target_device_id} "
+            f"(selected from {len(online_devices)} online / {len(devices)} total devices)"
+        )
+
+        # Get node capacity to calculate how many PVCs to create
+        capacity = self.sbcli_utils.get_node_capacity(self._target_node_id)
+        if isinstance(capacity, list):
+            capacity = capacity[0] if capacity else {}
+        size_total_bytes = capacity.get("size_total", 0)
+        if isinstance(size_total_bytes, str):
+            size_total_bytes = self._parse_size(size_total_bytes)
+        target_bytes = int(size_total_bytes * self.FILL_PERCENT / 100)
+        lvol_bytes = self._parse_size(self.LVOL_SIZE)
+        num_lvols = max(1, math.ceil(target_bytes / lvol_bytes))
+        self.logger.info(
+            f"Node capacity: {size_total_bytes} bytes, "
+            f"target fill: {target_bytes} bytes, "
+            f"creating {num_lvols} PVCs of {self.K8S_PVC_SIZE}"
+        )
+
+        # Create PVCs pinned to target node
+        for i in range(num_lvols):
+            pvc_name = f"mig-target-{_rand_seq(4)}-{i}"
+            self._create_pvc(pvc_name, self._target_node_id)
+            self._lvols_on_target.append(pvc_name)
+
+        # Create 1 PVC per OTHER node (for IO load variant)
+        other_nodes = [n for n in self._sn_nodes if n != self._target_node_id]
+        for idx, node_id in enumerate(other_nodes):
+            pvc_name = f"mig-other-{_rand_seq(4)}-{idx}"
+            self._create_pvc(pvc_name, node_id)
+            self._lvols_on_others.append(pvc_name)
+
+        self._timing["setup_duration"] = time.time() - t0
+        self.logger.info(
+            f"Setup complete: {len(self._lvols_on_target)} target PVCs, "
+            f"{len(self._lvols_on_others)} other PVCs "
+            f"({self._timing['setup_duration']:.1f}s)"
+        )
+
+    def _create_pvc(self, pvc_name, node_id):
+        """Create a PVC pinned to a specific storage node."""
+        self.k8s_utils.create_pvc(
+            pvc_name, self.K8S_PVC_SIZE, self.STORAGE_CLASS_NAME,
+            node_id=node_id,
+        )
+        self.k8s_utils.wait_pvc_bound(pvc_name, timeout=300)
+        sleep_n_sec(2)
+
+        node_id_actual = self._get_pvc_node_id(pvc_name) or node_id
+        self._pvc_details[pvc_name] = {
+            "job_name": None,
+            "configmap_name": None,
+            "node_id": node_id_actual,
+        }
+        self.node_vs_pvc.setdefault(node_id_actual, []).append(pvc_name)
+        self.logger.info(f"PVC {pvc_name} created and bound (node={node_id_actual})")
+
+    # ── Phase 2 override: fill via K8s FIO Jobs ──────────────────────────────
+
+    def _phase_fill_devices(self):
+        self.logger.info(
+            f"=== Phase: Fill target device to {self.FILL_PERCENT}% (K8s FIO Jobs) ==="
+        )
+        t0 = time.time()
+
+        # Create fill FIO jobs for target PVCs
+        for pvc_name in self._lvols_on_target:
+            job_name = f"fio-fill-{pvc_name}"
+            cm_name = f"fiocfg-fill-{pvc_name}"
+            run_id = _rand_seq(6)
+
+            fio_config = (
+                f"[global]\n"
+                f"name=fill-{pvc_name}\n"
+                f"filename_format=/spdkvol/fio-fill-{run_id}.$jobnum\n"
+                f"rw=write\n"
+                f"bs={self.FIO_FILL_BS}\n"
+                f"iodepth=1\n"
+                f"direct=1\n"
+                f"ioengine=libaio\n"
+                f"size={self.K8S_FIO_FILL_SIZE}\n"
+                f"numjobs=1\n"
+                f"group_reporting\n"
+                f"\n"
+                f"[job1]\n"
+            )
+
+            try:
+                self.k8s_utils.create_fio_job(
+                    job_name, pvc_name, cm_name, fio_config,
+                    image=self.FIO_IMAGE,
+                )
+                self._fill_jobs.append((job_name, cm_name))
+                self.logger.info(f"Fill FIO job {job_name} created for {pvc_name}")
+            except Exception as exc:
+                self.logger.error(f"Fill FIO job failed for {pvc_name}: {exc}")
+
+        # Wait for fill jobs to complete
+        self.logger.info(f"Waiting for {len(self._fill_jobs)} fill jobs to complete ...")
+        for job_name, _ in self._fill_jobs:
+            try:
+                self.k8s_utils.wait_job_complete(job_name, timeout=3600)
+                self.logger.info(f"Fill job {job_name} completed")
+            except Exception as exc:
+                self.logger.warning(f"Fill job {job_name} did not complete: {exc}")
+
+        # Verify fill level
+        sleep_n_sec(5)
+        capacity = self.sbcli_utils.get_node_capacity(self._target_node_id)
+        if isinstance(capacity, list):
+            capacity = capacity[0] if capacity else {}
+        util = capacity.get("size_util", 0)
+        self.logger.info(f"Post-fill device utilisation: {util}%")
+
+        # Cleanup fill jobs
+        for job_name, cm_name in self._fill_jobs:
+            try:
+                self.k8s_utils.delete_resource("job", job_name)
+                self.k8s_utils.delete_resource("configmap", cm_name)
+            except Exception:
+                pass
+
+        self._timing["fill_duration"] = time.time() - t0
+        self.logger.info(
+            f"Fill complete ({self._timing['fill_duration']:.1f}s)"
+        )
+
+    # ── Phase 3 override: IO load via K8s FIO Jobs ───────────────────────────
+
+    def _phase_start_io_load(self):
+        self.logger.info("=== Phase: Start IO load on all nodes (K8s FIO Jobs) ===")
+        all_pvc_names = self._lvols_on_target + self._lvols_on_others
+
+        for pvc_name in all_pvc_names:
+            job_name = f"fio-load-{pvc_name}"
+            cm_name = f"fiocfg-load-{pvc_name}"
+            run_id = _rand_seq(6)
+
+            fio_config = (
+                f"[global]\n"
+                f"name=load-{pvc_name}\n"
+                f"filename_format=/spdkvol/fio-load-{run_id}.$jobnum\n"
+                f"rw=randrw\n"
+                f"rwmixread=50\n"
+                f"bs={self.FIO_LOAD_BS}\n"
+                f"iodepth={self.FIO_LOAD_IODEPTH}\n"
+                f"direct=1\n"
+                f"ioengine=libaio\n"
+                f"size={self.K8S_FIO_LOAD_SIZE}\n"
+                f"numjobs={self.FIO_LOAD_NUMJOBS}\n"
+                f"time_based\n"
+                f"runtime={self.FIO_LOAD_RUNTIME}\n"
+                f"verify=md5\n"
+                f"verify_dump=1\n"
+                f"verify_fatal=1\n"
+                f"verify_backlog=4096\n"
+                f"group_reporting\n"
+                f"\n"
+                f"[job1]\n"
+            )
+
+            try:
+                node_id = self._pvc_details.get(pvc_name, {}).get("node_id")
+                avoid = (
+                    self._get_k8s_node_for_storage_node(node_id)
+                    if node_id else None
+                )
+                self.k8s_utils.create_fio_job(
+                    job_name, pvc_name, cm_name, fio_config,
+                    image=self.FIO_IMAGE,
+                    avoid_node=avoid,
+                )
+                self._load_jobs.append((job_name, cm_name))
+                self._pvc_details[pvc_name]["job_name"] = job_name
+                self._pvc_details[pvc_name]["configmap_name"] = cm_name
+                self.logger.info(f"Load FIO job {job_name} created for {pvc_name}")
+            except Exception as exc:
+                self.logger.error(f"Load FIO job failed for {pvc_name}: {exc}")
+
+        sleep_n_sec(15)  # let IO ramp up
+        self.logger.info(
+            f"IO load started: {len(self._load_jobs)} FIO jobs"
+        )
+
+    # ── Phase 2b override: checksums via K8s utility pods ───────────────────
+
+    def _phase_compute_checksums(self):
+        """Compute MD5 checksums via utility pods on target PVCs."""
+        self.logger.info("=== Phase: Compute pre-migration checksums (K8s) ===")
+        self._pre_migration_checksums = {}
+        self._checksum_utility_pods = []
+
+        for pvc_name in self._lvols_on_target:
+            pod_name = f"cksum-pre-{pvc_name}"
+            try:
+                self.k8s_utils.create_utility_pod(pod_name, pvc_name)
+                self._checksum_utility_pods.append(pod_name)
+                self.k8s_utils.wait_pod_running(pod_name)
+                files = self.k8s_utils.find_files_in_pvc(pod_name)
+                if files:
+                    checksums = self.k8s_utils.generate_checksums_in_pvc(
+                        pod_name, files
+                    )
+                    self._pre_migration_checksums[pvc_name] = checksums
+                    self.logger.info(
+                        f"Captured {len(checksums)} file checksums for {pvc_name}"
+                    )
+                else:
+                    self.logger.warning(
+                        f"No files found in PVC {pvc_name} for checksum"
+                    )
+            except Exception as exc:
+                self.logger.warning(
+                    f"Checksum capture failed for {pvc_name}: {exc}"
+                )
+            finally:
+                try:
+                    self.k8s_utils.delete_pod(pod_name)
+                except Exception:
+                    pass
+
+        self.logger.info(
+            f"Pre-migration checksums captured for "
+            f"{len(self._pre_migration_checksums)} PVCs"
+        )
+
+    def _phase_verify_checksums(self):
+        """Verify MD5 checksums via utility pods on target PVCs."""
+        self.logger.info("=== Verifying post-migration data integrity (K8s) ===")
+        mismatches = 0
+
+        for pvc_name, expected in self._pre_migration_checksums.items():
+            pod_name = f"cksum-post-{pvc_name}"
+            try:
+                self.k8s_utils.create_utility_pod(pod_name, pvc_name)
+                self.k8s_utils.wait_pod_running(pod_name)
+                actual = self.k8s_utils.generate_checksums_in_pvc(
+                    pod_name,
+                    self.k8s_utils.find_files_in_pvc(pod_name),
+                )
+                # Compare by filename (basename)
+                expected_by_name = {
+                    os.path.basename(k): v for k, v in expected.items()
+                }
+                actual_by_name = {
+                    os.path.basename(k): v for k, v in actual.items()
+                }
+                for fname, cksum in expected_by_name.items():
+                    if fname not in actual_by_name:
+                        self.logger.error(
+                            f"File {fname} missing in PVC {pvc_name} after migration"
+                        )
+                        mismatches += 1
+                    elif actual_by_name[fname] != cksum:
+                        self.logger.error(
+                            f"Checksum MISMATCH for {fname} in {pvc_name}: "
+                            f"expected {cksum}, got {actual_by_name[fname]}"
+                        )
+                        mismatches += 1
+                    else:
+                        self.logger.info(f"Checksum OK: {fname} in {pvc_name}")
+            except Exception as exc:
+                self.logger.error(
+                    f"Checksum verification error for {pvc_name}: {exc}"
+                )
+                mismatches += 1
+            finally:
+                try:
+                    self.k8s_utils.delete_pod(pod_name)
+                except Exception:
+                    pass
+
+        assert mismatches == 0, (
+            f"Data integrity check failed: {mismatches} file(s) had "
+            f"checksum mismatches after migration"
+        )
+        self.logger.info(
+            "All post-migration checksums verified — data integrity OK"
+        )
+
+    def _phase_validate_fio(self):
+        """Check FIO K8s Job status and pod logs for errors."""
+        self.logger.info("=== Verifying FIO jobs for errors (K8s) ===")
+        target_errors = []
+        other_errors = []
+
+        for job_name, _ in self._load_jobs:
+            # Determine if this job is on a target or other PVC
+            pvc_name = job_name.replace("fio-load-", "", 1)
+            is_target = pvc_name in self._lvols_on_target
+            try:
+                pod_name = self.k8s_utils.get_job_pod_name(job_name)
+                if not pod_name:
+                    self.logger.warning(
+                        f"Could not find pod for FIO job {job_name}"
+                    )
+                    continue
+                logs = self.k8s_utils.get_pod_logs(pod_name, tail=500)
+                fail_words = ["error", "fail", "interrupt", "terminate"]
+                logs_lower = logs.lower() if logs else ""
+                found = [w for w in fail_words if w in logs_lower]
+                if found:
+                    msg = f"{job_name} ({pvc_name}): pod logs contain {found}"
+                    if is_target:
+                        target_errors.append(msg)
+                        self.logger.warning(
+                            f"[expected] FIO error on failed-device PVC "
+                            f"{pvc_name}: {found}"
+                        )
+                    else:
+                        other_errors.append(msg)
+                        self.logger.error(
+                            f"FIO error on non-target PVC {pvc_name}: {found}"
+                        )
+                else:
+                    self.logger.info(f"FIO job {job_name}: no errors")
+            except Exception as exc:
+                self.logger.warning(
+                    f"Could not check FIO job {job_name}: {exc}"
+                )
+
+        if target_errors:
+            self.logger.warning(
+                f"{len(target_errors)} FIO error(s) on target-device PVCs "
+                f"(expected during device migration)"
+            )
+        if other_errors:
+            self.logger.error(
+                f"{len(other_errors)} FIO error(s) on non-target PVCs: "
+                f"{other_errors}"
+            )
+
+    # ── Phase: wait for FIO to complete naturally (K8s) ─────────────────────
+
+    def _phase_wait_fio_completion(self):
+        """Wait for FIO K8s Jobs to complete naturally."""
+        self.logger.info(
+            "=== Phase: Waiting for FIO K8s Jobs to complete naturally ==="
+        )
+        t0 = time.time()
+        fio_timeout = self.FIO_LOAD_RUNTIME + 300
+
+        for job_name, _ in self._load_jobs:
+            try:
+                status = self.k8s_utils.wait_job_complete(
+                    job_name, timeout=fio_timeout
+                )
+                self.logger.info(
+                    f"FIO job {job_name} completed: {status}"
+                )
+            except Exception as exc:
+                self.logger.warning(
+                    f"FIO job {job_name} did not complete: {exc}"
+                )
+
+        elapsed = time.time() - t0
+        self._timing["fio_completion_duration"] = elapsed
+        self.logger.info(
+            f"All FIO jobs finished ({elapsed:.1f}s)"
+        )
+
+    # ── Phase 5 override: stop IO load (K8s) ─────────────────────────────────
+
+    def _phase_stop_io_load(self):
+        """Delete remaining FIO jobs (failure path only)."""
+        self.logger.info("=== Phase: Stop IO load (K8s cleanup) ===")
+        for job_name, cm_name in self._load_jobs:
+            try:
+                self.k8s_utils.delete_resource("job", job_name)
+                self.k8s_utils.delete_resource("configmap", cm_name)
+            except Exception:
+                pass
+        self.logger.info("IO load stopped (K8s jobs deleted)")
+
+    # ── Cleanup override (K8s) ───────────────────────────────────────────────
+
+    def _phase_cleanup(self):
+        self.logger.info("=== Phase: Cleanup (K8s) ===")
+        try:
+            # Delete all FIO jobs and configmaps
+            for job_name, cm_name in self._fill_jobs + self._load_jobs:
+                try:
+                    self.k8s_utils.delete_resource("job", job_name)
+                    self.k8s_utils.delete_resource("configmap", cm_name)
+                except Exception:
+                    pass
+
+            # Delete PVCs
+            all_pvcs = self._lvols_on_target + self._lvols_on_others
+            for pvc_name in all_pvcs:
+                try:
+                    self.k8s_utils.delete_pvc(pvc_name)
+                except Exception:
+                    pass
+            sleep_n_sec(10)
+
+            # Delete storage pool
+            self.sbcli_utils.delete_all_storage_pools()
+        except Exception as e:
+            self.logger.error(f"Cleanup error: {e}")
+
+
+# ── K8s concrete classes ─────────────────────────────────────────────────────
+
+class DeviceFailureMigrationNoLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest):
+    """K8s-native: fill device to 65 %, fail via API, run migration WITHOUT IO load."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_no_load_k8s"
+
+    def run(self):
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for result in storage_nodes["results"]:
+            self.sn_nodes.append(result["uuid"])
+            self.node_vs_pvc[result["uuid"]] = []
+
+        pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test
+
+        cluster_id = self.cluster_id or ""
+        self.k8s_utils.create_storage_class(
+            name=self.STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+        )
+        self._run_migration_test(with_io_load=False, failure_mode="api")
+
+
+class DeviceFailureMigrationUnderLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest):
+    """K8s-native: fill device to 65 %, start IO, fail via API, migrate UNDER LOAD."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_under_load_k8s"
+
+    def run(self):
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for result in storage_nodes["results"]:
+            self.sn_nodes.append(result["uuid"])
+            self.node_vs_pvc[result["uuid"]] = []
+
+        pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test
+
+        cluster_id = self.cluster_id or ""
+        self.k8s_utils.create_storage_class(
+            name=self.STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+        )
+        self._run_migration_test(with_io_load=True, failure_mode="api")
+
+
+class DeviceFailureMigrationPCIeNoLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest):
+    """K8s-native: fill device to 65 %, remove via PCIe sysfs, migrate WITHOUT IO load."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_pcie_no_load_k8s"
+
+    def run(self):
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for result in storage_nodes["results"]:
+            self.sn_nodes.append(result["uuid"])
+            self.node_vs_pvc[result["uuid"]] = []
+
+        pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test
+
+        cluster_id = self.cluster_id or ""
+        self.k8s_utils.create_storage_class(
+            name=self.STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+        )
+        self._run_migration_test(with_io_load=False, failure_mode="pcie")
+
+
+class DeviceFailureMigrationPCIeUnderLoadK8s(_DeviceFailureMigrationK8s, K8sNativeFailoverTest):
+    """K8s-native: fill device to 65 %, start IO, remove via PCIe sysfs, migrate UNDER LOAD."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.logger = setup_logger(__name__)
+        self._init_migration_state()
+        self.test_name = "device_failure_migration_pcie_under_load_k8s"
+
+    def run(self):
+        storage_nodes = self.sbcli_utils.get_storage_nodes()
+        for result in storage_nodes["results"]:
+            self.sn_nodes.append(result["uuid"])
+            self.node_vs_pvc[result["uuid"]] = []
+
+        pool_test = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        self.pool_name = self.pool_name if pool_test == self.pool_name else pool_test
+
+        cluster_id = self.cluster_id or ""
+        self.k8s_utils.create_storage_class(
+            name=self.STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+        )
+        self._run_migration_test(with_io_load=True, failure_mode="pcie")
diff --git a/e2e/stress_test/large_scale_lvol_stress.py b/e2e/stress_test/large_scale_lvol_stress.py
index 8d959eef0..be646c1d3 100755
--- a/e2e/stress_test/large_scale_lvol_stress.py
+++ b/e2e/stress_test/large_scale_lvol_stress.py
@@ -22,6 +22,7 @@
 
 from __future__ import annotations
 
+import json as _json
 import os
 import random
 import re
@@ -69,6 +70,7 @@ class _LargeScaleMixin:
     # ── Parallelism ──────────────────────────────────────────────────────────
     MAX_WORKERS = 20
     BATCH_SIZE = 50
+    PARALLEL_PARENTS = 5             # concurrent parents/subsystems during creation
 
     # ── Internal state ───────────────────────────────────────────────────────
     _phase_durations: dict
@@ -86,6 +88,7 @@ def _init_mixin_state(self):
     def _run_large_scale_test(self):
         total = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM
         self._init_mixin_state()
+        self._creation_partial = False
         self.logger.info(
             f"=== Starting {self.__class__.__name__}: "
             f"{self.NUM_SUBSYSTEMS} subsystems × "
@@ -93,8 +96,30 @@ def _run_large_scale_test(self):
         )
         try:
             t0 = time.time()
-            self._phase_create_subsystems()
-            self._phase_durations["create"] = round(time.time() - t0, 1)
+            try:
+                self._phase_create_subsystems()
+            except Exception as create_err:
+                self._creation_partial = True
+                self._phase_durations["create"] = round(time.time() - t0, 1)
+                created = self._count_created_resources()
+                self.logger.error(
+                    f"[create] CREATION FAILED after {created} resources: "
+                    f"{create_err}"
+                )
+                self.logger.info(
+                    f"[create] *** Max resources created: {created} / "
+                    f"{total} ({created * 100 // max(total, 1)}%) ***"
+                )
+                if created == 0:
+                    raise RuntimeError(
+                        f"No resources created — cannot proceed: {create_err}"
+                    )
+                self.logger.info(
+                    f"[create] Proceeding with FIO on {created} existing "
+                    f"resources"
+                )
+            else:
+                self._phase_durations["create"] = round(time.time() - t0, 1)
 
             t0 = time.time()
             self._phase_start_fio()
@@ -120,6 +145,10 @@ def _run_large_scale_test(self):
                 f"Large-scale test had {self._fio_failures} FIO failures"
             )
 
+    def _count_created_resources(self):
+        """Count resources available for FIO — override in subclass."""
+        return self._total_created
+
     # ── Steady state (shared) ────────────────────────────────────────────────
 
     def _phase_steady_state(self):
@@ -147,6 +176,164 @@ def _phase_validate(self):
         """Override in subclass for mode-specific validation."""
         self.logger.info("=== Validation phase ===")
 
+    # ── FIO log collection helpers (shared) ──────────────────────────────────
+
+    def _save_fio_pod_logs(self, job_name: str, resource_name: str,
+                           pvc_name: str = None):
+        """Save FIO pod logs and performance data to local log directory."""
+        try:
+            pod_name = self.k8s_utils.get_job_pod_name(job_name)
+            if not pod_name:
+                return
+            logs = self.k8s_utils.get_pod_logs(pod_name, tail=2000)
+            if logs:
+                log_file = os.path.join(
+                    self.log_path, f"{resource_name}_fio.log"
+                )
+                with open(log_file, "w") as f:
+                    f.write(logs)
+                self.logger.info(
+                    f"[save_fio] Saved logs for {resource_name}"
+                )
+            self._copy_fio_perf_logs(
+                pod_name, resource_name, pvc_name=pvc_name
+            )
+        except Exception as exc:
+            self.logger.warning(
+                f"[save_fio] Could not save logs for {resource_name}: {exc}"
+            )
+
+    def _list_fio_perf_files(self, pod_name: str, ns: str,
+                              container: str = None) -> list:
+        """List FIO-generated perf files in /spdkvol/ of a running pod."""
+        container_flag = f"-c {container} " if container else ""
+        try:
+            file_list, _ = self.k8s_utils._exec_kubectl(
+                f"kubectl exec {container_flag}{pod_name} -n {ns} -- "
+                f"find /spdkvol/ -maxdepth 1 "
+                f"\\( -name '*fio*.log' -o -name '*-iolog.log' "
+                f"-o -name '*_lat.*' "
+                f"-o -name '*_bw.*' -o -name '*_iops.*' "
+                f"-o -name '*_clat.*' "
+                f"-o -name '*_slat.*' \\) "
+                f"2>/dev/null || true",
+                supress_logs=True,
+            )
+            return [
+                f.strip() for f in file_list.strip().splitlines()
+                if f.strip()
+            ]
+        except Exception:
+            return []
+
+    def _create_copier_pod(self, copier_name: str, pvc_name: str,
+                            node_name: str, ns: str):
+        """Create a lightweight busybox pod mounting a PVC for log copy."""
+        yaml_spec = (
+            f"apiVersion: v1\n"
+            f"kind: Pod\n"
+            f"metadata:\n"
+            f"  name: {copier_name}\n"
+            f"  namespace: {ns}\n"
+            f"  labels:\n"
+            f"    app: fio-copier\n"
+            f"spec:\n"
+            f"  nodeName: {node_name}\n"
+            f"  tolerations:\n"
+            f"  - operator: Exists\n"
+            f"  containers:\n"
+            f"  - name: copier\n"
+            f"    image: busybox\n"
+            f"    command: ['sleep', '300']\n"
+            f"    volumeMounts:\n"
+            f"    - mountPath: /spdkvol\n"
+            f"      name: vol\n"
+            f"  volumes:\n"
+            f"  - name: vol\n"
+            f"    persistentVolumeClaim:\n"
+            f"      claimName: {pvc_name}\n"
+            f"  restartPolicy: Never\n"
+        )
+        self.k8s_utils._exec_kubectl(
+            f"cat <<'COPIER_EOF' | kubectl apply -f -\n"
+            f"{yaml_spec}COPIER_EOF",
+        )
+        self.k8s_utils._exec_kubectl(
+            f"kubectl wait pod/{copier_name} -n {ns} "
+            f"--for=condition=Ready --timeout=120s",
+        )
+
+    def _copy_fio_perf_logs(self, pod_name: str, resource_name: str,
+                             pvc_name: str = None):
+        """Copy FIO perf log files from /spdkvol/ in the pod to local dir."""
+        ns = self.k8s_utils.namespace
+        perf_dir = os.path.join(self.log_path, f"{resource_name}_perf")
+        copier_name = None
+        copy_from_pod = pod_name
+        container = None
+
+        try:
+            files = self._list_fio_perf_files(pod_name, ns)
+
+            if not files and pvc_name:
+                node_name = self.k8s_utils.get_pod_node_name(pod_name)
+                if node_name:
+                    copier_name = f"fio-cp-{_rand_seq(8)}"
+                    self.logger.info(
+                        f"[perf_copy] Creating copier pod {copier_name} "
+                        f"on {node_name} for PVC {pvc_name}"
+                    )
+                    try:
+                        self._create_copier_pod(
+                            copier_name, pvc_name, node_name, ns
+                        )
+                        files = self._list_fio_perf_files(
+                            copier_name, ns, container="copier"
+                        )
+                        copy_from_pod = copier_name
+                        container = "copier"
+                    except Exception as exc:
+                        self.logger.warning(
+                            f"[perf_copy] Copier pod failed for "
+                            f"{resource_name}: {exc}"
+                        )
+                        files = []
+
+            if not files:
+                return
+
+            os.makedirs(perf_dir, exist_ok=True)
+            container_flag = f" -c {container}" if container else ""
+            for src_path in files:
+                fname = os.path.basename(src_path)
+                dest = os.path.join(perf_dir, fname)
+                self.k8s_utils._exec_kubectl(
+                    f"kubectl cp "
+                    f"{ns}/{copy_from_pod}:{src_path} {dest}"
+                    f"{container_flag} "
+                    f"2>/dev/null || true",
+                    supress_logs=True,
+                )
+            self.logger.info(
+                f"[perf_copy] Copied {len(files)} perf log(s) "
+                f"for {resource_name}"
+            )
+        except Exception as exc:
+            self.logger.warning(
+                f"[perf_copy] Could not copy perf logs for "
+                f"{resource_name}: {exc}"
+            )
+        finally:
+            if copier_name:
+                try:
+                    self.k8s_utils._exec_kubectl(
+                        f"kubectl delete pod {copier_name} -n {ns} "
+                        f"--force --grace-period=0 2>/dev/null || true",
+                        supress_logs=True,
+                    )
+                except Exception:
+                    pass
+
     # ── Summary (shared) ─────────────────────────────────────────────────────
 
     def _print_large_scale_summary(self):
@@ -326,6 +513,7 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.test_name = "large_scale_lvol_docker"
         self.fio_threads: list[threading.Thread] = []
+        self.sn_nodes: list[str] = []
 
         # parent_name → {id, client, ctrl_dev, nqn, devices: [dev_path]}
         self._parent_registry: dict[str, dict] = {}
@@ -386,7 +574,12 @@ def _wait_until_namespace_device_gone(self, node: str, ctrl_dev: str,
     # ── run() ────────────────────────────────────────────────────────────────
 
     def run(self):
-        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        if actual_pool and actual_pool != self.pool_name:
+            self.logger.info(
+                f"[run] Pool name changed: {self.pool_name} -> {actual_pool}"
+            )
+            self.pool_name = actual_pool
         storage_nodes = self.sbcli_utils.get_storage_nodes()
         for result in storage_nodes["results"]:
             self.sn_nodes.append(result["uuid"])
@@ -396,169 +589,217 @@ def run(self):
 
     def _phase_create_subsystems(self):
         self.logger.info("=== Phase: Create Subsystems (Docker) ===")
-
-        # Sub-phase 1: Create 100 parent lvols in parallel
+        total_expected = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM
         self.logger.info(
-            f"[create] Sub-phase 1: Creating {self.NUM_SUBSYSTEMS} parents"
+            f"[create] {self.NUM_SUBSYSTEMS} parents × "
+            f"{self.NAMESPACES_PER_SUBSYSTEM} ns = {total_expected} lvols "
+            f"(parallel={self.PARALLEL_PARENTS})"
         )
-        parent_items = []
-        for i in range(self.NUM_SUBSYSTEMS):
-            name = f"lss-par-{_rand_seq(6)}-{i:03d}"
-            parent_items.append({"name": name, "idx": i})
 
-        self._batch_exec(parent_items, self._create_parent, "create_parents")
-
-        parent_count = len(self._parent_registry)
-        self.logger.info(f"[create] {parent_count} parents created")
-        if parent_count == 0:
-            raise RuntimeError("No parents created — cannot continue")
-
-        # Sub-phase 2: NVMe connect all parents + format/mount parent device
+        # ── Sub-phase 1: Create all parent lvols in parallel ────────────
+        parent_names = [
+            f"lss-par-{_rand_seq(6)}-{i:03d}"
+            for i in range(self.NUM_SUBSYSTEMS)
+        ]
         self.logger.info(
-            f"[create] Sub-phase 2: NVMe connecting {parent_count} parents"
+            f"[create][sub1] Creating {len(parent_names)} parent lvols "
+            f"(parallel, workers={self.MAX_WORKERS})"
         )
-        parent_names = list(self._parent_registry.keys())
-        self._batch_exec(
-            parent_names, self._connect_parent, "connect_parents"
+        ok, fail = self._batch_exec(
+            [{"name": n} for n in parent_names],
+            self._create_parent,
+            "create_parents",
+        )
+        if fail > 0:
+            self._total_created = len(self._device_registry)
+            raise RuntimeError(
+                f"[create][sub1] {fail} parent creations failed"
+            )
+        # Verify all parents are registered
+        for pn in parent_names:
+            if pn not in self._parent_registry:
+                raise RuntimeError(
+                    f"[create][sub1] Parent {pn} not in registry after create"
+                )
+        self.logger.info(
+            f"[create][sub1] All {ok} parents created successfully"
         )
 
-        connected = sum(
-            1 for p in self._parent_registry.values() if p.get("ctrl_dev")
+        # ── Sub-phase 2: NVMe-connect all parents (sequential) ─────────
+        # Sequential to avoid device-detection races on same client.
+        self.logger.info(
+            f"[create][sub2] Connecting {len(parent_names)} parents "
+            f"(sequential)"
+        )
+        for idx, pn in enumerate(parent_names):
+            # Pre-assign client round-robin
+            self._parent_registry[pn]["client"] = (
+                self.fio_node[idx % len(self.fio_node)]
+            )
+            self._connect_parent(pn)
+            pinfo = self._parent_registry[pn]
+            if not pinfo.get("ctrl_dev"):
+                raise RuntimeError(
+                    f"[create][sub2] Parent {pn} NVMe connect failed"
+                )
+            if (idx + 1) % 10 == 0 or idx == len(parent_names) - 1:
+                self.logger.info(
+                    f"[create][sub2] Connected {idx+1}/"
+                    f"{len(parent_names)}"
+                )
+        self.logger.info(
+            f"[create][sub2] All {len(parent_names)} parents connected"
         )
-        self.logger.info(f"[create] {connected} parents connected")
 
-        # Sub-phase 3: Create namespace children per parent
-        # (sequential within a parent, parallel across parents)
-        total_children = (self.NAMESPACES_PER_SUBSYSTEM - 1) * connected
+        # ── Sub-phase 3: Create children (PARALLEL_PARENTS concurrent) ──
         self.logger.info(
-            f"[create] Sub-phase 3: Creating {total_children} namespace "
-            f"children ({self.NAMESPACES_PER_SUBSYSTEM - 1} per parent)"
+            f"[create][sub3] Creating children for {len(parent_names)} "
+            f"parents (parallel, workers={self.PARALLEL_PARENTS})"
         )
-        connected_parents = [
-            pname for pname, pinfo in self._parent_registry.items()
-            if pinfo.get("ctrl_dev")
-        ]
-        # Each parent creates 31 children sequentially (~130s each worst case)
-        self._batch_exec(
-            connected_parents,
+        child_timeout = self.NAMESPACES_PER_SUBSYSTEM * 180
+        ok, fail = self._batch_exec(
+            parent_names,
             self._create_children_for_parent,
             "create_children",
-            per_item_timeout=5400,  # 90 min per parent
+            per_item_timeout=child_timeout,
+            max_workers=self.PARALLEL_PARENTS,
         )
+        if fail > 0:
+            self._total_created = len(self._device_registry)
+            raise RuntimeError(
+                f"[create][sub3] {fail} parent child-creation batches failed"
+            )
+
+        # Verify child counts
+        for pn in parent_names:
+            children_done = sum(
+                1 for c in self._child_registry.values()
+                if c["parent_name"] == pn
+            )
+            expected = self.NAMESPACES_PER_SUBSYSTEM - 1
+            if children_done < expected:
+                raise RuntimeError(
+                    f"Parent {pn}: only {children_done}/{expected} "
+                    f"children created — aborting"
+                )
 
-        child_count = len(self._child_registry)
         self._total_created = len(self._device_registry)
         self.logger.info(
-            f"[create] {child_count} children created, "
-            f"{self._total_created} total devices formatted + mounted"
+            f"[create] All done: {len(self._parent_registry)} parents, "
+            f"{len(self._child_registry)} children, "
+            f"{self._total_created} total devices mounted"
         )
 
+    def _count_created_resources(self):
+        """Count devices available for FIO from the device registry."""
+        return len(self._device_registry)
+
     def _create_parent(self, params: dict):
         name = params["name"]
+        self.sbcli_utils.add_lvol(
+            lvol_name=name,
+            pool_name=self.pool_name,
+            size=self.LVOL_SIZE,
+            distr_ndcs=self.ndcs,
+            distr_npcs=self.npcs,
+            distr_bs=self.bs,
+            distr_chunk_bs=self.chunk_bs,
+            max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM,
+            retry=3,
+        )
+        sleep_n_sec(2)
+        lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=name)
+        if not lvol_id:
+            raise RuntimeError(f"[create_parent] {name}: ID not found")
+        # Get the node_id so children can target the same node via host_id
+        node_id = None
         try:
-            self.sbcli_utils.add_lvol(
-                lvol_name=name,
-                pool_name=self.pool_name,
-                size=self.LVOL_SIZE,
-                distr_ndcs=self.ndcs,
-                distr_npcs=self.npcs,
-                distr_bs=self.bs,
-                distr_chunk_bs=self.chunk_bs,
-                max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM,
-                retry=3,
-            )
-            sleep_n_sec(2)
-            lvol_id = self.sbcli_utils.get_lvol_id(lvol_name=name)
-            if not lvol_id:
-                self.logger.error(f"[create_parent] {name}: ID not found")
-                return
-            self._parent_registry[name] = {
-                "id": lvol_id,
-                "client": None,
-                "ctrl_dev": None,
-                "nqn": None,
-                "devices": [],
-            }
-            self.logger.info(f"[create_parent] {name} -> {lvol_id}")
-        except Exception as e:
-            self.logger.error(f"[create_parent] {name} failed: {e}")
+            details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)
+            if details:
+                node_id = details[0].get("node_id")
+        except Exception as ex:
+            self.logger.warning(f"[create_parent] {name}: could not get node_id: {ex}")
+        self._parent_registry[name] = {
+            "id": lvol_id,
+            "node_id": node_id,
+            "client": None,
+            "ctrl_dev": None,
+            "nqn": None,
+            "devices": [],
+        }
+        self.logger.info(f"[create_parent] {name} -> {lvol_id} (node={node_id})")
 
     def _connect_parent(self, parent_name: str):
         """NVMe-connect parent, detect device, format + mount the parent
-        namespace (nsid=1)."""
+        namespace (nsid=1).  Raises on any failure."""
         pinfo = self._parent_registry.get(parent_name)
         if not pinfo:
-            return
-        try:
-            connect_ls = self.sbcli_utils.get_lvol_connect_str(
-                lvol_name=parent_name
+            raise RuntimeError(f"{parent_name}: not in registry")
+
+        connect_ls = self.sbcli_utils.get_lvol_connect_str(
+            lvol_name=parent_name
+        )
+        if not connect_ls:
+            raise RuntimeError(
+                f"[connect] {parent_name}: no connect strings"
             )
-            if not connect_ls:
-                self.logger.error(
-                    f"[connect] {parent_name}: no connect strings"
-                )
-                return
 
-            # Round-robin across client nodes
-            client = self.fio_node[
-                list(self._parent_registry.keys()).index(parent_name)
-                % len(self.fio_node)
-            ]
-            pinfo["client"] = client
+        # Use pre-assigned client if set (sub-phase 2), otherwise fall back
+        if not pinfo.get("client"):
+            idx = list(self._parent_registry.keys()).index(parent_name)
+            pinfo["client"] = self.fio_node[idx % len(self.fio_node)]
+        client = pinfo["client"]
 
-            initial_devices = self.ssh_obj.get_devices(node=client)
+        initial_devices = self.ssh_obj.get_devices(node=client)
 
-            for cmd in connect_ls:
-                self.ssh_obj.exec_command(node=client, command=cmd)
-                # Extract NQN for later disconnect
-                nqn_match = re.search(r"-n\s+(nqn\S+)", cmd)
-                if nqn_match:
-                    pinfo["nqn"] = nqn_match.group(1)
+        for cmd in connect_ls:
+            self.ssh_obj.exec_command(node=client, command=cmd)
+            # Extract NQN for later disconnect
+            nqn_match = re.search(r"-n\s+(nqn\S+)", cmd)
+            if nqn_match:
+                pinfo["nqn"] = nqn_match.group(1)
 
-            sleep_n_sec(3)
-            final_devices = self.ssh_obj.get_devices(node=client)
+        sleep_n_sec(3)
+        final_devices = self.ssh_obj.get_devices(node=client)
 
-            parent_dev = None
-            for dev in final_devices:
-                if dev not in initial_devices:
-                    parent_dev = f"/dev/{dev.strip()}"
-                    break
+        parent_dev = None
+        for dev in final_devices:
+            if dev not in initial_devices:
+                parent_dev = f"/dev/{dev.strip()}"
+                break
 
-            if not parent_dev:
-                self.logger.error(
-                    f"[connect] {parent_name}: no new device after connect"
-                )
-                return
+        if not parent_dev:
+            raise RuntimeError(
+                f"[connect] {parent_name}: no new device after connect"
+            )
 
-            ctrl_dev = get_parent_device(parent_dev)
-            pinfo["ctrl_dev"] = ctrl_dev
-            pinfo["devices"] = [parent_dev]
+        ctrl_dev = get_parent_device(parent_dev)
+        pinfo["ctrl_dev"] = ctrl_dev
+        pinfo["devices"] = [parent_dev]
 
-            # Format + mount the parent device (nsid=1)
-            mount_name = f"lss-{parent_name[-3:]}-ns01"
-            mount_point = f"{self.mount_path}/{mount_name}"
-            log_file = f"{self.log_path}/{mount_name}.log"
-            self.ssh_obj.format_disk(
-                node=client, device=parent_dev, fs_type="ext4"
-            )
-            self.ssh_obj.mount_path(
-                node=client, device=parent_dev, mount_path=mount_point
-            )
-            self._device_registry[parent_dev] = {
-                "name": mount_name,
-                "client": client,
-                "mount": mount_point,
-                "log": log_file,
-                "parent_name": parent_name,
-                "ctrl_dev": ctrl_dev,
-                "ns_idx": 1,
-            }
-            self.logger.info(
-                f"[connect] {parent_name}: {parent_dev} ns01 "
-                f"(ctrl={ctrl_dev}) on {client} -> {mount_point}"
-            )
-        except Exception as e:
-            self.logger.error(f"[connect] {parent_name} failed: {e}")
+        # Format + mount the parent device (nsid=1)
+        mount_name = f"lss-{parent_name[-3:]}-ns01"
+        mount_point = f"{self.mount_path}/{mount_name}"
+        log_file = f"{self.log_path}/{mount_name}.log"
+        self.ssh_obj.format_disk(
+            node=client, device=parent_dev, fs_type="ext4"
+        )
+        self.ssh_obj.mount_path(
+            node=client, device=parent_dev, mount_path=mount_point
+        )
+        self._device_registry[parent_dev] = {
+            "name": mount_name,
+            "client": client,
+            "mount": mount_point,
+            "log": log_file,
+            "parent_name": parent_name,
+            "ctrl_dev": ctrl_dev,
+            "ns_idx": 1,
+        }
+        self.logger.info(
+            f"[connect] {parent_name}: {parent_dev} ns01 "
+            f"(ctrl={ctrl_dev}) on {client} -> {mount_point}"
+        )
 
     def _create_children_for_parent(self, parent_name: str):
         """Create all namespace children for one parent sequentially.
@@ -566,102 +807,89 @@ def _create_children_for_parent(self, parent_name: str):
         For each child:
           1. add_lvol(namespace=parent_id)
           2. Verify the new namespace device appears on the client
-             (rescan if it doesn't show up automatically)
           3. Format + mount the new device
+
+        Raises on any failure so the caller can abort immediately.
         """
         pinfo = self._parent_registry.get(parent_name)
         if not pinfo or not pinfo.get("ctrl_dev"):
-            return
+            raise RuntimeError(f"{parent_name}: not connected")
         parent_id = pinfo["id"]
         client = pinfo["client"]
         ctrl_dev = pinfo["ctrl_dev"]
 
         # Snapshot of current namespace devices before creating children
         before_set = set(self._list_nvme_ns_devices(client, ctrl_dev))
-        created = 0
 
         for ns_idx in range(2, self.NAMESPACES_PER_SUBSYSTEM + 1):
             cname = (
                 f"lss-ch-{parent_name[-3:]}-ns{ns_idx:02d}-{_rand_seq(4)}"
             )
-            try:
-                self.sbcli_utils.add_lvol(
-                    lvol_name=cname,
-                    pool_name=self.pool_name,
-                    size=self.LVOL_SIZE,
-                    distr_ndcs=self.ndcs,
-                    distr_npcs=self.npcs,
-                    distr_bs=self.bs,
-                    distr_chunk_bs=self.chunk_bs,
-                    namespace=parent_id,
-                    retry=3,
-                )
-                sleep_n_sec(2)
-                child_id = self.sbcli_utils.get_lvol_id(lvol_name=cname)
-                if not child_id:
-                    self.logger.error(
-                        f"[create_child] {cname}: ID not found"
-                    )
-                    continue
 
-                # Wait for the new namespace device to appear on client
-                new_dev, new_set = self._wait_for_new_namespace_device(
-                    node=client,
-                    ctrl_dev=ctrl_dev,
-                    before_set=before_set,
-                    timeout=120,
-                    interval=3,
+            self.sbcli_utils.add_lvol(
+                lvol_name=cname,
+                pool_name=self.pool_name,
+                size=self.LVOL_SIZE,
+                distr_ndcs=self.ndcs,
+                distr_npcs=self.npcs,
+                distr_bs=self.bs,
+                distr_chunk_bs=self.chunk_bs,
+                host_id=pinfo.get("node_id"),
+                namespace=parent_id,
+                retry=3,
+            )
+            sleep_n_sec(2)
+            child_id = self.sbcli_utils.get_lvol_id(lvol_name=cname)
+            if not child_id:
+                raise RuntimeError(
+                    f"[create_child] {cname}: lvol ID not found after create"
                 )
-                if not new_dev:
-                    self.logger.error(
-                        f"[create_child] {cname}: namespace device did not "
-                        f"appear on {client} (ctrl={ctrl_dev})"
-                    )
-                    continue
-                before_set = new_set
 
-                # Format + mount the new namespace device
-                mount_name = (
-                    f"lss-{parent_name[-3:]}-ns{ns_idx:02d}"
-                )
-                mount_point = f"{self.mount_path}/{mount_name}"
-                log_file = f"{self.log_path}/{mount_name}.log"
-                self.ssh_obj.format_disk(
-                    node=client, device=new_dev, fs_type="ext4"
-                )
-                self.ssh_obj.mount_path(
-                    node=client, device=new_dev, mount_path=mount_point
+            # Wait for the new namespace device to appear on client
+            new_dev, new_set = self._wait_for_new_namespace_device(
+                node=client,
+                ctrl_dev=ctrl_dev,
+                before_set=before_set,
+                timeout=120,
+                interval=3,
+            )
+            if not new_dev:
+                raise RuntimeError(
+                    f"[create_child] {cname}: namespace device did not "
+                    f"appear on {client} (ctrl={ctrl_dev})"
                 )
+            before_set = new_set
 
-                self._child_registry[cname] = {
-                    "id": child_id,
-                    "parent_name": parent_name,
-                    "device": new_dev,
-                    "ns_idx": ns_idx,
-                }
-                self._device_registry[new_dev] = {
-                    "name": mount_name,
-                    "client": client,
-                    "mount": mount_point,
-                    "log": log_file,
-                    "parent_name": parent_name,
-                    "ctrl_dev": ctrl_dev,
-                    "ns_idx": ns_idx,
-                }
-                created += 1
-                self.logger.info(
-                    f"[create_child] {cname} -> {child_id} "
-                    f"ns{ns_idx:02d} device={new_dev} on {client}"
-                )
-            except Exception as e:
-                self.logger.error(
-                    f"[create_child] {cname} failed: {e}"
-                )
+            # Format + mount the new namespace device
+            mount_name = f"lss-{parent_name[-3:]}-ns{ns_idx:02d}"
+            mount_point = f"{self.mount_path}/{mount_name}"
+            log_file = f"{self.log_path}/{mount_name}.log"
+            self.ssh_obj.format_disk(
+                node=client, device=new_dev, fs_type="ext4"
+            )
+            self.ssh_obj.mount_path(
+                node=client, device=new_dev, mount_path=mount_point
+            )
 
-        self.logger.info(
-            f"[create_children] {parent_name}: "
-            f"{created}/{self.NAMESPACES_PER_SUBSYSTEM - 1} children created"
-        )
+            self._child_registry[cname] = {
+                "id": child_id,
+                "parent_name": parent_name,
+                "device": new_dev,
+                "ns_idx": ns_idx,
+            }
+            self._device_registry[new_dev] = {
+                "name": mount_name,
+                "client": client,
+                "mount": mount_point,
+                "log": log_file,
+                "parent_name": parent_name,
+                "ctrl_dev": ctrl_dev,
+                "ns_idx": ns_idx,
+            }
+            self.logger.info(
+                f"[create_child] {cname} -> {child_id} "
+                f"ns{ns_idx:02d} device={new_dev} on {client}"
+            )
 
     # ── Phase 2: Start FIO ──────────────────────────────────────────────────
 
@@ -727,6 +955,11 @@ def _log_health_status(self, elapsed: int):
 
     def _phase_validate(self):
         self.logger.info("=== Phase: Validate FIO (Docker) ===")
+
+        # 1. Collect FIO logs from all clients
+        self._save_all_fio_logs_docker()
+
+        # 2. Check thread liveness
         alive = sum(1 for t in self.fio_threads if t.is_alive())
         dead = len(self.fio_threads) - alive
         self.logger.info(
@@ -738,6 +971,82 @@ def _phase_validate(self):
                 f"[validate] {dead} FIO threads died during test"
             )
 
+        # 3. Validate FIO log contents for errors
+        validated = 0
+        failed = 0
+        for device, dinfo in self._device_registry.items():
+            log_file = dinfo.get("log")
+            client = dinfo.get("client")
+            name = dinfo.get("name")
+            if not log_file or not client:
+                continue
+            try:
+                self.common_utils.validate_fio_test(client, log_file)
+                validated += 1
+            except RuntimeError as e:
+                failed += 1
+                self.logger.error(
+                    f"[validate] FIO error in {name} on {client}: {e}"
+                )
+        self.logger.info(
+            f"[validate] Log validation: {validated} passed, "
+            f"{failed} failed"
+        )
+        self._fio_failures = max(self._fio_failures, failed)
+
+    def _save_all_fio_logs_docker(self):
+        """Collect FIO log files from all clients to the local log dir."""
+        saved = 0
+        for device, dinfo in self._device_registry.items():
+            log_file = dinfo.get("log")
+            client = dinfo.get("client")
+            name = dinfo.get("name")
+            if not log_file or not client:
+                continue
+            try:
+                file_data = self.ssh_obj.read_file(client, log_file)
+                if file_data:
+                    local_path = os.path.join(
+                        self.log_path, f"{name}_fio.log"
+                    )
+                    with open(local_path, "w") as f:
+                        f.write(file_data)
+                    saved += 1
+            except Exception:
+                pass
+            # Also collect perf logs (_bw, _lat, _iops, _iolog)
+            fio_log_base = log_file.replace(".log", "_fio")
+            perf_dir = os.path.join(self.log_path, f"{name}_perf")
+            try:
+                out, _ = self.ssh_obj.exec_command(
+                    node=client,
+                    command=f"bash -lc 'ls {fio_log_base}* "
+                            f"{log_file.replace('.log', '_iolog.log')} "
+                            f"2>/dev/null || true'",
+                    supress_logs=True,
+                )
+                perf_files = [
+                    f.strip() for f in (out or "").splitlines()
+                    if f.strip()
+                ]
+                if perf_files:
+                    os.makedirs(perf_dir, exist_ok=True)
+                    for src in perf_files:
+                        fname = os.path.basename(src)
+                        dest = os.path.join(perf_dir, fname)
+                        try:
+                            data = self.ssh_obj.read_file(client, src)
+                            if data:
+                                with open(dest, "w") as f:
+                                    f.write(data)
+                        except Exception:
+                            pass
+            except Exception:
+                pass
+        self.logger.info(
+            f"[save_fio] Collected {saved} FIO logs from clients"
+        )
+
     # ── Cleanup ──────────────────────────────────────────────────────────────
 
     def _phase_cleanup(self):
@@ -902,14 +1211,30 @@ def _delete_children_for_parent(self, parent_name: str,
     # ── Batch parallel helper ────────────────────────────────────────────────
 
     def _batch_exec(self, items, task_fn, op_name: str,
-                    per_item_timeout: int = 600):
-        """Execute task_fn(item) for each item using ThreadPoolExecutor."""
+                    per_item_timeout: int = 600,
+                    max_workers: int = None,
+                    max_failures: int = 10):
+        """Execute task_fn(item) for each item using ThreadPoolExecutor.
+
+        Stops submitting new batches once failures >= max_failures.
+        Returns (success_count, failure_count).
+        """
         total = len(items)
         success = 0
         failures = 0
+        workers = max_workers or self.MAX_WORKERS
+        stopped_early = False
 
-        with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+        with ThreadPoolExecutor(max_workers=workers) as executor:
             for batch_start in range(0, total, self.BATCH_SIZE):
+                if failures >= max_failures:
+                    stopped_early = True
+                    self.logger.error(
+                        f"[{op_name}] Stopping: {failures} failures "
+                        f"reached max_failures={max_failures}"
+                    )
+                    break
+
                 batch = items[batch_start:batch_start + self.BATCH_SIZE]
                 futures = {}
                 for item in batch:
@@ -923,7 +1248,8 @@ def _batch_exec(self, items, task_fn, op_name: str,
                     except Exception as exc:
                         failures += 1
                         self.logger.error(
-                            f"[{op_name}] Failed: {exc}"
+                            f"[{op_name}] Failed ({failures}/"
+                            f"{max_failures} max): {exc}"
                         )
 
                 done = batch_start + len(batch)
@@ -932,6 +1258,12 @@ def _batch_exec(self, items, task_fn, op_name: str,
                     f"(ok={success} fail={failures})"
                 )
 
+        if stopped_early:
+            self.logger.info(
+                f"[{op_name}] Stopped early: {success} succeeded, "
+                f"{failures} failed, "
+                f"{total - success - failures} skipped"
+            )
         return success, failures
 
 
@@ -948,9 +1280,8 @@ class LargeScaleLvolK8s(_LargeScaleMixin, K8sNativeFailoverTest):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.test_name = "large_scale_lvol_k8s"
-        # Override base class FIO config for lightweight load
+        # Match Docker: lightweight FIO load
         self.fio_num_jobs = self.FIO_NUMJOBS
-        self.FIO_RUNTIME = 7200
 
     # ── run() ────────────────────────────────────────────────────────────────
 
@@ -960,7 +1291,12 @@ def run(self):
             self.sn_nodes.append(result["uuid"])
             self.node_vs_pvc[result["uuid"]] = []
 
-        self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        actual_pool = self.sbcli_utils.add_storage_pool(pool_name=self.pool_name)
+        if actual_pool and actual_pool != self.pool_name:
+            self.logger.info(
+                f"[run] Pool name changed: {self.pool_name} -> {actual_pool}"
+            )
+            self.pool_name = actual_pool
 
         cluster_id = self.cluster_id or os.environ.get("CLUSTER_ID", "")
         self.k8s_utils.create_storage_class(
@@ -971,184 +1307,245 @@ def run(self):
             npcs=self.npcs,
             max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM,
         )
+        self.k8s_utils.create_storage_class(
+            name=self.XFS_STORAGE_CLASS_NAME,
+            cluster_id=cluster_id,
+            pool_name=self.pool_name,
+            ndcs=self.ndcs,
+            npcs=self.npcs,
+            fs_type="xfs",
+            max_namespace_per_subsys=self.NAMESPACES_PER_SUBSYSTEM,
+        )
 
         self._run_large_scale_test()
 
-    # ── Phase 1: Create subsystems ───────────────────────────────────────────
+    def _count_created_resources(self):
+        """Count PVCs available for FIO from pvc_details."""
+        return len(self.pvc_details)
+
+    # ── Phase 1: Create subsystems (parallel across subsystems) ─────────
 
     def _phase_create_subsystems(self):
+        """Create PVCs with PARALLEL_PARENTS subsystems processed concurrently.
+
+        Each subsystem creates NAMESPACES_PER_SUBSYSTEM PVCs sequentially
+        (to preserve device detection order within a subsystem), but multiple
+        subsystems run in parallel to reduce total wall-clock time."""
         total_pvcs = self.NUM_SUBSYSTEMS * self.NAMESPACES_PER_SUBSYSTEM
         self.logger.info(
-            f"=== Phase: Create {total_pvcs} PVCs (K8s) ==="
+            f"=== Phase: Create {total_pvcs} PVCs (K8s) — "
+            f"{self.NUM_SUBSYSTEMS} subsystems × "
+            f"{self.NAMESPACES_PER_SUBSYSTEM} PVCs "
+            f"(parallel={self.PARALLEL_PARENTS}) ==="
         )
 
-        pvc_items = []
-        for i in range(total_pvcs):
-            pvc_name = f"lss-pvc-{_rand_seq(6)}-{i:04d}"
-            pvc_items.append({"name": pvc_name, "idx": i})
+        # Build work items: one per subsystem
+        work_items = [
+            {
+                "subsys_idx": s,
+                "start_pvc_idx": s * self.NAMESPACES_PER_SUBSYSTEM,
+            }
+            for s in range(self.NUM_SUBSYSTEMS)
+        ]
 
-        if self.use_client_fio:
-            self._create_pvcs_client_mode(pvc_items)
-        else:
-            self._create_pvcs_job_mode(pvc_items)
+        subsys_timeout = self.NAMESPACES_PER_SUBSYSTEM * 60
+        ok, fail = self._batch_exec_k8s(
+            work_items,
+            self._create_subsystem_pvcs,
+            "create_subsystems",
+            per_item_timeout=subsys_timeout,
+            max_workers=self.PARALLEL_PARENTS,
+        )
+        if fail > 0:
+            self._total_created = len(self.pvc_details)
+            raise RuntimeError(
+                f"[create] {fail}/{self.NUM_SUBSYSTEMS} subsystems failed"
+            )
+
+        # Bulk verification at the end
+        all_lvols = self.sbcli_utils.list_lvols()
+        if len(all_lvols) < total_pvcs:
+            self.logger.warning(
+                f"[create] lvol count {len(all_lvols)} < "
+                f"expected {total_pvcs}"
+            )
 
         self._total_created = len(self.pvc_details)
-        self.logger.info(f"[create] {self._total_created} PVCs created")
+        self.logger.info(
+            f"[create] {self._total_created} PVCs created, "
+            f"lvols in API: {len(all_lvols)}"
+        )
+
+    def _create_subsystem_pvcs(self, params: dict):
+        """Create all PVCs for one subsystem sequentially.
+
+        Called from _batch_exec_k8s with PARALLEL_PARENTS concurrency.
+        PVCs within a subsystem must be sequential for device detection."""
+        subsys_idx = params["subsys_idx"]
+        start_idx = params["start_pvc_idx"]
+
+        self.logger.info(
+            f"[create] === Subsystem {subsys_idx+1}/"
+            f"{self.NUM_SUBSYSTEMS} ==="
+        )
+        for ns in range(self.NAMESPACES_PER_SUBSYSTEM):
+            pvc_idx = start_idx + ns
+            pvc_name = f"lss-pvc-{_rand_seq(6)}-{pvc_idx:04d}"
 
-    def _create_pvcs_job_mode(self, items: list[dict]):
-        """Create PVCs in parallel (K8s Job FIO mode)."""
-        self._batch_exec_k8s(items, self._create_single_pvc, "create_pvcs")
+            if self.use_client_fio:
+                self._create_single_pvc_client(
+                    {"name": pvc_name, "idx": pvc_idx}
+                )
+            else:
+                self._create_single_pvc({"name": pvc_name})
+
+            if pvc_name not in self.pvc_details:
+                raise RuntimeError(
+                    f"PVC {pvc_name} creation failed — aborting "
+                    f"subsystem {subsys_idx+1}"
+                )
 
-    def _create_pvcs_client_mode(self, items: list[dict]):
-        """Create PVCs + NVMe connect on clients."""
-        self._batch_exec_k8s(
-            items, self._create_single_pvc_client, "create_pvcs_client"
+        self.logger.info(
+            f"[create] Subsystem {subsys_idx+1}/{self.NUM_SUBSYSTEMS} "
+            f"OK — {self.NAMESPACES_PER_SUBSYSTEM} PVCs created"
         )
 
     def _create_single_pvc(self, params: dict):
+        """Create a single PVC and wait for Bound.  Raises on failure."""
         name = params["name"]
-        try:
-            self.k8s_utils.create_pvc(
-                name=name,
-                size=self.PVC_SIZE,
-                storage_class=self.STORAGE_CLASS_NAME,
-            )
-            if not self.k8s_utils.wait_pvc_bound(name, timeout=300):
-                self.logger.error(f"[create_pvc] {name}: not Bound in 300s")
-                return
-            self.pvc_details[name] = {
-                "job_name": None,
-                "configmap_name": None,
-                "snapshots": [],
-            }
-            self.logger.info(f"[create_pvc] {name} Bound")
-        except Exception as e:
-            self.logger.error(f"[create_pvc] {name} failed: {e}")
+        sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+        fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
+        self.k8s_utils.create_pvc(
+            name=name,
+            size=self.PVC_SIZE,
+            storage_class=sc_name,
+        )
+        if not self.k8s_utils.wait_pvc_bound(name, timeout=300):
+            raise TimeoutError(f"PVC {name} not Bound within 300s")
+        self.pvc_details[name] = {
+            "job_name": None,
+            "configmap_name": None,
+            "snapshots": [],
+            "storage_class": sc_name,
+            "fs_type": fs_type,
+        }
+        self.logger.info(f"[create_pvc] {name} Bound (fs={fs_type})")
 
     def _create_single_pvc_client(self, params: dict):
         """Create a single PVC, NVMe-connect on a client, and verify the
-        namespace device appears.  CSI auto-groups PVCs into subsystems
-        based on the StorageClass max_namespace_per_subsys setting.
+        namespace device appears.  Raises on any failure.
 
-        After NVMe connect, the device may appear as:
-        - A new controller + namespace (first PVC in a subsystem)
-        - A new namespace on an existing controller (shared subsystem)
-        Either way we verify a new block device is present.
+        CSI auto-groups PVCs into subsystems based on the StorageClass
+        max_namespace_per_subsys setting.  After NVMe connect, the device
+        may appear as a new controller + namespace (first PVC in a subsystem)
+        or a new namespace on an existing controller (shared subsystem).
         """
         name = params["name"]
-        try:
-            self.k8s_utils.create_pvc(
-                name=name,
-                size=self.PVC_SIZE,
-                storage_class=self.STORAGE_CLASS_NAME,
-            )
-            if not self.k8s_utils.wait_pvc_bound(name, timeout=300):
-                self.logger.error(f"[create_pvc] {name}: not Bound in 300s")
-                return
+        sc_name = random.choice([self.STORAGE_CLASS_NAME, self.XFS_STORAGE_CLASS_NAME])
+        fs_type = "xfs" if sc_name == self.XFS_STORAGE_CLASS_NAME else "ext4"
+        self.k8s_utils.create_pvc(
+            name=name,
+            size=self.PVC_SIZE,
+            storage_class=sc_name,
+        )
+        if not self.k8s_utils.wait_pvc_bound(name, timeout=300):
+            raise TimeoutError(f"PVC {name} not Bound within 300s")
 
-            # Get lvol info for NVMe connect
-            lvol_id = self.k8s_utils.get_pvc_volume_handle(name)
-            if not lvol_id:
-                self.logger.error(
-                    f"[create_pvc] {name}: no volume handle"
-                )
-                return
+        # Get lvol info for NVMe connect
+        lvol_id = self.k8s_utils.get_pvc_volume_handle(name)
+        if not lvol_id:
+            raise RuntimeError(f"PVC {name}: no volume handle")
 
-            lvol_name = None
-            lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)
-            if lvol_details:
-                lvol_name = lvol_details[0].get("lvol_name", name)
-            else:
-                lvol_name = name
+        lvol_details = self.sbcli_utils.get_lvol_details(lvol_id=lvol_id)
+        lvol_name = (
+            lvol_details[0].get("lvol_name", name) if lvol_details else name
+        )
 
-            connect_ls = self.sbcli_utils.get_lvol_connect_str(
-                lvol_name=lvol_name
-            )
+        connect_ls = self.sbcli_utils.get_lvol_connect_str(
+            lvol_name=lvol_name
+        )
+
+        client = self.fio_node[params["idx"] % len(self.fio_node)]
 
-            client = self.fio_node[params["idx"] % len(self.fio_node)]
+        # Snapshot devices before connect
+        initial_devices = set(self.ssh_obj.get_devices(node=client))
 
-            # Snapshot devices before connect
-            initial_devices = set(self.ssh_obj.get_devices(node=client))
+        # Extract NQN from connect strings for namespace tracking
+        nqn = None
+        for cmd in connect_ls:
+            self.ssh_obj.exec_command(node=client, command=cmd)
+            nqn_match = re.search(r"-n\s+(nqn\S+)", cmd)
+            if nqn_match:
+                nqn = nqn_match.group(1)
 
-            # Extract NQN from connect strings for namespace tracking
-            nqn = None
-            for cmd in connect_ls:
-                self.ssh_obj.exec_command(node=client, command=cmd)
-                nqn_match = re.search(r"-n\s+(nqn\S+)", cmd)
-                if nqn_match:
-                    nqn = nqn_match.group(1)
+        sleep_n_sec(3)
 
-            sleep_n_sec(3)
+        # Check for new device — could be new controller or new namespace
+        final_devices = set(self.ssh_obj.get_devices(node=client))
+        new_devs = sorted(final_devices - initial_devices)
 
-            # Check for new device — could be new controller or new namespace
+        new_dev = None
+        if new_devs:
+            new_dev = f"/dev/{new_devs[-1].strip()}"
+        else:
+            # Device didn't appear automatically — try NVMe rescan
+            self.logger.info(
+                f"[create_pvc] {name}: no new device, rescanning"
+            )
+            rescan_cmd = (
+                "bash -lc 'for c in /dev/nvme*; do "
+                "[ -c \"$c\" ] && nvme ns-rescan $c 2>/dev/null; "
+                "done || true'"
+            )
+            self.ssh_obj.exec_command(
+                node=client, command=rescan_cmd
+            )
+            sleep_n_sec(5)
             final_devices = set(self.ssh_obj.get_devices(node=client))
             new_devs = sorted(final_devices - initial_devices)
-
-            new_dev = None
             if new_devs:
                 new_dev = f"/dev/{new_devs[-1].strip()}"
-            else:
-                # Device didn't appear automatically — try NVMe rescan
-                # Find controller for this NQN and rescan namespaces
-                self.logger.info(
-                    f"[create_pvc] {name}: no new device, rescanning"
-                )
-                # Rescan all controllers on this client
-                rescan_cmd = (
-                    "bash -lc 'for c in /dev/nvme*; do "
-                    "[ -c \"$c\" ] && nvme ns-rescan $c 2>/dev/null; "
-                    "done || true'"
-                )
-                self.ssh_obj.exec_command(
-                    node=client, command=rescan_cmd
-                )
-                sleep_n_sec(5)
-                final_devices = set(self.ssh_obj.get_devices(node=client))
-                new_devs = sorted(final_devices - initial_devices)
-                if new_devs:
-                    new_dev = f"/dev/{new_devs[-1].strip()}"
 
-            if not new_dev:
-                self.logger.error(
-                    f"[create_pvc] {name}: no device after NVMe "
-                    f"connect + rescan on {client}"
-                )
-                return
+        if not new_dev:
+            raise RuntimeError(
+                f"PVC {name}: no device after NVMe connect + rescan "
+                f"on {client}"
+            )
 
-            ctrl_dev = get_parent_device(new_dev)
-            mount_point = f"{self.mount_path}/{name}"
-            log_file = f"{self.log_path}/{name}.log"
+        ctrl_dev = get_parent_device(new_dev)
+        mount_point = f"{self.mount_path}/{name}"
+        log_file = f"{self.log_path}/{name}.log"
 
-            self.ssh_obj.format_disk(
-                node=client, device=new_dev, fs_type="ext4"
-            )
-            self.ssh_obj.mount_path(
-                node=client, device=new_dev, mount_path=mount_point
-            )
+        self.ssh_obj.format_disk(
+            node=client, device=new_dev, fs_type=fs_type
+        )
+        self.ssh_obj.mount_path(
+            node=client, device=new_dev, mount_path=mount_point
+        )
 
-            self.pvc_details[name] = {
-                "job_name": None,
-                "configmap_name": None,
-                "snapshots": [],
-            }
-            self.lvol_mount_details[lvol_name] = {
-                "ID": lvol_id,
-                "Name": lvol_name,
-                "Mount": mount_point,
-                "Device": new_dev,
-                "FS": "ext4",
-                "Log": log_file,
-                "Client": client,
-                "pvc_name": name,
-                "ctrl_dev": ctrl_dev,
-                "nqn": nqn,
-            }
-            self.logger.info(
-                f"[create_pvc] {name} -> {new_dev} "
-                f"(ctrl={ctrl_dev}) on {client}"
-            )
-        except Exception as e:
-            self.logger.error(f"[create_pvc] {name} failed: {e}")
+        self.pvc_details[name] = {
+            "job_name": None,
+            "configmap_name": None,
+            "snapshots": [],
+            "storage_class": sc_name,
+            "fs_type": fs_type,
+        }
+        self.lvol_mount_details[lvol_name] = {
+            "ID": lvol_id,
+            "Name": lvol_name,
+            "Mount": mount_point,
+            "Device": new_dev,
+            "FS": fs_type,
+            "Log": log_file,
+            "Client": client,
+            "pvc_name": name,
+            "ctrl_dev": ctrl_dev,
+            "nqn": nqn,
+        }
+        self.logger.info(
+            f"[create_pvc] {name} -> {new_dev} "
+            f"(ctrl={ctrl_dev}) on {client}"
+        )
 
     # ── Phase 2: Start FIO ──────────────────────────────────────────────────
 
@@ -1304,7 +1701,13 @@ def _log_health_status(self, elapsed: int):
 
     def _phase_validate(self):
         self.logger.info("=== Phase: Validate FIO (K8s) ===")
+
+        # 1. Save all FIO logs first (regardless of pass/fail)
+        self._save_all_fio_logs_k8s()
+        self._save_fio_mapping_summary_k8s()
+
         if self.use_client_fio:
+            # 2a. Check thread liveness
             alive = sum(1 for t in self.fio_threads if t.is_alive())
             dead = len(self.fio_threads) - alive
             self.logger.info(
@@ -1315,27 +1718,123 @@ def _phase_validate(self):
                 self.logger.error(
                     f"[validate] {dead} FIO threads died during test"
                 )
+
+            # 2b. Validate client FIO log contents
+            validated = 0
+            failed = 0
+            for lvol_name, details in self.lvol_mount_details.items():
+                log_file = details.get("Log")
+                client = details.get("Client")
+                if not log_file or not client:
+                    continue
+                try:
+                    self.common_utils.validate_fio_test(client, log_file)
+                    validated += 1
+                except RuntimeError as e:
+                    failed += 1
+                    self.logger.error(
+                        f"[validate] FIO error in {lvol_name}: {e}"
+                    )
+            self.logger.info(
+                f"[validate] Log validation: {validated} passed, "
+                f"{failed} failed"
+            )
+            self._fio_failures = max(self._fio_failures, failed)
         else:
-            # Check K8s Job statuses
-            try:
-                ns = self.k8s_utils.namespace
-                out, _ = self.k8s_utils._exec_kubectl(
-                    f"kubectl get jobs -n {ns} "
-                    f"-l app=fio "
-                    f"-o jsonpath='{{.items[*].status.failed}}' "
-                    f"2>/dev/null || true",
-                    supress_logs=True,
-                )
-                failed_counts = [
-                    int(x) for x in (out or "").split() if x.strip()
-                ]
-                total_failed = sum(failed_counts)
-                self.logger.info(
-                    f"[validate] {total_failed} jobs have failures"
+            # 2c. Validate K8s Job statuses + pod logs
+            fio_timeout = self.FIO_RUNTIME + 300
+            validated = 0
+            failed = 0
+            for pvc_name, pvc_info in self.pvc_details.items():
+                job_name = pvc_info.get("job_name")
+                if not job_name:
+                    continue
+                try:
+                    self.k8s_utils.validate_fio_job(
+                        job_name, timeout=fio_timeout
+                    )
+                    validated += 1
+                except RuntimeError as e:
+                    failed += 1
+                    self.logger.error(
+                        f"[validate] FIO job {job_name} failed: {e}"
+                    )
+            self.logger.info(
+                f"[validate] Job validation: {validated} passed, "
+                f"{failed} failed"
+            )
+            self._fio_failures = failed
+
+    def _save_all_fio_logs_k8s(self):
+        """Save FIO pod logs and perf files for all PVCs."""
+        if self.use_client_fio:
+            # Client mode: collect logs via SSH
+            saved = 0
+            for lvol_name, details in self.lvol_mount_details.items():
+                log_file = details.get("Log")
+                client = details.get("Client")
+                if not log_file or not client:
+                    continue
+                try:
+                    file_data = self.ssh_obj.read_file(client, log_file)
+                    if file_data:
+                        local_path = os.path.join(
+                            self.log_path, f"{lvol_name}_fio.log"
+                        )
+                        with open(local_path, "w") as f:
+                            f.write(file_data)
+                        saved += 1
+                except Exception:
+                    pass
+            self.logger.info(
+                f"[save_fio] Collected {saved} FIO logs from clients"
+            )
+            return
+
+        # K8s Job mode: collect pod logs + perf files
+        saved = 0
+        for pvc_name, pvc_info in self.pvc_details.items():
+            job_name = pvc_info.get("job_name")
+            if job_name:
+                self._save_fio_pod_logs(
+                    job_name, pvc_name, pvc_name=pvc_name
                 )
-                self._fio_failures = total_failed
-            except Exception as e:
-                self.logger.warning(f"[validate] Job check failed: {e}")
+                saved += 1
+        self.logger.info(f"[save_fio] Saved FIO logs for {saved} PVCs")
+
+        # Bulk cleanup leftover copier pods
+        try:
+            self.k8s_utils._exec_kubectl(
+                f"kubectl delete pods -l app=fio-copier "
+                f"-n {self.k8s_utils.namespace} "
+                f"--force --grace-period=0 2>/dev/null || true",
+                supress_logs=True,
+            )
+        except Exception:
+            pass
+
+    def _save_fio_mapping_summary_k8s(self):
+        """Save a JSON summary mapping PVCs to lvols, workers, FIO jobs."""
+        if self.use_client_fio:
+            return
+        try:
+            entries = self.k8s_utils.log_fio_pvc_mapping(
+                self.pvc_details
+            )
+            if not entries:
+                return
+            summary_path = os.path.join(
+                self.docker_logs_path, "fio_mapping_summary.json"
+            )
+            with open(summary_path, "w") as f:
+                _json.dump(entries, f, indent=2, default=str)
+            self.logger.info(
+                f"[save_fio] Wrote FIO mapping summary to {summary_path}"
+            )
+        except Exception as exc:
+            self.logger.warning(
+                f"[save_fio] Could not write mapping summary: {exc}"
+            )
 
     # ── Cleanup ──────────────────────────────────────────────────────────────
 
@@ -1506,14 +2005,31 @@ def _phase_cleanup(self):
 
     # ── Batch parallel helper ────────────────────────────────────────────────
 
-    def _batch_exec_k8s(self, items, task_fn, op_name: str):
-        """Execute task_fn(item) for each item using ThreadPoolExecutor."""
+    def _batch_exec_k8s(self, items, task_fn, op_name: str,
+                        per_item_timeout: int = 600,
+                        max_workers: int = None,
+                        max_failures: int = 10):
+        """Execute task_fn(item) for each item using ThreadPoolExecutor.
+
+        Stops submitting new batches once failures >= max_failures.
+        Returns (success_count, failure_count).
+        """
         total = len(items)
         success = 0
         failures = 0
+        workers = max_workers or self.MAX_WORKERS
+        stopped_early = False
 
-        with ThreadPoolExecutor(max_workers=self.MAX_WORKERS) as executor:
+        with ThreadPoolExecutor(max_workers=workers) as executor:
             for batch_start in range(0, total, self.BATCH_SIZE):
+                if failures >= max_failures:
+                    stopped_early = True
+                    self.logger.error(
+                        f"[{op_name}] Stopping: {failures} failures "
+                        f"reached max_failures={max_failures}"
+                    )
+                    break
+
                 batch = items[batch_start:batch_start + self.BATCH_SIZE]
                 futures = {}
                 for item in batch:
@@ -1522,11 +2038,14 @@ def _batch_exec_k8s(self, items, task_fn, op_name: str):
 
                 for f in as_completed(futures):
                     try:
-                        f.result(timeout=600)
+                        f.result(timeout=per_item_timeout)
                         success += 1
                     except Exception as exc:
                         failures += 1
-                        self.logger.error(f"[{op_name}] Failed: {exc}")
+                        self.logger.error(
+                            f"[{op_name}] Failed ({failures}/"
+                            f"{max_failures} max): {exc}"
+                        )
 
                 done = batch_start + len(batch)
                 self.logger.info(
@@ -1534,4 +2053,10 @@ def _batch_exec_k8s(self, items, task_fn, op_name: str):
                     f"(ok={success} fail={failures})"
                 )
 
+        if stopped_early:
+            self.logger.info(
+                f"[{op_name}] Stopped early: {success} succeeded, "
+                f"{failures} failed, "
+                f"{total - success - failures} skipped"
+            )
         return success, failures
diff --git a/e2e/utils/k8s_utils.py b/e2e/utils/k8s_utils.py
index 19b228d18..896fba523 100755
--- a/e2e/utils/k8s_utils.py
+++ b/e2e/utils/k8s_utils.py
@@ -810,6 +810,8 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None,
                     except Exception:
                         pass
 
+                fs_type = info.get("fs_type", "N/A") or "N/A"
+
                 all_entries.append({
                     "type": label,
                     "name": name or "N/A",
@@ -817,6 +819,7 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None,
                     "lvol_id": vol_handle or "N/A",
                     "storage_node": storage_node,
                     "storage_class": sc,
+                    "fs_type": fs_type,
                     "snap_name": snap,
                     "parent_pvc": parent_pvc,
                     "fio_k8s_node": fio_node,
@@ -825,22 +828,22 @@ def log_fio_pvc_mapping(self, pvc_details: dict, clone_details: dict = None,
         if not all_entries:
             return
 
-        self.logger.info("=" * 180)
+        self.logger.info("=" * 190)
         self.logger.info("FIO Job → PVC/Clone → Lvol → Worker Mapping")
-        self.logger.info("-" * 180)
+        self.logger.info("-" * 190)
         self.logger.info(
             f"{'FIO Job':<30} {'PVC/Clone':<25} {'Lvol ID':<40} "
             f"{'Storage Node':<40} {'FIO K8s Node':<20} {'SC':<28} "
-            f"{'Snapshot':<20} {'Parent PVC':<25} {'Type':<6}"
+            f"{'FS':<6} {'Snapshot':<20} {'Parent PVC':<25} {'Type':<6}"
         )
-        self.logger.info("-" * 180)
+        self.logger.info("-" * 190)
         for e in all_entries:
             self.logger.info(
                 f"{e['job']:<30} {e['name']:<25} {e['lvol_id']:<40} "
                 f"{e['storage_node']:<40} {e['fio_k8s_node']:<20} {e['storage_class']:<28} "
-                f"{e['snap_name']:<20} {e['parent_pvc']:<25} {e['type']:<6}"
+                f"{e['fs_type']:<6} {e['snap_name']:<20} {e['parent_pvc']:<25} {e['type']:<6}"
             )
-        self.logger.info("=" * 180)
+        self.logger.info("=" * 190)
         return all_entries
 
     # ── VolumeSnapshot operations ────────────────────────────────────────────
diff --git a/e2e/utils/sbcli_utils.py b/e2e/utils/sbcli_utils.py
index 32993378b..cbdbcd02f 100755
--- a/e2e/utils/sbcli_utils.py
+++ b/e2e/utils/sbcli_utils.py
@@ -480,8 +480,8 @@ def add_lvol(self, lvol_name, pool_name, size="256M", distr_ndcs=0, distr_npcs=0
             body["max_namespace_per_subsys"] = int(max_namespace_per_subsys)
 
         if namespace:
-            # parent lvol id
-            body["namespace"] = namespace
+            # flag for auto-grouping into existing parent subsystem
+            body["namespaced"] = True
         
         self.post_request(api_url="/lvol", body=body, retry=retry)
 
@@ -722,25 +722,50 @@ def all_expected_status(self, value_dict, expected_status):
         self.logger.info(f"Value: {value_match}")
         return all(value_match)
     
-    def wait_for_device_status(self, node_id, status, timeout=60):
+    def wait_for_device_status(self, node_id, status, timeout=60, device_id=None):
+        """Wait for device(s) to reach the expected status.
+
+        Args:
+            node_id: Storage node UUID.
+            status: Expected status string or list of status strings.
+            timeout: Max seconds to wait.
+            device_id: If provided, only check this specific device.
+                       If None, check ALL devices on the node (legacy behaviour).
+        """
+        status = status if isinstance(status, list) else [status]
         device_ids = {}
         device_details = self.get_device_details(storage_node_id=node_id)
         total_devices = len(device_details)
         while timeout > 0:
             self.logger.info("Retrying Device Status check")
             device_details = self.get_device_details(storage_node_id=node_id)
-            for device in device_details:
-                device_ids[device['id']] = device['status']
-                status = status if isinstance(status, list) else [status]
+
+            if device_id:
+                # Single-device mode: only check the specified device
+                for device in device_details:
+                    if device['id'] == device_id:
+                        actual = device['status']
+                        self.logger.info(f"Device ID: {device_id} Expected Status: {status} / Actual Status: {actual}")
+                        if actual in status:
+                            return device_details
+                        break
+                else:
+                    self.logger.warning(f"Device {device_id} not found on node {node_id}")
+            else:
+                # All-devices mode (legacy): require every device to match
+                device_ids = {}
+                for device in device_details:
+                    device_ids[device['id']] = device['status']
                 self.logger.info(f"Device statuses: {device_ids}")
-                if device['status'] in status:
-                    if len(device_ids) == total_devices and self.all_expected_status(device_ids, status):
-                        return device_details
-                self.logger.info(f"Device ID: {device['id']} Expected Status: {status} / Actual Status: {device['status']}")
+                if len(device_ids) == total_devices and self.all_expected_status(device_ids, status):
+                    return device_details
+                for did, dstatus in device_ids.items():
+                    self.logger.info(f"Device ID: {did} Expected Status: {status} / Actual Status: {dstatus}")
+
             sleep_n_sec(1)
             timeout -= 1
-        raise TimeoutError(f"Timed out waiting for device status, Node id: {node_id}, Device id: {list(device_ids.keys())}"
-                            f"Expected status: {status}, Actual status: {list(device_ids.values())}")
+        raise TimeoutError(f"Timed out waiting for device status, Node id: {node_id}, Device id: {device_id or list(device_ids.keys())}, "
+                            f"Expected status: {status}, Actual status: {list(device_ids.values()) if not device_id else 'see above'}")
     
     def wait_for_health_status(self, node_id, status, timeout=60, device_id=None):
         actual_status = None
@@ -782,10 +807,10 @@ def wait_for_health_status(self, node_id, status, timeout=60, device_id=None):
 
     def list_migration_tasks(self, cluster_id):
         """List all migration tasks for a given cluster."""
-        return self.get_request(f"/cluster/list-tasks/{cluster_id}?limit=0")
+        return self.get_request(f"/cluster/get-tasks/{cluster_id}?limit=0")
 
     def wait_migration_tasks_complete(self, timeout=3600):
-        """Wait until all FN_FAILED_DEV_MIG tasks finish.
+        """Wait until all failed_device_migration tasks finish.
 
         Polls ``list_migration_tasks`` every 10 seconds until no active
         failure-migration tasks remain or *timeout* seconds elapse.
@@ -803,10 +828,15 @@ def wait_migration_tasks_complete(self, timeout=3600):
         start = _time.time()
         active = []
         while _time.time() - start < timeout:
-            tasks = self.list_migration_tasks(self.cluster_id)
+            try:
+                tasks = self.list_migration_tasks(self.cluster_id)
+            except Exception as exc:
+                self.logger.warning(f"list_migration_tasks API failed: {exc}")
+                sleep_n_sec(10)
+                continue
             active = [
                 t for t in tasks.get("results", [])
-                if t.get("function_name") == "FN_FAILED_DEV_MIG"
+                if t.get("function_name") == "failed_device_migration"
                 and t.get("status") not in ("done", "cancelled", "error")
             ]
             if not active:
diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py
index 627ac6a61..276eee0b6 100755
--- a/e2e/utils/ssh_utils.py
+++ b/e2e/utils/ssh_utils.py
@@ -2939,6 +2939,43 @@ def stop_all_tshark(self, node_ip):
         self.exec_command(node_ip, stop_command)
         self.logger.info(f"Stopped all tshark processes on {node_ip}")
 
+    def start_full_pcap_capture(self, node_ip, log_dir, interface="any",
+                                max_size_mb=500, max_files=3):
+        """Start full packet capture in pcap format with file rotation.
+
+        Captures all packets on the given interface.  Files rotate at
+        *max_size_mb* MB, keeping at most *max_files* rotated files
+        (total max disk = max_size_mb * max_files per node).
+
+        Args:
+            node_ip: Target node IP.
+            log_dir: Directory to write pcap files into.
+            interface: Network interface (default ``any``).
+            max_size_mb: Rotate file after this many MB.
+            max_files: Maximum number of rotated files to keep.
+        """
+        self.check_and_install_tcpdump(node_ip)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        pcap_file = f"{log_dir}/full_capture_{node_ip}_{timestamp}.pcap"
+        cmd = (
+            f"sudo tmux new-session -d -s full_pcap_session "
+            f"\"tcpdump -i {interface} -w {pcap_file} "
+            f"-C {max_size_mb} -W {max_files} 2>&1\""
+        )
+        self.exec_command(node_ip, cmd)
+        self.logger.info(
+            f"Started full pcap capture on {node_ip} -> {pcap_file} "
+            f"(rotate={max_size_mb}MB x{max_files})"
+        )
+
+    def stop_full_pcap_capture(self, node_ip):
+        """Stop the full pcap capture tmux session on a node."""
+        self.exec_command(
+            node_ip,
+            "sudo tmux kill-session -t full_pcap_session 2>/dev/null || true",
+        )
+        self.logger.info(f"Stopped full pcap capture on {node_ip}")
+
     def get_dmesg_logs_within_iso_window(self, node_ip, start_iso, end_iso):
         """
         Fetch dmesg logs with ISO timestamps on a remote node within a time window.