|
| 1 | +#!/bin/bash |
| 2 | +# Captures GC pause statistics during a reindex run and compares them to a saved baseline. |
| 3 | +# |
| 4 | +# Usage: |
| 5 | +# ./gc-reindex-report.sh --token TOKEN [--server URL] [--gclog PATH] [--save-baseline] |
| 6 | +# |
| 7 | +# What it does: |
| 8 | +# 1. Reads the server's GC log (or polls /api/v1/system/metrics for JVM stats if no GC log) |
| 9 | +# 2. Triggers a full reindex |
| 10 | +# 3. Polls until the reindex finishes |
| 11 | +# 4. Summarises: total STW pause time, max pause, p99 pause, probe failures, throughput |
| 12 | +# 5. Optionally saves the result as a baseline for future comparison |
| 13 | +# |
| 14 | +# GC log note: |
| 15 | +# Start the server with: -Xlog:gc*:file=/tmp/om-gc.log:time,uptime,tags:filecount=5,filesize=20m |
| 16 | +# Then pass --gclog /tmp/om-gc.log to this script. |
| 17 | +# |
| 18 | +# Without --gclog the script uses the Prometheus /metrics endpoint to estimate GC pressure. |
| 19 | + |
| 20 | +set -euo pipefail |
| 21 | + |
| 22 | +SERVER_URL="http://localhost:8585" |
| 23 | +TOKEN="" |
| 24 | +GC_LOG="" |
| 25 | +SAVE_BASELINE=false |
| 26 | +BASELINE_FILE="$(dirname "$0")/../gc-baseline.json" |
| 27 | +POLL_INTERVAL=10 # seconds between status polls |
| 28 | + |
| 29 | +while [[ $# -gt 0 ]]; do |
| 30 | + case $1 in |
| 31 | + --server) SERVER_URL="$2"; shift 2 ;; |
| 32 | + --token) TOKEN="$2"; shift 2 ;; |
| 33 | + --gclog) GC_LOG="$2"; shift 2 ;; |
| 34 | + --save-baseline) SAVE_BASELINE=true; shift ;; |
| 35 | + --baseline) BASELINE_FILE="$2"; shift 2 ;; |
| 36 | + -h|--help) |
| 37 | + sed -n '2,20p' "$0" | sed 's/^# //;s/^#//' |
| 38 | + exit 0 |
| 39 | + ;; |
| 40 | + *) echo "Unknown arg: $1"; exit 1 ;; |
| 41 | + esac |
| 42 | +done |
| 43 | + |
| 44 | +if [[ -z "$TOKEN" ]]; then |
| 45 | + echo "ERROR: --token is required"; exit 1 |
| 46 | +fi |
| 47 | + |
| 48 | +AUTH_HEADER="Authorization: Bearer $TOKEN" |
| 49 | +TIMESTAMP=$(date +%Y%m%d-%H%M%S) |
| 50 | + |
| 51 | +# ── helpers ─────────────────────────────────────────────────────────────────── |
| 52 | + |
| 53 | +jq_required() { |
| 54 | + if ! command -v jq &>/dev/null; then |
| 55 | + echo "ERROR: jq is required. Install with: brew install jq"; exit 1 |
| 56 | + fi |
| 57 | +} |
| 58 | + |
| 59 | +curl_json() { |
| 60 | + curl -sf -H "$AUTH_HEADER" -H "Content-Type: application/json" "$@" |
| 61 | +} |
| 62 | + |
| 63 | +# ── capture GC snapshot via /metrics ───────────────────────────────────────── |
| 64 | + |
| 65 | +gc_metrics_snapshot() { |
| 66 | + local label="$1" |
| 67 | + local result |
| 68 | + # Try Prometheus endpoint first (if enabled) |
| 69 | + if result=$(curl -sf -H "$AUTH_HEADER" "${SERVER_URL}/api/v1/system/metrics" 2>/dev/null); then |
| 70 | + local gc_count gc_time |
| 71 | + gc_count=$(echo "$result" | grep '^jvm_gc_pause_seconds_count' | awk '{s+=$NF} END{print int(s)}' || echo 0) |
| 72 | + gc_time=$(echo "$result" | grep '^jvm_gc_pause_seconds_sum' | awk '{s+=$NF} END{printf "%.3f", s}' || echo "0.000") |
| 73 | + echo "${label}:gc_count=${gc_count},gc_time_s=${gc_time}" |
| 74 | + else |
| 75 | + echo "${label}:gc_count=N/A,gc_time_s=N/A" |
| 76 | + fi |
| 77 | +} |
| 78 | + |
| 79 | +# ── parse GC log (G1GC format) ──────────────────────────────────────────────── |
| 80 | + |
| 81 | +parse_gc_log() { |
| 82 | + local logfile="$1" |
| 83 | + if [[ ! -f "$logfile" ]]; then |
| 84 | + echo "GC_LOG_NOT_FOUND"; return |
| 85 | + fi |
| 86 | + |
| 87 | + python3 - "$logfile" <<'PYEOF' |
| 88 | +import re, sys, statistics |
| 89 | +
|
| 90 | +log_file = sys.argv[1] |
| 91 | +pauses = [] |
| 92 | +
|
| 93 | +# Match G1GC pause lines: [0.123s][info][gc] GC(0) Pause ... 12.345ms |
| 94 | +pause_re = re.compile(r'Pause (?:Young|Full|Initial Mark|Remark|Cleanup)[^0-9]*(\d+\.\d+)ms', re.IGNORECASE) |
| 95 | +# Also match: GC(N) Pause ... Nms |
| 96 | +generic_re = re.compile(r'\bPause\b.*?(\d+\.\d+)ms') |
| 97 | +
|
| 98 | +with open(log_file) as f: |
| 99 | + for line in f: |
| 100 | + m = pause_re.search(line) or generic_re.search(line) |
| 101 | + if m: |
| 102 | + pauses.append(float(m.group(1))) |
| 103 | +
|
| 104 | +if not pauses: |
| 105 | + print("no_pauses_found") |
| 106 | + sys.exit(0) |
| 107 | +
|
| 108 | +pauses.sort() |
| 109 | +p99_idx = max(0, int(len(pauses) * 0.99) - 1) |
| 110 | +print(f"count={len(pauses)}") |
| 111 | +print(f"total_ms={sum(pauses):.1f}") |
| 112 | +print(f"max_ms={max(pauses):.1f}") |
| 113 | +print(f"p99_ms={pauses[p99_idx]:.1f}") |
| 114 | +print(f"mean_ms={statistics.mean(pauses):.1f}") |
| 115 | +PYEOF |
| 116 | +} |
| 117 | + |
| 118 | +# ── probe health once ───────────────────────────────────────────────────────── |
| 119 | + |
| 120 | +probe_health() { |
| 121 | + local http_code |
| 122 | + http_code=$(curl -o /dev/null -sf -w "%{http_code}" \ |
| 123 | + --max-time 1 \ |
| 124 | + -H "$AUTH_HEADER" \ |
| 125 | + "${SERVER_URL}/api/v1/system/health" 2>/dev/null || echo "000") |
| 126 | + echo "$http_code" |
| 127 | +} |
| 128 | + |
| 129 | +# ── trigger reindex ─────────────────────────────────────────────────────────── |
| 130 | + |
| 131 | +trigger_reindex() { |
| 132 | + local payload='{"recreateIndex":false}' |
| 133 | + curl_json -X POST \ |
| 134 | + "${SERVER_URL}/api/v1/apps/trigger/SearchIndexingApplication" \ |
| 135 | + -d "$payload" > /dev/null 2>&1 || true |
| 136 | + echo "Reindex triggered" |
| 137 | +} |
| 138 | + |
| 139 | +# ── poll reindex status ─────────────────────────────────────────────────────── |
| 140 | + |
| 141 | +STATS_FILE="/tmp/om-reindex-stats-$$.txt" |
| 142 | + |
| 143 | +poll_until_done() { |
| 144 | + local start_ts |
| 145 | + start_ts=$(date +%s) |
| 146 | + local probe_failures=0 |
| 147 | + local probe_total=0 |
| 148 | + local max_latency_ms=0 |
| 149 | + local probes_over_1000=0 |
| 150 | + local consecutive_slow=0 |
| 151 | + local max_consecutive_slow=0 |
| 152 | + local last_probe_ts=0 |
| 153 | + |
| 154 | + echo "" |
| 155 | + echo "Polling reindex status every ${POLL_INTERVAL}s ..." |
| 156 | + echo " (probing /system/health every 10s — k8s periodSeconds=10, failureThreshold=3)" |
| 157 | + echo "" |
| 158 | + |
| 159 | + while true; do |
| 160 | + sleep "$POLL_INTERVAL" |
| 161 | + local now |
| 162 | + now=$(date +%s) |
| 163 | + |
| 164 | + # Probe health every 10s (k8s default periodSeconds) |
| 165 | + if (( now - last_probe_ts >= 10 )); then |
| 166 | + local t0 code latency |
| 167 | + t0=$(python3 -c "import time; print(int(time.time()*1000))") |
| 168 | + code=$(probe_health) |
| 169 | + latency=$(( $(python3 -c "import time; print(int(time.time()*1000))") - t0 )) |
| 170 | + probe_total=$(( probe_total + 1 )) |
| 171 | + last_probe_ts=$now |
| 172 | +
|
| 173 | + if (( latency > max_latency_ms )); then max_latency_ms=$latency; fi |
| 174 | + if (( latency > 1000 )); then |
| 175 | + probes_over_1000=$(( probes_over_1000 + 1 )) |
| 176 | + consecutive_slow=$(( consecutive_slow + 1 )) |
| 177 | + if (( consecutive_slow > max_consecutive_slow )); then |
| 178 | + max_consecutive_slow=$consecutive_slow |
| 179 | + fi |
| 180 | + else |
| 181 | + consecutive_slow=0 |
| 182 | + fi |
| 183 | +
|
| 184 | + if [[ "$code" != "200" ]]; then |
| 185 | + probe_failures=$(( probe_failures + 1 )) |
| 186 | + echo " [$(date +%H:%M:%S)] PROBE FAIL code=$code latency=${latency}ms consecutive=${consecutive_slow}" >&2 |
| 187 | + elif (( latency > 1000 )); then |
| 188 | + echo " [$(date +%H:%M:%S)] SLOW PROBE latency=${latency}ms *** >1000ms (k8s timeout risk) consecutive=${consecutive_slow}" >&2 |
| 189 | + else |
| 190 | + echo " [$(date +%H:%M:%S)] probe ok latency=${latency}ms" >&2 |
| 191 | + fi |
| 192 | +
|
| 193 | + if (( max_consecutive_slow >= 3 )); then |
| 194 | + echo " *** WARNING: ${max_consecutive_slow} consecutive slow probes — k8s WOULD RESTART POD ***" >&2 |
| 195 | + fi |
| 196 | + fi |
| 197 | +
|
| 198 | + # Check app status |
| 199 | + local status_json |
| 200 | + status_json=$(curl_json "${SERVER_URL}/api/v1/apps/name/SearchIndexingApplication/status?offset=0&limit=1" 2>/dev/null || echo '{}') |
| 201 | + local app_status |
| 202 | + app_status=$(echo "$status_json" | python3 -c " |
| 203 | +import json,sys |
| 204 | +try: |
| 205 | + d=json.load(sys.stdin) |
| 206 | + runs=d.get('data',[]) |
| 207 | + if runs: print(runs[0].get('status','unknown')) |
| 208 | + else: print('pending') |
| 209 | +except: print('unknown') |
| 210 | +" 2>/dev/null || echo "unknown") |
| 211 | +
|
| 212 | + if [[ "$app_status" == "success" || "$app_status" == "failed" ]]; then |
| 213 | + local elapsed=$(( now - start_ts )) |
| 214 | + echo "" >&2 |
| 215 | + echo " Reindex finished: status=$app_status elapsed=${elapsed}s" >&2 |
| 216 | + # Write structured results to stats file (not stdout) |
| 217 | + { |
| 218 | + echo "PROBE_FAILURES=$probe_failures" |
| 219 | + echo "PROBE_TOTAL=$probe_total" |
| 220 | + echo "MAX_LATENCY_MS=$max_latency_ms" |
| 221 | + echo "PROBES_OVER_1000=$probes_over_1000" |
| 222 | + echo "MAX_CONSECUTIVE_SLOW=$max_consecutive_slow" |
| 223 | + echo "ELAPSED_S=$elapsed" |
| 224 | + } > "$STATS_FILE" |
| 225 | + return |
| 226 | + fi |
| 227 | + done |
| 228 | +} |
| 229 | +
|
| 230 | +# ── main ────────────────────────────────────────────────────────────────────── |
| 231 | +
|
| 232 | +jq_required |
| 233 | +
|
| 234 | +echo "═══════════════════════════════════════════════════════════════" |
| 235 | +echo " GC / Reindex Report — $(date)" |
| 236 | +echo " Server: $SERVER_URL" |
| 237 | +echo "═══════════════════════════════════════════════════════════════" |
| 238 | +
|
| 239 | +# Snapshot before |
| 240 | +METRICS_BEFORE=$(gc_metrics_snapshot "before") |
| 241 | +GC_LOG_LINES_BEFORE=0 |
| 242 | +if [[ -n "$GC_LOG" && -f "$GC_LOG" ]]; then |
| 243 | + GC_LOG_LINES_BEFORE=$(wc -l < "$GC_LOG") |
| 244 | + echo "GC log: $GC_LOG (current lines: $GC_LOG_LINES_BEFORE)" |
| 245 | +fi |
| 246 | +
|
| 247 | +trigger_reindex |
| 248 | +
|
| 249 | +# Poll — probe lines print live to stderr, structured results written to STATS_FILE |
| 250 | +poll_until_done |
| 251 | +
|
| 252 | +PROBE_FAILURES=$(grep '^PROBE_FAILURES=' "$STATS_FILE" | cut -d= -f2) |
| 253 | +PROBE_TOTAL=$(grep '^PROBE_TOTAL=' "$STATS_FILE" | cut -d= -f2) |
| 254 | +MAX_LATENCY=$(grep '^MAX_LATENCY_MS=' "$STATS_FILE" | cut -d= -f2) |
| 255 | +PROBES_OVER_1000=$(grep '^PROBES_OVER_1000=' "$STATS_FILE" | cut -d= -f2) |
| 256 | +MAX_CONSECUTIVE=$(grep '^MAX_CONSECUTIVE_SLOW=' "$STATS_FILE" | cut -d= -f2) |
| 257 | +ELAPSED_S=$(grep '^ELAPSED_S=' "$STATS_FILE" | cut -d= -f2) |
| 258 | +rm -f "$STATS_FILE" |
| 259 | +
|
| 260 | +# Snapshot after |
| 261 | +METRICS_AFTER=$(gc_metrics_snapshot "after") |
| 262 | +
|
| 263 | +# GC log delta |
| 264 | +GC_LOG_STATS="" |
| 265 | +if [[ -n "$GC_LOG" && -f "$GC_LOG" ]]; then |
| 266 | + # Extract only lines added since reindex started |
| 267 | + TMPLOG="/tmp/om-gc-run-${TIMESTAMP}.log" |
| 268 | + tail -n "+${GC_LOG_LINES_BEFORE}" "$GC_LOG" > "$TMPLOG" |
| 269 | + GC_LOG_STATS=$(parse_gc_log "$TMPLOG") |
| 270 | + rm -f "$TMPLOG" |
| 271 | +fi |
| 272 | +
|
| 273 | +# Parse metrics delta |
| 274 | +GC_COUNT_BEFORE=$(echo "$METRICS_BEFORE" | sed 's/.*gc_count=\([^,]*\).*/\1/') |
| 275 | +GC_TIME_BEFORE=$(echo "$METRICS_BEFORE" | sed 's/.*gc_time_s=\([^,]*\).*/\1/') |
| 276 | +GC_COUNT_AFTER=$(echo "$METRICS_AFTER" | sed 's/.*gc_count=\([^,]*\).*/\1/') |
| 277 | +GC_TIME_AFTER=$(echo "$METRICS_AFTER" | sed 's/.*gc_time_s=\([^,]*\).*/\1/') |
| 278 | +
|
| 279 | +# Compute deltas if numeric |
| 280 | +GC_COUNT_DELTA="N/A" |
| 281 | +GC_TIME_DELTA="N/A" |
| 282 | +if [[ "$GC_COUNT_BEFORE" =~ ^[0-9]+$ && "$GC_COUNT_AFTER" =~ ^[0-9]+$ ]]; then |
| 283 | + GC_COUNT_DELTA=$(( GC_COUNT_AFTER - GC_COUNT_BEFORE )) |
| 284 | +fi |
| 285 | +if [[ "$GC_TIME_BEFORE" =~ ^[0-9.]+$ && "$GC_TIME_AFTER" =~ ^[0-9.]+$ ]]; then |
| 286 | + GC_TIME_DELTA=$(python3 -c "print(f'{float('$GC_TIME_AFTER') - float('$GC_TIME_BEFORE'):.3f}')") |
| 287 | +fi |
| 288 | +
|
| 289 | +echo "" |
| 290 | +echo "══════════════════ RESULTS ════════════════════════════════════" |
| 291 | +echo "" |
| 292 | +printf " %-30s %s\n" "Reindex duration" "${ELAPSED_S}s" |
| 293 | +printf " %-30s %s\n" "Health probe total" "${PROBE_TOTAL}" |
| 294 | +printf " %-30s %s\n" "Probe failures (non-200)" "${PROBE_FAILURES:-0} $([ "${PROBE_FAILURES:-0}" -eq 0 ] && echo '✓ none' || echo '⚠ k8s would have restarted pod')" |
| 295 | +printf " %-30s %s\n" "Probes >1000ms" "${PROBES_OVER_1000:-0} $([ "${PROBES_OVER_1000:-0}" -eq 0 ] && echo '✓ none' || echo '⚠ GC pressure detected')" |
| 296 | +printf " %-30s %s\n" "Max consecutive >1000ms" "${MAX_CONSECUTIVE:-0} $([ "${MAX_CONSECUTIVE:-0}" -lt 3 ] && echo '✓ ok' || echo '*** k8s WOULD RESTART POD ***')" |
| 297 | +printf " %-30s %s\n" "Max probe latency" "${MAX_LATENCY:-0}ms $([ "${MAX_LATENCY:-0}" -lt 1000 ] && echo '✓ ok' || echo '⚠ high')" |
| 298 | +echo "" |
| 299 | +printf " %-30s %s\n" "GC collections (delta)" "$GC_COUNT_DELTA" |
| 300 | +printf " %-30s %s\n" "GC pause time (delta)" "${GC_TIME_DELTA}s" |
| 301 | +echo "" |
| 302 | +
|
| 303 | +if [[ -n "$GC_LOG_STATS" && "$GC_LOG_STATS" != "no_pauses_found" && "$GC_LOG_STATS" != "GC_LOG_NOT_FOUND" ]]; then |
| 304 | + echo " GC log pause breakdown:" |
| 305 | + echo "$GC_LOG_STATS" | sed 's/^/ /' |
| 306 | + echo "" |
| 307 | +fi |
| 308 | +
|
| 309 | +# Build JSON result |
| 310 | +RESULT_JSON=$(python3 -c " |
| 311 | +import json, sys |
| 312 | +result = { |
| 313 | + 'timestamp': '$(date -u +%Y-%m-%dT%H:%M:%SZ)', |
| 314 | + 'server': '$SERVER_URL', |
| 315 | + 'elapsed_s': int('${ELAPSED_S:-0}'), |
| 316 | + 'probe_total': int('${PROBE_TOTAL:-0}'), |
| 317 | + 'probe_failures': int('${PROBE_FAILURES:-0}'), |
| 318 | + 'probes_over_1000ms': int('${PROBES_OVER_1000:-0}'), |
| 319 | + 'max_consecutive_slow': int('${MAX_CONSECUTIVE:-0}'), |
| 320 | + 'max_probe_latency_ms': int('${MAX_LATENCY:-0}'), |
| 321 | + 'gc_collections_delta': '${GC_COUNT_DELTA}', |
| 322 | + 'gc_pause_time_delta_s': '${GC_TIME_DELTA}', |
| 323 | + 'gc_log_stats': '${GC_LOG_STATS:-none}' |
| 324 | +} |
| 325 | +print(json.dumps(result, indent=2)) |
| 326 | +") |
| 327 | +
|
| 328 | +# Compare to baseline |
| 329 | +if [[ -f "$BASELINE_FILE" ]]; then |
| 330 | + echo " Comparison to baseline ($BASELINE_FILE):" |
| 331 | + python3 - "$BASELINE_FILE" <<PYEOF |
| 332 | +import json, sys |
| 333 | +
|
| 334 | +with open(sys.argv[1]) as f: |
| 335 | + baseline = json.load(f) |
| 336 | +
|
| 337 | +current = $RESULT_JSON |
| 338 | +
|
| 339 | +def pct(new, old): |
| 340 | + if old == 0: return '+∞%' |
| 341 | + delta = new - old |
| 342 | + return f'{delta/old*100:+.1f}%' |
| 343 | +
|
| 344 | +print(f" {'Metric':<30} {'Baseline':>12} {'Current':>12} {'Change':>10}") |
| 345 | +print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*10}") |
| 346 | +for k in ['elapsed_s','probe_failures','probes_over_1000ms','max_consecutive_slow','max_probe_latency_ms']: |
| 347 | + b = baseline.get(k, 'N/A') |
| 348 | + c = current.get(k, 'N/A') |
| 349 | + change = pct(c, b) if isinstance(b, (int,float)) and isinstance(c, (int,float)) else 'N/A' |
| 350 | + print(f" {k:<30} {str(b):>12} {str(c):>12} {change:>10}") |
| 351 | +PYEOF |
| 352 | + echo "" |
| 353 | +fi |
| 354 | +
|
| 355 | +# Save result |
| 356 | +OUT_FILE="gc-report-${TIMESTAMP}.json" |
| 357 | +echo "$RESULT_JSON" > "$OUT_FILE" |
| 358 | +echo " Report saved to: $OUT_FILE" |
| 359 | +
|
| 360 | +if [[ "$SAVE_BASELINE" == "true" ]]; then |
| 361 | + cp "$OUT_FILE" "$BASELINE_FILE" |
| 362 | + echo " Saved as baseline: $BASELINE_FILE" |
| 363 | +fi |
| 364 | +
|
| 365 | +echo "" |
| 366 | +echo "══════════════════════════════════════════════════════════════" |
| 367 | +echo "" |
| 368 | +echo " Tip: Run once with --save-baseline to record a baseline, then" |
| 369 | +echo " run again after code changes to see the delta." |
| 370 | +echo "" |
0 commit comments