Skip to content

Commit f69d753

Browse files
Set Indexing related executor threads priority to LOW (#27153)
* Improve memory usage for reindex process and lower priority for * review comments * Fix Long Jobs not marked * Remove not required changes
1 parent 31ce812 commit f69d753

5 files changed

Lines changed: 472 additions & 36 deletions

File tree

Lines changed: 370 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
#!/bin/bash
2+
# Captures GC pause statistics during a reindex run and compares them to a saved baseline.
3+
#
4+
# Usage:
5+
# ./gc-reindex-report.sh --token TOKEN [--server URL] [--gclog PATH] [--save-baseline]
6+
#
7+
# What it does:
8+
# 1. Reads the server's GC log (or polls /api/v1/system/metrics for JVM stats if no GC log)
9+
# 2. Triggers a full reindex
10+
# 3. Polls until the reindex finishes
11+
# 4. Summarises: total STW pause time, max pause, p99 pause, probe failures, throughput
12+
# 5. Optionally saves the result as a baseline for future comparison
13+
#
14+
# GC log note:
15+
# Start the server with: -Xlog:gc*:file=/tmp/om-gc.log:time,uptime,tags:filecount=5,filesize=20m
16+
# Then pass --gclog /tmp/om-gc.log to this script.
17+
#
18+
# Without --gclog the script uses the Prometheus /metrics endpoint to estimate GC pressure.
19+
20+
set -euo pipefail
21+
22+
SERVER_URL="http://localhost:8585"
23+
TOKEN=""
24+
GC_LOG=""
25+
SAVE_BASELINE=false
26+
BASELINE_FILE="$(dirname "$0")/../gc-baseline.json"
27+
POLL_INTERVAL=10 # seconds between status polls
28+
29+
while [[ $# -gt 0 ]]; do
30+
case $1 in
31+
--server) SERVER_URL="$2"; shift 2 ;;
32+
--token) TOKEN="$2"; shift 2 ;;
33+
--gclog) GC_LOG="$2"; shift 2 ;;
34+
--save-baseline) SAVE_BASELINE=true; shift ;;
35+
--baseline) BASELINE_FILE="$2"; shift 2 ;;
36+
-h|--help)
37+
sed -n '2,20p' "$0" | sed 's/^# //;s/^#//'
38+
exit 0
39+
;;
40+
*) echo "Unknown arg: $1"; exit 1 ;;
41+
esac
42+
done
43+
44+
if [[ -z "$TOKEN" ]]; then
45+
echo "ERROR: --token is required"; exit 1
46+
fi
47+
48+
AUTH_HEADER="Authorization: Bearer $TOKEN"
49+
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
50+
51+
# ── helpers ───────────────────────────────────────────────────────────────────
52+
53+
jq_required() {
54+
if ! command -v jq &>/dev/null; then
55+
echo "ERROR: jq is required. Install with: brew install jq"; exit 1
56+
fi
57+
}
58+
59+
curl_json() {
60+
curl -sf -H "$AUTH_HEADER" -H "Content-Type: application/json" "$@"
61+
}
62+
63+
# ── capture GC snapshot via /metrics ─────────────────────────────────────────
64+
65+
gc_metrics_snapshot() {
66+
local label="$1"
67+
local result
68+
# Try Prometheus endpoint first (if enabled)
69+
if result=$(curl -sf -H "$AUTH_HEADER" "${SERVER_URL}/api/v1/system/metrics" 2>/dev/null); then
70+
local gc_count gc_time
71+
gc_count=$(echo "$result" | grep '^jvm_gc_pause_seconds_count' | awk '{s+=$NF} END{print int(s)}' || echo 0)
72+
gc_time=$(echo "$result" | grep '^jvm_gc_pause_seconds_sum' | awk '{s+=$NF} END{printf "%.3f", s}' || echo "0.000")
73+
echo "${label}:gc_count=${gc_count},gc_time_s=${gc_time}"
74+
else
75+
echo "${label}:gc_count=N/A,gc_time_s=N/A"
76+
fi
77+
}
78+
79+
# ── parse GC log (G1GC format) ────────────────────────────────────────────────
80+
81+
parse_gc_log() {
82+
local logfile="$1"
83+
if [[ ! -f "$logfile" ]]; then
84+
echo "GC_LOG_NOT_FOUND"; return
85+
fi
86+
87+
python3 - "$logfile" <<'PYEOF'
88+
import re, sys, statistics
89+
90+
log_file = sys.argv[1]
91+
pauses = []
92+
93+
# Match G1GC pause lines: [0.123s][info][gc] GC(0) Pause ... 12.345ms
94+
pause_re = re.compile(r'Pause (?:Young|Full|Initial Mark|Remark|Cleanup)[^0-9]*(\d+\.\d+)ms', re.IGNORECASE)
95+
# Also match: GC(N) Pause ... Nms
96+
generic_re = re.compile(r'\bPause\b.*?(\d+\.\d+)ms')
97+
98+
with open(log_file) as f:
99+
for line in f:
100+
m = pause_re.search(line) or generic_re.search(line)
101+
if m:
102+
pauses.append(float(m.group(1)))
103+
104+
if not pauses:
105+
print("no_pauses_found")
106+
sys.exit(0)
107+
108+
pauses.sort()
109+
p99_idx = max(0, int(len(pauses) * 0.99) - 1)
110+
print(f"count={len(pauses)}")
111+
print(f"total_ms={sum(pauses):.1f}")
112+
print(f"max_ms={max(pauses):.1f}")
113+
print(f"p99_ms={pauses[p99_idx]:.1f}")
114+
print(f"mean_ms={statistics.mean(pauses):.1f}")
115+
PYEOF
116+
}
117+
118+
# ── probe health once ─────────────────────────────────────────────────────────
119+
120+
probe_health() {
121+
local http_code
122+
http_code=$(curl -o /dev/null -sf -w "%{http_code}" \
123+
--max-time 1 \
124+
-H "$AUTH_HEADER" \
125+
"${SERVER_URL}/api/v1/system/health" 2>/dev/null || echo "000")
126+
echo "$http_code"
127+
}
128+
129+
# ── trigger reindex ───────────────────────────────────────────────────────────
130+
131+
trigger_reindex() {
132+
local payload='{"recreateIndex":false}'
133+
curl_json -X POST \
134+
"${SERVER_URL}/api/v1/apps/trigger/SearchIndexingApplication" \
135+
-d "$payload" > /dev/null 2>&1 || true
136+
echo "Reindex triggered"
137+
}
138+
139+
# ── poll reindex status ───────────────────────────────────────────────────────
140+
141+
STATS_FILE="/tmp/om-reindex-stats-$$.txt"
142+
143+
poll_until_done() {
144+
local start_ts
145+
start_ts=$(date +%s)
146+
local probe_failures=0
147+
local probe_total=0
148+
local max_latency_ms=0
149+
local probes_over_1000=0
150+
local consecutive_slow=0
151+
local max_consecutive_slow=0
152+
local last_probe_ts=0
153+
154+
echo ""
155+
echo "Polling reindex status every ${POLL_INTERVAL}s ..."
156+
echo " (probing /system/health every 10s — k8s periodSeconds=10, failureThreshold=3)"
157+
echo ""
158+
159+
while true; do
160+
sleep "$POLL_INTERVAL"
161+
local now
162+
now=$(date +%s)
163+
164+
# Probe health every 10s (k8s default periodSeconds)
165+
if (( now - last_probe_ts >= 10 )); then
166+
local t0 code latency
167+
t0=$(python3 -c "import time; print(int(time.time()*1000))")
168+
code=$(probe_health)
169+
latency=$(( $(python3 -c "import time; print(int(time.time()*1000))") - t0 ))
170+
probe_total=$(( probe_total + 1 ))
171+
last_probe_ts=$now
172+
173+
if (( latency > max_latency_ms )); then max_latency_ms=$latency; fi
174+
if (( latency > 1000 )); then
175+
probes_over_1000=$(( probes_over_1000 + 1 ))
176+
consecutive_slow=$(( consecutive_slow + 1 ))
177+
if (( consecutive_slow > max_consecutive_slow )); then
178+
max_consecutive_slow=$consecutive_slow
179+
fi
180+
else
181+
consecutive_slow=0
182+
fi
183+
184+
if [[ "$code" != "200" ]]; then
185+
probe_failures=$(( probe_failures + 1 ))
186+
echo " [$(date +%H:%M:%S)] PROBE FAIL code=$code latency=${latency}ms consecutive=${consecutive_slow}" >&2
187+
elif (( latency > 1000 )); then
188+
echo " [$(date +%H:%M:%S)] SLOW PROBE latency=${latency}ms *** >1000ms (k8s timeout risk) consecutive=${consecutive_slow}" >&2
189+
else
190+
echo " [$(date +%H:%M:%S)] probe ok latency=${latency}ms" >&2
191+
fi
192+
193+
if (( max_consecutive_slow >= 3 )); then
194+
echo " *** WARNING: ${max_consecutive_slow} consecutive slow probes — k8s WOULD RESTART POD ***" >&2
195+
fi
196+
fi
197+
198+
# Check app status
199+
local status_json
200+
status_json=$(curl_json "${SERVER_URL}/api/v1/apps/name/SearchIndexingApplication/status?offset=0&limit=1" 2>/dev/null || echo '{}')
201+
local app_status
202+
app_status=$(echo "$status_json" | python3 -c "
203+
import json,sys
204+
try:
205+
d=json.load(sys.stdin)
206+
runs=d.get('data',[])
207+
if runs: print(runs[0].get('status','unknown'))
208+
else: print('pending')
209+
except: print('unknown')
210+
" 2>/dev/null || echo "unknown")
211+
212+
if [[ "$app_status" == "success" || "$app_status" == "failed" ]]; then
213+
local elapsed=$(( now - start_ts ))
214+
echo "" >&2
215+
echo " Reindex finished: status=$app_status elapsed=${elapsed}s" >&2
216+
# Write structured results to stats file (not stdout)
217+
{
218+
echo "PROBE_FAILURES=$probe_failures"
219+
echo "PROBE_TOTAL=$probe_total"
220+
echo "MAX_LATENCY_MS=$max_latency_ms"
221+
echo "PROBES_OVER_1000=$probes_over_1000"
222+
echo "MAX_CONSECUTIVE_SLOW=$max_consecutive_slow"
223+
echo "ELAPSED_S=$elapsed"
224+
} > "$STATS_FILE"
225+
return
226+
fi
227+
done
228+
}
229+
230+
# ── main ──────────────────────────────────────────────────────────────────────
231+
232+
jq_required
233+
234+
echo "═══════════════════════════════════════════════════════════════"
235+
echo " GC / Reindex Report — $(date)"
236+
echo " Server: $SERVER_URL"
237+
echo "═══════════════════════════════════════════════════════════════"
238+
239+
# Snapshot before
240+
METRICS_BEFORE=$(gc_metrics_snapshot "before")
241+
GC_LOG_LINES_BEFORE=0
242+
if [[ -n "$GC_LOG" && -f "$GC_LOG" ]]; then
243+
GC_LOG_LINES_BEFORE=$(wc -l < "$GC_LOG")
244+
echo "GC log: $GC_LOG (current lines: $GC_LOG_LINES_BEFORE)"
245+
fi
246+
247+
trigger_reindex
248+
249+
# Poll — probe lines print live to stderr, structured results written to STATS_FILE
250+
poll_until_done
251+
252+
PROBE_FAILURES=$(grep '^PROBE_FAILURES=' "$STATS_FILE" | cut -d= -f2)
253+
PROBE_TOTAL=$(grep '^PROBE_TOTAL=' "$STATS_FILE" | cut -d= -f2)
254+
MAX_LATENCY=$(grep '^MAX_LATENCY_MS=' "$STATS_FILE" | cut -d= -f2)
255+
PROBES_OVER_1000=$(grep '^PROBES_OVER_1000=' "$STATS_FILE" | cut -d= -f2)
256+
MAX_CONSECUTIVE=$(grep '^MAX_CONSECUTIVE_SLOW=' "$STATS_FILE" | cut -d= -f2)
257+
ELAPSED_S=$(grep '^ELAPSED_S=' "$STATS_FILE" | cut -d= -f2)
258+
rm -f "$STATS_FILE"
259+
260+
# Snapshot after
261+
METRICS_AFTER=$(gc_metrics_snapshot "after")
262+
263+
# GC log delta
264+
GC_LOG_STATS=""
265+
if [[ -n "$GC_LOG" && -f "$GC_LOG" ]]; then
266+
# Extract only lines added since reindex started
267+
TMPLOG="/tmp/om-gc-run-${TIMESTAMP}.log"
268+
tail -n "+${GC_LOG_LINES_BEFORE}" "$GC_LOG" > "$TMPLOG"
269+
GC_LOG_STATS=$(parse_gc_log "$TMPLOG")
270+
rm -f "$TMPLOG"
271+
fi
272+
273+
# Parse metrics delta
274+
GC_COUNT_BEFORE=$(echo "$METRICS_BEFORE" | sed 's/.*gc_count=\([^,]*\).*/\1/')
275+
GC_TIME_BEFORE=$(echo "$METRICS_BEFORE" | sed 's/.*gc_time_s=\([^,]*\).*/\1/')
276+
GC_COUNT_AFTER=$(echo "$METRICS_AFTER" | sed 's/.*gc_count=\([^,]*\).*/\1/')
277+
GC_TIME_AFTER=$(echo "$METRICS_AFTER" | sed 's/.*gc_time_s=\([^,]*\).*/\1/')
278+
279+
# Compute deltas if numeric
280+
GC_COUNT_DELTA="N/A"
281+
GC_TIME_DELTA="N/A"
282+
if [[ "$GC_COUNT_BEFORE" =~ ^[0-9]+$ && "$GC_COUNT_AFTER" =~ ^[0-9]+$ ]]; then
283+
GC_COUNT_DELTA=$(( GC_COUNT_AFTER - GC_COUNT_BEFORE ))
284+
fi
285+
if [[ "$GC_TIME_BEFORE" =~ ^[0-9.]+$ && "$GC_TIME_AFTER" =~ ^[0-9.]+$ ]]; then
286+
GC_TIME_DELTA=$(python3 -c "print(f'{float('$GC_TIME_AFTER') - float('$GC_TIME_BEFORE'):.3f}')")
287+
fi
288+
289+
echo ""
290+
echo "══════════════════ RESULTS ════════════════════════════════════"
291+
echo ""
292+
printf " %-30s %s\n" "Reindex duration" "${ELAPSED_S}s"
293+
printf " %-30s %s\n" "Health probe total" "${PROBE_TOTAL}"
294+
printf " %-30s %s\n" "Probe failures (non-200)" "${PROBE_FAILURES:-0} $([ "${PROBE_FAILURES:-0}" -eq 0 ] && echo '✓ none' || echo '⚠ k8s would have restarted pod')"
295+
printf " %-30s %s\n" "Probes >1000ms" "${PROBES_OVER_1000:-0} $([ "${PROBES_OVER_1000:-0}" -eq 0 ] && echo '✓ none' || echo '⚠ GC pressure detected')"
296+
printf " %-30s %s\n" "Max consecutive >1000ms" "${MAX_CONSECUTIVE:-0} $([ "${MAX_CONSECUTIVE:-0}" -lt 3 ] && echo '✓ ok' || echo '*** k8s WOULD RESTART POD ***')"
297+
printf " %-30s %s\n" "Max probe latency" "${MAX_LATENCY:-0}ms $([ "${MAX_LATENCY:-0}" -lt 1000 ] && echo '✓ ok' || echo '⚠ high')"
298+
echo ""
299+
printf " %-30s %s\n" "GC collections (delta)" "$GC_COUNT_DELTA"
300+
printf " %-30s %s\n" "GC pause time (delta)" "${GC_TIME_DELTA}s"
301+
echo ""
302+
303+
if [[ -n "$GC_LOG_STATS" && "$GC_LOG_STATS" != "no_pauses_found" && "$GC_LOG_STATS" != "GC_LOG_NOT_FOUND" ]]; then
304+
echo " GC log pause breakdown:"
305+
echo "$GC_LOG_STATS" | sed 's/^/ /'
306+
echo ""
307+
fi
308+
309+
# Build JSON result
310+
RESULT_JSON=$(python3 -c "
311+
import json, sys
312+
result = {
313+
'timestamp': '$(date -u +%Y-%m-%dT%H:%M:%SZ)',
314+
'server': '$SERVER_URL',
315+
'elapsed_s': int('${ELAPSED_S:-0}'),
316+
'probe_total': int('${PROBE_TOTAL:-0}'),
317+
'probe_failures': int('${PROBE_FAILURES:-0}'),
318+
'probes_over_1000ms': int('${PROBES_OVER_1000:-0}'),
319+
'max_consecutive_slow': int('${MAX_CONSECUTIVE:-0}'),
320+
'max_probe_latency_ms': int('${MAX_LATENCY:-0}'),
321+
'gc_collections_delta': '${GC_COUNT_DELTA}',
322+
'gc_pause_time_delta_s': '${GC_TIME_DELTA}',
323+
'gc_log_stats': '${GC_LOG_STATS:-none}'
324+
}
325+
print(json.dumps(result, indent=2))
326+
")
327+
328+
# Compare to baseline
329+
if [[ -f "$BASELINE_FILE" ]]; then
330+
echo " Comparison to baseline ($BASELINE_FILE):"
331+
python3 - "$BASELINE_FILE" <<PYEOF
332+
import json, sys
333+
334+
with open(sys.argv[1]) as f:
335+
baseline = json.load(f)
336+
337+
current = $RESULT_JSON
338+
339+
def pct(new, old):
340+
if old == 0: return '+∞%'
341+
delta = new - old
342+
return f'{delta/old*100:+.1f}%'
343+
344+
print(f" {'Metric':<30} {'Baseline':>12} {'Current':>12} {'Change':>10}")
345+
print(f" {'-'*30} {'-'*12} {'-'*12} {'-'*10}")
346+
for k in ['elapsed_s','probe_failures','probes_over_1000ms','max_consecutive_slow','max_probe_latency_ms']:
347+
b = baseline.get(k, 'N/A')
348+
c = current.get(k, 'N/A')
349+
change = pct(c, b) if isinstance(b, (int,float)) and isinstance(c, (int,float)) else 'N/A'
350+
print(f" {k:<30} {str(b):>12} {str(c):>12} {change:>10}")
351+
PYEOF
352+
echo ""
353+
fi
354+
355+
# Save result
356+
OUT_FILE="gc-report-${TIMESTAMP}.json"
357+
echo "$RESULT_JSON" > "$OUT_FILE"
358+
echo " Report saved to: $OUT_FILE"
359+
360+
if [[ "$SAVE_BASELINE" == "true" ]]; then
361+
cp "$OUT_FILE" "$BASELINE_FILE"
362+
echo " Saved as baseline: $BASELINE_FILE"
363+
fi
364+
365+
echo ""
366+
echo "══════════════════════════════════════════════════════════════"
367+
echo ""
368+
echo " Tip: Run once with --save-baseline to record a baseline, then"
369+
echo " run again after code changes to see the delta."
370+
echo ""

0 commit comments

Comments
 (0)