From 754d21000172492a68dd9474388b26dc9cb9b07d Mon Sep 17 00:00:00 2001 From: luotao1 Date: Tue, 19 May 2026 17:50:30 +0800 Subject: [PATCH] Agent: add extraction helper scripts and dedup tool Add graph_net/agent/scripts/ with three utilities: 1. analyze_extraction_log.sh - Analyze batch extraction logs for success/failure stats - Categorize failures (script error, model too large, download failure, etc.) - Support CPU/GPU modes and binary logs (grep -a) - Generate processed/success model lists to /tmp/ 2. check_extraction_progress.sh - Check real-time status of running extraction tasks - Show PID, CPU/memory, worker count, progress, speed estimate, disk space, and sample directory counts - Auto-detect latest log or accept custom log path 3. gen_hash_and_dedup.py - Walk extracted subgraphs, compute SHA256 of model.py files - Generate graph_hash.txt per subgraph and dedup_report.txt - Support --remove to delete duplicate subgraphs (keep first per group) - Default workspace is current directory (.), no hardcoded paths Update graph_net/agent/README.md with usage docs and environment variable overrides (GRAPHNET_LOG_DIR, GRAPHNET_SUCCESS_DIR, GRAPHNET_SAMPLES_DIR). --- graph_net/agent/README.md | 52 ++++ .../agent/scripts/analyze_extraction_log.sh | 229 ++++++++++++++++++ .../scripts/check_extraction_progress.sh | 205 ++++++++++++++++ graph_net/agent/scripts/gen_hash_and_dedup.py | 171 +++++++++++++ 4 files changed, 657 insertions(+) create mode 100644 graph_net/agent/scripts/analyze_extraction_log.sh create mode 100644 graph_net/agent/scripts/check_extraction_progress.sh create mode 100644 graph_net/agent/scripts/gen_hash_and_dedup.py diff --git a/graph_net/agent/README.md b/graph_net/agent/README.md index 2fcf1aa7c..b8aa9a65e 100644 --- a/graph_net/agent/README.md +++ b/graph_net/agent/README.md @@ -117,3 +117,55 @@ agent = GraphNetAgent(llm_retry=False) ``` LLM retry 需要 `ducc` 或 `claude` 在 `PATH` 中可用。 + +## 辅助脚本 + +`graph_net/agent/scripts/` 目录下提供批量任务相关的辅助脚本: + +### check_extraction_progress.sh + +一键查看当前抽取任务的运行状态。 + +```bash +# 自动查找最新日志 +bash graph_net/agent/scripts/check_extraction_progress.sh + +# 或指定日志文件 +bash graph_net/agent/scripts/check_extraction_progress.sh $HOME/workspace/logs_and_lists/batch7_safe_run.log +``` + +输出包括:进程状态(PID、CPU/内存、Worker 数)、日志最新进度、成功/失败统计、处理速度估算、预计剩余时间、磁盘空间、样本目录文件数。 + +### analyze_extraction_log.sh + +分析已完成批次的抽取日志,输出失败分布和根因统计。 + +```bash +bash graph_net/agent/scripts/analyze_extraction_log.sh $HOME/workspace/logs_and_lists/batch7_safe_run.log +``` + +输出包括:总体统计(成功率)、失败原因一级分布、模型过大分布、异常类型分布(ValueError/Dynamo/IndexError 等)、HTTP 状态码分布、辅助文件(生成已处理和成功模型列表到 `/tmp/`)。 + +### gen_hash_and_dedup.py + +子图去重脚本。遍历抽取结果目录,为每个子图的 `model.py` 生成 SHA256 哈希,找出内容完全相同的子图并生成去重报告,支持一键删除重复目录。 + +```bash +# 仅分析,不删除 +python graph_net/agent/scripts/gen_hash_and_dedup.py ./success_20260515_merged + +# 分析并直接删除重复目录(保留每组第一个) +python graph_net/agent/scripts/gen_hash_and_dedup.py ./success_20260515_merged --remove +``` + +输出包括:生成的 `graph_hash.txt` 数量、唯一子图数、重复组数、可删除数量,以及详细的去重报告 `dedup_report.txt`。 + +> **为什么去重率高**:同一基础模型的微调变体(fine-tune variants)通常共享完全相同的计算图,只是权重不同。实测可缩减 90%+(85K 子图 → 1.5K 唯一子图,2.3 GB → 172 MB)。 + +**环境变量**:以上脚本默认使用 `$HOME/workspace/` 下的目录,可通过环境变量覆盖: + +```bash +export GRAPHNET_LOG_DIR=/your/path/logs_and_lists +export GRAPHNET_SUCCESS_DIR=/your/path/success +export GRAPHNET_SAMPLES_DIR=/your/path/samples +``` diff --git a/graph_net/agent/scripts/analyze_extraction_log.sh b/graph_net/agent/scripts/analyze_extraction_log.sh new file mode 100644 index 000000000..e88d5aeaf --- /dev/null +++ b/graph_net/agent/scripts/analyze_extraction_log.sh @@ -0,0 +1,229 @@ +#!/bin/bash +# ============================================================================= +# 模型抽取日志分析脚本 +# ============================================================================= +# 用途:分析 GraphNet Agent 批量模型抽取日志,输出成功/失败分布、异常类型、 +# 模型大小分布、HTTP 状态码等统计信息,便于快速定位批量失败根因。 +# +# 适用场景: +# - 某批次跑完后,快速统计成功率和失败分布 +# - 对比多轮抽取的失败模式变化 +# - 生成已处理/成功模型列表,用于生成下一轮剩余列表 +# +# 用法: +# bash analyze_extraction_log.sh +# +# 示例: +# bash analyze_extraction_log.sh $HOME/workspace/logs_and_lists/batch7_safe_run.log +# +# 输出内容(10 个部分): +# 一、总体统计 —— 总尝试数、成功数、失败数、成功率、PROGRESS 行数 +# 二、失败原因一级分布 —— Script execution failed / Failed to analyze model / +# Failed to download / timeout / 401 / ducc 等 +# 三、Failed to analyze model 细分 —— Model too large / Unsupported model_type 等 +# 四、Model too large 分布(Top 10) —— 按参数量统计被过滤的大模型 +# 五、Script execution failed 异常类型分布 —— ValueError / Dynamo / IndexError 等 +# 六、ValueError 详细分布(Top 10) —— 最常见的 ValueError 具体信息 +# 七、KeyError 详细分布(Top 10) —— 最常见的 KeyError 具体 key +# 八、Download failure HTTP 状态码分布 —— 404 / 403 等 +# 九、其他异常 —— ducc 超时、ducc 空输出、脚本执行超时 +# 十、辅助文件 —— 生成 /tmp/_processed.txt 和 _success.txt +# +# 示例输出片段: +# --- 一、总体统计 --- +# 总尝试数: 4973 +# 成功数: 244 +# 失败数: 1791 +# 成功率: 4.9% +# PROGRESS 行数: 2925 +# +# --- 二、失败原因一级分布 --- +# 类型 | 数量 +# ---|--- +# Script execution failed | 1003 +# Failed to analyze model | 718 +# timeout | 2 +# 401 | 3 +# ducc | 66 +# +# --- 五、Script execution failed 异常类型分布 --- +# 异常类型 | 数量 +# ---|--- +# ValueError | 1368 +# torch._dynamo.exc.InternalTorchDynamoError | 940 +# IndexError | 936 +# ... +# +# 依赖:bash, grep, sed, awk, bc, sort, uniq +# ============================================================================= + +set -e + +LOG_FILE="${1:-}" +if [ -z "$LOG_FILE" ] || [ ! -f "$LOG_FILE" ]; then + echo "用法: bash $0 " + echo "示例: bash $0 \$HOME/workspace/logs_and_lists/batch7_safe_run.log" + exit 1 +fi + +echo "======================================" +echo "批次日志分析报告" +echo "日志文件: $LOG_FILE" +echo "分析时间: $(date '+%Y-%m-%d %H:%M:%S')" +echo "======================================" +echo "" + +# 1. 总体统计 +echo "--- 一、总体统计 ---" +# 使用日志中实际的标记格式统计: +# 成功: [CPU X] OK model_name 或 [GPU X] OK model_name +# 失败: [CPU X] FAIL model_name 或 [GPU X] FAIL model_name +# 注:同时支持 CPU-only 模式和 GPU 模式的日志 +# 部分日志含二进制数据,用 grep -a 强制按文本处理 +TOTAL=$(grep -ac "Starting extraction for model:" "$LOG_FILE" 2>/dev/null || echo 0) +SUCCESS=$(grep -aoE '\[(CPU|GPU) [0-9]+\] OK [^ ]+' "$LOG_FILE" 2>/dev/null | wc -l | tr -d ' ') +FAILED=$(grep -aoE '\[(CPU|GPU) [0-9]+\] FAIL [^ ]+' "$LOG_FILE" 2>/dev/null | wc -l | tr -d ' ') +PROGRESS_LINES=$(grep -c "PROGRESS" "$LOG_FILE" 2>/dev/null || echo 0) + +echo "总尝试数: $TOTAL" +echo "成功数: $SUCCESS" +echo "失败数: $FAILED" +if [ "$TOTAL" -gt 0 ]; then + printf "成功率: %.1f%%\n" $(echo "scale=2; $SUCCESS * 100 / $TOTAL" | bc 2>/dev/null || echo 0) +fi +echo "PROGRESS 行数: $PROGRESS_LINES" +echo "" + +# 2. 失败原因一级分布 +# 注意:不同错误类型在日志中的格式不同 +# - Script execution failed / Failed to analyze model: "Extraction failed for ...: $reason" +# - Failed to download / 401: "Unexpected error for ...: $reason" +# - timeout: 可能出现在两种格式中 +echo "--- 二、失败原因一级分布 ---" +echo "类型 | 数量" +echo "---|---" + +# Script execution failed +scount=$(grep -c "Extraction failed for.*Script execution failed" "$LOG_FILE" 2>/dev/null || echo 0) +scount=$(echo "$scount" | tr -d '\n') +if [ "$scount" -gt 0 ]; then echo "Script execution failed | $scount"; fi + +# Failed to analyze model (包含 Model too large / Unsupported model_type) +acount=$(grep -c "Extraction failed for.*Failed to analyze model" "$LOG_FILE" 2>/dev/null || echo 0) +acount=$(echo "$acount" | tr -d '\n') +if [ "$acount" -gt 0 ]; then echo "Failed to analyze model | $acount"; fi + +# Download failure(包括超时、404、403、401 等) +dcount=$(grep -c "Unexpected error for.*Failed to download" "$LOG_FILE" 2>/dev/null || echo 0) +dcount=$(echo "$dcount" | tr -d '\n') +if [ "$dcount" -gt 0 ]; then echo "Failed to download | $dcount"; fi + +# Sample verification failed +vcount=$(grep -c "Extraction failed for.*Sample verification failed" "$LOG_FILE" 2>/dev/null || echo 0) +vcount=$(echo "$vcount" | tr -d '\n') +if [ "$vcount" -gt 0 ]; then echo "Sample verification failed | $vcount"; fi + +# 401(未授权 / 私有模型) +# 401 可能在 download 失败中,也可能单独出现 +count401=$(grep -c "401 Client Error" "$LOG_FILE" 2>/dev/null || echo 0) +count401=$(echo "$count401" | tr -d '\n') +if [ "$count401" -gt 0 ]; then echo "401 Unauthorized | $count401"; fi + +# ducc +ducccount=$(grep -c "ducc -p" "$LOG_FILE" 2>/dev/null || echo 0) +ducccount=$(echo "$ducccount" | tr -d '\n') +if [ "$ducccount" -gt 0 ]; then echo "ducc retry | $ducccount"; fi +echo "" + +# 3. Failed to analyze model 细分 +echo "--- 三、Failed to analyze model 细分 ---" +echo "子类别 | 数量" +echo "---|---" +for sub in "Model too large" "Unsupported model_type" "not supported between instances"; do + count=$(grep "Extraction failed for.*Failed to analyze model" "$LOG_FILE" | grep -c "$sub" 2>/dev/null || echo 0) + count=$(echo "$count" | tr -d '\n') + if [ "$count" -gt 0 ]; then + echo "$sub | $count" + fi +done +echo "" + +# 4. Model too large 的参数量分布(Top 10) +echo "--- 四、Model too large 分布(Top 10) ---" +grep "Extraction failed for.*Failed to analyze model.*Model too large" "$LOG_FILE" 2>/dev/null | \ + sed 's/.*estimated //;s/ parameters.*//' | \ + sort | uniq -c | sort -rn | head -10 | \ + awk '{print $2 " | " $1}' +echo "" + +# 5. Script execution failed 的异常类型分布 +echo "--- 五、Script execution failed 异常类型分布 ---" +TEMP_ERR=$(mktemp) +grep -A 30 "Extraction failed.*Script execution failed" "$LOG_FILE" 2>/dev/null | \ + grep -oE "(torch\._dynamo\.exc\.InternalTorchDynamoError|ValueError|AttributeError|TypeError|KeyError|ImportError|ModuleNotFoundError|RuntimeError|IndexError|NameError|OSError|FileNotFoundError)" | \ + sort | uniq -c | sort -rn > "$TEMP_ERR" +if [ -s "$TEMP_ERR" ]; then + echo "异常类型 | 数量" + echo "---|---" + cat "$TEMP_ERR" | awk '{print $2 " | " $1}' +else + echo "未提取到异常类型(可能日志中无详细 traceback)" +fi +rm -f "$TEMP_ERR" +echo "" + +# 6. ValueError 详细分布 +echo "--- 六、ValueError 详细分布(Top 10) ---" +grep -A 30 "Extraction failed.*Script execution failed" "$LOG_FILE" 2>/dev/null | \ + grep "ValueError" | sed 's/.*ValueError: //' | \ + sort | uniq -c | sort -rn | head -10 | \ + awk '{first=$1; $1=""; print substr($0,2) " | " first}' +echo "" + +# 7. KeyError 详细分布 +echo "--- 七、KeyError 详细分布(Top 10) ---" +grep -A 30 "Extraction failed.*Script execution failed" "$LOG_FILE" 2>/dev/null | \ + grep "KeyError" | sed "s/.*KeyError: //;s/'//g" | \ + sort | uniq -c | sort -rn | head -10 | \ + awk '{first=$1; $1=""; print substr($0,2) " | " first}' +echo "" + +# 8. Download failure 的 HTTP 状态码分布 +echo "--- 八、Download failure HTTP 状态码分布 ---" +grep "Unexpected error.*Failed to download" "$LOG_FILE" 2>/dev/null | \ + grep -oP '\d{3} Client Error' | sort | uniq -c | sort -rn | \ + awk '{print $2 " | " $1}' +echo "" + +# 9. 超时/ducc 相关 +echo "--- 九、其他异常 ---" +DUCC_TIMEOUT=$(grep -c "ducc -p timed out" "$LOG_FILE" 2>/dev/null || echo 0) +DUCC_TIMEOUT=$(echo "$DUCC_TIMEOUT" | tr -d '\n') +DUCC_EMPTY=$(grep -c "ducc -p returned empty" "$LOG_FILE" 2>/dev/null || echo 0) +DUCC_EMPTY=$(echo "$DUCC_EMPTY" | tr -d '\n') +SCRIPT_TIMEOUT=$(grep -c "Script execution timed out" "$LOG_FILE" 2>/dev/null || echo 0) +SCRIPT_TIMEOUT=$(echo "$SCRIPT_TIMEOUT" | tr -d '\n') +if [ "$DUCC_TIMEOUT" -gt 0 ]; then echo "ducc 超时: $DUCC_TIMEOUT"; fi +if [ "$DUCC_EMPTY" -gt 0 ]; then echo "ducc 空输出: $DUCC_EMPTY"; fi +if [ "$SCRIPT_TIMEOUT" -gt 0 ]; then echo "脚本执行超时: $SCRIPT_TIMEOUT"; fi +echo "" + +# 10. 生成已处理模型列表(可选) +echo "--- 十、辅助文件 ---" +BASE_NAME=$(basename "$LOG_FILE" .log) +PROCESSED="/tmp/${BASE_NAME}_processed.txt" +SUCCESS_LIST="/tmp/${BASE_NAME}_success.txt" + +grep "Starting extraction for model:" "$LOG_FILE" 2>/dev/null | \ + sed 's/.*Starting extraction for model: //' > "$PROCESSED" + +grep -aoE '\[(CPU|GPU) [0-9]+\] OK [^ ]+' "$LOG_FILE" 2>/dev/null | \ + awk '{print $NF}' | sort -u > "$SUCCESS_LIST" + +echo "已处理模型列表: $PROCESSED ($(wc -l < "$PROCESSED" 2>/dev/null || echo 0) 个)" +echo "成功模型列表: $SUCCESS_LIST ($(wc -l < "$SUCCESS_LIST" 2>/dev/null || echo 0) 个)" +echo "" + +echo "======================================" +echo "分析完成" +echo "======================================" diff --git a/graph_net/agent/scripts/check_extraction_progress.sh b/graph_net/agent/scripts/check_extraction_progress.sh new file mode 100644 index 000000000..fbf1e21b5 --- /dev/null +++ b/graph_net/agent/scripts/check_extraction_progress.sh @@ -0,0 +1,205 @@ +#!/bin/bash +# ============================================================================= +# 模型抽取进度检查脚本 +# ============================================================================= +# 用途:一键查看当前模型抽取任务的运行状态,包括进程存活、日志进度、 +# 成功/失败统计、处理速度估算、磁盘空间等。 +# +# 用法: +# bash check_extraction_progress.sh [log_file] +# +# 不带参数:自动查找 $HOME/workspace/logs_and_lists/ 下最新的 .log +# 带参数: 指定日志文件路径 +# +# 示例: +# bash check_extraction_progress.sh +# bash check_extraction_progress.sh $HOME/workspace/logs_and_lists/batch7_safe_run.log +# +# 输出示例: +# ====== 进程状态 ====== +# 主进程 PID: 24925 (运行时长: 00:24) +# CPU 使用率: 132% +# 内存使用率: 1.0% +# Worker 数量: 7 +# +# ====== 日志进度 ====== +# 日志文件: batch7_safe_run_no_llm.log +# 最新进度: [PROGRESS] 1608/63555 done, success rate so far: 24.5% +# 日志时间: 2026-05-18 16:57:48 +# 最后更新: 0 秒前 +# +# ====== 统计汇总 ====== +# 已处理: 1608 / 63555 +# 剩余: 61947 +# 成功率: 24.5% +# 成功: 37 +# 失败: 1195 +# 速度: ~1800 个/小时 (最近 50 条 PROGRESS) +# 预计剩余: ~34.4 小时 (~1.4 天) +# +# ====== 磁盘空间 ====== +# /home: 可用 67G / 总 3.6T (99% 已用) +# /tmp: 可用 5.8G / 总 6.5G (6% 已用) +# +# ====== 样本目录 ====== +# 成功样本: 2025 个 (目录: $HOME/workspace/success) +# 样本文件: 3382 个 (目录: $HOME/workspace/samples) +# +# ============================================================================= + +# 默认路径,可通过环境变量覆盖 +LOG_DIR="${GRAPHNET_LOG_DIR:-$HOME/workspace/logs_and_lists}" +SUCCESS_DIR="${GRAPHNET_SUCCESS_DIR:-$HOME/workspace/success}" +SAMPLES_DIR="${GRAPHNET_SAMPLES_DIR:-$HOME/workspace/samples}" + +# --------------------------------------------------------------------------- +# 1. 确定日志文件 +# --------------------------------------------------------------------------- +if [ -n "$1" ]; then + LOG_FILE="$1" +else + LOG_FILE=$(ls -t "$LOG_DIR"/*.log 2>/dev/null | head -1) +fi + +if [ -z "$LOG_FILE" ] || [ ! -f "$LOG_FILE" ]; then + echo "错误: 未找到日志文件" + echo "用法: bash $0 [log_file]" + exit 1 +fi + +LOG_BASENAME=$(basename "$LOG_FILE") + +# --------------------------------------------------------------------------- +# 2. 进程状态 +# --------------------------------------------------------------------------- +echo "====== 进程状态 ======" + +# 查找 graph_net_agent / run_model 相关进程 +PIDS=$(pgrep -f "graph_net_agent|run_model\.py|parallel_extract" 2>/dev/null || true) +if [ -n "$PIDS" ]; then + MAIN_PID=$(echo "$PIDS" | head -1) + # 计算运行时长 + ELAPSED=$(ps -o etime= -p "$MAIN_PID" 2>/dev/null | tr -d ' ' || echo "未知") + # CPU 和内存 + CPU_MEM=$(ps -o %cpu=,%mem= -p "$MAIN_PID" 2>/dev/null | tr -s ' ' | sed 's/^ //' || echo "? ?") + CPU=$(echo "$CPU_MEM" | awk '{print $1}') + MEM=$(echo "$CPU_MEM" | awk '{print $2}') + # Worker 数量(run_model.py 子进程数) + WORKERS=$(pgrep -f "run_model\.py" 2>/dev/null | wc -l) + + echo "主进程 PID: $MAIN_PID (运行时长: $ELAPSED)" + echo "CPU 使用率: ${CPU}%" + echo "内存使用率: ${MEM}%" + echo "Worker 数量: $WORKERS" +else + echo "状态: 无运行中的抽取进程" +fi +echo "" + +# --------------------------------------------------------------------------- +# 3. 日志进度 +# --------------------------------------------------------------------------- +echo "====== 日志进度 ======" +echo "日志文件: $LOG_BASENAME" + +LATEST_PROGRESS=$(grep "PROGRESS" "$LOG_FILE" 2>/dev/null | tail -1) +if [ -n "$LATEST_PROGRESS" ]; then + echo "最新进度: $LATEST_PROGRESS" +else + echo "最新进度: (日志中无 PROGRESS 记录)" +fi + +# 最后一条日志的时间戳(从末尾 20 行里找) +LAST_LOG_TIME=$(tail -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}' | tail -1) +if [ -n "$LAST_LOG_TIME" ]; then + LAST_TS=$(date -d "$LAST_LOG_TIME" +%s 2>/dev/null || echo 0) + NOW_TS=$(date +%s) + DELTA=$((NOW_TS - LAST_TS)) + if [ $DELTA -lt 60 ]; then + AGO="${DELTA} 秒前" + elif [ $DELTA -lt 3600 ]; then + AGO="$((DELTA / 60)) 分钟前" + elif [ $DELTA -lt 86400 ]; then + AGO="$((DELTA / 3600)) 小时前" + else + AGO="$((DELTA / 86400)) 天前" + fi + echo "日志时间: $LAST_LOG_TIME" + echo "最后更新: $AGO" +else + echo "日志时间: (无法解析)" +fi +echo "" + +# --------------------------------------------------------------------------- +# 4. 统计汇总 +# --------------------------------------------------------------------------- +echo "====== 统计汇总 ======" + +# 从日志统计 +TOTAL_IN_LOG=$(grep -cE "Starting extraction for|Successfully extracted|Extraction failed for" "$LOG_FILE" 2>/dev/null || echo 0) +SUCCESS_IN_LOG=$(grep -c "Successfully extracted" "$LOG_FILE" 2>/dev/null || echo 0) +FAILED_IN_LOG=$(grep -c "Extraction failed for" "$LOG_FILE" 2>/dev/null || echo 0) + +# 从 PROGRESS 行提取进度 +if [ -n "$LATEST_PROGRESS" ]; then + DONE=$(echo "$LATEST_PROGRESS" | grep -oP '\d+(?=/)' | head -1) + TOTAL=$(echo "$LATEST_PROGRESS" | grep -oP '(?<=/)\d+' | head -1) + RATE=$(echo "$LATEST_PROGRESS" | grep -oP '\d+\.\d+(?=%)' | head -1) + + if [ -n "$DONE" ] && [ -n "$TOTAL" ]; then + echo "已处理: $DONE / $TOTAL" + REMAIN=$((TOTAL - DONE)) + echo "剩余: $REMAIN" + fi + if [ -n "$RATE" ]; then + echo "成功率: ${RATE}%" + fi +fi + +echo "成功: $SUCCESS_IN_LOG" +echo "失败: $FAILED_IN_LOG" + +# 速度估算:取最近 50 条 PROGRESS,计算时间差 +PROGRESS_TIMES=$(grep "PROGRESS" "$LOG_FILE" 2>/dev/null | tail -50 | grep -oP '^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}') +PROGRESS_COUNT=$(echo "$PROGRESS_TIMES" | wc -l) +if [ "$PROGRESS_COUNT" -ge 2 ]; then + FIRST_TIME=$(echo "$PROGRESS_TIMES" | head -1) + LAST_TIME=$(echo "$PROGRESS_TIMES" | tail -1) + FIRST_TS=$(date -d "$FIRST_TIME" +%s 2>/dev/null || echo 0) + LAST_TS=$(date -d "$LAST_TIME" +%s 2>/dev/null || echo 0) + TIME_DIFF=$((LAST_TS - FIRST_TS)) + if [ "$TIME_DIFF" -gt 0 ]; then + SPEED_PER_HOUR=$(echo "scale=0; $PROGRESS_COUNT * 3600 / $TIME_DIFF" | bc 2>/dev/null || echo 0) + echo "速度: ~$SPEED_PER_HOUR 个/小时 (最近 $PROGRESS_COUNT 条 PROGRESS)" + if [ -n "$REMAIN" ] && [ "$SPEED_PER_HOUR" -gt 0 ] 2>/dev/null; then + HOURS_LEFT=$(echo "scale=1; $REMAIN / $SPEED_PER_HOUR" | bc 2>/dev/null || echo 0) + DAYS_LEFT=$(echo "scale=1; $HOURS_LEFT / 24" | bc 2>/dev/null || echo 0) + echo "预计剩余: ~${HOURS_LEFT} 小时 (~${DAYS_LEFT} 天)" + fi + fi +fi +echo "" + +# --------------------------------------------------------------------------- +# 5. 磁盘空间 +# --------------------------------------------------------------------------- +echo "====== 磁盘空间 ======" +df -h /home /tmp 2>/dev/null | awk 'NR==1 {next} {printf "%s: 可用 %s / 总 %s (%s 已用)\n", $6, $4, $2, $5}' +echo "" + +# --------------------------------------------------------------------------- +# 6. 样本目录统计(可选) +# --------------------------------------------------------------------------- +echo "====== 样本目录 ======" +if [ -d "$SUCCESS_DIR" ]; then + SUCCESS_COUNT=$(ls -1 "$SUCCESS_DIR" 2>/dev/null | wc -l) + echo "成功样本: $SUCCESS_COUNT 个 (目录: $SUCCESS_DIR)" +fi +if [ -d "$SAMPLES_DIR" ]; then + SAMPLE_COUNT=$(ls -1 "$SAMPLES_DIR" 2>/dev/null | wc -l) + echo "样本文件: $SAMPLE_COUNT 个 (目录: $SAMPLES_DIR)" +fi +echo "" + +echo "====== 检查完成 ======" diff --git a/graph_net/agent/scripts/gen_hash_and_dedup.py b/graph_net/agent/scripts/gen_hash_and_dedup.py new file mode 100644 index 000000000..a5b628e60 --- /dev/null +++ b/graph_net/agent/scripts/gen_hash_and_dedup.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# ============================================================================= +# 子图去重脚本 +# ============================================================================= +# 用途:遍历工作目录下所有子图的 model.py,计算 SHA256 哈希生成 graph_hash.txt, +# 找出内容完全相同的子图并生成去重报告,支持一键删除重复目录。 +# +# 为什么需要去重: +# 同一基础模型的微调变体(fine-tune variants)通常共享完全相同的计算图, +# 只是权重不同。去重后体积可缩减 90%+(实测:85K 子图 -> 1.5K 唯一子图, +# 2.3 GB -> 172 MB)。 +# +# 用法: +# python gen_hash_and_dedup.py [--remove] +# +# 参数: +# workspace_dir 抽取结果目录(含 model.py 子图),默认当前目录 "." +# --remove 可选,确认后直接删除重复子图目录 +# +# 示例: +# # 1. 仅分析,不删除 +# python gen_hash_and_dedup.py ./success_20260515_merged +# +# # 2. 分析并直接删除重复目录(保留每组第一个) +# python gen_hash_and_dedup.py ./success_20260515_merged --remove +# +# 输出示例: +# Found 6919 model.py files under ./success_20260515_merged +# Progress: 2000/6919 +# ... +# +# Step 1 - Generate graph_hash.txt: +# Total model.py: 6919 +# Generated/Updated: 6919 +# Failed: 0 +# +# Step 2 - Deduplication analysis: +# Total subgraphs: 6919 +# Unique graphs: 1486 +# Duplicate groups: 421 +# Subgraphs involved in duplication: 5433 +# Can be removed (keeping one per group): 5012 +# +# Dedup report saved to: ./success_20260515_merged/dedup_report.txt +# +# To remove duplicates, run: +# python /path/to/gen_hash_and_dedup.py ./success_20260515_merged --remove +# +# # 若加了 --remove,还会输出: +# Removed 5012 duplicate subgraph directories. +# Remaining subgraphs: 1907 +# +# 输出文件: +# /dedup_report.txt 去重报告,包含每组重复的 hash、数量和路径 +# /graph_hash.txt 每个子图目录下的哈希文件 +# +# 依赖:Python 3.6+ +# ============================================================================= + +import hashlib +import os +import sys +from collections import defaultdict + + +def get_sha256_hash(content): + m = hashlib.sha256() + m.update(content.encode("utf-8")) + return m.hexdigest() + + +def find_model_files(workspace): + results = [] + for root, dirs, files in os.walk(workspace): + if "model.py" in files: + results.append(os.path.join(root, "model.py")) + return sorted(results) + + +def main(): + workspace = sys.argv[1] if len(sys.argv) > 1 else "." + + # Step 1: Find all model.py and generate graph_hash.txt + generated = 0 + failed = 0 + hash_to_paths = defaultdict(list) + + model_files = find_model_files(workspace) + total = len(model_files) + print("Found %d model.py files under %s" % (total, workspace)) + + for i, model_py in enumerate(model_files, 1): + subgraph_dir = os.path.dirname(model_py) + hash_file = os.path.join(subgraph_dir, "graph_hash.txt") + + try: + with open(model_py, "r") as f: + model_code = f.read() + file_hash = get_sha256_hash(model_code) + + with open(hash_file, "w") as f: + f.write(file_hash) + hash_to_paths[file_hash].append(subgraph_dir) + generated += 1 + + except Exception as e: + failed += 1 + print(" [FAIL] %s: %s" % (subgraph_dir, e)) + + if i % 2000 == 0: + print(" Progress: %d/%d" % (i, total)) + + print("\nStep 1 - Generate graph_hash.txt:") + print(" Total model.py: %d" % total) + print(" Generated/Updated: %d" % generated) + print(" Failed: %d" % failed) + + # Step 2: Dedup analysis + unique_count = len(hash_to_paths) + dup_groups = dict( + (h, paths) for h, paths in hash_to_paths.items() if len(paths) > 1 + ) + dup_subgraph_count = sum(len(p) for p in dup_groups.values()) + removable = dup_subgraph_count - len(dup_groups) + + print("\nStep 2 - Deduplication analysis:") + print(" Total subgraphs: %d" % total) + print(" Unique graphs: %d" % unique_count) + print(" Duplicate groups: %d" % len(dup_groups)) + print(" Subgraphs involved in duplication: %d" % dup_subgraph_count) + print(" Can be removed (keeping one per group): %d" % removable) + + # Step 3: Write dedup report + report_path = os.path.join(workspace, "dedup_report.txt") + with open(report_path, "w") as f: + f.write("Deduplication Report\n") + f.write("====================\n") + f.write("Total subgraphs: %d\n" % total) + f.write("Unique graphs: %d\n" % unique_count) + f.write("Duplicate groups: %d\n" % len(dup_groups)) + f.write("Removable duplicates: %d\n\n" % removable) + + for h, paths in sorted(dup_groups.items(), key=lambda x: -len(x[1])): + f.write("Hash: %s\n" % h) + f.write(" Count: %d\n" % len(paths)) + f.write(" Keep: %s\n" % paths[0]) + for p in paths[1:]: + f.write(" Remove: %s\n" % p) + f.write("\n") + + print("\n Dedup report saved to: %s" % report_path) + + # Step 4: Ask before removing + if len(sys.argv) > 2 and sys.argv[2] == "--remove": + import shutil + + removed = 0 + for h, paths in dup_groups.items(): + for p in paths[1:]: + if os.path.exists(p): + shutil.rmtree(p) + removed += 1 + print("\nRemoved %d duplicate subgraph directories." % removed) + print("Remaining subgraphs: %d" % (total - removed)) + else: + print("\nTo remove duplicates, run:") + print(" python %s %s --remove" % (os.path.abspath(__file__), workspace)) + + +if __name__ == "__main__": + main()