From 9ad82e5fbbf6ac9845ef594ac49c488d6a622ee1 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 20 May 2026 14:43:38 +0800 Subject: [PATCH 1/3] Agent: fix graph hash generation for multi-subgraph models - _generate_graph_hash: generate per-subgraph hashes (subgraph_N/graph_hash.txt) instead of a single top-level hash, avoiding false collisions - is_duplicate_sample: use frozenset of subgraph hashes for multi-subgraph models, preventing rglob false matches on per-subgraph hash files - Single-graph model logic unchanged (root graph_hash.txt) --- graph_net/agent/graph_net_agent.py | 131 ++++++++++++++++++++--------- 1 file changed, 92 insertions(+), 39 deletions(-) diff --git a/graph_net/agent/graph_net_agent.py b/graph_net/agent/graph_net_agent.py index c3cac10a5..ca4f5c5f9 100644 --- a/graph_net/agent/graph_net_agent.py +++ b/graph_net/agent/graph_net_agent.py @@ -349,24 +349,47 @@ def _fix_model_name(self, sample_dir: Path, model_id: str) -> None: self.logger.warning(f"Failed to fix model_name in {json_path}: {e}") def _generate_graph_hash(self, sample_dir: Path) -> None: - """Generate graph_hash.txt from model.py if it doesn't exist""" - graph_hash_path = sample_dir / "graph_hash.txt" - model_py_path = sample_dir / "model.py" + """Generate graph_hash.txt from model.py. - if graph_hash_path.exists(): - return - - if not model_py_path.exists(): - self.logger.warning(f"model.py not found at {model_py_path}") - return - - try: - model_code = model_py_path.read_text() - graph_hash = get_sha256_hash(model_code) - graph_hash_path.write_text(graph_hash) - self.logger.info(f"Generated graph_hash.txt: {graph_hash[:16]}...") - except (OSError, IOError) as e: - self.logger.warning(f"Failed to generate graph_hash.txt: {e}") + - Single-graph: hash root model.py → sample_dir/graph_hash.txt + - Multi-subgraph: hash each subgraph_N/model.py → subgraph_N/graph_hash.txt + (no top-level graph_hash.txt for multi-subgraph models) + """ + root_model = sample_dir / "model.py" + if root_model.exists(): + # Single-graph model + graph_hash_path = sample_dir / "graph_hash.txt" + if graph_hash_path.exists(): + return + try: + graph_hash = get_sha256_hash(root_model.read_text()) + graph_hash_path.write_text(graph_hash) + self.logger.info(f"Generated graph_hash.txt: {graph_hash[:16]}...") + except (OSError, IOError) as e: + self.logger.warning(f"Failed to generate graph_hash.txt: {e}") + else: + # Multi-subgraph model: generate per-subgraph hashes + subgraph_dirs = sorted(sample_dir.glob("subgraph_*")) + if not subgraph_dirs: + self.logger.warning( + f"No model.py or subgraph dirs found in {sample_dir}" + ) + return + for subgraph_dir in subgraph_dirs: + model_py = subgraph_dir / "model.py" + hash_path = subgraph_dir / "graph_hash.txt" + if hash_path.exists() or not model_py.exists(): + continue + try: + graph_hash = get_sha256_hash(model_py.read_text()) + hash_path.write_text(graph_hash) + self.logger.info( + f"Generated {subgraph_dir.name}/graph_hash.txt: {graph_hash[:16]}..." + ) + except (OSError, IOError) as e: + self.logger.warning( + f"Failed to generate graph_hash.txt for {subgraph_dir}: {e}" + ) def _move_sample(self, sample_dir: Path, dest_parent: Path) -> Path: """Move sample_dir into dest_parent/, overwriting if destination exists""" @@ -378,31 +401,61 @@ def _move_sample(self, sample_dir: Path, dest_parent: Path) -> Path: return dest def is_duplicate_sample(self, sample_dir: Path) -> bool: - """Check if the extracted sample is a duplicate of an existing sample""" - graph_hash_path = sample_dir / "graph_hash.txt" - - if not graph_hash_path.exists(): - return False + """Check if the extracted sample is a duplicate of an existing sample. + - Single-graph: compare root graph_hash.txt against existing root hashes. + - Multi-subgraph: compare frozenset of all subgraph_*/graph_hash.txt hashes. + """ try: - current_hash = graph_hash_path.read_text().strip() - - # Search for duplicates in success_dir (where past successful samples live) - for search_root in [self.workspace.success_dir, self.workspace.samples_dir]: - if not search_root.exists(): - continue - for hash_file in search_root.rglob("graph_hash.txt"): - if hash_file == graph_hash_path: + root_model = sample_dir / "model.py" + if root_model.exists(): + # Single-graph + graph_hash_path = sample_dir / "graph_hash.txt" + if not graph_hash_path.exists(): + return False + current_hash = graph_hash_path.read_text().strip() + for search_root in [ + self.workspace.success_dir, + self.workspace.samples_dir, + ]: + if not search_root.exists(): continue - try: - existing_hash = hash_file.read_text().strip() - if existing_hash == current_hash: - self.logger.info(f"Duplicate found: {hash_file.parent}") - return True - except (OSError, IOError): + for existing_dir in search_root.iterdir(): + if not existing_dir.is_dir() or existing_dir == sample_dir: + continue + hash_file = existing_dir / "graph_hash.txt" + if not hash_file.exists(): + continue + try: + if hash_file.read_text().strip() == current_hash: + self.logger.info(f"Duplicate found: {existing_dir}") + return True + except (OSError, IOError): + continue + else: + # Multi-subgraph: compare the set of subgraph hashes + current_hashes = frozenset( + h.read_text().strip() + for h in sample_dir.glob("subgraph_*/graph_hash.txt") + ) + if not current_hashes: + return False + for search_root in [ + self.workspace.success_dir, + self.workspace.samples_dir, + ]: + if not search_root.exists(): continue - - return False + for existing_dir in search_root.iterdir(): + if not existing_dir.is_dir() or existing_dir == sample_dir: + continue + existing_hashes = frozenset( + h.read_text().strip() + for h in existing_dir.glob("subgraph_*/graph_hash.txt") + ) + if existing_hashes and existing_hashes == current_hashes: + self.logger.info(f"Duplicate found: {existing_dir}") + return True except (OSError, IOError) as e: self.logger.warning(f"Failed to check duplicate: {e}") - return False + return False From 31a5e896cdb8bac8044a4edefd6570196d2d8094 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 20 May 2026 14:51:58 +0800 Subject: [PATCH 2/3] Agent: reuse graph_net.hash_util in gen_hash_and_dedup.py Replace inline SHA256 implementation with the canonical get_sha256_hash from graph_net.hash_util, consistent with the agent extraction pipeline and other modules. --- graph_net/agent/scripts/gen_hash_and_dedup.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/graph_net/agent/scripts/gen_hash_and_dedup.py b/graph_net/agent/scripts/gen_hash_and_dedup.py index a5b628e60..722d912e8 100644 --- a/graph_net/agent/scripts/gen_hash_and_dedup.py +++ b/graph_net/agent/scripts/gen_hash_and_dedup.py @@ -57,16 +57,17 @@ # 依赖:Python 3.6+ # ============================================================================= -import hashlib import os import sys from collections import defaultdict +# Ensure graph_net is importable when running this script standalone +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_GRAPHNET_ROOT = os.path.join(_SCRIPT_DIR, "..", "..", "..") +if _GRAPHNET_ROOT not in sys.path: + sys.path.insert(0, _GRAPHNET_ROOT) -def get_sha256_hash(content): - m = hashlib.sha256() - m.update(content.encode("utf-8")) - return m.hexdigest() +from graph_net.hash_util import get_sha256_hash # noqa: E402 def find_model_files(workspace): From 5533b16790b2224a84ddffc8d42f69f216af88a4 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Wed, 20 May 2026 15:43:16 +0800 Subject: [PATCH 3/3] Agent: unify single/multi-subgraph handling with _get_subgraph_dirs - Add _get_subgraph_dirs() helper: returns [sample_dir] for single-graph or [subgraph_0, subgraph_1, ...] for multi-subgraph models - Refactor _fix_model_name, _generate_graph_hash, is_duplicate_sample to use the helper, eliminating hardcoded subgraph_xxx globs - is_duplicate_sample now collects hashes from all subgraphs uniformly via frozenset comparison, regardless of model layout --- graph_net/agent/graph_net_agent.py | 146 +++++++++++------------------ 1 file changed, 53 insertions(+), 93 deletions(-) diff --git a/graph_net/agent/graph_net_agent.py b/graph_net/agent/graph_net_agent.py index ca4f5c5f9..c25928e86 100644 --- a/graph_net/agent/graph_net_agent.py +++ b/graph_net/agent/graph_net_agent.py @@ -332,12 +332,19 @@ def _extract_graph(self, script_path: Path, model_id: str) -> Path: self.logger.info(f"Graph extracted to: {sample_dir}") return sample_dir + def _get_subgraph_dirs(self, sample_dir: Path) -> list[Path]: + """Return list of subgraph directories. + + For single-graph models, returns [sample_dir]. + For multi-subgraph models, returns [subgraph_0, subgraph_1, ...] sorted. + """ + subgraph_dirs = sorted(sample_dir.glob("subgraph_*/")) + return subgraph_dirs if subgraph_dirs else [sample_dir] + def _fix_model_name(self, sample_dir: Path, model_id: str) -> None: """Update model_name in graph_net.json to the original HuggingFace model_id (org/model).""" - for json_path in [ - sample_dir / "graph_net.json", - *sample_dir.glob("subgraph_*/graph_net.json"), - ]: + for target_dir in self._get_subgraph_dirs(sample_dir): + json_path = target_dir / "graph_net.json" if not json_path.exists(): continue try: @@ -349,47 +356,20 @@ def _fix_model_name(self, sample_dir: Path, model_id: str) -> None: self.logger.warning(f"Failed to fix model_name in {json_path}: {e}") def _generate_graph_hash(self, sample_dir: Path) -> None: - """Generate graph_hash.txt from model.py. - - - Single-graph: hash root model.py → sample_dir/graph_hash.txt - - Multi-subgraph: hash each subgraph_N/model.py → subgraph_N/graph_hash.txt - (no top-level graph_hash.txt for multi-subgraph models) - """ - root_model = sample_dir / "model.py" - if root_model.exists(): - # Single-graph model - graph_hash_path = sample_dir / "graph_hash.txt" - if graph_hash_path.exists(): - return + """Generate graph_hash.txt from model.py for each subgraph.""" + for target_dir in self._get_subgraph_dirs(sample_dir): + model_py = target_dir / "model.py" + hash_path = target_dir / "graph_hash.txt" + if hash_path.exists() or not model_py.exists(): + continue try: - graph_hash = get_sha256_hash(root_model.read_text()) - graph_hash_path.write_text(graph_hash) - self.logger.info(f"Generated graph_hash.txt: {graph_hash[:16]}...") + graph_hash = get_sha256_hash(model_py.read_text()) + hash_path.write_text(graph_hash) + rel = hash_path.relative_to(sample_dir) + self.logger.info(f"Generated {rel}: {graph_hash[:16]}...") except (OSError, IOError) as e: - self.logger.warning(f"Failed to generate graph_hash.txt: {e}") - else: - # Multi-subgraph model: generate per-subgraph hashes - subgraph_dirs = sorted(sample_dir.glob("subgraph_*")) - if not subgraph_dirs: - self.logger.warning( - f"No model.py or subgraph dirs found in {sample_dir}" - ) - return - for subgraph_dir in subgraph_dirs: - model_py = subgraph_dir / "model.py" - hash_path = subgraph_dir / "graph_hash.txt" - if hash_path.exists() or not model_py.exists(): - continue - try: - graph_hash = get_sha256_hash(model_py.read_text()) - hash_path.write_text(graph_hash) - self.logger.info( - f"Generated {subgraph_dir.name}/graph_hash.txt: {graph_hash[:16]}..." - ) - except (OSError, IOError) as e: - self.logger.warning( - f"Failed to generate graph_hash.txt for {subgraph_dir}: {e}" - ) + rel = hash_path.relative_to(sample_dir) + self.logger.warning(f"Failed to generate {rel}: {e}") def _move_sample(self, sample_dir: Path, dest_parent: Path) -> Path: """Move sample_dir into dest_parent/, overwriting if destination exists""" @@ -403,59 +383,39 @@ def _move_sample(self, sample_dir: Path, dest_parent: Path) -> Path: def is_duplicate_sample(self, sample_dir: Path) -> bool: """Check if the extracted sample is a duplicate of an existing sample. - - Single-graph: compare root graph_hash.txt against existing root hashes. - - Multi-subgraph: compare frozenset of all subgraph_*/graph_hash.txt hashes. + Collects all subgraph graph_hash.txt hashes into a frozenset and compares + against existing samples. Works for both single-graph and multi-subgraph. """ + + def _collect_hashes(path: Path) -> frozenset[str]: + hashes = set() + for target_dir in self._get_subgraph_dirs(path): + hash_path = target_dir / "graph_hash.txt" + if hash_path.exists(): + try: + hashes.add(hash_path.read_text().strip()) + except (OSError, IOError): + pass + return frozenset(hashes) + try: - root_model = sample_dir / "model.py" - if root_model.exists(): - # Single-graph - graph_hash_path = sample_dir / "graph_hash.txt" - if not graph_hash_path.exists(): - return False - current_hash = graph_hash_path.read_text().strip() - for search_root in [ - self.workspace.success_dir, - self.workspace.samples_dir, - ]: - if not search_root.exists(): - continue - for existing_dir in search_root.iterdir(): - if not existing_dir.is_dir() or existing_dir == sample_dir: - continue - hash_file = existing_dir / "graph_hash.txt" - if not hash_file.exists(): - continue - try: - if hash_file.read_text().strip() == current_hash: - self.logger.info(f"Duplicate found: {existing_dir}") - return True - except (OSError, IOError): - continue - else: - # Multi-subgraph: compare the set of subgraph hashes - current_hashes = frozenset( - h.read_text().strip() - for h in sample_dir.glob("subgraph_*/graph_hash.txt") - ) - if not current_hashes: - return False - for search_root in [ - self.workspace.success_dir, - self.workspace.samples_dir, - ]: - if not search_root.exists(): + current_hashes = _collect_hashes(sample_dir) + if not current_hashes: + return False + + for search_root in [ + self.workspace.success_dir, + self.workspace.samples_dir, + ]: + if not search_root.exists(): + continue + for existing_dir in search_root.iterdir(): + if not existing_dir.is_dir() or existing_dir == sample_dir: continue - for existing_dir in search_root.iterdir(): - if not existing_dir.is_dir() or existing_dir == sample_dir: - continue - existing_hashes = frozenset( - h.read_text().strip() - for h in existing_dir.glob("subgraph_*/graph_hash.txt") - ) - if existing_hashes and existing_hashes == current_hashes: - self.logger.info(f"Duplicate found: {existing_dir}") - return True + existing_hashes = _collect_hashes(existing_dir) + if existing_hashes and existing_hashes == current_hashes: + self.logger.info(f"Duplicate found: {existing_dir}") + return True except (OSError, IOError) as e: self.logger.warning(f"Failed to check duplicate: {e}") return False