From 2467054fcb4c169a89eeee56467fd3aaf752d973 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 30 Apr 2026 16:00:38 +0800 Subject: [PATCH 1/2] Agent: reorganize workspace with success/failed/logs_and_lists dirs - workspace_manager.py: add success_dir, failed_dir, logs_and_lists_dir - graph_net_agent.py: auto-move samples to success/ or failed/ after extraction - parallel_extract.py: output JSON to logs_and_lists/ instead of workspace root --- graph_net/agent/graph_net_agent.py | 57 +++++++++++++--------- graph_net/agent/parallel_extract.py | 10 +++- graph_net/agent/utils/workspace_manager.py | 18 +++++++ 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/graph_net/agent/graph_net_agent.py b/graph_net/agent/graph_net_agent.py index 0cf9718079..62c635edc2 100644 --- a/graph_net/agent/graph_net_agent.py +++ b/graph_net/agent/graph_net_agent.py @@ -2,6 +2,7 @@ import json import os +import shutil from enum import Enum from pathlib import Path from typing import Optional @@ -118,6 +119,7 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: ExtractionStatus.ERROR – unexpected error """ self.last_timeout_success = False + sample_dir: Optional[Path] = None try: self.logger.info(f"Starting extraction for model: {model_id}") @@ -144,6 +146,7 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: if self.is_duplicate_sample(sample_dir): self.logger.info("Duplicate sample detected, skipping verification") + self._move_sample(sample_dir, self.workspace.success_dir) return ExtractionStatus.OK if not self.sample_verifier.verify(sample_dir): @@ -152,6 +155,7 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: model_id, Exception("Sample verification failed"), ) + self._move_sample(sample_dir, self.workspace.failed_dir) return ExtractionStatus.VERIFY_FAILED if getattr(self.sample_verifier, "last_timeout_success", False): @@ -160,12 +164,15 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: f"Sample verification for {model_id} passed via timeout skip" ) + self._move_sample(sample_dir, self.workspace.success_dir) self.logger.info(f"Successfully extracted sample for {model_id}") return ExtractionStatus.OK except SampleVerificationError as e: self.logger.error(f"Extraction failed for {model_id}: {e}") self.error_classifier.classify_and_record(model_id, e) + if sample_dir and sample_dir.exists(): + self._move_sample(sample_dir, self.workspace.failed_dir) return ExtractionStatus.VERIFY_FAILED except ( ModelFetchError, @@ -175,23 +182,16 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: ) as e: self.logger.error(f"Extraction failed for {model_id}: {e}") self.error_classifier.classify_and_record(model_id, e) + if sample_dir and sample_dir.exists(): + self._move_sample(sample_dir, self.workspace.failed_dir) return ExtractionStatus.EXTRACT_FAILED except Exception as e: self.logger.error(f"Unexpected error for {model_id}: {e}", exc_info=True) self.error_classifier.classify_and_record(model_id, e) + if sample_dir and sample_dir.exists(): + self._move_sample(sample_dir, self.workspace.failed_dir) return ExtractionStatus.ERROR - @staticmethod - def _is_llm_fixable_error(err: GraphExtractionError) -> bool: - """Decide whether an extraction error is worth retrying with LLM. - - Only allow LLM retry for script logic errors (non-zero return code). - All other categories (timeout, infrastructure, missing model, etc.) - are not fixable by rewriting the script. - """ - category = GraphExtractionErrorClassifier.classify_from_exception(err) - return category == GraphExtractionErrorCategory.SCRIPT_EXECUTION_FAILED - def _llm_retry( self, first_err: GraphExtractionError, @@ -357,6 +357,15 @@ def _generate_graph_hash(self, sample_dir: Path) -> None: except (OSError, IOError) as e: self.logger.warning(f"Failed to generate graph_hash.txt: {e}") + def _move_sample(self, sample_dir: Path, dest_parent: Path) -> Path: + """Move sample_dir into dest_parent/, overwriting if destination exists""" + dest = dest_parent / sample_dir.name + if dest.exists(): + shutil.rmtree(dest) + shutil.move(str(sample_dir), str(dest)) + self.logger.info(f"Moved sample to: {dest}") + return dest + def is_duplicate_sample(self, sample_dir: Path) -> bool: """Check if the extracted sample is a duplicate of an existing sample""" graph_hash_path = sample_dir / "graph_hash.txt" @@ -366,21 +375,21 @@ def is_duplicate_sample(self, sample_dir: Path) -> bool: try: current_hash = graph_hash_path.read_text().strip() - samples_root = self.workspace.samples_dir - - if not samples_root.exists(): - return False - for hash_file in samples_root.rglob("graph_hash.txt"): - if hash_file == graph_hash_path: - continue - try: - existing_hash = hash_file.read_text().strip() - if existing_hash == current_hash: - self.logger.info(f"Duplicate found: {hash_file.parent}") - return True - except (OSError, IOError): + # Search for duplicates in success_dir (where past successful samples live) + for search_root in [self.workspace.success_dir, self.workspace.samples_dir]: + if not search_root.exists(): continue + for hash_file in search_root.rglob("graph_hash.txt"): + if hash_file == graph_hash_path: + continue + try: + existing_hash = hash_file.read_text().strip() + if existing_hash == current_hash: + self.logger.info(f"Duplicate found: {hash_file.parent}") + return True + except (OSError, IOError): + continue return False except (OSError, IOError) as e: diff --git a/graph_net/agent/parallel_extract.py b/graph_net/agent/parallel_extract.py index 0f93162386..2c6b9fc93f 100644 --- a/graph_net/agent/parallel_extract.py +++ b/graph_net/agent/parallel_extract.py @@ -573,8 +573,14 @@ def main() -> int: ) # --- Save results --- - output_file = ( - args.output or f"parallel_extract_{start_time.strftime('%Y%m%d_%H%M%S')}.json" + from graph_net.agent.utils.workspace_manager import ( + WorkspaceManager as _WorkspaceManager, + ) + + _ws = _WorkspaceManager(workspace) + output_file = args.output or str( + _ws.logs_and_lists_dir + / f"parallel_extract_{start_time.strftime('%Y%m%d_%H%M%S')}.json" ) _save_results(results, output_file) diff --git a/graph_net/agent/utils/workspace_manager.py b/graph_net/agent/utils/workspace_manager.py index a93a114842..950b04a56c 100644 --- a/graph_net/agent/utils/workspace_manager.py +++ b/graph_net/agent/utils/workspace_manager.py @@ -22,6 +22,9 @@ def _ensure_directories(self): self.generated_dir, self.samples_dir, self.logs_dir, + self.success_dir, + self.failed_dir, + self.logs_and_lists_dir, ] for dir_path in dirs: dir_path.mkdir(parents=True, exist_ok=True) @@ -46,6 +49,21 @@ def logs_dir(self) -> Path: """Directory for logs""" return self.workspace_root / "logs" + @property + def success_dir(self) -> Path: + """Directory for successfully extracted samples""" + return self.workspace_root / "success" + + @property + def failed_dir(self) -> Path: + """Directory for failed extraction artifacts""" + return self.workspace_root / "failed" + + @property + def logs_and_lists_dir(self) -> Path: + """Directory for result JSONs, model lists, and run logs""" + return self.workspace_root / "logs_and_lists" + def get_model_dir(self, model_id: str) -> Path: """Get directory path for a specific model""" return self.models_dir / model_id.replace("/", "_") From 5c04ed47e23a79ca78d999717610eb2a2125930c Mon Sep 17 00:00:00 2001 From: luotao1 Date: Tue, 19 May 2026 19:28:16 +0800 Subject: [PATCH 2/2] Agent: isolate extraction output to samples/ subdirectory - Set GRAPH_NET_EXTRACT_WORKSPACE=workspace/samples/ in subprocess env - _get_workspace_path() defaults to samples/ subdir - Prevents clutter in workspace root from redundant model directories --- .../subprocess_graph_extractor.py | 17 +++++++++++++---- graph_net/agent/graph_net_agent.py | 11 +++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/graph_net/agent/graph_extractor/subprocess_graph_extractor.py b/graph_net/agent/graph_extractor/subprocess_graph_extractor.py index b579d7140b..d3ba3c4dea 100644 --- a/graph_net/agent/graph_extractor/subprocess_graph_extractor.py +++ b/graph_net/agent/graph_extractor/subprocess_graph_extractor.py @@ -112,9 +112,13 @@ def extract(self, code_path: Path, model_id: str) -> Path: else: env["PYTHONPATH"] = str(graphnet_root) - # Ensure GRAPH_NET_EXTRACT_WORKSPACE points to our workspace - if "GRAPH_NET_EXTRACT_WORKSPACE" not in env: - env["GRAPH_NET_EXTRACT_WORKSPACE"] = str(self.workspace) + # Ensure GRAPH_NET_EXTRACT_WORKSPACE points to samples dir + # so extraction output goes to workspace/samples/ instead of root + samples_dir = self.workspace / "samples" + samples_dir.mkdir(parents=True, exist_ok=True) + env["GRAPH_NET_EXTRACT_WORKSPACE"] = str(samples_dir) + # Also set in current process env so _get_workspace_path() can find it + os.environ["GRAPH_NET_EXTRACT_WORKSPACE"] = str(samples_dir) # Run script in subprocess via Popen so we can kill on timeout proc = subprocess.Popen( @@ -225,7 +229,12 @@ def _find_output_dir_robust(self, model_id: str) -> Optional[Path]: def _get_workspace_path(self) -> Optional[Path]: """Get workspace path from environment or instance variable""" workspace_env = os.environ.get("GRAPH_NET_EXTRACT_WORKSPACE") - return Path(workspace_env) if workspace_env else self.workspace + if workspace_env: + return Path(workspace_env) + # Default to samples/ subdir to avoid cluttering workspace root + samples_dir = self.workspace / "samples" + samples_dir.mkdir(parents=True, exist_ok=True) + return samples_dir def _find_dir_by_pattern( self, workspace_path: Path, model_id: str, safe_model_id: str diff --git a/graph_net/agent/graph_net_agent.py b/graph_net/agent/graph_net_agent.py index 62c635edc2..c3cac10a56 100644 --- a/graph_net/agent/graph_net_agent.py +++ b/graph_net/agent/graph_net_agent.py @@ -192,6 +192,17 @@ def extract_sample(self, model_id: str) -> ExtractionStatus: self._move_sample(sample_dir, self.workspace.failed_dir) return ExtractionStatus.ERROR + @staticmethod + def _is_llm_fixable_error(err: GraphExtractionError) -> bool: + """Decide whether an extraction error is worth retrying with LLM. + + Only allow LLM retry for script logic errors (non-zero return code). + All other categories (timeout, infrastructure, missing model, etc.) + are not fixable by rewriting the script. + """ + category = GraphExtractionErrorClassifier.classify_from_exception(err) + return category == GraphExtractionErrorCategory.SCRIPT_EXECUTION_FAILED + def _llm_retry( self, first_err: GraphExtractionError,