diff --git a/.claude/settings.json b/.claude/settings.json new file mode 100644 index 000000000..b4255be37 --- /dev/null +++ b/.claude/settings.json @@ -0,0 +1,8 @@ +{ + "permissions": { + "allow": [ + "Bash(sed -n '/class PrecisionPolicy/,/^PRECISION_POLICIES = {/p' deployment/configs/enums.py)", + "Bash(sed -n '/PRECISION_POLICIES = {/,/^}/p' deployment/configs/enums.py)" + ] + } +} diff --git a/deployment/README.md b/deployment/README.md new file mode 100644 index 000000000..32c9a5f9a --- /dev/null +++ b/deployment/README.md @@ -0,0 +1,55 @@ +# AWML Deployment Framework + +The `deployment/` package is the shared path from trained PyTorch checkpoints to ONNX and TensorRT artifacts, with verification and evaluation built into the same run. Shared runtime code lives in the framework packages, while task-specific logic lives under `deployment/projects//`. + +## Core workflow + +```text +Load checkpoint -> Export -> Verify -> Evaluate +``` + +## Quick start + +```bash +python -m deployment.cli.main [--log-level INFO] + +# Example (CenterPoint) +python -m deployment.cli.main centerpoint \ + deployment/projects/centerpoint/config/deploy_config.py \ + \ + --rot-y-axis-reference +``` + +## What to read + +| If you want to... | Start here | +| --- | --- | +| Run deployment today | [docs/runbook.md](docs/runbook.md) | +| Edit or author deploy config | [docs/configuration.md](docs/configuration.md) | +| Understand the framework structure | [docs/architecture.md](docs/architecture.md) | +| Troubleshoot or tune runs | [docs/operations.md](docs/operations.md) | +| Add a new project bundle | [docs/contributing.md](docs/contributing.md) | +| Run the current shipped project | [projects/centerpoint/README.md](projects/centerpoint/README.md) | + +## Current status + +- Current first-class project: [CenterPoint](projects/centerpoint/README.md) +- Shared framework responsibilities: CLI, typed config, exporters, runtime orchestration, verification, evaluation, and pipeline creation + +## Repository layout + +```text +deployment/ +├── cli/ # Unified CLI +├── configs/ # Typed deploy config schema +├── core/ # Shared types, evaluators, verification mixins +├── exporters/ # ONNX / TensorRT exporters and export pipeline bases +├── pipelines/ # Inference pipelines and global factory +├── runtime/ # BaseDeploymentRunner, orchestrators, ArtifactManager +├── projects/ # Per-task bundles +└── tests/ # CPU-only unit tests (pytest) +``` + +## License + +See `LICENSE` at the repository root. diff --git a/deployment/__init__.py b/deployment/__init__.py new file mode 100644 index 000000000..9aba18837 --- /dev/null +++ b/deployment/__init__.py @@ -0,0 +1,7 @@ +"""Autoware ML Deployment Framework. + +Task-agnostic export, verification, and evaluation across backends (ONNX, TensorRT). +Import from concrete submodules (for example ``deployment.configs.base``, ``deployment.runtime.runner``). +""" + +__version__ = "1.0.0" diff --git a/deployment/cli/__init__.py b/deployment/cli/__init__.py new file mode 100644 index 000000000..4f413b78e --- /dev/null +++ b/deployment/cli/__init__.py @@ -0,0 +1 @@ +"""Deployment CLI package.""" diff --git a/deployment/cli/args.py b/deployment/cli/args.py new file mode 100644 index 000000000..c63ca51d0 --- /dev/null +++ b/deployment/cli/args.py @@ -0,0 +1,79 @@ +""" +CLI helpers: argument parsing and logging setup. + +Config schema does not depend on argparse/logging; this module is the single place for CLI concerns. +""" + +from __future__ import annotations + +import argparse +import logging +from pathlib import Path + +_LOG_FORMAT = "%(levelname)s:%(name)s:%(message)s" + + +def setup_logging(level: str = "INFO") -> logging.Logger: + """ + Setup logging configuration. + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + + Returns: + Configured logger instance + """ + logging.basicConfig(level=getattr(logging, level), format=_LOG_FORMAT) + return logging.getLogger("deployment") + + +def add_deployment_file_logging(log_file_path: str) -> None: + """ + Append a UTF-8 file handler to the root logger so all log records are also written to disk. + + Idempotent for the same absolute path. Creates parent directories when needed. + + Args: + log_file_path: Absolute or resolved path to the log file. + """ + path = Path(log_file_path).expanduser() + if not path.is_absolute(): + path = (Path.cwd() / path).resolve(strict=False) + else: + path = path.resolve(strict=False) + path.parent.mkdir(parents=True, exist_ok=True) + + root = logging.getLogger() + for h in root.handlers: + if isinstance(h, logging.FileHandler): + handler_path = getattr(h, "baseFilename", "") + if handler_path and Path(handler_path).resolve(strict=False) == path: + return + + fh = logging.FileHandler(str(path), mode="a", encoding="utf-8") + fh.setFormatter(logging.Formatter(_LOG_FORMAT)) + fh.setLevel(root.level) + root.addHandler(fh) + + +def parse_base_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser: + """ + Create argument parser with common deployment arguments. + + Args: + parser: Existing ArgumentParser to add arguments to + + Returns: + ArgumentParser with deployment arguments + """ + parser.add_argument("deploy_cfg", help="Deploy config path") + parser.add_argument("model_cfg", help="Model config path") + # Optional overrides + parser.add_argument( + "--log-level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Logging level", + ) + + return parser diff --git a/deployment/cli/main.py b/deployment/cli/main.py new file mode 100644 index 000000000..2c7bbb292 --- /dev/null +++ b/deployment/cli/main.py @@ -0,0 +1,100 @@ +""" +Single deployment entrypoint. + +Usage: + python -m deployment.cli.main [project-specific args] +""" + +from __future__ import annotations + +import argparse +import importlib +import pkgutil +import sys +import traceback +from typing import Sequence + +import deployment.projects as projects_pkg +from deployment.cli.args import parse_base_args +from deployment.projects.registry import project_registry + + +def _discover_project_packages() -> Sequence[str]: + """Discover project package names under deployment.projects (without importing them).""" + + names: list[str] = [] + for mod in pkgutil.iter_modules(projects_pkg.__path__): + if not mod.ispkg: + continue + if mod.name.startswith("_"): + continue + names.append(mod.name) + return sorted(names) + + +def _import_and_register_project(project_name: str) -> None: + """Import project package, which should register itself into project_registry.""" + importlib.import_module(f"deployment.projects.{project_name}") + + +def build_parser() -> argparse.ArgumentParser: + """Build the deployment CLI parser. + + This discovers `deployment.projects.` bundles, imports them to trigger + registration into ``deployment.projects.registry.project_registry``, then creates a + subcommand per registered project. + """ + parser = argparse.ArgumentParser( + description="AWML Deployment CLI", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + subparsers = parser.add_subparsers(dest="project", required=True) + + # Discover projects and import them so they can contribute args. + failed_projects: list[str] = [] + for project_name in _discover_project_packages(): + try: + _import_and_register_project(project_name) + except Exception as e: + tb = traceback.format_exc() + failed_projects.append(f"- {project_name}: {e}\n{tb}") + continue + + try: + adapter = project_registry.get(project_name) + except KeyError: + continue + + sub = subparsers.add_parser(project_name, help=f"{project_name} deployment") + parse_base_args(sub) # adds deploy_cfg, model_cfg, --log-level + adapter.add_args(sub) + sub.set_defaults(_adapter_name=project_name) + + if not project_registry.list_projects(): + details = "\n".join(failed_projects) if failed_projects else "(no project packages discovered)" + raise RuntimeError( + "No deployment projects were registered. This usually means project imports failed.\n" f"{details}" + ) + + return parser + + +def main(argv: Sequence[str]) -> int: + """CLI entrypoint. + + Args: + argv: Argv list (without program name). + + Returns: + Process exit code (0 for success). + """ + parser = build_parser() + args = parser.parse_args(argv) + + adapter = project_registry.get(args._adapter_name) + return adapter.run(args) + + +if __name__ == "__main__": + raise SystemExit(main(sys.argv[1:])) diff --git a/deployment/configs/__init__.py b/deployment/configs/__init__.py new file mode 100644 index 000000000..aa00f316f --- /dev/null +++ b/deployment/configs/__init__.py @@ -0,0 +1,4 @@ +"""Deployment configuration package (enums, schema, base container). + +Import from concrete modules, e.g. ``deployment.configs.base``, ``deployment.configs.schema``, ``deployment.configs.enums``. +""" diff --git a/deployment/configs/base.py b/deployment/configs/base.py new file mode 100644 index 000000000..c5ebbe750 --- /dev/null +++ b/deployment/configs/base.py @@ -0,0 +1,197 @@ +""" +Base deployment config: single entry point container with runtime validation and helpers. + +Torch/CUDA validation lives here. Schema/enums are in configs.schema and configs.enums. +""" + +from __future__ import annotations + +from pathlib import Path +from types import MappingProxyType +from typing import Optional, Tuple + +import torch +from mmengine.config import Config + +from deployment.configs.enums import ExportMode +from deployment.configs.schema import ( + ComponentsConfig, + DeviceConfig, + EvaluationConfig, + ExportConfig, + OnnxConfig, + TensorRTConfig, + VerificationConfig, + VerificationScenario, +) +from deployment.core.backend import Backend +from deployment.exporters.common.configs import ( + ONNXExportConfig, + TensorRTExportConfig, + TensorRTModelInputConfig, +) + + +class BaseDeploymentConfig: + """ + Base configuration container for deployment settings. + + This class provides a task-agnostic interface for deployment configuration. + Task-specific configs should extend this class and add task-specific settings. + + Attributes: + checkpoint_path: Single source of truth for the PyTorch checkpoint path. + Used by both export (for ONNX conversion) and evaluation + (for PyTorch backend). Defined at top-level of deploy config. + """ + + def __init__(self, deploy_cfg: Config) -> None: + """ + Initialize deployment configuration. + + Args: + deploy_cfg: MMEngine Config object containing deployment settings + """ + self._deploy_cfg = deploy_cfg + + checkpoint_path = deploy_cfg.get("checkpoint_path") + self.checkpoint_path = self._validate_checkpoint_path(checkpoint_path) + self.device_config = DeviceConfig.from_dict(deploy_cfg.get("devices", {})) + self.components_cfg = ComponentsConfig.from_dict(deploy_cfg.get("components")) + self._onnx_config = OnnxConfig.from_dict(deploy_cfg.get("onnx_config")) + self.export_config = ExportConfig.from_dict(deploy_cfg.get("export")) + self._tensorrt_config = TensorRTConfig.from_dict(deploy_cfg.get("tensorrt_config", {})) + self.evaluation_config = EvaluationConfig.from_dict(deploy_cfg.get("evaluation", {})) + self.verification_config = VerificationConfig.from_dict(deploy_cfg.get("verification", {})) + self._deploy_log_path = self._parse_deploy_log_path(deploy_cfg.get("deploy_log_path", "deployment.log")) + + # Runtime/environment validation (torch/cuda) + self._validate_cuda_device() + + @staticmethod + def _parse_deploy_log_path(raw: Optional[str]) -> Optional[str]: + """Parse deploy_log_path; None or blank disables file logging.""" + return raw.strip() or None if raw is not None else None + + @staticmethod + def _validate_checkpoint_path(checkpoint_path: str) -> str: + """Require a non-empty checkpoint path that exists as a regular file.""" + if not isinstance(checkpoint_path, str): + raise TypeError(f"checkpoint_path must be a string, got {type(checkpoint_path).__name__}.") + path = Path(checkpoint_path).expanduser() + + if not path.is_file(): + raise FileNotFoundError( + f"Checkpoint file not found: '{checkpoint_path}' (resolved to '{path.resolve()}'). " + f"Deploy-config paths are relative to the current working directory " + f"('{Path.cwd()}'); run from the repository root or set an absolute checkpoint_path." + ) + + return str(path) + + def _validate_cuda_device(self) -> None: + """Validate CUDA device availability once at config stage.""" + if not self._uses_tensorrt(): + return + + cuda_device = self.device_config.cuda + device_idx = self.device_config.cuda_device_index + + if cuda_device is None or device_idx is None: + raise RuntimeError( + "CUDA device is required (TensorRT export/verification/evaluation enabled) but no CUDA device was" + " configured in devices." + ) + + if not torch.cuda.is_available(): + raise RuntimeError( + "CUDA device is required (TensorRT export/verification/evaluation enabled) " + "but torch.cuda.is_available() returned False." + ) + + device_count = torch.cuda.device_count() + if device_idx >= device_count: + raise ValueError( + f"Requested CUDA device '{cuda_device}' but only {device_count} CUDA device(s) are available." + ) + + def _uses_tensorrt(self) -> bool: + """Whether TensorRT is used by any stage (export, evaluation, or verification).""" + if self.export_config.should_export_tensorrt: + return True + + if self.evaluation_config.enabled: + backends_cfg = self.evaluation_config.backends + tensorrt_backend = backends_cfg.get(Backend.TENSORRT.value) + if tensorrt_backend and tensorrt_backend.get("enabled", False): + return True + + if self.verification_config.enabled: + for scenario_list in self.verification_config.scenarios.values(): + for scenario in scenario_list: + if Backend.TENSORRT in (scenario.ref_backend, scenario.test_backend): + return True + + return False + + @property + def resolved_deploy_log_file(self) -> Optional[str]: + """Absolute path for the deployment log file, or None if file logging is disabled.""" + if self._deploy_log_path is None: + return None + log_path = Path(self._deploy_log_path).expanduser() + if log_path.is_absolute(): + return str(log_path.resolve(strict=False)) + work_dir = Path(self.export_config.work_dir).expanduser() + return str((work_dir / log_path).resolve(strict=False)) + + def get_verification_scenarios(self, export_mode: ExportMode) -> Tuple[VerificationScenario, ...]: + """ + Get verification scenarios for the given export mode. + + Args: + export_mode: Export mode (`ExportMode`) + + Returns: + Tuple of verification scenarios + """ + return self.verification_config.get_scenarios(export_mode) + + def get_onnx_settings(self, component_name: str) -> ONNXExportConfig: + """Get ONNX export settings for a component. I/O and save_file come from ComponentCfg.""" + component_cfg = self.components_cfg.get_component(component_name) + onnx_config = self._onnx_config + input_names = tuple(inp.name for inp in component_cfg.io.inputs) + output_names = tuple(out.name for out in component_cfg.io.outputs) + if not input_names: + input_names = ("input",) + if not output_names: + output_names = ("output",) + return ONNXExportConfig( + input_names=input_names, + output_names=output_names, + dynamic_axes=component_cfg.io.dynamic_axes, + simplify=onnx_config.simplify, + opset_version=onnx_config.opset_version, + export_params=onnx_config.export_params, + keep_initializers_as_inputs=onnx_config.keep_initializers_as_inputs, + verbose=False, + do_constant_folding=onnx_config.do_constant_folding, + save_file=component_cfg.onnx_file, + batch_size=None, + ) + + def get_tensorrt_settings(self, component_name: str) -> TensorRTExportConfig: + """Get TensorRT export settings for a component. Profile and I/O come from ComponentCfg.""" + component_cfg = self.components_cfg.get_component(component_name) + + model_input: Optional[TensorRTModelInputConfig] = None + if component_cfg.tensorrt_profile: + input_shapes = MappingProxyType(dict(component_cfg.tensorrt_profile)) + model_input = TensorRTModelInputConfig(input_shapes=input_shapes) + + return TensorRTExportConfig( + precision_policy=self._tensorrt_config.precision_policy, + max_workspace_size=self._tensorrt_config.max_workspace_size, + model_input=model_input, + ) diff --git a/deployment/configs/enums.py b/deployment/configs/enums.py new file mode 100644 index 000000000..6408776df --- /dev/null +++ b/deployment/configs/enums.py @@ -0,0 +1,64 @@ +""" +Pure enums and constants for deployment config. + +No dependency on torch or mmengine. Safe to import from exporters, evaluators, CLI. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Optional, Union + +# Constants +DEFAULT_WORKSPACE_SIZE = 1 << 30 # 1 GB + + +class PrecisionPolicy(str, Enum): + """Precision policy options for TensorRT. + + The concrete TensorRT flags each policy maps to are applied by the TensorRT + exporter (see ``TensorRTExporter._apply_precision_policy``); this enum is the + single source of truth for the policy itself. + """ + + AUTO = "auto" + FP16 = "fp16" + FP32_TF32 = "fp32_tf32" + STRONGLY_TYPED = "strongly_typed" + + @classmethod + def from_value(cls, value: Optional[Union[str, PrecisionPolicy]]) -> PrecisionPolicy: + """Parse strings or enum members into PrecisionPolicy (defaults to AUTO).""" + if value is None: + return cls.AUTO + if isinstance(value, cls): + return value + if isinstance(value, str): + normalized = value.strip().lower() + for member in cls: + if member.value == normalized: + return member + raise ValueError(f"Invalid precision_policy '{value}'. Must be one of {[m.value for m in cls]}.") + + +class ExportMode(str, Enum): + """Export pipeline modes.""" + + ONNX = "onnx" + TRT = "trt" + BOTH = "both" + NONE = "none" + + @classmethod + def from_value(cls, value: Optional[Union[str, ExportMode]]) -> ExportMode: + """Parse strings or enum members into ExportMode (defaults to BOTH).""" + if value is None: + return cls.BOTH + if isinstance(value, cls): + return value + if isinstance(value, str): + normalized = value.strip().lower() + for member in cls: + if member.value == normalized: + return member + raise ValueError(f"Invalid export mode '{value}'. Must be one of {[m.value for m in cls]}.") diff --git a/deployment/configs/schema.py b/deployment/configs/schema.py new file mode 100644 index 000000000..2f6a2d484 --- /dev/null +++ b/deployment/configs/schema.py @@ -0,0 +1,434 @@ +""" +Typed schema for deployment config. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from types import MappingProxyType +from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple + +from deployment.configs.enums import ( + DEFAULT_WORKSPACE_SIZE, + ExportMode, + PrecisionPolicy, +) +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.exporters.common.configs import TensorRTProfileConfig + + +def _empty_mapping() -> Mapping[Any, Any]: + """Return an immutable empty mapping.""" + return MappingProxyType({}) + + +# ----------------------------------------------------------------------------- +# Export / Device / Runtime +# ----------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class ExportConfig: + """Configuration for model export settings.""" + + mode: ExportMode # required: caller must explicitly pick what to export + work_dir: str = "work_dirs" + onnx_path: Optional[str] = None + sample_idx: int = 0 + + @classmethod + def from_dict(cls, config_dict: Optional[Mapping[str, Any]]) -> ExportConfig: + """Build ExportConfig from deploy_cfg['export']. Required: a dict with a valid `mode`.""" + if config_dict is None: + raise ValueError("Missing 'export' section in deploy config.") + if not isinstance(config_dict, Mapping): + raise TypeError(f"export must be a dict, got {type(config_dict).__name__}") + if "mode" not in config_dict: + valid = [m.value for m in ExportMode] + raise ValueError(f"export.mode is required; must be one of {valid}.") + return cls( + mode=ExportMode.from_value(config_dict["mode"]), + work_dir=config_dict.get("work_dir", cls.work_dir), + onnx_path=config_dict.get("onnx_path"), + sample_idx=config_dict.get("sample_idx", cls.sample_idx), + ) + + @property + def should_export_onnx(self) -> bool: + """Whether ONNX export is requested.""" + return self.mode in (ExportMode.ONNX, ExportMode.BOTH) + + @property + def should_export_tensorrt(self) -> bool: + """Whether TensorRT export is requested.""" + return self.mode in (ExportMode.TRT, ExportMode.BOTH) + + +@dataclass(frozen=True) +class DeviceConfig: + """Parsed device settings shared across deployment stages.""" + + cpu: DeviceSpec = field(default_factory=lambda: DeviceSpec.from_value("cpu")) + cuda: Optional[DeviceSpec] = field(default_factory=lambda: DeviceSpec.from_value("cuda:0")) + + def __post_init__(self) -> None: + object.__setattr__(self, "cpu", self._parse_cpu_device(self.cpu)) + object.__setattr__(self, "cuda", self._parse_cuda_device(self.cuda)) + + @classmethod + def from_dict(cls, config_dict: Mapping[str, Any]) -> DeviceConfig: + """Create DeviceConfig from dict.""" + return cls(cpu=config_dict.get("cpu", "cpu"), cuda=config_dict.get("cuda", "cuda:0")) + + @staticmethod + def _parse_cpu_device(device: Any) -> DeviceSpec: + """Parse CPU device input into DeviceSpec.""" + device_spec = DeviceSpec.from_value(device if device is not None else "cpu") + if device_spec.is_cuda: + raise ValueError("CPU device cannot be a CUDA device") + return device_spec + + @staticmethod + def _parse_cuda_device(device: Any) -> Optional[DeviceSpec]: + """Parse CUDA device input into DeviceSpec.""" + if device is None: + return None + device_spec = DeviceSpec.from_value(device) + if not device_spec.is_cuda: + raise ValueError(f"Invalid CUDA device '{device}'.") + return device_spec + + @property + def cuda_device_index(self) -> Optional[int]: + """Return CUDA device index as integer (if configured).""" + if self.cuda is None: + return None + return self.cuda.index + + +@dataclass(frozen=True) +class OnnxConfig: + """ONNX export settings (shared across all components).""" + + opset_version: int = 17 + do_constant_folding: bool = True + export_params: bool = True + keep_initializers_as_inputs: bool = False + simplify: bool = False + + @classmethod + def from_dict(cls, raw: Optional[Mapping[str, Any]]) -> OnnxConfig: + """Build OnnxConfig from deploy_cfg['onnx_config'].""" + if not raw: + return cls() + if not isinstance(raw, Mapping): + raise TypeError(f"onnx_config must be a dict, got {type(raw).__name__}") + return cls( + opset_version=int(raw.get("opset_version", 17)), + do_constant_folding=bool(raw.get("do_constant_folding", True)), + export_params=bool(raw.get("export_params", True)), + keep_initializers_as_inputs=bool(raw.get("keep_initializers_as_inputs", False)), + simplify=bool(raw.get("simplify", False)), + ) + + +@dataclass(frozen=True) +class TensorRTConfig: + """ + Configuration for TensorRT backend-specific settings. + + Uses config structure: + tensorrt_config = dict(precision_policy="auto", max_workspace_size=1<<30) + + TensorRT profiles are defined in components.*.tensorrt_profile. + + Note: + The deploy config key for this section is **`tensorrt_config`**. + """ + + precision_policy: PrecisionPolicy = PrecisionPolicy.AUTO + max_workspace_size: int = DEFAULT_WORKSPACE_SIZE + + @classmethod + def from_dict(cls, config_dict: Mapping[str, Any]) -> TensorRTConfig: + return cls( + precision_policy=PrecisionPolicy.from_value(config_dict.get("precision_policy")), + max_workspace_size=config_dict.get("max_workspace_size", DEFAULT_WORKSPACE_SIZE), + ) + + +# ----------------------------------------------------------------------------- +# Component config (deploy_cfg["components"]) +# ----------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class InputSpec: + """Single input name/dtype for a component.""" + + name: str + dtype: str = "float32" + + +@dataclass(frozen=True) +class OutputSpec: + """Single output name/dtype for a component.""" + + name: str + dtype: str = "float32" + + +@dataclass(frozen=True) +class ComponentIO: + """I/O specification for a component (inputs, outputs, dynamic_axes).""" + + inputs: List[InputSpec] + outputs: List[OutputSpec] + dynamic_axes: Dict[str, Dict[int, str]] + + +@dataclass(frozen=True) +class ComponentCfg: + """Configuration for one deployable component. + + The component identifier is the key in deploy_cfg['components']; ``name`` is always set + from that key. + """ + + name: str + onnx_file: str + engine_file: str + io: ComponentIO + tensorrt_profile: Dict[str, TensorRTProfileConfig] + + +@dataclass(frozen=True) +class ComponentsConfig: + """Component configuration: mapping of component id -> ComponentCfg. + + The dict key is the component identifier (e.g. "model", "pts_voxel_encoder", "pts_backbone_neck_head"). + """ + + _components: Mapping[str, ComponentCfg] + + def get_component(self, component_name: str) -> ComponentCfg: + """Get component config by name. Raises KeyError if not found.""" + if component_name not in self._components: + raise KeyError(f"Unknown component: {component_name}. Available: {list(self._components.keys())}") + return self._components[component_name] + + def get_artifact_filename(self, component_name: str, file_key: str) -> Optional[str]: + """Return artifact filename for path resolution (onnx_file or engine_file).""" + component_cfg = self._components.get(component_name) + if component_cfg is None: + raise KeyError(f"Unknown component: {component_name}. Available: {list(self._components.keys())}") + return getattr(component_cfg, file_key) + + def component_names(self) -> Iterable[str]: + """Iterate over component names.""" + return self._components.keys() + + def items(self) -> Iterable[Tuple[str, ComponentCfg]]: + """Iterate (name, ComponentCfg) pairs.""" + return self._components.items() + + @staticmethod + def _validate_dynamic_axes(raw: Any) -> Dict[str, Dict[int, str]]: + """Validate dynamic_axes schema without coercing types.""" + + def _require_type(value: Any, expected: type, message: str) -> None: + if not isinstance(value, expected): + raise TypeError(f"{message}, got {type(value).__name__}") + + if raw is None: + return {} + _require_type(raw, Mapping, "dynamic_axes must be a dict") + + result: Dict[str, Dict[int, str]] = {} + for name, axes in raw.items(): + _require_type(name, str, "dynamic_axes key must be str") + _require_type(axes, Mapping, f"dynamic_axes['{name}'] must be a dict") + + typed_axes: Dict[int, str] = {} + for axis_idx, axis_name in axes.items(): + _require_type(axis_idx, int, f"dynamic_axes['{name}'] axis index must be int") + _require_type(axis_name, str, f"dynamic_axes['{name}'][{axis_idx}] axis name must be str") + typed_axes[axis_idx] = axis_name + result[name] = typed_axes + return result + + @classmethod + def from_dict(cls, raw: Optional[Mapping[str, Any]]) -> ComponentsConfig: + """Build ComponentsConfig from deploy_cfg['components']. Required: a non-empty dict.""" + if raw is None: + raise ValueError("Missing 'components' section in deploy config.") + if not isinstance(raw, Mapping): + raise TypeError(f"components must be a dict, got {type(raw).__name__}") + if not raw: + raise ValueError("deploy config 'components' must define at least one component.") + parsed = {} + for component_name, comp_raw in raw.items(): + parsed[component_name] = cls._parse_component(comp_raw, component_name) + return cls(_components=MappingProxyType(parsed)) + + @staticmethod + def _parse_io_specs( + raw_specs: Iterable[Any], + component_name: str, + io_kind: str, + spec_cls: type, + ) -> List[Any]: + """Parse an ``io.inputs``/``io.outputs`` list into typed name/dtype specs. + + Inputs and outputs are validated identically (both are name/dtype records); + ``io_kind`` ('inputs' or 'outputs') only shapes the error messages, and + ``spec_cls`` selects `InputSpec` or `OutputSpec`. + """ + specs: List[Any] = [] + for i, raw in enumerate(raw_specs): + if not isinstance(raw, Mapping) or "name" not in raw: + raise KeyError(f"components['{component_name}'].io.{io_kind}[{i}] must define 'name'.") + name = raw["name"] + if not name or not isinstance(name, str): + raise ValueError(f"components['{component_name}'].io.{io_kind}[{i}].name must be a non-empty string.") + specs.append(spec_cls(name=name, dtype=raw.get("dtype", "float32"))) + return specs + + @classmethod + def _parse_component(cls, comp_raw: Any, component_name: str) -> ComponentCfg: + if not isinstance(comp_raw, Mapping): + raise TypeError(f"components['{component_name}'] must be a dict, got {type(comp_raw).__name__}") + for field_name in ("onnx_file", "engine_file", "io"): + if field_name not in comp_raw: + raise KeyError(f"components['{component_name}'] must define '{field_name}'.") + component_id = component_name + io_raw = comp_raw["io"] + if not isinstance(io_raw, Mapping): + raise TypeError(f"components['{component_name}'].io must be a dict, got {type(io_raw).__name__}") + if "outputs" not in io_raw or not io_raw["outputs"]: + raise KeyError(f"components['{component_name}'].io.outputs must be a non-empty list.") + if "inputs" not in io_raw or not io_raw["inputs"]: + raise KeyError(f"components['{component_name}'].io.inputs must be a non-empty list.") + outputs = cls._parse_io_specs(io_raw["outputs"], component_name, "outputs", OutputSpec) + inputs = cls._parse_io_specs(io_raw["inputs"], component_name, "inputs", InputSpec) + dynamic_axes = cls._validate_dynamic_axes(io_raw.get("dynamic_axes") or {}) + io = ComponentIO( + inputs=inputs, + outputs=outputs, + dynamic_axes=dynamic_axes, + ) + profile_raw = comp_raw.get("tensorrt_profile") or {} + if not isinstance(profile_raw, Mapping): + raise TypeError(f"components['{component_name}'].tensorrt_profile must be a dict.") + tensorrt_profile = {} + for input_name, shape_cfg in profile_raw.items(): + if not isinstance(shape_cfg, Mapping): + raise TypeError( + f"components['{component_name}'].tensorrt_profile['{input_name}'] must be a dict, got {type(shape_cfg).__name__}." + ) + tensorrt_profile[input_name] = TensorRTProfileConfig.from_dict(shape_cfg) + return ComponentCfg( + name=component_id, + onnx_file=str(comp_raw["onnx_file"]), + engine_file=str(comp_raw["engine_file"]), + io=io, + tensorrt_profile=tensorrt_profile, + ) + + +# ----------------------------------------------------------------------------- +# Evaluation & Verification +# ----------------------------------------------------------------------------- + + +@dataclass(frozen=True) +class EvaluationConfig: + """Typed configuration for evaluation settings.""" + + enabled: bool = False + num_samples: int = 10 + num_warmup: int = 0 + verbose: bool = False + backends: Mapping[Any, Mapping[str, Any]] = field(default_factory=_empty_mapping) + + @classmethod + def from_dict(cls, config_dict: Mapping[str, Any]) -> EvaluationConfig: + backends_raw = config_dict.get("backends", None) + if backends_raw is None: + backends_raw = {} + if not isinstance(backends_raw, Mapping): + raise TypeError(f"evaluation.backends must be a dict, got {type(backends_raw).__name__}") + backends_frozen = {key: MappingProxyType(dict(value)) for key, value in backends_raw.items()} + + return cls( + enabled=config_dict.get("enabled", False), + num_samples=config_dict.get("num_samples", 10), + num_warmup=config_dict.get("num_warmup", 0), + verbose=config_dict.get("verbose", False), + backends=MappingProxyType(backends_frozen), + ) + + +@dataclass(frozen=True) +class VerificationScenario: + """Immutable verification scenario specification.""" + + ref_backend: Backend + ref_device: DeviceSpec + test_backend: Backend + test_device: DeviceSpec + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> VerificationScenario: + missing_keys = {"ref_backend", "ref_device", "test_backend", "test_device"} - data.keys() + if missing_keys: + raise ValueError(f"Verification scenario missing keys: {missing_keys}") + + return cls( + ref_backend=Backend.from_value(data["ref_backend"]), + ref_device=DeviceSpec.from_value(data["ref_device"]), + test_backend=Backend.from_value(data["test_backend"]), + test_device=DeviceSpec.from_value(data["test_device"]), + ) + + +@dataclass(frozen=True) +class VerificationConfig: + """Typed configuration for verification settings.""" + + enabled: bool = True + num_verify_samples: int = 3 + tolerance: float = 0.1 + scenarios: Mapping[ExportMode, Tuple[VerificationScenario, ...]] = field(default_factory=_empty_mapping) + + @classmethod + def from_dict(cls, config_dict: Mapping[str, Any]) -> VerificationConfig: + scenarios_raw = config_dict.get("scenarios") + if scenarios_raw is None: + scenarios_raw = {} + if not isinstance(scenarios_raw, Mapping): + raise TypeError(f"verification.scenarios must be a dict, got {type(scenarios_raw).__name__}") + + scenario_map: Dict[ExportMode, Tuple[VerificationScenario, ...]] = {} + for mode_key, scenario_list in scenarios_raw.items(): + mode = ExportMode.from_value(mode_key) + if scenario_list is None: + scenario_list = [] + elif not isinstance(scenario_list, (list, tuple)): + raise TypeError( + f"verification.scenarios.{mode_key} must be a list or tuple, got {type(scenario_list).__name__}" + ) + scenario_entries = tuple(VerificationScenario.from_dict(entry) for entry in scenario_list) + scenario_map[mode] = scenario_entries + + return cls( + enabled=config_dict.get("enabled", True), + num_verify_samples=config_dict.get("num_verify_samples", 3), + tolerance=config_dict.get("tolerance", 0.1), + scenarios=MappingProxyType(scenario_map), + ) + + def get_scenarios(self, mode: ExportMode) -> Tuple[VerificationScenario, ...]: + """Return scenarios for a specific export mode.""" + return self.scenarios.get(mode, ()) diff --git a/deployment/core/__init__.py b/deployment/core/__init__.py new file mode 100644 index 000000000..53f8c4ae0 --- /dev/null +++ b/deployment/core/__init__.py @@ -0,0 +1 @@ +"""Core deployment utilities. Import concrete submodules (``deployment.core.backend``, …).""" diff --git a/deployment/core/artifacts.py b/deployment/core/artifacts.py new file mode 100644 index 000000000..a1da0f54c --- /dev/null +++ b/deployment/core/artifacts.py @@ -0,0 +1,163 @@ +""" +Artifact Path Resolution for Deployment Pipelines. + +This module provides: +1. Artifact dataclass - represents an exported model artifact +2. Path resolution functions - resolve artifact paths from deploy config + +Supports: +- Single-component models (YOLOX, Calibration): use component_name="model" +- Multi-component models (CenterPoint): use component_name="pts_voxel_encoder", "pts_backbone_neck_head", etc. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Mapping, Optional + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# Artifact Dataclass +# ============================================================================ + + +@dataclass(frozen=True) +class Artifact: + """ + Represents an exported model artifact (ONNX file, TensorRT engine, etc.). + + Attributes: + path: Filesystem path to the artifact (file or directory). + """ + + path: str + + @property + def exists(self) -> bool: + """Whether the artifact exists on disk.""" + return Path(self.path).exists() + + def __str__(self) -> str: + return self.path + + +# ============================================================================ +# Path Resolution Functions +# ============================================================================ + + +def resolve_artifact_path( + *, + base_dir: str, + components_cfg: Optional[Mapping[str, Any]], + component_name: str, + file_key: str, +) -> str: + """Resolve artifact path for any component. + + This is the entry point for artifact path resolution. + + Args: + base_dir: Base directory for artifacts (onnx_dir or tensorrt_dir), + or direct path to an artifact file. + components_cfg: The `components` dict from deploy_config. + Can be None for backwards compatibility. + component_name: Component id (e.g., 'model', 'pts_voxel_encoder', 'pts_backbone_neck_head') + file_key: Key to look up ('onnx_file' or 'engine_file') + + Returns: + Resolved path to the artifact file + + Resolution strategy (single supported mode): + 1. `base_dir` must be a directory (e.g., `.../onnx` or `.../tensorrt`) + 2. Require `components_cfg[component_name][file_key]` to be set + - must be a relative path resolved under `base_dir` + 3. The resolved path must exist and be a file + + This function intentionally does NOT: + - scan directories for matching extensions + - fall back to default filenames + - accept `base_dir` as a file path + - accept absolute paths in `components` (enforces fully config-driven, workspace-relative artifacts) + + Examples: + # Single-component model (YOLOX) + resolve_artifact_path( + base_dir="work_dirs/yolox/onnx", + components_cfg={"model": {"onnx_file": "yolox.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + + # Multi-component model (CenterPoint) + resolve_artifact_path( + base_dir="work_dirs/centerpoint/tensorrt", + components_cfg={"pts_voxel_encoder": {"engine_file": "pts_voxel_encoder.engine"}}, + component_name="pts_voxel_encoder", + file_key="engine_file", + ) + """ + base_path = Path(base_dir) + if not base_path.is_dir(): + raise ValueError( + "Artifact resolution requires `base_dir` to be a directory. " + f"Got: {base_dir}. " + "Set evaluation.backends..{model_dir|engine_dir} to the artifact directory, " + "and set the artifact filename in deploy config under components.*.{onnx_file|engine_file}." + ) + + # Require filename from components config + filename = _get_filename_from_config(components_cfg, component_name, file_key) + if not filename: + raise KeyError( + "Missing artifact filename in deploy config. " + f"Expected components['{component_name}']['{file_key}'] to be set." + ) + + if Path(filename).is_absolute(): + raise ValueError( + "Absolute artifact paths are not allowed. " + f"Set components['{component_name}']['{file_key}'] to a relative filename under base_dir instead. " + f"(got: {filename})" + ) + + base_abs = base_path.resolve(strict=False) + path = (base_abs / filename).resolve(strict=False) + # Prevent escaping base_dir via '../' + try: + path.relative_to(base_abs) + except ValueError: + raise ValueError( + "Artifact path must stay within base_dir. " + f"Got components['{component_name}']['{file_key}']={filename} which resolves to {path} outside {base_abs}." + ) + if not path.is_file(): + raise FileNotFoundError( + f"Configured artifact file not found: {path}. " + f"(base_dir={base_dir}, component_name={component_name}, file_key={file_key})" + ) + return str(path) + + +def _get_filename_from_config( + components_cfg: Optional[Mapping[str, Any]], + component_name: str, + file_key: str, +) -> Optional[str]: + """Extract filename from components config (dict or ComponentsConfig dataclass).""" + if components_cfg is None: + return None + if hasattr(components_cfg, "get_artifact_filename"): + out = components_cfg.get_artifact_filename(component_name, file_key) + return out if isinstance(out, str) and out else None + component_cfg = components_cfg.get(component_name, {}) + if not isinstance(component_cfg, Mapping): + return None + filename = component_cfg.get(file_key) + if isinstance(filename, str) and filename: + return filename + return None diff --git a/deployment/core/backend.py b/deployment/core/backend.py new file mode 100644 index 000000000..c491d4af9 --- /dev/null +++ b/deployment/core/backend.py @@ -0,0 +1,51 @@ +"""Backend enum used across deployment configs and runtime components.""" + +from __future__ import annotations + +from enum import Enum +from typing import Union + + +class Backend(str, Enum): + """Supported deployment backends.""" + + PYTORCH = "pytorch" + ONNX = "onnx" + TENSORRT = "tensorrt" + + @classmethod + def from_value(cls, value: Union[str, Backend]) -> Backend: + """ + Normalize backend identifiers coming from configs or enums. + + Args: + value: Backend as string or Backend enum + + Returns: + Backend enum instance + + Raises: + ValueError: If value cannot be mapped to a supported backend + """ + if isinstance(value, cls): + return value + + if isinstance(value, str): + normalized = value.strip().lower() + try: + return cls(normalized) + except ValueError as exc: + raise ValueError(f"Unsupported backend '{value}'. Expected one of {[b.value for b in cls]}.") from exc + + raise TypeError(f"Backend must be a string or Backend enum, got {type(value)}") + + @property + def requires_cuda(self) -> bool: + """Whether this backend can only run on a CUDA device. + + Single source of truth for the runtime constraint enforced by config validation, evaluation, and verification. + """ + return self is Backend.TENSORRT + + def __str__(self) -> str: # pragma: no cover - convenience for logging + return self.value diff --git a/deployment/core/contexts.py b/deployment/core/contexts.py new file mode 100644 index 000000000..a74f94c7f --- /dev/null +++ b/deployment/core/contexts.py @@ -0,0 +1,67 @@ +""" +Typed context objects for deployment workflows. + +Usage: + # Create context for export + ctx = ExportContext() + + # Project-specific context + ctx = CenterPointExportContext(rot_y_axis_reference=True) + + # Pass to orchestrator + result = export_orchestrator.run(ctx) +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Optional + + +@dataclass(frozen=True) +class ExportContext: + """ + Base context for export operations. + + Marker base class for export contexts; project-specific subclasses (e.g. + ``CenterPointExportContext``) add typed fields for their export parameters. + """ + + +@dataclass(frozen=True) +class YOLOXExportContext(ExportContext): + """ + YOLOX-specific export context. + + Attributes: + model_cfg: Path to model configuration file. If None, attempts + to extract from model_cfg.filename. + """ + + model_cfg: Optional[str] = None + + +@dataclass(frozen=True) +class CenterPointExportContext(ExportContext): + """ + CenterPoint-specific export context. + + Attributes: + rot_y_axis_reference: Whether to use y-axis rotation reference for + ONNX-compatible output format. This affects + how rotation and dimensions are encoded. + """ + + rot_y_axis_reference: bool = False + + +@dataclass(frozen=True) +class CalibrationExportContext(ExportContext): + """ + Calibration model export context. + + Currently uses only base ExportContext fields. + Extend with calibration-specific parameters as needed. + """ + + pass diff --git a/deployment/core/device.py b/deployment/core/device.py new file mode 100644 index 000000000..b65d90313 --- /dev/null +++ b/deployment/core/device.py @@ -0,0 +1,77 @@ +"""Runtime device descriptor used across deployment backends.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal, Union + +import torch + + +@dataclass(frozen=True) +class DeviceSpec: + """Validated runtime device representation. + + This type is intentionally runtime-focused (single concrete device), unlike + config-level `DeviceConfig` which stores defaults and aliases. + """ + + kind: Literal["cpu", "cuda"] + index: int = 0 + + def __post_init__(self) -> None: + if self.kind == "cpu": + object.__setattr__(self, "index", 0) + return + + if self.kind != "cuda": + raise ValueError(f"Unsupported device kind '{self.kind}'.") + if self.index < 0: + raise ValueError("CUDA device index must be non-negative.") + + @classmethod + def from_value(cls, value: Union[str, torch.device, DeviceSpec]) -> "DeviceSpec": + """Build a DeviceSpec from a string, a torch.device, or an existing DeviceSpec.""" + if isinstance(value, cls): + return value + + if isinstance(value, torch.device): + if value.type == "cuda": + return cls(kind="cuda", index=0 if value.index is None else int(value.index)) + if value.type == "cpu": + return cls(kind="cpu", index=0) + raise ValueError(f"Unsupported torch device type '{value.type}'.") + + if isinstance(value, str): + normalized = value.strip().lower() + if normalized == "cpu": + return cls(kind="cpu", index=0) + if normalized == "cuda": + return cls(kind="cuda", index=0) + if normalized.startswith("cuda:"): + suffix = normalized.split(":", 1)[1].strip() + if not suffix.isdigit(): + raise ValueError(f"Invalid CUDA device index in '{value}'.") + return cls(kind="cuda", index=int(suffix)) + raise ValueError(f"Unrecognized device string '{value}'.") + + raise TypeError(f"Unsupported device value type: {type(value)}") + + @property + def is_cuda(self) -> bool: + return self.kind == "cuda" + + def to_torch_device(self) -> torch.device: + """Return torch.device equivalent.""" + return torch.device(str(self)) + + def to_ort_provider(self) -> list[str]: + """Return ONNX Runtime execution providers for this device.""" + if self.is_cuda: + return ["CUDAExecutionProvider", "CPUExecutionProvider"] + return ["CPUExecutionProvider"] + + def __str__(self) -> str: + if self.is_cuda: + return f"cuda:{self.index}" + return "cpu" diff --git a/deployment/core/evaluation/__init__.py b/deployment/core/evaluation/__init__.py new file mode 100644 index 000000000..719a78658 --- /dev/null +++ b/deployment/core/evaluation/__init__.py @@ -0,0 +1 @@ +"""Evaluation helpers. Import from concrete modules under ``deployment.core.evaluation.*``.""" diff --git a/deployment/core/evaluation/backend_executor.py b/deployment/core/evaluation/backend_executor.py new file mode 100644 index 000000000..33c1b0555 --- /dev/null +++ b/deployment/core/evaluation/backend_executor.py @@ -0,0 +1,116 @@ +""" +Backend execution primitives. + +`BackendExecutor` is the task-specific collaborator that knows how to run a single +backend on a device for one sample: create the inference pipeline, prepare the +model input, and manage the reference PyTorch model's device placement. + +It is shared by both `~deployment.core.evaluation.base_evaluator.BaseEvaluator` +(the evaluation loop) and +`~deployment.core.evaluation.backend_verifier.BackendVerifier` (the +reference/test verification loop), so neither has to depend on the other. +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import Any, List, Mapping, Optional + +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.evaluator_types import InferenceInput, ModelSpec +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.pipelines.base_pipeline import BaseInferencePipeline + +logger = logging.getLogger(__name__) + + +class BackendExecutor(ABC): + """Run a backend on a device for one sample (pipeline / input / device handling). + + Holds the loaded reference PyTorch model (set via `set_pytorch_model`), which is + needed both to move the model onto a device and to build PyTorch/ONNX/TensorRT + pipelines. Subclasses implement the two task-specific hooks: `create_pipeline` + and `prepare_input`. + """ + + def __init__(self) -> None: + self.pytorch_model: Any = None + + def set_pytorch_model(self, pytorch_model: Any) -> None: + """Attach the loaded PyTorch module used for reference runs and pipeline creation.""" + self.pytorch_model = pytorch_model + + def ensure_model_on_device(self, device: DeviceSpec) -> Any: + """Ensure ``pytorch_model`` lives on ``device`` (used before infer / pipeline creation).""" + if self.pytorch_model is None: + raise RuntimeError( + f"{self.__class__.__name__}.pytorch_model is None. " + "DeploymentRunner must call set_pytorch_model() before verify/evaluate." + ) + + current_device = next(self.pytorch_model.parameters()).device + target_device = device.to_torch_device() + + if current_device != target_device: + logger.info("Moving PyTorch model from %s to %s", current_device, target_device) + self.pytorch_model = self.pytorch_model.to(target_device) + + return self.pytorch_model + + def validate_device(self, backend: Backend, device: DeviceSpec) -> DeviceSpec: + """Validate backend runtime constraints on a concrete DeviceSpec and return it.""" + if backend.requires_cuda and not device.is_cuda: + raise ValueError(f"{backend.value} verification requires CUDA, got '{device}'.") + return device + + def get_output_names(self) -> Optional[List[str]]: + """Optional names for list/tuple raw outputs during verification comparison. + + Override when the backend's pipeline returns a sequence of tensors with known + semantic names (e.g. detection heads). The names are forwarded to the + `~deployment.core.evaluation.output_comparator.OutputComparator` to label + positions in diagnostic paths. + + Returns: + Names aligned with output index order, or `None` to fall back to + `output_0`, `output_1`, ... + """ + return None + + @abstractmethod + def create_pipeline(self, model_spec: ModelSpec, device: DeviceSpec) -> BaseInferencePipeline: + """Create an inference pipeline for ``model_spec.backend`` on ``device``. + + Args: + model_spec: Backend, device, and artifact path for the deployment model. + device: Concrete device for this run. + + Returns: + A ``BaseInferencePipeline`` subclass exposing ``infer()`` and ``cleanup()``. + """ + raise NotImplementedError + + @abstractmethod + def prepare_input( + self, + sample: Mapping[str, Any], + data_loader: BaseDataLoader, + device: DeviceSpec, + ) -> InferenceInput: + """Build an `InferenceInput` for ``sample`` on ``device``. + + Verification calls this once per side (reference and test) with each backend's + own device, so implementations should create tensors directly on ``device`` + rather than relying on downstream moves. + + Args: + sample: Sample data from the data loader. + data_loader: Data loader to load the sample from. + device: Device to prepare the input on. + + Returns: + InferenceInput containing the actual input data and metadata. + """ + raise NotImplementedError diff --git a/deployment/core/evaluation/backend_verifier.py b/deployment/core/evaluation/backend_verifier.py new file mode 100644 index 000000000..a77e9cee3 --- /dev/null +++ b/deployment/core/evaluation/backend_verifier.py @@ -0,0 +1,292 @@ +""" +Backend verification. + +This module contains `BackendVerifier`, which drives the per-sample +verification loop for a reference/test `ModelSpec` pair: it runs both +backends, compares their outputs via `OutputComparator`, and owns all +verification logging. +""" + +from __future__ import annotations + +import logging +import math +from dataclasses import dataclass +from typing import List, Optional + +import torch + +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.evaluation.evaluator_types import ( + ModelSpec, + VerifyResultDict, +) +from deployment.core.evaluation.output_comparator import ( + OutputComparator, + OutputDiffSummary, + TensorDiffDetail, +) +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.pipelines.base_pipeline import BaseInferencePipeline + +logger = logging.getLogger(__name__) + + +def _fmt_finite_diff(value: float) -> str: + """Format a diff for logs; ``inf`` is spelled ``inf`` (not ``inf`` via ``%f`` quirks).""" + return "inf" if math.isinf(value) else f"{value:.6f}" + + +@dataclass(frozen=True) +class SampleVerificationResult: + """Result of verifying a single sample. + + Attributes: + sample_idx: Index used with ``data_loader.load_sample``. + passed: Whether reference and test outputs match within tolerance. + max_diff: Maximum absolute difference observed. + mean_diff: Mean absolute difference weighted by element count. + reason: First discovered mismatch description (``None`` when passed). + """ + + sample_idx: int + passed: bool + max_diff: float + mean_diff: float + reason: Optional[str] = None + + +class BackendVerifier: + """Drive a reference vs test verification run over ``N`` samples. + + Args: + executor: `BackendExecutor` providing pipeline creation, input + preparation, and device handling for each side. + comparator: Pure comparator used on each sample's raw outputs. + """ + + def __init__(self, executor: BackendExecutor, comparator: OutputComparator) -> None: + self._executor = executor + self._comparator = comparator + + def run( + self, + reference: ModelSpec, + test: ModelSpec, + data_loader: BaseDataLoader, + num_samples: int, + tolerance: float, + ) -> VerifyResultDict: + """Run verification for `min(num_samples, data_loader.num_samples)` samples. + + Args: + reference: Reference backend model specification. + test: Backend-under-test specification. + data_loader: Same loader used for evaluation. + num_samples: Requested sample count (capped by loader length). + tolerance: Per-element absolute tolerance for numeric comparison. + + Returns: + `VerifyResultDict` with ``summary`` + per-sample pass map. ``error`` + is set when device normalization fails before any inference runs. + """ + results: VerifyResultDict = { + "summary": {"passed": 0, "failed": 0, "total": 0}, + "samples": {}, + } + + try: + ref_device = self._executor.validate_device(reference.backend, reference.device) + test_device = self._executor.validate_device(test.backend, test.device) + except ValueError as exc: + results["error"] = str(exc) + return results + + self._log_header(reference, test, ref_device, test_device, num_samples, tolerance) + + actual_samples = min(num_samples, data_loader.num_samples) + sample_results: List[SampleVerificationResult] = [] + ref_pipeline = None + test_pipeline = None + try: + logger.info("\nInitializing %s reference pipeline...", reference.backend.value) + self._executor.ensure_model_on_device(ref_device) + ref_pipeline = self._executor.create_pipeline(reference, ref_device) + + logger.info("\nInitializing %s test pipeline...", test.backend.value) + self._executor.ensure_model_on_device(test_device) + test_pipeline = self._executor.create_pipeline(test, test_device) + + for i in range(actual_samples): + sr = self._run_single_sample( + i, + ref_pipeline, + test_pipeline, + data_loader, + ref_device, + test_device, + reference.backend, + test.backend, + tolerance, + ) + sample_results.append(sr) + results["samples"][f"sample_{i}"] = sr.passed + self._log_sample_result(sr) + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + finally: + for pipeline in (ref_pipeline, test_pipeline): + if pipeline is None: + continue + try: + pipeline.cleanup() + except Exception as e: + logger.warning("Error during pipeline cleanup in verification: %s", e) + + passed_count = sum(1 for r in sample_results if r.passed) + failed_count = sum(1 for r in sample_results if not r.passed) + results["summary"] = { + "passed": passed_count, + "failed": failed_count, + "total": len(sample_results), + } + + self._log_summary(sample_results) + return results + + def _run_single_sample( + self, + sample_idx: int, + ref_pipeline: BaseInferencePipeline, + test_pipeline: BaseInferencePipeline, + data_loader: BaseDataLoader, + ref_device: DeviceSpec, + test_device: DeviceSpec, + ref_backend: Backend, + test_backend: Backend, + tolerance: float, + ) -> SampleVerificationResult: + """Run both pipelines on one sample and compare their raw outputs. + + Each side calls ``prepare_input`` with its own device so that tensors + are created directly on the right device (no post-hoc ``.to(device)`` + shuffling). + """ + executor = self._executor + + logger.info("\n%s", "=" * 60) + logger.info("Verifying sample %s", sample_idx) + logger.info("%s", "=" * 60) + + sample = data_loader.load_sample(sample_idx) + + executor.ensure_model_on_device(ref_device) + ref_input = executor.prepare_input(sample, data_loader, ref_device) + ref_label = f"{ref_backend.value} ({ref_device})" + logger.info("Running %s reference...", ref_label) + ref_result = ref_pipeline.infer( + ref_input.data, + metadata=ref_input.metadata, + return_raw_outputs=True, + ) + logger.info(" %s latency: %.2f ms", ref_label, ref_result.latency_ms) + + executor.ensure_model_on_device(test_device) + test_input = executor.prepare_input(sample, data_loader, test_device) + test_label = f"{test_backend.value} ({test_device})" + logger.info("Running %s test...", test_label) + test_result = test_pipeline.infer( + test_input.data, + metadata=test_input.metadata, + return_raw_outputs=True, + ) + logger.info(" %s latency: %.2f ms", test_label, test_result.latency_ms) + + summary, per_tensor = self._comparator.compare(ref_result.output, test_result.output, tolerance) + self._log_per_output_comparison(test_label, per_tensor, summary) + + return SampleVerificationResult( + sample_idx=sample_idx, + passed=summary.passed, + max_diff=summary.max_diff, + mean_diff=summary.mean_diff, + reason=summary.reason, + ) + + def _log_per_output_comparison( + self, + test_label: str, + per_tensor: List[TensorDiffDetail], + summary: OutputDiffSummary, + ) -> None: + """Emit one line per tensor, then overall max/mean, then a verification line.""" + logger.info("") + for d in per_tensor: + logger.info( + " %s: shape=%s, max_diff=%s, mean_diff=%s", + d.path, + d.shape, + _fmt_finite_diff(d.max_diff), + _fmt_finite_diff(d.mean_diff), + ) + logger.info(" Overall Max difference: %s", _fmt_finite_diff(summary.max_diff)) + logger.info(" Overall Mean difference: %s", _fmt_finite_diff(summary.mean_diff)) + verdict = "PASSED ✓" if summary.passed else "FAILED ✗" + logger.info(" %s verification %s", test_label, verdict) + + def _log_header( + self, + reference: ModelSpec, + test: ModelSpec, + ref_device: DeviceSpec, + test_device: DeviceSpec, + num_samples: int, + tolerance: float, + ) -> None: + """Emit a banner with models, devices, sample count and tolerance.""" + logger.info("\n" + "=" * 60) + logger.info("Model Verification") + logger.info("=" * 60) + logger.info("Reference: %s on %s - %s", reference.backend.value, ref_device, reference.artifact.path) + logger.info("Test: %s on %s - %s", test.backend.value, test_device, test.artifact.path) + logger.info("Number of samples: %s", num_samples) + logger.info("Tolerance: %s", tolerance) + logger.info("=" * 60) + + def _log_sample_result(self, result: SampleVerificationResult) -> None: + """Log a single sample's pass/fail verdict plus max/mean diff (and reason on fail).""" + if result.passed: + logger.info( + " sample_%s PASSED ✓ (max_diff=%.6f, mean_diff=%.6f)", + result.sample_idx, + result.max_diff, + result.mean_diff, + ) + else: + logger.warning( + " sample_%s FAILED ✗ (max_diff=%.6f, mean_diff=%.6f) - %s", + result.sample_idx, + result.max_diff, + result.mean_diff, + result.reason or "no diagnostic", + ) + + def _log_summary(self, sample_results: List[SampleVerificationResult]) -> None: + """Log per-sample verdicts then an aggregate pass/fail counter.""" + logger.info("\n" + "=" * 60) + logger.info("Verification Summary") + logger.info("=" * 60) + + for r in sample_results: + status = "PASSED" if r.passed else "FAILED" + logger.info(" sample_%s: %s", r.sample_idx, status) + + total = len(sample_results) + passed = sum(1 for r in sample_results if r.passed) + failed = total - passed + logger.info("=" * 60) + logger.info("Total: %s/%s passed, %s/%s failed", passed, total, failed, total) + logger.info("=" * 60) diff --git a/deployment/core/evaluation/base_evaluator.py b/deployment/core/evaluation/base_evaluator.py new file mode 100644 index 000000000..e22274062 --- /dev/null +++ b/deployment/core/evaluation/base_evaluator.py @@ -0,0 +1,262 @@ +""" +Base evaluator for model evaluation in deployment. + +All project evaluators should extend `BaseEvaluator` and implement the +required hooks for their specific task. The base class provides: + +- A unified evaluation loop (iterate samples -> infer -> accumulate -> metrics) +- Common utilities (latency stats, model device management) + +Module constants: + + LOG_INTERVAL + Sample interval for verbose progress logs in `BaseEvaluator.evaluate`. +""" + +from __future__ import annotations + +import logging +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Mapping + +import numpy as np +from mmengine.config import Config + +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.evaluation.evaluator_types import ( + EvalResultDict, + LatencyBreakdown, + LatencyStats, + ModelSpec, +) +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.core.metrics.base_metrics_interface import BaseMetricsInterface +from deployment.pipelines.base_pipeline import BaseInferencePipeline + +logger = logging.getLogger(__name__) + +# Verbose ``evaluate()`` logs every LOG_INTERVAL samples. +LOG_INTERVAL = 50 + + +class BaseEvaluator(ABC): + """ + Base class for all task-specific evaluators. + + Backend execution (pipeline creation, input preparation, device handling) is + delegated to a `BackendExecutor`. Subclasses implement task-specific metrics hooks: + + - _parse_predictions: Convert pipeline output to the format the metrics interface expects + - _parse_ground_truths: Extract ground truth from sample + - _add_to_interface: Feed a single frame to the metrics interface + - _build_results: Construct final results dict from interface metrics + - print_results: Format and display results + """ + + def __init__( + self, + metrics_interface: BaseMetricsInterface, + model_cfg: Config, + executor: BackendExecutor, + ) -> None: + """Wire task metrics, model configuration, and backend executor into the evaluator. + + Args: + metrics_interface: Task-specific metrics accumulator (reset per ``evaluate()`` run). + model_cfg: MMEngine config for the model (class names, heads, etc.). + executor: Backend execution primitives (pipeline / input / device handling), + shared with `BackendVerifier`. + """ + self.metrics_interface = metrics_interface + self.model_cfg = model_cfg + self._executor = executor + + # ================== Abstract Methods (Task-Specific) ================== + + @abstractmethod + def _parse_predictions(self, pipeline_output: Any) -> Any: + """Convert raw pipeline output into the format `_add_to_interface` expects.""" + raise NotImplementedError + + @abstractmethod + def _parse_ground_truths(self, gt_data: Mapping[str, Any]) -> Any: + """Parse `sample["ground_truth"]` into ground-truth structures for metrics.""" + raise NotImplementedError + + @abstractmethod + def _add_to_interface(self, predictions: Any, ground_truths: Any) -> None: + """Feed one sample's predictions and labels into ``metrics_interface``.""" + raise NotImplementedError + + @abstractmethod + def _build_results( + self, + latencies: List[float], + latency_breakdowns: List[Dict[str, float]], + num_samples: int, + ) -> EvalResultDict: + """Aggregate metrics and latencies into the final `EvalResultDict`.""" + raise NotImplementedError + + @abstractmethod + def print_results(self, results: EvalResultDict) -> None: + """Render ``results`` for human-readable logs (prefer ``logging``, not ``print``).""" + raise NotImplementedError + + def summarize_for_comparison(self, results: EvalResultDict) -> List[str]: + """Return metric lines for the cross-backend comparison table. + + Args: + results: A successful ``EvalResultDict``. + + Returns: + Pre-formatted, indented log lines (may be empty). + """ + lines: List[str] = [] + latency = results.get("latency") + if latency is not None: + lines.append(f" Latency: {latency.mean_ms:.2f} ± {latency.std_ms:.2f} ms") + return lines + + # ================== Core Evaluation Loop ================== + + def evaluate( + self, + model: ModelSpec, + data_loader: BaseDataLoader, + num_samples: int, + verbose: bool = False, + num_warmup: int = 0, + ) -> EvalResultDict: + """Run inference over samples and compute task metrics via ``metrics_interface``. + + Args: + model: Backend, device, and artifact for the model under test. + data_loader: Provides ``load_sample(i)`` with ``ground_truth`` for each sample. + num_samples: Requested batch count (capped by ``data_loader.num_samples``). + verbose: If True, log progress every :data:`LOG_INTERVAL` samples. + num_warmup: Number of warm-up inferences to run before timing begins. These + prime GPU/CUDA/TensorRT state and are excluded from latency and metrics, + so they do not affect the ``num_samples`` totals. + + Returns: + Task-specific evaluation dict from ``_build_results``. + + Raises: + KeyError: If a loaded sample lacks ``\"ground_truth\"``. + """ + logger.info("\nEvaluating %s model: %s", model.backend.value, model.artifact.path) + logger.info("Number of samples: %s", num_samples) + + self._executor.ensure_model_on_device(model.device) + pipeline = self._executor.create_pipeline(model, model.device) + self.metrics_interface.reset() + + latencies: List[float] = [] + latency_breakdowns: List[Dict[str, float]] = [] + + actual_samples = min(num_samples, data_loader.num_samples) + + try: + self._run_warmup(pipeline, data_loader, model, num_warmup, verbose) + + for idx in range(actual_samples): + if verbose and idx % LOG_INTERVAL == 0: + logger.info("Processing sample %s/%s", idx + 1, actual_samples) + + sample = data_loader.load_sample(idx) + inference_input = self._executor.prepare_input(sample, data_loader, model.device) + + if "ground_truth" not in sample: + raise KeyError("DataLoader.load_sample() must return 'ground_truth' for evaluation.") + ground_truths = self._parse_ground_truths(sample["ground_truth"]) + + infer_result = pipeline.infer(inference_input.data, metadata=inference_input.metadata) + latencies.append(infer_result.latency_ms) + if infer_result.breakdown: + latency_breakdowns.append(infer_result.breakdown) + + predictions = self._parse_predictions(infer_result.output) + self._add_to_interface(predictions, ground_truths) + + pipeline.periodic_cleanup(idx) + finally: + try: + pipeline.cleanup() + except Exception as e: + logger.warning("Error during pipeline cleanup: %s", e) + + return self._build_results(latencies, latency_breakdowns, actual_samples) + + def _run_warmup( + self, + pipeline: BaseInferencePipeline, + data_loader: BaseDataLoader, + model: ModelSpec, + num_warmup: int, + verbose: bool, + ) -> None: + """Run throwaway inferences to prime GPU/CUDA/TensorRT state before timing. + + Reuses the first ``num_warmup`` samples (capped by dataset size). Outputs, + latency, and metrics are intentionally discarded so warm-up does not affect the + measured ``num_samples`` results. + """ + warmup_count = min(num_warmup, data_loader.num_samples) + if warmup_count <= 0: + return + + if verbose: + logger.info("Warming up on %s sample(s) (excluded from metrics/latency)...", warmup_count) + + for idx in range(warmup_count): + sample = data_loader.load_sample(idx) + inference_input = self._executor.prepare_input(sample, data_loader, model.device) + pipeline.infer(inference_input.data, metadata=inference_input.metadata) + + # ================== Utilities ================== + + def compute_latency_stats(self, latencies: List[float]) -> LatencyStats: + """Compute mean, std, min, max, median over per-sample latencies (milliseconds). + + Args: + latencies: Per-inference `latency_ms` values (empty list yields zeros via `LatencyStats.empty()`). + + Returns: + Immutable `LatencyStats`. + """ + if not latencies: + return LatencyStats.empty() + + arr = np.array(latencies) + return LatencyStats( + mean_ms=float(np.mean(arr)), + std_ms=float(np.std(arr)), + min_ms=float(np.min(arr)), + max_ms=float(np.max(arr)), + median_ms=float(np.median(arr)), + ) + + def _compute_latency_breakdown( + self, + latency_breakdowns: List[Dict[str, float]], + ) -> LatencyBreakdown: + """Aggregate per-sample stage timings into a `LatencyBreakdown`. + + Args: + latency_breakdowns: One dict per sample, keys are stage names, values are ms. + + Returns: + Per-stage `LatencyStats` keyed by stage name. + """ + if not latency_breakdowns: + return LatencyBreakdown.empty() + + stage_order = list(dict.fromkeys(stage for bd in latency_breakdowns for stage in bd.keys())) + + return LatencyBreakdown( + stages={ + stage: self.compute_latency_stats([bd[stage] for bd in latency_breakdowns if stage in bd]) + for stage in stage_order + } + ) diff --git a/deployment/core/evaluation/evaluator_types.py b/deployment/core/evaluation/evaluator_types.py new file mode 100644 index 000000000..58fadce88 --- /dev/null +++ b/deployment/core/evaluation/evaluator_types.py @@ -0,0 +1,143 @@ +""" +Type definitions for model evaluation in deployment. + +This module contains the shared type definitions used by evaluators, +runners, and orchestrators. +""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, Mapping, Optional, TypedDict + +from deployment.core.artifacts import Artifact +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec + + +class EvalResultDict(TypedDict, total=False): + """ + Structured evaluation result produced by ``BaseEvaluator._build_results``. + + Every key is optional (``total=False``): each task surfaces only the subset + relevant to it (detection emits ``mAP_*``; classification emits ``accuracy``). + + Attributes: + mAP_by_mode: Detection mAP keyed by evaluation mode/distance bucket. + mAPH_by_mode: Detection mAPH (heading-aware) keyed by mode. + per_class_ap_by_mode: Per-class AP nested by mode. + accuracy: Top-line scalar for classification tasks. + detailed_metrics: Raw task-specific metric payload for deep inspection. + latency: End-to-end latency statistics from ``compute_latency_stats``. + latency_breakdown: Per-stage latency statistics (optional). + num_samples: Number of samples actually evaluated. + error: Set instead of metrics when evaluation failed for this backend. + """ + + mAP_by_mode: Dict[str, float] + mAPH_by_mode: Dict[str, float] + per_class_ap_by_mode: Dict[str, Any] + accuracy: float + detailed_metrics: Dict[str, Any] + latency: "LatencyStats" + latency_breakdown: "LatencyBreakdown" + num_samples: int + error: str + + +class VerifyResultDict(TypedDict, total=False): + """ + Structured verification outcome shared between runners and evaluators. + + Attributes: + summary: Aggregate pass/fail counts. + samples: Mapping of sample identifiers to boolean pass/fail states. + """ + + summary: Dict[str, int] + samples: Dict[str, bool] + error: str + + +@dataclass(frozen=True) +class LatencyStats: + """ + Immutable latency statistics for a batch of inferences. + + Provides a typed alternative to loose dictionaries and a convenient + ``to_dict`` helper for interoperability with existing call sites. + """ + + mean_ms: float + std_ms: float + min_ms: float + max_ms: float + median_ms: float + + @classmethod + def empty(cls) -> LatencyStats: + """Return a zero-initialized stats object.""" + return cls(0.0, 0.0, 0.0, 0.0, 0.0) + + def to_dict(self) -> Dict[str, float]: + """Convert to a plain dictionary for serialization.""" + return asdict(self) + + +@dataclass(frozen=True) +class LatencyBreakdown: + """ + Stage-wise latency statistics keyed by stage name. + + Stored as a mapping of stage -> LatencyStats, with a ``to_dict`` helper + to preserve backward compatibility with existing dictionary consumers. + """ + + stages: Dict[str, LatencyStats] + + @classmethod + def empty(cls) -> LatencyBreakdown: + """Return an empty breakdown.""" + return cls(stages={}) + + def to_dict(self) -> Dict[str, Dict[str, float]]: + """Convert to ``Dict[str, Dict[str, float]]`` for downstream use.""" + return {stage: stats.to_dict() for stage, stats in self.stages.items()} + + +@dataclass(frozen=True) +class InferenceInput: + """Prepared input for pipeline inference. + + Attributes: + data: The actual input data (e.g., points tensor, image tensor). + metadata: Sample metadata forwarded to postprocess(). + """ + + data: Any + metadata: Mapping[str, Any] = field(default_factory=dict) + + +@dataclass(frozen=True) +class InferenceResult: + """Standard inference return payload.""" + + output: Any + latency_ms: float + breakdown: Optional[Dict[str, float]] = None + + +@dataclass(frozen=True) +class ModelSpec: + """ + Minimal description of a concrete model artifact to evaluate or verify. + + Attributes: + backend: Backend identifier such as 'pytorch', 'onnx', or 'tensorrt'. + device: Target runtime device. + artifact: Filesystem representation of the produced model. + """ + + backend: Backend + device: DeviceSpec + artifact: Artifact diff --git a/deployment/core/evaluation/output_comparator.py b/deployment/core/evaluation/output_comparator.py new file mode 100644 index 000000000..760c2c246 --- /dev/null +++ b/deployment/core/evaluation/output_comparator.py @@ -0,0 +1,242 @@ +""" +Pure output comparison for model verification. + +This module contains `OutputComparator`, a stateless recursive comparator +for structured model outputs. Deployment verification expects pipeline raw +outputs that are **sequences (list/tuple) of tensors/arrays** and/or tensor +leaves; dict and bare scalar outputs are not handled (they fail with a type +mismatch). + +Naming: + - **OutputDiffSummary**: one object for the **whole output** — whether it + passed, overall max/mean diff (aggregated), and first failure reason. + - **TensorDiffDetail**: one row per **tensor** in the structure — path, + shape, and that tensor's max/mean diff (for per-head logging). + +Design notes: + - No logging here; callers (e.g. ``BackendVerifier``) render logs. + - :meth:`OutputComparator.compare` returns ``(OutputDiffSummary, list of + TensorDiffDetail)`` in a single traversal. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, List, Optional, Sequence, Tuple, Union + +import numpy as np +import torch + + +@dataclass(frozen=True) +class OutputDiffSummary: + """Rolled-up comparison for an entire structured output (or subtree). + + Use this for pass/fail and **global** max/mean diff. For each tensor's + own stats, see :class:`TensorDiffDetail`. + + Attributes: + passed: True if the full structure is within ``tolerance``. + max_diff: Largest per-tensor max diff anywhere in the tree. + mean_diff: Element-weighted mean of absolute differences over the tree. + num_elements: Total tensor elements compared (for weighted mean). + reason: First failing tensor's message, or ``None`` if passed. + """ + + passed: bool + max_diff: float + mean_diff: float + num_elements: int = 0 + reason: Optional[str] = None + + +@dataclass(frozen=True) +class TensorDiffDetail: + """Stats for **one tensor** at a path (used for verbose per-output logs). + + Attributes: + path: Dot/bracket path (e.g. ``output[heatmap]``). + shape: NumPy shape of this tensor. + max_diff: Max absolute difference on this tensor. + mean_diff: Mean absolute difference on this tensor. + passed: Whether this tensor alone satisfies ``tolerance``. + """ + + path: str + shape: Tuple[int, ...] + max_diff: float + mean_diff: float + passed: bool + + +class OutputComparator: + """Recursively compare structured outputs within an absolute tolerance. + + Optional ``output_names`` label sequence slots (e.g. head names) in paths. + + Args: + output_names: Names aligned with sequence indices; children become + ``output[name]`` instead of ``output_0``, ``output_1``, ... + """ + + def __init__(self, output_names: Optional[Sequence[str]] = None) -> None: + self._output_names: Optional[Tuple[str, ...]] = tuple(output_names) if output_names else None + + def compare( + self, + reference: Any, + test: Any, + tolerance: float, + path: str = "output", + ) -> Tuple[OutputDiffSummary, List[TensorDiffDetail]]: + """Compare two structured outputs; collect per-tensor rows and a summary.""" + tensor_details: List[TensorDiffDetail] = [] + summary = self._compare_nested(reference, test, tolerance, path, tensor_details) + return summary, tensor_details + + def _compare_nested( + self, + reference: Any, + test: Any, + tolerance: float, + path: str, + tensor_details: List[TensorDiffDetail], + ) -> OutputDiffSummary: + """Recursive compare; appends one :class:`TensorDiffDetail` per tensor leaf.""" + if reference is None and test is None: + return OutputDiffSummary(passed=True, max_diff=0.0, mean_diff=0.0) + + if reference is None or test is None: + return _fail(path, "one side is None while the other is not") + + if isinstance(reference, (list, tuple)) and isinstance(test, (list, tuple)): + return self._compare_sequences(reference, test, tolerance, path, tensor_details) + + if self._is_array_like(reference) and self._is_array_like(test): + return self._compare_arrays(reference, test, tolerance, path, tensor_details) + + return _fail( + path, + f"type mismatch {type(reference).__name__} vs {type(test).__name__}", + ) + + def _compare_sequences( + self, + reference: Union[List, Tuple], + test: Union[List, Tuple], + tolerance: float, + path: str, + tensor_details: List[TensorDiffDetail], + ) -> OutputDiffSummary: + """Compare list/tuple outputs element-wise using ``output_names`` when provided.""" + if len(reference) != len(test): + return _fail(path, f"length mismatch {len(reference)} vs {len(test)}") + + names = self._output_names + + def _child_summaries(): + for idx, (ref_item, test_item) in enumerate(zip(reference, test)): + name = names[idx] if names and idx < len(names) else f"output_{idx}" + yield self._compare_nested(ref_item, test_item, tolerance, f"{path}[{name}]", tensor_details) + + return self._merge_summaries(_child_summaries()) + + def _compare_arrays( + self, + reference: Any, + test: Any, + tolerance: float, + path: str, + tensor_details: List[TensorDiffDetail], + ) -> OutputDiffSummary: + """Compare tensor/ndarray leaves (same shape required).""" + ref_np = self._to_numpy(reference) + test_np = self._to_numpy(test) + + if ref_np.shape != test_np.shape: + tensor_details.append( + TensorDiffDetail( + path=path, + shape=tuple(int(x) for x in ref_np.shape), + max_diff=float("inf"), + mean_diff=float("inf"), + passed=False, + ) + ) + return _fail(path, f"shape mismatch {ref_np.shape} vs {test_np.shape}") + + diff = np.abs(ref_np.astype(np.float64) - test_np.astype(np.float64)) + max_diff = float(np.max(diff)) if diff.size else 0.0 + mean_diff = float(np.mean(diff)) if diff.size else 0.0 + num_elements = int(diff.size) + + passed = max_diff <= tolerance + reason = ( + None if passed else f"{path}: max_diff={max_diff:.6f} > tolerance={tolerance:.6f} (shape={ref_np.shape})" + ) + tensor_details.append( + TensorDiffDetail( + path=path, + shape=tuple(int(x) for x in ref_np.shape), + max_diff=max_diff, + mean_diff=mean_diff, + passed=passed, + ) + ) + return OutputDiffSummary( + passed=passed, + max_diff=max_diff, + mean_diff=mean_diff, + num_elements=num_elements, + reason=reason, + ) + + @staticmethod + def _merge_summaries(results) -> OutputDiffSummary: + """Combine child :class:`OutputDiffSummary` values into one rollup.""" + max_diff = 0.0 + total_diff = 0.0 + total_elements = 0 + all_passed = True + first_reason: Optional[str] = None + + for result in results: + max_diff = max(max_diff, result.max_diff) + total_diff += result.mean_diff * result.num_elements + total_elements += result.num_elements + if not result.passed and all_passed: + all_passed = False + first_reason = result.reason + + mean_diff = total_diff / total_elements if total_elements > 0 else 0.0 + return OutputDiffSummary( + passed=all_passed, + max_diff=max_diff, + mean_diff=mean_diff, + num_elements=total_elements, + reason=first_reason, + ) + + @staticmethod + def _is_array_like(obj: Any) -> bool: + """Return True when ``obj`` is a tensor or ndarray (leaf comparison path).""" + return isinstance(obj, (torch.Tensor, np.ndarray)) + + @staticmethod + def _to_numpy(tensor: Any) -> np.ndarray: + """Convert tensors to CPU NumPy arrays; pass through ``ndarray``.""" + if isinstance(tensor, torch.Tensor): + return tensor.detach().cpu().numpy() + if isinstance(tensor, np.ndarray): + return tensor + return np.array(tensor) + + +def _fail(path: str, reason: str) -> OutputDiffSummary: + """Build a failing summary with infinite diffs and a short reason.""" + return OutputDiffSummary( + passed=False, + max_diff=float("inf"), + mean_diff=float("inf"), + reason=f"{path}: {reason}", + ) diff --git a/deployment/core/io/__init__.py b/deployment/core/io/__init__.py new file mode 100644 index 000000000..1bcc7ba43 --- /dev/null +++ b/deployment/core/io/__init__.py @@ -0,0 +1 @@ +"""I/O utilities. Import from concrete modules such as ``deployment.core.io.base_data_loader``.""" diff --git a/deployment/core/io/base_data_loader.py b/deployment/core/io/base_data_loader.py new file mode 100644 index 000000000..3158d8e18 --- /dev/null +++ b/deployment/core/io/base_data_loader.py @@ -0,0 +1,127 @@ +""" +Abstract base class for data loading in deployment. + +Each task (classification, detection, segmentation, etc.) must implement +a concrete DataLoader that extends this base class. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any, Dict, Mapping, Optional, TypedDict + + +class SampleData(TypedDict, total=False): + """ + Typed representation of a data sample handled by data loaders. + + Attributes: + input: Raw input data such as images or point clouds. + ground_truth: Labels or annotations if available. + metadata: Additional information required for evaluation. + """ + + input: Any + ground_truth: Any + metadata: Dict[str, Any] + + +class BaseDataLoader(ABC): + """ + Abstract base class for task-specific data loaders. + + This class defines the interface that all task-specific data loaders + must implement. It handles loading raw data from disk and preprocessing + it into a format suitable for model inference. + """ + + def __init__(self, config: Optional[Mapping[str, Any]] = None) -> None: + """ + Initialize data loader. + + Args: + config: Configuration dictionary containing task-specific settings + """ + self.config = config or {} + + @abstractmethod + def load_sample(self, index: int) -> SampleData: + """ + Load a single sample from the dataset. + + Args: + index: Sample index to load + + Returns: + Dictionary containing raw sample data. Structure is task-specific, + but should typically include: + - Raw input data (image, point cloud, etc.) + - Ground truth labels/annotations (if available) + - Any metadata needed for evaluation + + Raises: + IndexError: If index is out of range + FileNotFoundError: If sample data files don't exist + """ + raise NotImplementedError + + @abstractmethod + def preprocess(self, sample: SampleData) -> Any: + """ + Preprocess raw sample data into model input format. + + Args: + sample: Raw sample data returned by load_sample() + + Returns: + Preprocessed model input ready for inference. Type/shape is task-specific. + (e.g., torch.Tensor, Dict[str, torch.Tensor], tuple, etc.) + + Raises: + ValueError: If sample format is invalid + """ + raise NotImplementedError + + @property + @abstractmethod + def num_samples(self) -> int: + """ + Get total number of samples in the dataset. + + Returns: + Total number of samples available + """ + raise NotImplementedError + + def get_shape_sample(self, index: int = 0) -> Any: + """ + Return a representative sample used for export shape configuration. + + This method provides a consistent interface for exporters to obtain + shape information without needing to know the internal structure of + preprocessed inputs (e.g., whether it's a single tensor, tuple, or list). + + The default implementation: + 1. Loads a sample using load_sample() + 2. Preprocesses it using preprocess() + 3. If the result is a list/tuple, returns the first element + 4. Otherwise returns the preprocessed result as-is + + Subclasses can override this method to provide custom shape sample logic + if the default behavior is insufficient. + + Args: + index: Sample index to use (default: 0) + + Returns: + A representative sample for shape configuration. Typically a torch.Tensor, + but the exact type depends on the task-specific implementation. + """ + sample = self.load_sample(index) + preprocessed = self.preprocess(sample) + + # Handle nested structures: if it's a list/tuple, use first element for shape + if isinstance(preprocessed, (list, tuple)): + return preprocessed[0] if len(preprocessed) > 0 else preprocessed + + return preprocessed diff --git a/deployment/core/metrics/__init__.py b/deployment/core/metrics/__init__.py new file mode 100644 index 000000000..bfe232777 --- /dev/null +++ b/deployment/core/metrics/__init__.py @@ -0,0 +1 @@ +"""Metrics interfaces. Import from concrete modules under ``deployment.core.metrics.*``.""" diff --git a/deployment/core/metrics/base_metrics_interface.py b/deployment/core/metrics/base_metrics_interface.py new file mode 100644 index 000000000..8a650f8f0 --- /dev/null +++ b/deployment/core/metrics/base_metrics_interface.py @@ -0,0 +1,223 @@ +""" +Base Metrics Interface for unified metric computation. + +This module provides the abstract base class that all task-specific metrics interfaces +must implement. It ensures a consistent contract across 3D detection, 2D detection, +and classification tasks. + +All metric interfaces use autoware_perception_evaluation as the underlying computation +engine to ensure consistency between training (T4MetricV2) and deployment evaluation. +""" + +import logging +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Dict, List, Mapping, Optional + +from perception_eval.common.label import AutowareLabel, Label + +logger = logging.getLogger(__name__) + +# Valid 2D frame IDs for camera-based tasks (2D detection, classification) +VALID_2D_FRAME_IDS = [ + "cam_front", + "cam_front_right", + "cam_front_left", + "cam_front_lower", + "cam_back", + "cam_back_left", + "cam_back_right", + "cam_traffic_light_near", + "cam_traffic_light_far", + "cam_traffic_light", +] + + +@dataclass(frozen=True) +class BaseMetricsConfig: + """Base configuration for all metrics interfaces. + + Attributes: + class_names: List of class names for evaluation. + frame_id: Frame ID for evaluation (e.g., "base_link" for 3D, "camera" for 2D). + """ + + class_names: List[str] + frame_id: str + + +@dataclass(frozen=True) +class ClassificationSummary: + """Structured summary for classification metrics.""" + + accuracy: float = 0.0 + precision: float = 0.0 + recall: float = 0.0 + f1score: float = 0.0 + per_class_accuracy: Dict[str, float] = field(default_factory=dict) + confusion_matrix: List[List[int]] = field(default_factory=list) + num_samples: int = 0 + detailed_metrics: Dict[str, float] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to a serializable dictionary.""" + return { + "accuracy": self.accuracy, + "precision": self.precision, + "recall": self.recall, + "f1score": self.f1score, + "per_class_accuracy": dict(self.per_class_accuracy), + "confusion_matrix": [list(row) for row in self.confusion_matrix], + "num_samples": self.num_samples, + "detailed_metrics": dict(self.detailed_metrics), + } + + +@dataclass(frozen=True) +class DetectionSummary: + """Structured summary for detection metrics (2D/3D). + + All matching modes computed by autoware_perception_evaluation are included. + The `mAP_by_mode` and `mAPH_by_mode` dicts contain results for each matching mode. + """ + + mAP_by_mode: Dict[str, float] = field(default_factory=dict) + mAPH_by_mode: Dict[str, float] = field(default_factory=dict) + per_class_ap_by_mode: Dict[str, Dict[str, float]] = field(default_factory=dict) + num_frames: int = 0 + detailed_metrics: Dict[str, float] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + """Convert to dict.""" + return { + "mAP_by_mode": dict(self.mAP_by_mode), + "mAPH_by_mode": dict(self.mAPH_by_mode), + "per_class_ap_by_mode": {k: dict(v) for k, v in self.per_class_ap_by_mode.items()}, + "num_frames": self.num_frames, + "detailed_metrics": dict(self.detailed_metrics), + } + + +class BaseMetricsInterface(ABC): + """ + Abstract base class for all task-specific metrics interfaces. + + This class defines the common interface that all metric interfaces must implement. + Each interface wraps autoware_perception_evaluation to compute metrics consistent + with training evaluation (T4MetricV2). + + The workflow is: + 1. Create interface with task-specific config + 2. Call reset() to start a new evaluation session + 3. Call add_frame() for each sample + 4. Call compute_metrics() to get final metrics + 5. Optionally call get_summary() for a human-readable summary + + Example: + interface = SomeMetricsInterface(config) + interface.reset() + for pred, gt in data: + interface.add_frame(pred, gt) + metrics = interface.compute_metrics() + """ + + _UNKNOWN = "unknown" + + def __init__(self, config: BaseMetricsConfig) -> None: + """ + Initialize the metrics interface. + + Args: + config: Configuration for the metrics interface. + """ + self.config = config + self.class_names = config.class_names + self.frame_id = config.frame_id + self._frame_count = 0 + + @abstractmethod + def reset(self) -> None: + """ + Reset the interface for a new evaluation session. + + This method should clear all accumulated frame data and reinitialize + the underlying evaluator. + """ + pass + + @abstractmethod + def add_frame(self, *args) -> None: + """ + Add a frame of predictions and ground truths for evaluation. + + The specific arguments depend on the task type: + - 3D Detection: predictions: List[Dict], ground_truths: List[Dict] + - 2D Detection: predictions: List[Dict], ground_truths: List[Dict] + - Classification: prediction: int, ground_truth: int, probabilities: List[float] + """ + pass + + @abstractmethod + def compute_metrics(self) -> Dict[str, float]: + """ + Compute metrics from all added frames. + + Returns: + Dictionary of metric names to values. + """ + pass + + @property + @abstractmethod + def summary(self) -> Any: + """ + Get a summary of the evaluation including primary metrics. + + Returns: + Dictionary with summary metrics and additional information. + """ + pass + + def _convert_index_to_label(self, label_index: int) -> Label: + """Convert a label index to a perception_eval Label object. + + Args: + label_index: Index of the label in class_names. + + Returns: + Label object with AutowareLabel (UNKNOWN for out-of-range indices). + """ + if 0 <= label_index < len(self.class_names): + class_name = self.class_names[label_index] + else: + class_name = self._UNKNOWN + + autoware_label = AutowareLabel.__members__.get(class_name.upper(), AutowareLabel.UNKNOWN) + return Label(label=autoware_label, name=class_name) + + @staticmethod + def _extract_matching_modes(metrics: Mapping[str, float]) -> List[str]: + """Extract matching modes from metric keys, preserving order and de-duping. + + Handles both non-prefixed (``mAP_center_distance_bev``) and prefixed + (``bev_center_0.0-50.0_mAP_center_distance_bev``) key formats. + """ + pat = re.compile(r"(?:^|_)mAP_(.+)$") + modes: List[str] = [] + for key in metrics.keys(): + match = pat.search(key) + if match: + modes.append(match.group(1)) + return list(dict.fromkeys(modes)) + + def format_metrics_report(self) -> Optional[str]: + """Format the metrics report as a human-readable string. + + This is an optional method that can be overridden by subclasses to provide + task-specific formatting. By default, returns None. + + Returns: + Formatted metrics report string. None if not implemented. + """ + return None diff --git a/deployment/core/metrics/classification_metrics.py b/deployment/core/metrics/classification_metrics.py new file mode 100644 index 000000000..9df68df6a --- /dev/null +++ b/deployment/core/metrics/classification_metrics.py @@ -0,0 +1,368 @@ +""" +Classification Metrics Interface using autoware_perception_evaluation. + +This module provides an interface to compute classification metrics (accuracy, precision, +recall, F1) using autoware_perception_evaluation, ensuring consistent metrics between +training evaluation and deployment evaluation. + +Usage: + config = ClassificationMetricsConfig( + class_names=["miscalibrated", "calibrated"], + ) + interface = ClassificationMetricsInterface(config) + + for pred_label, gt_label in zip(predictions, ground_truths): + interface.add_frame(prediction=pred_label, ground_truth=gt_label) + + metrics = interface.compute_metrics() + # Returns: {"accuracy": 0.95, "precision": 0.94, "recall": 0.96, "f1score": 0.95, ...} +""" + +import logging +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple + +import numpy as np +from perception_eval.common.dataset import FrameGroundTruth +from perception_eval.common.object2d import DynamicObject2D +from perception_eval.common.schema import FrameID +from perception_eval.config.perception_evaluation_config import PerceptionEvaluationConfig +from perception_eval.evaluation.metrics import MetricsScore +from perception_eval.evaluation.result.perception_frame_config import ( + CriticalObjectFilterConfig, + PerceptionPassFailConfig, +) +from perception_eval.manager import PerceptionEvaluationManager + +from deployment.core.metrics.base_metrics_interface import ( + VALID_2D_FRAME_IDS, + BaseMetricsConfig, + BaseMetricsInterface, + ClassificationSummary, +) + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class ClassificationMetricsConfig(BaseMetricsConfig): + """Configuration for classification metrics. + + Attributes: + class_names: List of class names for evaluation. + frame_id: Camera frame ID for evaluation (default: "cam_front"). + evaluation_config_dict: Configuration dict for perception evaluation. + critical_object_filter_config: Config for filtering critical objects. + frame_pass_fail_config: Config for pass/fail criteria. + """ + + frame_id: str = "cam_front" + evaluation_config_dict: Optional[Dict[str, Any]] = None + critical_object_filter_config: Optional[Dict[str, Any]] = None + frame_pass_fail_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + if self.frame_id not in VALID_2D_FRAME_IDS: + raise ValueError( + f"Invalid frame_id '{self.frame_id}' for classification. " f"Valid options: {VALID_2D_FRAME_IDS}" + ) + + if self.evaluation_config_dict is None: + object.__setattr__( + self, + "evaluation_config_dict", + { + "evaluation_task": "classification2d", + "target_labels": self.class_names, + "center_distance_thresholds": None, + "center_distance_bev_thresholds": None, + "plane_distance_thresholds": None, + "iou_2d_thresholds": None, + "iou_3d_thresholds": None, + "label_prefix": "autoware", + }, + ) + + if self.critical_object_filter_config is None: + object.__setattr__( + self, + "critical_object_filter_config", + { + "target_labels": self.class_names, + "ignore_attributes": None, + }, + ) + + if self.frame_pass_fail_config is None: + object.__setattr__( + self, + "frame_pass_fail_config", + { + "target_labels": self.class_names, + "matching_threshold_list": [1.0] * len(self.class_names), + "confidence_threshold_list": None, + }, + ) + + +class ClassificationMetricsInterface(BaseMetricsInterface): + """Interface for computing classification metrics using autoware_perception_evaluation. + + Metrics computed: + - Accuracy: TP / (num_predictions + num_gt - TP) + - Precision: TP / (TP + FP) + - Recall: TP / num_gt + - F1 Score: 2 * precision * recall / (precision + recall) + - Per-class accuracy, precision, recall, F1 + """ + + def __init__( + self, + config: ClassificationMetricsConfig, + data_root: str = "data/t4dataset/", + result_root_directory: str = "/tmp/perception_eval_classification/", + ) -> None: + """Initialize the classification metrics interface. + + Args: + config: Configuration for classification metrics. + data_root: Root directory of the dataset. + result_root_directory: Directory for saving evaluation results. + """ + super().__init__(config) + self.config: ClassificationMetricsConfig = config + + self.perception_eval_config = PerceptionEvaluationConfig( + dataset_paths=data_root, + frame_id=config.frame_id, + result_root_directory=result_root_directory, + evaluation_config_dict=config.evaluation_config_dict, + load_raw_data=False, + ) + + self.critical_object_filter_config = CriticalObjectFilterConfig( + evaluator_config=self.perception_eval_config, + **config.critical_object_filter_config, + ) + + self.frame_pass_fail_config = PerceptionPassFailConfig( + evaluator_config=self.perception_eval_config, + **config.frame_pass_fail_config, + ) + + self.evaluator: Optional[PerceptionEvaluationManager] = None + + def reset(self) -> None: + """Reset the interface for a new evaluation session.""" + self.evaluator = PerceptionEvaluationManager( + evaluation_config=self.perception_eval_config, + load_ground_truth=False, + metric_output_dir=None, + ) + self._frame_count = 0 + + def _create_dynamic_object_2d( + self, + label_index: int, + unix_time: int, + score: float = 1.0, + uuid: Optional[str] = None, + ) -> DynamicObject2D: + """Create a DynamicObject2D for classification (roi=None for image-level).""" + return DynamicObject2D( + unix_time=unix_time, + frame_id=FrameID.from_value(self.frame_id), + semantic_score=score, + semantic_label=self._convert_index_to_label(label_index), + roi=None, + uuid=uuid, + ) + + def add_frame( + self, + prediction: int, + ground_truth: int, + probabilities: Optional[List[float]] = None, + frame_name: Optional[str] = None, + ) -> None: + """Add a single prediction and ground truth for evaluation. + + Args: + prediction: Predicted class index. + ground_truth: Ground truth class index. + probabilities: Optional probability scores for each class. + frame_name: Optional name for the frame. + """ + if self.evaluator is None: + self.reset() + + unix_time = int(time.time() * 1e6) + if frame_name is None: + frame_name = str(self._frame_count) + + # Get confidence score from probabilities if available + score = 1.0 + if probabilities is not None and 0 <= prediction < len(probabilities): + score = float(probabilities[prediction]) + + # Create prediction and ground truth objects + estimated_object = self._create_dynamic_object_2d( + label_index=prediction, unix_time=unix_time, score=score, uuid=frame_name + ) + gt_object = self._create_dynamic_object_2d( + label_index=ground_truth, unix_time=unix_time, score=1.0, uuid=frame_name + ) + + frame_ground_truth = FrameGroundTruth( + unix_time=unix_time, + frame_name=frame_name, + objects=[gt_object], + transforms=None, + raw_data=None, + ) + + try: + self.evaluator.add_frame_result( + unix_time=unix_time, + ground_truth_now_frame=frame_ground_truth, + estimated_objects=[estimated_object], + critical_object_filter_config=self.critical_object_filter_config, + frame_pass_fail_config=self.frame_pass_fail_config, + ) + self._frame_count += 1 + except Exception as e: + logger.warning("Failed to add frame %s: %s", frame_name, e) + + def compute_metrics(self) -> Dict[str, float]: + """Compute metrics from all added predictions. + + Returns: + Dictionary of metrics including accuracy, precision, recall, f1score, + and per-class metrics. + """ + if self.evaluator is None or self._frame_count == 0: + logger.warning("No samples to evaluate") + return {} + + try: + metrics_score: MetricsScore = self.evaluator.get_scene_result() + return self._process_metrics_score(metrics_score) + except Exception: + logger.exception("Error computing metrics") + return {} + + @staticmethod + def _summarize_classification_score(classification_score: Any) -> Tuple[float, float, float, float]: + """Read overall (accuracy, precision, recall, f1) from a perception_eval score.""" + summarize = getattr(classification_score, "_summarize", None) + if not callable(summarize): + raise AttributeError( + "perception_eval classification score no longer exposes '_summarize'; " + "update ClassificationMetricsInterface to the current perception_eval API." + ) + return summarize() + + @staticmethod + def _finite_or_zero(value: float) -> float: + """Coerce inf/nan (e.g. from empty divisions or 0/0) to 0.0.""" + return float(value) if np.isfinite(value) else 0.0 + + def _process_metrics_score(self, metrics_score: MetricsScore) -> Dict[str, float]: + """Process MetricsScore into a flat dictionary.""" + metric_dict = {} + + for classification_score in metrics_score.classification_scores: + # Get overall metrics + accuracy, precision, recall, f1score = self._summarize_classification_score(classification_score) + metric_dict["accuracy"] = self._finite_or_zero(accuracy) + metric_dict["precision"] = self._finite_or_zero(precision) + metric_dict["recall"] = self._finite_or_zero(recall) + metric_dict["f1score"] = self._finite_or_zero(f1score) + + # Process per-class metrics + for acc in classification_score.accuracies: + if not acc.target_labels: + continue + + target_label = acc.target_labels[0] + class_name = getattr(target_label, "name", str(target_label)) + + metric_dict[f"{class_name}_accuracy"] = self._finite_or_zero(acc.accuracy) + metric_dict[f"{class_name}_precision"] = self._finite_or_zero(acc.precision) + metric_dict[f"{class_name}_recall"] = self._finite_or_zero(acc.recall) + metric_dict[f"{class_name}_f1score"] = self._finite_or_zero(acc.f1score) + metric_dict[f"{class_name}_tp"] = acc.num_tp + metric_dict[f"{class_name}_fp"] = acc.num_fp + metric_dict[f"{class_name}_num_gt"] = acc.num_ground_truth + metric_dict[f"{class_name}_num_pred"] = acc.objects_results_num + + metric_dict["total_samples"] = self._frame_count + return metric_dict + + # TODO(vividf): Remove after autoware_perception_evaluation supports confusion matrix. + @property + def confusion_matrix(self) -> np.ndarray: + """Get the confusion matrix. + + Returns: + 2D numpy array where cm[i][j] = count of ground truth i predicted as j. + """ + num_classes = len(self.class_names) + if self.evaluator is None or self._frame_count == 0: + return np.zeros((num_classes, num_classes), dtype=int) + + confusion_matrix = np.zeros((num_classes, num_classes), dtype=int) + + for frame_result in self.evaluator.frame_results: + if not frame_result.object_results: + continue + + for obj_result in frame_result.object_results: + if obj_result.ground_truth_object is None: + continue + + pred_name = obj_result.estimated_object.semantic_label.name + gt_name = obj_result.ground_truth_object.semantic_label.name + + # Find indices + pred_idx = next( + (i for i, n in enumerate(self.class_names) if n.lower() == pred_name.lower()), + -1, + ) + gt_idx = next( + (i for i, n in enumerate(self.class_names) if n.lower() == gt_name.lower()), + -1, + ) + + if 0 <= pred_idx < num_classes and 0 <= gt_idx < num_classes: + confusion_matrix[gt_idx, pred_idx] += 1 + + return confusion_matrix + + @property + def summary(self) -> ClassificationSummary: + """Get a summary of the evaluation. + + Returns: + ClassificationSummary with aggregate metrics. + """ + metrics = self.compute_metrics() + + if not metrics: + return ClassificationSummary() + + per_class_accuracy = { + name: metrics[f"{name}_accuracy"] for name in self.class_names if f"{name}_accuracy" in metrics + } + + return ClassificationSummary( + accuracy=metrics.get("accuracy", 0.0), + precision=metrics.get("precision", 0.0), + recall=metrics.get("recall", 0.0), + f1score=metrics.get("f1score", 0.0), + per_class_accuracy=per_class_accuracy, + confusion_matrix=self.confusion_matrix.tolist(), + num_samples=self._frame_count, + detailed_metrics=metrics, + ) diff --git a/deployment/core/metrics/detection_2d_metrics.py b/deployment/core/metrics/detection_2d_metrics.py new file mode 100644 index 000000000..ed582b800 --- /dev/null +++ b/deployment/core/metrics/detection_2d_metrics.py @@ -0,0 +1,472 @@ +""" +2D Detection Metrics Interface using autoware_perception_evaluation. + +This module provides an interface to compute 2D detection metrics (mAP) +using autoware_perception_evaluation in 2D mode, ensuring consistent metrics +between training evaluation and deployment evaluation. + +For 2D detection, the interface uses: +- IoU 2D thresholds for matching (e.g., 0.5, 0.75) +- Only AP is computed (no APH since there's no heading in 2D) + +Usage: + config = Detection2DMetricsConfig( + class_names=["car", "truck", "bus", "bicycle", "pedestrian", "motorcycle", "trailer", "unknown"], + frame_id="camera", + ) + interface = Detection2DMetricsInterface(config) + + # Add frames + for pred, gt in zip(predictions_list, ground_truths_list): + interface.add_frame( + predictions=pred, # List[Dict] with bbox (x1,y1,x2,y2), label, score + ground_truths=gt, # List[Dict] with bbox (x1,y1,x2,y2), label + ) + + # Compute metrics + metrics = interface.compute_metrics() + # Returns: {"mAP_iou_2d_0.5": 0.7, "mAP_iou_2d_0.75": 0.65, ...} +""" + +import logging +import time +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +import numpy as np +from perception_eval.common.dataset import FrameGroundTruth +from perception_eval.common.object2d import DynamicObject2D +from perception_eval.common.schema import FrameID +from perception_eval.config.perception_evaluation_config import PerceptionEvaluationConfig +from perception_eval.evaluation.metrics import MetricsScore +from perception_eval.evaluation.result.perception_frame_config import ( + CriticalObjectFilterConfig, + PerceptionPassFailConfig, +) +from perception_eval.manager import PerceptionEvaluationManager + +from deployment.core.metrics.base_metrics_interface import ( + VALID_2D_FRAME_IDS, + BaseMetricsConfig, + BaseMetricsInterface, + DetectionSummary, +) + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class Detection2DMetricsConfig(BaseMetricsConfig): + """Configuration for 2D detection metrics. + + Attributes: + class_names: List of class names for evaluation. + frame_id: Frame ID for evaluation. Valid values for 2D: + "cam_front", "cam_front_right", "cam_front_left", "cam_front_lower", + "cam_back", "cam_back_left", "cam_back_right", + "cam_traffic_light_near", "cam_traffic_light_far", "cam_traffic_light" + iou_thresholds: List of IoU thresholds for evaluation. + evaluation_config_dict: Configuration dict for perception evaluation. + critical_object_filter_config: Config for filtering critical objects. + frame_pass_fail_config: Config for pass/fail criteria. + """ + + # Override default frame_id for 2D detection (camera frame instead of base_link) + frame_id: str = "cam_front" + iou_thresholds: List[float] = field(default_factory=lambda: [0.5, 0.75]) + evaluation_config_dict: Optional[Dict[str, Any]] = None + critical_object_filter_config: Optional[Dict[str, Any]] = None + frame_pass_fail_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + # Validate frame_id for 2D detection + if self.frame_id not in VALID_2D_FRAME_IDS: + raise ValueError( + f"Invalid frame_id '{self.frame_id}' for 2D detection. Valid options: {VALID_2D_FRAME_IDS}" + ) + + # Set default evaluation config if not provided + if self.evaluation_config_dict is None: + default_eval_config = { + "evaluation_task": "detection2d", + "target_labels": self.class_names, + "iou_2d_thresholds": self.iou_thresholds, + "center_distance_bev_thresholds": None, + "plane_distance_thresholds": None, + "iou_3d_thresholds": None, + "label_prefix": "autoware", + } + object.__setattr__(self, "evaluation_config_dict", default_eval_config) + + # Set default critical object filter config if not provided + if self.critical_object_filter_config is None: + default_filter_config = { + "target_labels": self.class_names, + "ignore_attributes": None, + } + object.__setattr__(self, "critical_object_filter_config", default_filter_config) + + # Set default frame pass fail config if not provided + if self.frame_pass_fail_config is None: + num_classes = len(self.class_names) + default_pass_fail_config = { + "target_labels": self.class_names, + "matching_threshold_list": [0.5] * num_classes, + "confidence_threshold_list": None, + } + object.__setattr__(self, "frame_pass_fail_config", default_pass_fail_config) + + +class Detection2DMetricsInterface(BaseMetricsInterface): + """ + Interface for computing 2D detection metrics using autoware_perception_evaluation. + + This interface provides a simplified interface for the deployment framework to + compute mAP for 2D object detection tasks (YOLOX, etc.). + + Unlike 3D detection, 2D detection: + - Uses IoU 2D for matching (based on bounding box overlap) + - Does not compute APH (no heading information in 2D) + - Works with image-space bounding boxes [x1, y1, x2, y2] + + Example usage: + config = Detection2DMetricsConfig( + class_names=["car", "truck", "bus", "bicycle", "pedestrian"], + iou_thresholds=[0.5, 0.75], + ) + interface = Detection2DMetricsInterface(config) + + # Add frames + for pred, gt in zip(predictions_list, ground_truths_list): + interface.add_frame( + predictions=pred, # List[Dict] with bbox, label, score + ground_truths=gt, # List[Dict] with bbox, label + ) + + # Compute metrics + metrics = interface.compute_metrics() + """ + + def __init__( + self, + config: Detection2DMetricsConfig, + data_root: str = "data/t4dataset/", + result_root_directory: str = "/tmp/perception_eval_2d/", + ) -> None: + """ + Initialize the 2D detection metrics interface. + + Args: + config: Configuration for 2D detection metrics. + data_root: Root directory of the dataset. + result_root_directory: Directory for saving evaluation results. + """ + super().__init__(config) + self.config: Detection2DMetricsConfig = config + self.data_root = data_root + self.result_root_directory = result_root_directory + + # Create perception evaluation config + self.perception_eval_config = PerceptionEvaluationConfig( + dataset_paths=data_root, + frame_id=config.frame_id, + result_root_directory=result_root_directory, + evaluation_config_dict=config.evaluation_config_dict, + load_raw_data=False, + ) + + # Create critical object filter config + self.critical_object_filter_config = CriticalObjectFilterConfig( + evaluator_config=self.perception_eval_config, + **config.critical_object_filter_config, + ) + + # Create frame pass fail config + self.frame_pass_fail_config = PerceptionPassFailConfig( + evaluator_config=self.perception_eval_config, + **config.frame_pass_fail_config, + ) + + # Initialize evaluation manager + self.evaluator: Optional[PerceptionEvaluationManager] = None + + def reset(self) -> None: + """Reset the interface for a new evaluation session.""" + self.evaluator = PerceptionEvaluationManager( + evaluation_config=self.perception_eval_config, + load_ground_truth=False, + metric_output_dir=None, + ) + self._frame_count = 0 + + def _predictions_to_dynamic_objects_2d( + self, + predictions: List[Dict[str, Any]], + unix_time: int, + ) -> List[DynamicObject2D]: + """Convert prediction dicts to DynamicObject2D instances. + + Args: + predictions: List of prediction dicts with keys: + - bbox: [x1, y1, x2, y2] (image coordinates) + - label: int (class index) + - score: float (confidence score) + unix_time: Unix timestamp in microseconds. + + Returns: + List of DynamicObject2D instances. + """ + estimated_objects = [] + frame_id = FrameID.from_value(self.frame_id) + + for pred in predictions: + bbox = pred.get("bbox", []) + if len(bbox) < 4: + continue + + # Extract bbox components [x1, y1, x2, y2] + x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] + + if x2 <= x1 or y2 <= y1: + logger.warning("Skipping prediction with degenerate bbox: %s", bbox) + continue + + # Convert [x1, y1, x2, y2] to [xmin, ymin, width, height] format + # as required by DynamicObject2D.roi + xmin = int(x1) + ymin = int(y1) + width = int(x2 - x1) + height = int(y2 - y1) + + # Get label + label_idx = pred.get("label", 0) + semantic_label = self._convert_index_to_label(int(label_idx)) + + # Get score + score = float(pred.get("score", 0.0)) + + # Create DynamicObject2D + # roi format: (xmin, ymin, width, height) + dynamic_obj = DynamicObject2D( + unix_time=unix_time, + frame_id=frame_id, + semantic_score=score, + semantic_label=semantic_label, + roi=(xmin, ymin, width, height), + uuid=None, + ) + estimated_objects.append(dynamic_obj) + + return estimated_objects + + def _ground_truths_to_dynamic_objects_2d( + self, + ground_truths: List[Dict[str, Any]], + unix_time: int, + ) -> List[DynamicObject2D]: + """Convert ground truth dicts to DynamicObject2D instances. + + Args: + ground_truths: List of ground truth dicts with keys: + - bbox: [x1, y1, x2, y2] (image coordinates) + - label: int (class index) + unix_time: Unix timestamp in microseconds. + + Returns: + List of DynamicObject2D instances. + """ + gt_objects = [] + frame_id = FrameID.from_value(self.frame_id) + + for gt in ground_truths: + bbox = gt.get("bbox", []) + if len(bbox) < 4: + continue + + # Extract bbox components [x1, y1, x2, y2] + x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3] + + # Skip degenerate/inverted boxes (see prediction loop above). + if x2 <= x1 or y2 <= y1: + logger.warning("Skipping ground truth with degenerate bbox: %s", bbox) + continue + + # Convert [x1, y1, x2, y2] to [xmin, ymin, width, height] format + # as required by DynamicObject2D.roi + xmin = int(x1) + ymin = int(y1) + width = int(x2 - x1) + height = int(y2 - y1) + + # Get label + label_idx = gt.get("label", 0) + semantic_label = self._convert_index_to_label(int(label_idx)) + + # Create DynamicObject2D (GT always has score 1.0) + # roi format: (xmin, ymin, width, height) + dynamic_obj = DynamicObject2D( + unix_time=unix_time, + frame_id=frame_id, + semantic_score=1.0, + semantic_label=semantic_label, + roi=(xmin, ymin, width, height), + uuid=None, + ) + gt_objects.append(dynamic_obj) + + return gt_objects + + def add_frame( + self, + predictions: List[Dict[str, Any]], + ground_truths: List[Dict[str, Any]], + frame_name: Optional[str] = None, + ) -> None: + """Add a frame of predictions and ground truths for evaluation. + + Args: + predictions: List of prediction dicts with keys: + - bbox: [x1, y1, x2, y2] (image coordinates) + - label: int (class index) + - score: float (confidence score) + ground_truths: List of ground truth dicts with keys: + - bbox: [x1, y1, x2, y2] (image coordinates) + - label: int (class index) + frame_name: Optional name for the frame. + """ + if self.evaluator is None: + self.reset() + + # Unix time in microseconds (int) + unix_time = int(time.time() * 1e6) + if frame_name is None: + frame_name = str(self._frame_count) + + # Convert predictions to DynamicObject2D + estimated_objects = self._predictions_to_dynamic_objects_2d(predictions, unix_time) + + # Convert ground truths to DynamicObject2D list + gt_objects = self._ground_truths_to_dynamic_objects_2d(ground_truths, unix_time) + + # Create FrameGroundTruth for 2D + frame_ground_truth = FrameGroundTruth( + unix_time=unix_time, + frame_name=frame_name, + objects=gt_objects, + transforms=None, + raw_data=None, + ) + + # Add frame result to evaluator + try: + self.evaluator.add_frame_result( + unix_time=unix_time, + ground_truth_now_frame=frame_ground_truth, + estimated_objects=estimated_objects, + critical_object_filter_config=self.critical_object_filter_config, + frame_pass_fail_config=self.frame_pass_fail_config, + ) + self._frame_count += 1 + except Exception as e: + logger.warning("Failed to add frame %s: %s", frame_name, e) + + def compute_metrics(self) -> Dict[str, float]: + """Compute metrics from all added frames. + + Returns: + Dictionary of metrics with keys like: + - mAP_iou_2d_0.5 + - mAP_iou_2d_0.75 + - car_AP_iou_2d_0.5 + - etc. + """ + if self.evaluator is None or self._frame_count == 0: + logger.warning("No frames to evaluate") + return {} + + try: + # Get scene result (aggregated metrics) + metrics_score: MetricsScore = self.evaluator.get_scene_result() + + # Process metrics into a flat dictionary + return self._process_metrics_score(metrics_score) + + except Exception: + logger.exception("Error computing metrics") + return {} + + def _process_metrics_score(self, metrics_score: MetricsScore) -> Dict[str, float]: + """Process MetricsScore into a flat dictionary. + + Args: + metrics_score: MetricsScore instance from evaluator. + + Returns: + Flat dictionary of metrics. + """ + metric_dict = {} + + for map_instance in metrics_score.mean_ap_values: + matching_mode = map_instance.matching_mode.value.lower().replace(" ", "_") + + # Process individual AP values + for label, aps in map_instance.label_to_aps.items(): + label_name = label.value + + for ap in aps: + threshold = ap.matching_threshold + ap_value = ap.ap + + # Create the metric key + key = f"{label_name}_AP_{matching_mode}_{threshold}" + metric_dict[key] = ap_value + + # Add mAP value (no mAPH for 2D detection) + map_key = f"mAP_{matching_mode}" + metric_dict[map_key] = map_instance.map + + return metric_dict + + @property + def summary(self) -> DetectionSummary: + """Get a summary of the evaluation including mAP and per-class metrics for all matching modes.""" + metrics = self.compute_metrics() + + modes = self._extract_matching_modes(metrics) + + if not modes: + return DetectionSummary( + mAP_by_mode={}, + mAPH_by_mode={}, + per_class_ap_by_mode={}, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) + + # Collect mAP and per-class AP for each matching mode + mAP_by_mode: Dict[str, float] = {} + per_class_ap_by_mode: Dict[str, Dict[str, float]] = {} + + for mode in modes: + map_value = metrics.get(f"mAP_{mode}", 0.0) + mAP_by_mode[mode] = float(map_value) + + # Collect AP values per class for this mode + per_class_ap_values: Dict[str, List[float]] = {} + ap_key_infix = f"_AP_{mode}_" + for key, value in metrics.items(): + if ap_key_infix not in key or key.startswith("mAP"): + continue + class_name = key.split("_AP_", 1)[0] + per_class_ap_values.setdefault(class_name, []).append(float(value)) + + if per_class_ap_values: + per_class_ap_by_mode[mode] = {k: float(np.mean(v)) for k, v in per_class_ap_values.items() if v} + + return DetectionSummary( + mAP_by_mode=mAP_by_mode, + mAPH_by_mode={}, # 2D detection doesn't have mAPH + per_class_ap_by_mode=per_class_ap_by_mode, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) diff --git a/deployment/core/metrics/detection_3d_metrics.py b/deployment/core/metrics/detection_3d_metrics.py new file mode 100644 index 000000000..3eb19f658 --- /dev/null +++ b/deployment/core/metrics/detection_3d_metrics.py @@ -0,0 +1,638 @@ +""" +3D Detection Metrics Interface using autoware_perception_evaluation. + +This module provides an interface to compute 3D detection metrics (mAP, mAPH) +using autoware_perception_evaluation, ensuring consistent metrics between +training evaluation (T4MetricV2) and deployment evaluation. + +Usage: + config = Detection3DMetricsConfig( + class_names=["car", "truck", "bus", "bicycle", "pedestrian"], + frame_id="base_link", + ) + interface = Detection3DMetricsInterface(config) + + # Add frames + for pred, gt in zip(predictions_list, ground_truths_list): + interface.add_frame( + predictions=pred, # List[Dict] with bbox_3d, label, score + ground_truths=gt, # List[Dict] with bbox_3d, label + ) + + # Compute metrics + metrics = interface.compute_metrics() + # Returns: {"mAP_center_distance_bev_0.5": 0.7, ...} +""" + +import logging +import time +from dataclasses import dataclass +from typing import Any, Dict, List, Mapping, Optional, Tuple + +import numpy as np +from perception_eval.common.dataset import FrameGroundTruth +from perception_eval.common.object import DynamicObject +from perception_eval.common.shape import Shape, ShapeType +from perception_eval.config.perception_evaluation_config import PerceptionEvaluationConfig +from perception_eval.evaluation.metrics import MetricsScore +from perception_eval.evaluation.result.perception_frame_config import ( + CriticalObjectFilterConfig, + PerceptionPassFailConfig, +) +from perception_eval.manager import PerceptionEvaluationManager +from pyquaternion import Quaternion + +from deployment.core.metrics.base_metrics_interface import BaseMetricsConfig, BaseMetricsInterface, DetectionSummary + +logger = logging.getLogger(__name__) + + +@dataclass(frozen=True) +class Detection3DMetricsConfig(BaseMetricsConfig): + """Configuration for 3D detection metrics. + + Attributes: + class_names: List of class names for evaluation. + frame_id: Frame ID for evaluation (e.g., "base_link"). + evaluation_config_dict: Configuration dict for perception evaluation. + Example: + { + "evaluation_task": "detection", + "target_labels": ["car", "truck", "bus", "bicycle", "pedestrian"], + "center_distance_bev_thresholds": [0.5, 1.0, 2.0, 4.0], + "plane_distance_thresholds": [2.0, 4.0], + "iou_2d_thresholds": None, + "iou_3d_thresholds": None, + "label_prefix": "autoware", + "max_distance": 121.0, + "min_distance": -121.0, + "min_point_numbers": 0, + } + critical_object_filter_config: Config for filtering critical objects. + Example: + { + "target_labels": ["car", "truck", "bus", "bicycle", "pedestrian"], + "ignore_attributes": None, + "max_distance_list": [121.0, 121.0, 121.0, 121.0, 121.0], + "min_distance_list": [-121.0, -121.0, -121.0, -121.0, -121.0], + } + frame_pass_fail_config: Config for pass/fail criteria. + Example: + { + "target_labels": ["car", "truck", "bus", "bicycle", "pedestrian"], + "matching_threshold_list": [2.0, 2.0, 2.0, 2.0, 2.0], + "confidence_threshold_list": None, + } + """ + + evaluation_config_dict: Optional[Dict[str, Any]] = None + critical_object_filter_config: Optional[Dict[str, Any]] = None + frame_pass_fail_config: Optional[Dict[str, Any]] = None + + def __post_init__(self): + # Set default evaluation config if not provided + if self.evaluation_config_dict is None: + default_eval_config = { + "evaluation_task": "detection", + "target_labels": self.class_names, + "center_distance_bev_thresholds": [0.5, 1.0, 2.0, 4.0], + "plane_distance_thresholds": [2.0, 4.0], + "iou_2d_thresholds": None, + "iou_3d_thresholds": None, + "label_prefix": "autoware", + "max_distance": 121.0, + "min_distance": -121.0, + "min_point_numbers": 0, + } + object.__setattr__(self, "evaluation_config_dict", default_eval_config) + + # Set default critical object filter config if not provided + if self.critical_object_filter_config is None: + num_classes = len(self.class_names) + default_filter_config = { + "target_labels": self.class_names, + "ignore_attributes": None, + "max_distance_list": [121.0] * num_classes, + "min_distance_list": [-121.0] * num_classes, + } + object.__setattr__(self, "critical_object_filter_config", default_filter_config) + + # Set default frame pass fail config if not provided + if self.frame_pass_fail_config is None: + num_classes = len(self.class_names) + default_pass_fail_config = { + "target_labels": self.class_names, + "matching_threshold_list": [2.0] * num_classes, + "confidence_threshold_list": None, + } + object.__setattr__(self, "frame_pass_fail_config", default_pass_fail_config) + + +class Detection3DMetricsInterface(BaseMetricsInterface): + # TODO(vividf): refactor this class after refactoring T4MetricV2 + """ + Interface for computing 3D detection metrics using autoware_perception_evaluation. + + This interface provides a simplified interface for the deployment framework to + compute mAP, mAPH, and other detection metrics that are consistent with + the T4MetricV2 used during training. + """ + + def __init__( + self, + config: Detection3DMetricsConfig, + data_root: str = "data/t4dataset/", + result_root_directory: str = "/tmp/perception_eval/", + ) -> None: + """ + Initialize the 3D detection metrics interface. + + Args: + config: Configuration for 3D detection metrics. + data_root: Root directory of the dataset. + result_root_directory: Directory for saving evaluation results. + """ + super().__init__(config) + self.data_root = data_root + self.result_root_directory = result_root_directory + + cfg_dict = config.evaluation_config_dict + if cfg_dict is None: + cfg_dict = {} + if not isinstance(cfg_dict, Mapping): + raise TypeError(f"evaluation_config_dict must be a dict, got {type(cfg_dict).__name__}") + + # Create multiple evaluators for different distance ranges (like T4MetricV2) + min_distance = cfg_dict.get("min_distance") + max_distance = cfg_dict.get("max_distance") + + if isinstance(min_distance, (int, float)) and isinstance(max_distance, (int, float)): + min_distance = [float(min_distance)] + max_distance = [float(max_distance)] + elif not isinstance(min_distance, list) or not isinstance(max_distance, list): + raise ValueError( + "min_distance and max_distance must be either scalars (int/float) or lists for multi-evaluator mode. " + f"Got min_distance={type(min_distance)}, max_distance={type(max_distance)}" + ) + + if len(min_distance) != len(max_distance): + raise ValueError( + f"min_distance and max_distance must have the same length. " + f"Got len(min_distance)={len(min_distance)}, len(max_distance)={len(max_distance)}" + ) + + if not min_distance or not max_distance: + raise ValueError("min_distance and max_distance lists cannot be empty") + + # Create distance ranges and evaluators + self._bev_distance_ranges = list(zip(min_distance, max_distance)) + self.evaluators: Dict[str, Dict[str, Any]] = {} + self._create_evaluators(config) + + # Per-frame objects are buffered here and replayed per distance range at + # compute time; see reset()/compute_metrics for the memory rationale. + self._frames: List[Tuple[float, List[DynamicObject], FrameGroundTruth]] = [] + self._cached_all_metrics: Optional[Dict[str, float]] = None + self._last_metrics_by_eval_name: Dict[str, MetricsScore] = {} + + def _create_evaluators(self, config: Detection3DMetricsConfig) -> None: + """Create multiple evaluators for different distance ranges (like T4MetricV2).""" + range_filter_name = "bev_center" + + for min_dist, max_dist in self._bev_distance_ranges: + # Create a copy of evaluation_config_dict with single distance values + eval_config_dict_raw = config.evaluation_config_dict + if eval_config_dict_raw is None: + eval_config_dict_raw = {} + if not isinstance(eval_config_dict_raw, Mapping): + raise TypeError(f"evaluation_config_dict must be a dict, got {type(eval_config_dict_raw).__name__}") + eval_config_dict = dict(eval_config_dict_raw) + eval_config_dict["min_distance"] = min_dist + eval_config_dict["max_distance"] = max_dist + + # Create perception evaluation config for this range + evaluator_config = PerceptionEvaluationConfig( + dataset_paths=self.data_root, + frame_id=config.frame_id, + result_root_directory=self.result_root_directory, + evaluation_config_dict=eval_config_dict, + load_raw_data=False, + ) + + # Create critical object filter config + critical_object_filter_config = CriticalObjectFilterConfig( + evaluator_config=evaluator_config, + **config.critical_object_filter_config, + ) + + # Create frame pass fail config + frame_pass_fail_config = PerceptionPassFailConfig( + evaluator_config=evaluator_config, + **config.frame_pass_fail_config, + ) + + evaluator_name = f"{range_filter_name}_{min_dist}-{max_dist}" + + self.evaluators[evaluator_name] = { + "evaluator_config": evaluator_config, + "critical_object_filter_config": critical_object_filter_config, + "frame_pass_fail_config": frame_pass_fail_config, + } + + def reset(self) -> None: + """Reset the interface for a new evaluation session. + + Evaluators are created lazily and one at a time in ``compute_metrics`` (each + distance range is evaluated then released), so here we only clear the buffered + frames and cached results. This keeps peak memory independent of the number of + distance ranges. + """ + self._frames = [] + self._frame_count = 0 + self._cached_all_metrics = None + self._last_metrics_by_eval_name = {} + + def _predictions_to_dynamic_objects( + self, + predictions: List[Dict[str, Any]], + unix_time: float, + ) -> List[DynamicObject]: + """Convert prediction dicts to DynamicObject instances. + + Args: + predictions: List of prediction dicts with keys: + - bbox_3d: [x, y, z, l, w, h, yaw] or [x, y, z, l, w, h, yaw, vx, vy] + (Same format as mmdet3d LiDARInstance3DBoxes) + - label: int (class index) + - score: float (confidence score) + unix_time: Unix timestamp for the frame. + + Returns: + List of DynamicObject instances. + """ + estimated_objects = [] + for pred in predictions: + bbox = pred.get("bbox_3d", []) + if len(bbox) < 7: + continue + + # Extract bbox components + # mmdet3d LiDARInstance3DBoxes format: [x, y, z, l, w, h, yaw, vx, vy] + # where l=length, w=width, h=height + x, y, z = bbox[0], bbox[1], bbox[2] + l, w, h = bbox[3], bbox[4], bbox[5] + yaw = bbox[6] + + if not np.all(np.isfinite([x, y, z, l, w, h, yaw])): + logger.warning("Skipping prediction with non-finite bbox_3d: %s", bbox) + continue + + # Velocity (optional) + vx = bbox[7] if len(bbox) > 7 else 0.0 + vy = bbox[8] if len(bbox) > 8 else 0.0 + + # Create quaternion from yaw + orientation = Quaternion(np.cos(yaw / 2), 0, 0, np.sin(yaw / 2)) + + # Get label + label_idx = pred.get("label", 0) + semantic_label = self._convert_index_to_label(int(label_idx)) + + # Get score + score = float(pred.get("score", 0.0)) + + # Shape size follows autoware_perception_evaluation convention: (length, width, height) + dynamic_obj = DynamicObject( + unix_time=unix_time, + frame_id=self.frame_id, + position=(x, y, z), + orientation=orientation, + shape=Shape(shape_type=ShapeType.BOUNDING_BOX, size=(l, w, h)), + velocity=(vx, vy, 0.0), + semantic_score=score, + semantic_label=semantic_label, + ) + estimated_objects.append(dynamic_obj) + + return estimated_objects + + def _ground_truths_to_frame_ground_truth( + self, + ground_truths: List[Dict[str, Any]], + unix_time: float, + frame_name: str = "0", + ) -> FrameGroundTruth: + """Convert ground truth dicts to FrameGroundTruth instance. + + Args: + ground_truths: List of ground truth dicts with keys: + - bbox_3d: [x, y, z, l, w, h, yaw] or [x, y, z, l, w, h, yaw, vx, vy] + (Same format as mmdet3d LiDARInstance3DBoxes) + - label: int (class index) + - num_lidar_pts: int (optional, number of lidar points) + unix_time: Unix timestamp for the frame. + frame_name: Name/ID of the frame. + + Returns: + FrameGroundTruth instance. + """ + gt_objects = [] + for gt in ground_truths: + bbox = gt.get("bbox_3d", []) + if len(bbox) < 7: + continue + + # Extract bbox components + # mmdet3d LiDARInstance3DBoxes format: [x, y, z, l, w, h, yaw, vx, vy] + # where l=length, w=width, h=height + x, y, z = bbox[0], bbox[1], bbox[2] + l, w, h = bbox[3], bbox[4], bbox[5] + yaw = bbox[6] + + # Skip non-finite boxes: a NaN/inf yaw produces a NaN quaternion that + # silently corrupts matching across the whole frame. + if not np.all(np.isfinite([x, y, z, l, w, h, yaw])): + logger.warning("Skipping ground truth with non-finite bbox_3d: %s", bbox) + continue + + # Velocity (optional) + vx = bbox[7] if len(bbox) > 7 else 0.0 + vy = bbox[8] if len(bbox) > 8 else 0.0 + + # Create quaternion from yaw + orientation = Quaternion(np.cos(yaw / 2), 0, 0, np.sin(yaw / 2)) + + # Get label + label_idx = gt.get("label", 0) + semantic_label = self._convert_index_to_label(int(label_idx)) + + # Get point count (optional) + num_pts = gt.get("num_lidar_pts", 0) + + # Shape size follows autoware_perception_evaluation convention: (length, width, height) + dynamic_obj = DynamicObject( + unix_time=unix_time, + frame_id=self.frame_id, + position=(x, y, z), + orientation=orientation, + shape=Shape(shape_type=ShapeType.BOUNDING_BOX, size=(l, w, h)), + velocity=(vx, vy, 0.0), + semantic_score=1.0, # GT always has score 1.0 + semantic_label=semantic_label, + pointcloud_num=int(num_pts), + ) + gt_objects.append(dynamic_obj) + + return FrameGroundTruth( + unix_time=unix_time, + frame_name=frame_name, + objects=gt_objects, + transforms=None, + raw_data=None, + ) + + def add_frame( + self, + predictions: List[Dict[str, Any]], + ground_truths: List[Dict[str, Any]], + frame_name: Optional[str] = None, + ) -> None: + """Add a frame of predictions and ground truths for evaluation. + + Args: + predictions: List of prediction dicts with keys: + - bbox_3d: [x, y, z, l, w, h, yaw] or [x, y, z, l, w, h, yaw, vx, vy] + - label: int (class index) + - score: float (confidence score) + ground_truths: List of ground truth dicts with keys: + - bbox_3d: [x, y, z, l, w, h, yaw] or [x, y, z, l, w, h, yaw, vx, vy] + - label: int (class index) + - num_lidar_pts: int (optional) + frame_name: Optional name for the frame. + """ + unix_time = time.time() + if frame_name is None: + frame_name = str(self._frame_count) + + # Build the perception_eval objects once and buffer them + estimated_objects = self._predictions_to_dynamic_objects(predictions, unix_time) + frame_ground_truth = self._ground_truths_to_frame_ground_truth(ground_truths, unix_time, frame_name) + + self._frames.append((unix_time, estimated_objects, frame_ground_truth)) + self._frame_count += 1 + self._cached_all_metrics = None # invalidate cached results + + def compute_metrics(self) -> Dict[str, float]: + """Compute metrics from all added frames. + + Returns: + Dictionary of metrics with keys like: + - mAP_center_distance_bev (mean AP across all classes, no threshold) + - mAPH_center_distance_bev (mean APH across all classes, no threshold) + - car_AP_center_distance_bev_0.5 (per-class AP with threshold) + - car_AP_center_distance_bev_1.0 (per-class AP with threshold) + - car_APH_center_distance_bev_0.5 (per-class APH with threshold) + - etc. + For multi-evaluator mode, metrics are prefixed with evaluator name: + - bev_center_0.0-50.0_mAP_center_distance_bev + - bev_center_0.0-50.0_car_AP_center_distance_bev_0.5 + - bev_center_50.0-90.0_mAP_center_distance_bev + - etc. + Note: mAP/mAPH keys do not include threshold; only per-class AP/APH keys do. + """ + if self._cached_all_metrics is not None: + return self._cached_all_metrics + + if self._frame_count == 0: + logger.warning("No frames to evaluate") + return {} + + # Evaluate one distance range at a time: build an evaluator, replay all buffered + # frames into it, summarize, then let it go out of scope before the next range so + # peak memory stays independent of the number of ranges. + scene_results: Dict[str, MetricsScore] = {} + all_metrics: Dict[str, float] = {} + for eval_name, eval_data in self.evaluators.items(): + try: + evaluator = PerceptionEvaluationManager( + evaluation_config=eval_data["evaluator_config"], + load_ground_truth=False, + metric_output_dir=None, + ) + for unix_time, estimated_objects, frame_ground_truth in self._frames: + evaluator.add_frame_result( + unix_time=unix_time, + ground_truth_now_frame=frame_ground_truth, + estimated_objects=estimated_objects, + critical_object_filter_config=eval_data["critical_object_filter_config"], + frame_pass_fail_config=eval_data["frame_pass_fail_config"], + ) + metrics_score = evaluator.get_scene_result() + scene_results[eval_name] = metrics_score + all_metrics.update(self._process_metrics_score(metrics_score, prefix=eval_name)) + except Exception as e: + logger.warning("Error computing metrics for %s: %s", eval_name, e) + + # Cache for reuse by format_metrics_report() and the summary property. + self._last_metrics_by_eval_name = scene_results + self._cached_all_metrics = all_metrics + return all_metrics + + def format_metrics_report(self) -> str: + """Format the metrics report as a human-readable string. + + For multi-evaluator mode, returns reports for all evaluators with distance range labels. + Uses cached results from compute_metrics() if available to avoid recomputation. + """ + # Use cached results if available, otherwise compute them + if not self._last_metrics_by_eval_name: + # Cache not available, compute now + self.compute_metrics() + + reports = [] + for eval_name, metrics_score in self._last_metrics_by_eval_name.items(): + try: + # Extract distance range from evaluator name (e.g., "bev_center_0.0-50.0" -> "0.0-50.0") + distance_range = eval_name.replace("bev_center_", "") + reports.append( + f"\n{'=' * 80}\n" f"Distance Range: {distance_range} m\n" f"{'=' * 80}\n" f"{metrics_score}" + ) + except Exception as e: + logger.warning("Error formatting report for %s: %s", eval_name, e) + + if not reports: + raise RuntimeError("Failed to generate metrics report. Ensure that metrics have been computed.") + + return "\n".join(reports) + + def _process_metrics_score(self, metrics_score: MetricsScore, prefix: Optional[str] = None) -> Dict[str, float]: + """Process MetricsScore into a flat dictionary. + + Args: + metrics_score: MetricsScore instance from evaluator. + prefix: Optional prefix to add to metric keys (for multi-evaluator mode). + + Returns: + Flat dictionary of metrics. + """ + metric_dict = {} + key_prefix = f"{prefix}_" if prefix else "" + + for map_instance in metrics_score.mean_ap_values: + matching_mode = map_instance.matching_mode.value.lower().replace(" ", "_") + + # Process individual AP values + for label, aps in map_instance.label_to_aps.items(): + label_name = label.value + + for ap in aps: + threshold = ap.matching_threshold + ap_value = ap.ap + + # Create the metric key + key = f"{key_prefix}{label_name}_AP_{matching_mode}_{threshold}" + metric_dict[key] = ap_value + + # Process individual APH values + label_to_aphs = getattr(map_instance, "label_to_aphs", None) + if label_to_aphs: + for label, aphs in label_to_aphs.items(): + label_name = label.value + for aph in aphs: + threshold = aph.matching_threshold + aph_value = getattr(aph, "aph", None) + if aph_value is None: + aph_value = getattr(aph, "ap", None) + if aph_value is None: + continue + key = f"{key_prefix}{label_name}_APH_{matching_mode}_{threshold}" + metric_dict[key] = aph_value + + # Add mAP and mAPH values + map_key = f"{key_prefix}mAP_{matching_mode}" + maph_key = f"{key_prefix}mAPH_{matching_mode}" + metric_dict[map_key] = map_instance.map + metric_dict[maph_key] = map_instance.maph + + return metric_dict + + @property + def summary(self) -> DetectionSummary: + """Get a summary of the evaluation including mAP and per-class metrics for all matching modes. + + Only uses metrics from the last distance bucket. + """ + metrics = self.compute_metrics() + + if not self._bev_distance_ranges: + return DetectionSummary( + mAP_by_mode={}, + mAPH_by_mode={}, + per_class_ap_by_mode={}, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) + + # Use the last distance bucket (should be the full range) + last_min_dist, last_max_dist = self._bev_distance_ranges[-1] + last_evaluator_name = f"bev_center_{last_min_dist}-{last_max_dist}" + + last_metrics_score = self._last_metrics_by_eval_name.get(last_evaluator_name) + if last_metrics_score is None: + return DetectionSummary( + mAP_by_mode={}, + mAPH_by_mode={}, + per_class_ap_by_mode={}, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) + + last_bucket_metrics = self._process_metrics_score(last_metrics_score, prefix=None) + + modes = self._extract_matching_modes(last_bucket_metrics) + if not modes: + return DetectionSummary( + mAP_by_mode={}, + mAPH_by_mode={}, + per_class_ap_by_mode={}, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) + + mAP_by_mode: Dict[str, float] = {} + mAPH_by_mode: Dict[str, float] = {} + per_class_ap_by_mode: Dict[str, Dict[str, float]] = {} + + for mode in modes: + # Get mAP and mAPH directly from last bucket metrics + map_key = f"mAP_{mode}" + maph_key = f"mAPH_{mode}" + + mAP_by_mode[mode] = last_bucket_metrics.get(map_key, 0.0) + mAPH_by_mode[mode] = last_bucket_metrics.get(maph_key, 0.0) + + # Collect AP values per class for this mode from the last bucket + per_class_ap_values: Dict[str, List[float]] = {} + ap_key_separator = f"_AP_{mode}_" + + for key, value in last_bucket_metrics.items(): + idx = key.find(ap_key_separator) + if idx < 0: + continue + + # Label is the full substring before "_AP_{mode}_" (class names may + # themselves contain underscores, e.g. "traffic_light"). + class_name = key[:idx] + if class_name: + per_class_ap_values.setdefault(class_name, []).append(float(value)) + + if per_class_ap_values: + per_class_ap_by_mode[mode] = {k: float(np.mean(v)) for k, v in per_class_ap_values.items() if v} + + return DetectionSummary( + mAP_by_mode=mAP_by_mode, + mAPH_by_mode=mAPH_by_mode, + per_class_ap_by_mode=per_class_ap_by_mode, + num_frames=self._frame_count, + detailed_metrics=metrics, + ) diff --git a/deployment/docs/architecture.md b/deployment/docs/architecture.md new file mode 100644 index 000000000..ca8ca32cb --- /dev/null +++ b/deployment/docs/architecture.md @@ -0,0 +1,113 @@ +# Deployment architecture + +How the framework is wired and what each part is allowed to own. Use this page when you need the mental model of `deployment/` or when you plan to extend it. + +For commands and run behavior, use [runbook.md](./runbook.md). For deploy config fields and examples, use [configuration.md](./configuration.md). + +## Three layers + +1. Entry layer: CLI plus project entrypoints. +2. Runtime layer: runner plus orchestrators and artifact resolution. +3. Execution layer: exporters, inference pipelines, evaluators, and metrics. + +## High-level flow + +```mermaid +flowchart TD + cli["deployment/cli/main.py"] --> projectBundle["deployment/projects/project/entrypoint.py"] + projectBundle --> runner["BaseDeploymentRunner"] + runner --> exportStack["ExportOrchestrator_and_exporters"] + runner --> verifyEval["Verification_and_Evaluation_orchestrators"] + verifyEval --> evaluator["BaseEvaluator"] + evaluator --> pipelineFactory["BasePipelineFactory"] + pipelineFactory --> projectPipelines["Project_pipelines"] +``` + +## Layer responsibilities + +### Entry layer + +- `deployment/cli/main.py` discovers registered project bundles and dispatches to a `ProjectAdapter`. +- Each project `entrypoint.py` loads configs, builds the data loader and evaluator, then creates the project runner. +- Project-specific flags belong in the project bundle, not in the shared CLI. + +### Runtime layer + +- `deployment/runtime/runner.py` owns the shared sequence: load, export, verify, evaluate. +- `ExportOrchestrator`, `VerificationOrchestrator`, and `EvaluationOrchestrator` keep the runner thin. +- `ArtifactManager` records ONNX and TensorRT artifacts so later stages resolve them consistently. + +### Execution layer + +- `deployment/exporters/` owns ONNX and TensorRT export mechanics. +- `deployment/pipelines/` owns shared inference pipeline abstractions and factory registration. +- Project `pipelines/` implement backend-specific inference. +- Evaluators own metrics, result reporting, and verification behavior. + +## Package map + +| Path | Responsibility | +| --- | --- | +| `deployment/cli/` | Unified CLI and shared argument helpers | +| `deployment/configs/` | Typed deployment config and schema | +| `deployment/core/` | Shared contexts, base evaluator, verification/output-comparison helpers, data-loader base, backend/device types, and metrics interfaces | +| `deployment/exporters/` | Shared ONNX and TensorRT exporters plus export pipeline bases | +| `deployment/pipelines/` | Global pipeline registry and factory | +| `deployment/runtime/` | Base runner, orchestrators, and artifact management | +| `deployment/projects//` | Project-specific entrypoint, runner, config, IO, eval, pipelines, and optional export logic | + +## Extension contract + +This section replaces the old standalone core contract page. + +### Runner responsibilities + +- `BaseDeploymentRunner` owns the end-to-end deployment flow. +- Project runners inject project-specific loaders, evaluators, wrappers, and optional export pipelines. +- Runners must not own task-specific preprocessing, postprocessing, or metrics logic. + +### Evaluator responsibilities + +- `BaseEvaluator` is the shared base for task evaluators. +- Evaluators create backend pipelines through `BasePipelineFactory`. +- Evaluators prepare inputs, normalize outputs, compute metrics, and report results. +- Evaluators should log summaries through `logging`, not `print`. + +### Pipeline responsibilities + +- `BaseInferencePipeline` owns `preprocess -> run_model -> postprocess`. +- Pipelines execute inference only. +- Pipelines must not load artifacts on their own and must not compute metrics. + +### Metrics responsibilities + +- Metrics interfaces convert predictions and ground truths into task metrics. +- Metrics code should not depend on runners, exporters, or pipeline factories directly. + +## Allowed dependencies + +| Dependency | Allowed | +| --- | --- | +| Runner -> Evaluator | Yes | +| Evaluator -> BasePipelineFactory / Pipelines / Metrics | Yes | +| BasePipelineFactory -> Pipelines | Yes | +| Pipelines -> Metrics | No | +| Metrics -> Runner / BasePipelineFactory | No | + +## Project shape + +Each project bundle should follow this layout: + +```text +deployment/projects// +├── __init__.py +├── entrypoint.py +├── runner.py +├── config/ +├── io/ +├── eval/ +├── pipelines/ +└── export/ # optional +``` + +CenterPoint is the current reference implementation of this structure. diff --git a/deployment/docs/configuration.md b/deployment/docs/configuration.md new file mode 100644 index 000000000..f92eb6d2c --- /dev/null +++ b/deployment/docs/configuration.md @@ -0,0 +1,251 @@ +# Configuration reference + +This is the single source of truth for deploy config fields: top-level keys, `components`, devices, `export`, `onnx_config`, `tensorrt_config`, `evaluation`, `verification`, and logging. + +Use [runbook.md](./runbook.md) for execution behavior and [architecture.md](./architecture.md) for framework structure. This page is intentionally reference-first. + +Deploy configs are plain Python dicts loaded with MMEngine `Config.fromfile`. `BaseDeploymentConfig` wraps them with typed dataclasses in `deployment.configs.schema` for validation and IDE-friendly access. + +## How to read this config + +Read the deploy config in this order: + +1. `checkpoint_path`, `devices`, and `export` define the run boundary. +2. `components` defines what artifacts are produced and how each subgraph is named. +3. `onnx_config` and `tensorrt_config` tune exporter behavior. +4. `evaluation` and `verification` define the post-export quality gates. + +## Top-level keys + +| Key | Required | Purpose | +| --- | --- | --- | +| `checkpoint_path` | Yes | Path to the PyTorch checkpoint (must exist). Single source for load + PyTorch backend. | +| `deploy_log_path` | No | File for deployment logs. Default `"deployment.log"`. Relative paths are resolved under `export.work_dir`. `None` or `""` disables file logging. | +| `devices` | Yes | `cpu` / `cuda` device strings shared by export, verification, and evaluation. | +| `export` | Yes | Export mode, `work_dir`, optional `onnx_path` (e.g. when `mode="trt"`), and `sample_idx` (dataset index of the sample used to trace/shape the exported model). | +| `components` | Yes | Multi-component I/O and artifact names (see below). Current runner expects this section. | +| `onnx_config` | No | Shared ONNX export flags (`opset_version`, `simplify`, …). Per-component filenames live under `components.*.onnx_file`. | +| `tensorrt_config` | No | Shared TensorRT build flags. Per-component profiles live under `components.*.tensorrt_profile`. | +| `evaluation` | No | Backend toggles, sample counts, optional `model_dir` / `engine_dir` for ONNX and TensorRT. | +| `verification` | No | Scenarios per export mode and tolerance. | + +> The evaluation/verification dataset is taken from the **model config's** `test_dataloader.dataset` (its `ann_file` is the test info). There is no separate `runtime_io`/`info_file` in the deploy config. + +## Logging (`deploy_log_path`) + +After `BaseDeploymentConfig` loads, the CenterPoint entrypoint attaches a root `FileHandler` via `deployment.cli.args.add_deployment_file_logging` when `resolved_deploy_log_file` is set. All standard `logging` output for the process (console + libraries that log to the root logger) is mirrored to that file. + +```python +# Relative → join(export.work_dir, "deployment.log") +deploy_log_path = "deployment.log" + +# Absolute path +# deploy_log_path = "/var/log/centerpoint_deploy.log" + +# Disable file logging +# deploy_log_path = None +``` + +## Single ONNX / engine (one component) + +The schema is still the unified `components` map: use **one** entry when the graph exports to a single ONNX and engine. + +```python +checkpoint_path = "work_dirs/model/best.pth" +deploy_log_path = "deployment.log" + +devices = dict(cpu="cpu", cuda="cuda:0") + +export = dict( + mode="both", + work_dir="work_dirs/my_deployment", + onnx_path=None, + sample_idx=0, +) + +components = dict( + model=dict( + onnx_file="model.onnx", + engine_file="model.engine", + io=dict( + inputs=[dict(name="input", dtype="float32")], + outputs=[dict(name="output", dtype="float32")], + dynamic_axes={"input": {0: "batch"}, "output": {0: "batch"}}, + ), + tensorrt_profile=dict( + input=dict( + min_shape=[1, 3, 960, 960], + opt_shape=[1, 3, 960, 960], + max_shape=[4, 3, 960, 960], + ), + ), + ), +) + +onnx_config = dict( + opset_version=17, + do_constant_folding=True, + export_params=True, + keep_initializers_as_inputs=False, + simplify=False, +) + +tensorrt_config = dict( + precision_policy="auto", + max_workspace_size=1 << 30, +) +``` + +There is **no** top-level `model_io` in the current `BaseDeploymentConfig`; I/O and filenames belong to `components`. + +## Multi-component export (CenterPoint-style) + +The **dict key** is the component id (e.g. `pts_voxel_encoder`, `pts_backbone_neck_head`). Typed parsing sets `ComponentCfg.name` from that key—do **not** add a separate `name` field inside each component dict. + +Align shapes and dynamic axes with your model config. Example structure (abbreviated; see `deployment/projects/centerpoint/config/deploy_config.py` for the full reference): + +```python +checkpoint_path = "work_dirs/centerpoint/best_checkpoint.pth" +deploy_log_path = "deployment.log" + +devices = dict(cpu="cpu", cuda="cuda:0") + +_WORK_DIR = "work_dirs/centerpoint_deployment" +export = dict( + mode="both", + work_dir=_WORK_DIR, + onnx_path=f"{_WORK_DIR}/onnx", + sample_idx=0, +) + +components = dict( + pts_voxel_encoder=dict( + onnx_file="pts_voxel_encoder.onnx", + engine_file="pts_voxel_encoder.engine", + io=dict( + inputs=[dict(name="input_features", dtype="float32")], + outputs=[dict(name="pillar_features", dtype="float32")], + dynamic_axes={ + "input_features": {0: "num_voxels", 1: "num_max_points"}, + "pillar_features": {0: "num_voxels"}, + }, + ), + tensorrt_profile=dict( + input_features=dict( + min_shape=[1000, 32, 11], + opt_shape=[20000, 32, 11], + max_shape=[96000, 32, 11], + ), + ), + ), + pts_backbone_neck_head=dict( + onnx_file="pts_backbone_neck_head.onnx", + engine_file="pts_backbone_neck_head.engine", + io=dict( + inputs=[dict(name="spatial_features", dtype="float32")], + outputs=[ + dict(name="heatmap", dtype="float32"), + dict(name="reg", dtype="float32"), + # ... remaining heads + ], + dynamic_axes={ + "spatial_features": {0: "batch_size", 2: "height", 3: "width"}, + # ... align per output + }, + ), + tensorrt_profile=dict( + spatial_features=dict( + min_shape=[1, 32, 1020, 1020], + opt_shape=[1, 32, 1020, 1020], + max_shape=[1, 32, 1020, 1020], + ), + ), + ), +) + +onnx_config = dict( + opset_version=17, + do_constant_folding=True, + export_params=True, + keep_initializers_as_inputs=False, + simplify=False, +) + +tensorrt_config = dict( + precision_policy="auto", + max_workspace_size=2 << 30, +) +``` + +## Evaluation + +`ArtifactManager` resolves ONNX/TensorRT paths in this order: registered export artifacts, then explicit evaluation paths, then fallbacks (`export.onnx_path`, etc.). For a typical post-export evaluation, set directories explicitly: + +```python +_ONNX_DIR = f"{_WORK_DIR}/onnx" +_TENSORRT_DIR = f"{_WORK_DIR}/tensorrt" + +evaluation = dict( + enabled=True, + num_samples=100, + verbose=False, + backends=dict( + pytorch=dict(enabled=True, device=devices["cuda"]), + onnx=dict(enabled=True, device=devices["cuda"], model_dir=_ONNX_DIR), + tensorrt=dict(enabled=True, device=devices["cuda"], engine_dir=_TENSORRT_DIR), + ), +) +``` + +Optional `num_warmup` (default `0`) adds warmup iterations before latency is measured. Per-backend path overrides live inside each `backends` entry (`model_dir` / `engine_dir`), not in a separate top-level field. + +## Verification + +Scenarios are grouped by **export mode** (`both`, `onnx`, `trt`, `none`), matching `deployment.configs.enums.ExportMode`. Only scenarios for the active export mode run. + +```python +verification = dict( + enabled=True, + tolerance=0.1, + num_verify_samples=3, + scenarios=dict( + both=[ + dict(ref_backend="pytorch", ref_device="cpu", test_backend="onnx", test_device="cpu"), + dict(ref_backend="onnx", ref_device="cuda", test_backend="tensorrt", test_device="cuda"), + ], + onnx=[ + dict(ref_backend="pytorch", ref_device="cpu", test_backend="onnx", test_device="cpu"), + ], + trt=[ + dict(ref_backend="onnx", ref_device="cuda", test_backend="tensorrt", test_device="cuda"), + ], + none=[], + ), +) +``` + +## Device aliases + +Keep a single top-level `devices` dict and reference it from `evaluation.backends` (and reuse the same CPU/CUDA strings in each `verification` scenario's `ref_device`/`test_device`) so device strings stay consistent. + +## Backend enum + +```python +from deployment.core.backend import Backend + +evaluation = dict( + backends={ + Backend.PYTORCH.value: {"enabled": True, "device": devices["cpu"]}, + Backend.ONNX.value: {"enabled": True, "device": devices["cpu"], "model_dir": _ONNX_DIR}, + Backend.TENSORRT.value: {"enabled": True, "device": devices["cuda"], "engine_dir": _TENSORRT_DIR}, + } +) +``` + +## Typed exporter configs + +Low-level typed classes live in `deployment.exporters.common.configs`. `BaseDeploymentConfig.get_onnx_settings(component_name)` / `get_tensorrt_settings(component_name)` merge shared `onnx_config` / `tensorrt_config` with each `components` entry. + +## Example on disk + +- `deployment/projects/centerpoint/config/deploy_config.py` — full multi-component deploy config (with comments on tolerance and verification). diff --git a/deployment/docs/contributing.md b/deployment/docs/contributing.md new file mode 100644 index 000000000..269613fd1 --- /dev/null +++ b/deployment/docs/contributing.md @@ -0,0 +1,46 @@ +# Contributing to deployment + +Use this page when adding a new project bundle or changing shared deployment infrastructure. + +Before changing shared runners, evaluators, `BasePipelineFactory`, metrics interfaces, or orchestrators, read [architecture.md](./architecture.md). It contains the framework structure and extension contract. + +## Minimal project checklist + +1. Create `deployment/projects//__init__.py` and register a `ProjectAdapter`. +2. Add `entrypoint.py` to build `BaseDeploymentConfig`, the data loader, evaluator, and runner. +3. Add `runner.py` as a thin `BaseDeploymentRunner` subclass. +4. Add `config/deploy_config.py` with the required deploy config sections described in [configuration.md](./configuration.md). +5. Add `io/` and `eval/` for project-specific loading and evaluation logic. +6. Add `pipelines/` with backend-specific inference pipelines and register a project pipeline factory. +7. Add a project `README.md` with the project-specific quick start and links back to shared docs. + +Add `export/` only when the project needs multi-stage or multi-file export orchestration. + +## Implementation notes + +### Evaluator and data loader + +- Subclass `BaseEvaluator` with task-specific metrics and output parsing. +- Subclass `BaseDataLoader` for project dataset and preprocessing needs. +- Keep metrics inside evaluators and metrics interfaces, not inside pipelines. + +### Runner + +- Project runners should focus on project model loading, wrappers, and optional export pipeline wiring. +- Keep export sequencing in the shared runtime instead of reimplementing it per project. + +### Inference pipelines + +- Add backend-specific pipelines under `deployment/projects//pipelines/`. +- Register a project `BasePipelineFactory` subclass with `@pipeline_registry.register`. +- Use `components_cfg` from `BaseDeploymentConfig` instead of raw config dicts where possible. + +### CLI + +- The shared entrypoint remains `python -m deployment.cli.main `. +- Project-specific flags should be added through the project adapter, not by editing the shared CLI for one project. + +### Documentation + +- Keep `deployment/README.md` short and user-facing. +- Put shared behavior in shared docs and project-specific details in the project README. diff --git a/deployment/docs/operations.md b/deployment/docs/operations.md new file mode 100644 index 000000000..01475c75f --- /dev/null +++ b/deployment/docs/operations.md @@ -0,0 +1,45 @@ +# Deployment operations + +Troubleshooting and practical deployment guidance. This page is for day-to-day operation, not for architecture reference. + +Use [runbook.md](./runbook.md) for the execution flow, [configuration.md](./configuration.md) for config fields, and [architecture.md](./architecture.md) for framework boundaries. + +## Configuration + +- Keep deploy config separate from the training model config. +- Prefer paths rooted under one `export.work_dir` so artifacts and logs stay together. +- Set `deploy_log_path` when you need a persistent deployment log next to the exported artifacts. +- Keep one top-level `devices` map and reuse it in evaluation and verification settings. + +## Export + +- Start with a valid ONNX export before tuning TensorRT. +- Keep `components` aligned with the actual deployable subgraphs of the model. +- Use project export pipelines only when a project needs multi-stage or multi-file export orchestration. +- Match `precision_policy` to the deployment target instead of treating it as a generic speed knob. + +## Verification + +- Start with strict tolerance and relax only when there is a clear backend-driven reason. +- Keep preprocessing identical across reference and test backends. +- Group scenarios by `export.mode` so partial runs do not execute irrelevant backend pairs. + +## Evaluation + +- Compare latency only when devices and sample counts are aligned across backends. +- Set `model_dir` and `engine_dir` explicitly when evaluating previously exported artifacts. +- Log evaluation summaries through `logging`, not `print`, so they are captured by `deploy_log_path`. + +## Pipeline development + +- Pipelines should only own inference mechanics and tensor shaping. +- Evaluators own metrics, verification input preparation, and result reporting. +- Register project pipeline factories through `pipeline_registry` so all backends are created consistently. + +## Troubleshooting + +1. ONNX export fails: check unsupported ops, dynamic axes, and representative export inputs first. +2. TensorRT build fails: validate the ONNX graph, shape profiles, and workspace limits. +3. Verification fails: check tolerance, backend pairing, and preprocessing parity before assuming export is broken. +4. Evaluation cannot find artifacts: confirm current export registration or set explicit ONNX and TensorRT directories. +5. Log file is empty: confirm `deploy_log_path` is enabled and summaries use `logging`. diff --git a/deployment/docs/runbook.md b/deployment/docs/runbook.md new file mode 100644 index 000000000..ad0eb698a --- /dev/null +++ b/deployment/docs/runbook.md @@ -0,0 +1,105 @@ +# Deployment runbook + +Practical guide for running deployment end to end: CLI syntax, required inputs, export modes, and what the framework does during a run. + +For deploy config keys and copy-paste examples, use [configuration.md](./configuration.md). For internal structure and extension rules, use [architecture.md](./architecture.md). + +## Quick start + +```bash +python -m deployment.cli.main centerpoint \ + \ + \ + [--log-level INFO] + +# CenterPoint-specific flag +python -m deployment.cli.main centerpoint \ + \ + \ + --rot-y-axis-reference +``` + +## Required inputs + +- `project_name` must match a registered `ProjectAdapter` under `deployment/projects/`. +- `deploy_cfg.py` is the deployment config loaded into `BaseDeploymentConfig`. +- `model_cfg.py` is the project model or training config used by the project entrypoint. + +## What happens during a run + +Every project follows the same high-level flow: + +```text +Load checkpoint -> Export -> Verify -> Evaluate +``` + +More concretely: + +1. `deployment.cli.main` resolves the project adapter and parses shared plus project-specific flags. +2. The project `entrypoint.py` loads configs, builds the data loader, evaluator, and project runner. +3. `BaseDeploymentRunner` executes export, verification, and evaluation in sequence. +4. Evaluators build backend-specific inference pipelines through `BasePipelineFactory`. + +## Export modes + +| `export.mode` | What runs | Typical use | +| --- | --- | --- | +| `both` | ONNX export, TensorRT export, then verification/evaluation | Full deployment quality gate | +| `onnx` | ONNX export, then ONNX-relevant verification/evaluation | Validate export before TRT | +| `trt` | TensorRT build from existing ONNX, then TRT-relevant verification/evaluation | Rebuild engines from a known ONNX layout | +| `none` | Skip export and only verify/evaluate against existing artifacts | Re-run checks on saved artifacts | + +## Verification and evaluation + +Verification and evaluation are part of the same deployment run, but they answer different questions. + +- Verification asks whether backend outputs still match within tolerance. +- Evaluation asks how each backend performs on task metrics and latency. + +### Verification + +Verification uses scenario lists grouped by export mode. Only the scenarios for the active `export.mode` run. + +Typical flow: + +1. Build reference and test pipelines for the chosen backends. +2. Run paired inference on shared samples. +3. Compare nested outputs with `verification.tolerance`. +4. Report pass/fail statistics. + +Use verification when you need to catch numerical or tensor-shape regressions before trusting exported artifacts. + +### Evaluation + +Evaluation runs the configured backends and reports task metrics plus latency summaries. + +- CenterPoint uses 3D detection metrics through the shared evaluator interfaces. +- ONNX and TensorRT artifacts are resolved from the current export run or from explicit directories in `evaluation.backends`. + +If you run with `export.mode="none"`, set `model_dir` or `engine_dir` explicitly so the evaluator can resolve artifacts without relying on in-process export state. + +## Export mental model + +The framework exports one or more model components defined in the `components` section of the deploy config. + +- Single-component projects export one ONNX file and optionally one TensorRT engine. +- Multi-component projects such as CenterPoint export several artifacts, one per component. + +For each component: + +- `onnx_file` and `io` drive ONNX export. +- `engine_file` and `tensorrt_profile` drive TensorRT engine build. +- Shared `onnx_config` and `tensorrt_config` are merged with per-component settings. + +## Logging + +- `--log-level` controls the root logging level. +- `deploy_log_path` mirrors deployment logs to a file, usually under `export.work_dir` when given as a relative path. +- Evaluators should log summaries through the `logging` module so console and file output stay aligned. + +## Where to go next + +- [configuration.md](./configuration.md) for deploy config fields and examples +- [architecture.md](./architecture.md) for CLI, runner, orchestrators, pipelines, and extension boundaries +- [operations.md](./operations.md) for troubleshooting and practical advice +- [../projects/centerpoint/README.md](../projects/centerpoint/README.md) for the current project-specific guide diff --git a/deployment/exporters/__init__.py b/deployment/exporters/__init__.py new file mode 100644 index 000000000..eb06d8a6d --- /dev/null +++ b/deployment/exporters/__init__.py @@ -0,0 +1 @@ +"""Model exporters. Import from concrete modules under ``deployment.exporters.common.*``.""" diff --git a/deployment/exporters/common/base_exporter.py b/deployment/exporters/common/base_exporter.py new file mode 100644 index 000000000..aa1215f4f --- /dev/null +++ b/deployment/exporters/common/base_exporter.py @@ -0,0 +1,95 @@ +""" +Abstract base class for model exporters. + +Provides a interface for exporting models to different formats. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Any, Optional + +import torch + +from deployment.core.artifacts import Artifact +from deployment.exporters.common.configs import BaseExporterConfig +from deployment.exporters.common.model_wrappers import BaseModelWrapper + + +class BaseExporter(ABC): + """ + Abstract base class for model exporters. + + This class defines a unified interface for exporting models + to different backend formats (ONNX, TensorRT, TorchScript, etc.). + + Enhanced features: + - Support for model wrappers (preprocessing before export) + - Flexible configuration with overrides + - Better logging and error handling + """ + + def __init__( + self, + config: BaseExporterConfig, + logger: logging.Logger, + model_wrapper: Optional[BaseModelWrapper] = None, + ) -> None: + """ + Initialize exporter. + + Args: + config: Typed export configuration dataclass (e.g., ``ONNXExportConfig``, + ``TensorRTExportConfig``). This ensures type safety and clear schema. + logger: Logger instance for export progress and diagnostics. + model_wrapper: Optional model wrapper class or callable. + If a class is provided, it will be instantiated with the model. + If an instance is provided, it should be a callable that takes a model. + """ + self.config: BaseExporterConfig = config + self.logger = logger + self._model_wrapper = model_wrapper + + def prepare_model(self, model: torch.nn.Module) -> torch.nn.Module: + """ + Prepare model for export (apply wrapper if configured). + + Args: + model: Original PyTorch model + + Returns: + Prepared model (wrapped if wrapper configured) + """ + if self._model_wrapper is None: + return model + + self.logger.info("Applying model wrapper for export") + + return self._model_wrapper(model) + + @abstractmethod + def export( + self, + model: torch.nn.Module, + sample_input: Any, + output_path: str, + onnx_path: Optional[str] = None, + ) -> Optional["Artifact"]: + """ + Export model to target format. + + Args: + model: PyTorch model to export + sample_input: Example model input(s) for tracing/shape inference + output_path: Path to save exported model + onnx_path: Path to a source ONNX model, for exporters that convert from + ONNX (e.g. TensorRT). Ignored by exporters that export directly from + a PyTorch model. + + Returns: + Artifact representing the exported model, or None if the exporter + does not produce one (the model is written to `output_path` either way). + + Raises: + RuntimeError: If export fails + """ + raise NotImplementedError diff --git a/deployment/exporters/common/configs.py b/deployment/exporters/common/configs.py new file mode 100644 index 000000000..b24281adf --- /dev/null +++ b/deployment/exporters/common/configs.py @@ -0,0 +1,108 @@ +"""Typed configuration helpers shared by exporter implementations.""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Iterable, Mapping, Optional, Tuple + +from deployment.configs.enums import PrecisionPolicy + + +@dataclass(frozen=True) +class TensorRTProfileConfig: + """Optimization profile description for a TensorRT input tensor.""" + + min_shape: Tuple[int, ...] = field(default_factory=tuple) + opt_shape: Tuple[int, ...] = field(default_factory=tuple) + max_shape: Tuple[int, ...] = field(default_factory=tuple) + + @classmethod + def from_dict(cls, data: Mapping[str, Any]) -> TensorRTProfileConfig: + return cls( + min_shape=cls._to_shape_tuple(data.get("min_shape")), + opt_shape=cls._to_shape_tuple(data.get("opt_shape")), + max_shape=cls._to_shape_tuple(data.get("max_shape")), + ) + + @staticmethod + def _to_shape_tuple(shape: Optional[Iterable[int]]) -> Tuple[int, ...]: + """Convert an iterable of dimensions into an int tuple; None becomes an empty tuple.""" + if shape is None: + return tuple() + return tuple(int(dim) for dim in shape) + + +@dataclass(frozen=True) +class TensorRTModelInputConfig: + """TensorRT model input shape settings.""" + + input_shapes: Mapping[str, TensorRTProfileConfig] = field(default_factory=dict) + + +class BaseExporterConfig: + """ + Base class for typed exporter configuration dataclasses. + + Concrete configs should extend this class and provide typed fields + for all configuration parameters. + """ + + pass + + +@dataclass(frozen=True) +class ONNXExportConfig(BaseExporterConfig): + """ + Typed schema describing ONNX exporter configuration. + + Attributes: + input_names: Ordered collection of input tensor names. + output_names: Ordered collection of output tensor names. + dynamic_axes: Optional dynamic axes mapping identical to torch.onnx API. + simplify: Whether to run onnx-simplifier after export. + opset_version: ONNX opset to target. + export_params: Whether to embed weights inside the ONNX file. + keep_initializers_as_inputs: Mirror of torch.onnx flag. + verbose: Whether to log torch.onnx export graph debugging. + do_constant_folding: Whether to enable constant folding. + save_file: Output filename for the ONNX model. + batch_size: Fixed batch size for export (None for dynamic batch). + """ + + input_names: Tuple[str, ...] = ("input",) + output_names: Tuple[str, ...] = ("output",) + dynamic_axes: Optional[Mapping[str, Mapping[int, str]]] = None + simplify: bool = False + opset_version: int = 17 + export_params: bool = True + keep_initializers_as_inputs: bool = False + verbose: bool = False + do_constant_folding: bool = True + save_file: str = "model.onnx" + batch_size: Optional[int] = None + + +@dataclass(frozen=True) +class TensorRTExportConfig(BaseExporterConfig): + """ + Typed schema describing TensorRT exporter configuration. + + Attributes: + precision_policy: Precision policy; the exporter maps it to concrete TensorRT flags. + max_workspace_size: Workspace size in bytes. + model_input: Per-input optimization-profile shapes. A single config already maps + multiple named inputs via ``input_shapes``; None means no dynamic profile. + """ + + precision_policy: PrecisionPolicy = PrecisionPolicy.AUTO + max_workspace_size: int = 1 << 30 + model_input: Optional[TensorRTModelInputConfig] = None + + +__all__ = [ + "BaseExporterConfig", + "ONNXExportConfig", + "TensorRTExportConfig", + "TensorRTModelInputConfig", + "TensorRTProfileConfig", +] diff --git a/deployment/exporters/common/factory.py b/deployment/exporters/common/factory.py new file mode 100644 index 000000000..21f9957fb --- /dev/null +++ b/deployment/exporters/common/factory.py @@ -0,0 +1,39 @@ +from __future__ import annotations + +import logging +from typing import Type + +from deployment.configs.base import BaseDeploymentConfig +from deployment.exporters.common.model_wrappers import BaseModelWrapper +from deployment.exporters.common.onnx_exporter import ONNXExporter +from deployment.exporters.common.tensorrt_exporter import TensorRTExporter + + +class ExporterFactory: + """Factory class for instantiating exporters using deployment configs.""" + + @staticmethod + def create_onnx_exporter( + config: BaseDeploymentConfig, + wrapper_cls: Type[BaseModelWrapper], + logger: logging.Logger, + component_name: str, + ) -> ONNXExporter: + """Build an ONNX exporter for the given component.""" + return ONNXExporter( + config=config.get_onnx_settings(component_name), + model_wrapper=wrapper_cls, + logger=logger, + ) + + @staticmethod + def create_tensorrt_exporter( + config: BaseDeploymentConfig, + logger: logging.Logger, + component_name: str, + ) -> TensorRTExporter: + """Build a TensorRT exporter for the given component.""" + return TensorRTExporter( + config=config.get_tensorrt_settings(component_name), + logger=logger, + ) diff --git a/deployment/exporters/common/model_wrappers.py b/deployment/exporters/common/model_wrappers.py new file mode 100644 index 000000000..63ee783b7 --- /dev/null +++ b/deployment/exporters/common/model_wrappers.py @@ -0,0 +1,59 @@ +""" +Base model wrappers for ONNX export. + +This module provides the base classes for model wrappers that prepare models +for ONNX export with specific output formats and processing requirements. + +Each project should define its own wrapper in {project}/model_wrappers.py, +either by using IdentityWrapper or by creating a custom wrapper that inherits +from BaseModelWrapper. +""" + +from abc import ABC, abstractmethod +from typing import Any + +import torch.nn as nn + + +class BaseModelWrapper(nn.Module, ABC): + """ + Abstract base class for ONNX export model wrappers. + + Wrappers modify model forward pass to produce ONNX-compatible outputs + with specific formats required by deployment backends. + + Each project should create its own wrapper class that inherits from this + base class if special output format conversion is needed. + """ + + def __init__(self, model: nn.Module) -> None: + """ + Initialize wrapper. + + Args: + model: PyTorch model to wrap + """ + super().__init__() + self.model = model + + @abstractmethod + def forward(self, *args) -> Any: + """ + Forward pass for ONNX export. + + Must be implemented by subclasses to define ONNX-specific output format. + """ + raise NotImplementedError + + +class IdentityWrapper(BaseModelWrapper): + """ + Identity wrapper that doesn't modify the model. + + Useful for models that don't need special ONNX export handling. + This is the default wrapper for most models. + """ + + def forward(self, *args) -> Any: + """Forward pass without modification.""" + return self.model(*args) diff --git a/deployment/exporters/common/onnx_exporter.py b/deployment/exporters/common/onnx_exporter.py new file mode 100644 index 000000000..818b85e3a --- /dev/null +++ b/deployment/exporters/common/onnx_exporter.py @@ -0,0 +1,223 @@ +"""ONNX model exporter.""" + +import logging +import os +import shutil +from pathlib import Path +from typing import Any, Optional + +import onnx +import onnxsim +import torch + +from deployment.exporters.common.base_exporter import BaseExporter +from deployment.exporters.common.configs import ONNXExportConfig + + +class ONNXExporter(BaseExporter): + """ + ONNX model exporter with enhanced features. + + Exports PyTorch models to ONNX format with: + - Optional model wrapping for ONNX-specific output formats + - Optional model simplification + - Configuration override capability + """ + + def __init__( + self, + config: ONNXExportConfig, + logger: logging.Logger, + model_wrapper: Optional[Any] = None, + ) -> None: + """ + Initialize ONNX exporter. + + Args: + config: ONNX export configuration dataclass instance. + logger: Logger instance for export progress and diagnostics. + model_wrapper: Optional model wrapper class (e.g., YOLOXOptElanONNXWrapper) + """ + super().__init__(config, logger=logger, model_wrapper=model_wrapper) + self._validate_config(config) + + def _validate_config(self, config: ONNXExportConfig) -> None: + """ + Validate ONNX export configuration. + + Args: + config: Configuration to validate + + Raises: + ValueError: If configuration is invalid + """ + if config.opset_version < 11: + raise ValueError(f"opset_version must be >= 11, got {config.opset_version}") + + if not config.input_names: + raise ValueError("input_names cannot be empty") + + if not config.output_names: + raise ValueError("output_names cannot be empty") + + if len(config.input_names) != len(set(config.input_names)): + raise ValueError("input_names contains duplicates") + + if len(config.output_names) != len(set(config.output_names)): + raise ValueError("output_names contains duplicates") + + def export( + self, + model: torch.nn.Module, + sample_input: Any, + output_path: str, + onnx_path: Optional[str] = None, + ) -> None: + """Export model to ONNX format. + + Args: + model: PyTorch model to export + sample_input: Sample input tensor + output_path: Path to save ONNX model + onnx_path: Unused; accepted for interface compatibility with BaseExporter. + + Raises: + RuntimeError: If export fails + ValueError: If configuration is invalid + """ + model = self._prepare_for_onnx(model) + self._do_onnx_export(model, sample_input, output_path, self.config) + if self.config.simplify: + self._simplify_model(output_path) + + def _prepare_for_onnx(self, model: torch.nn.Module) -> torch.nn.Module: + """ + Prepare model for ONNX export. + + Applies model wrapper if configured and sets model to eval mode. + + Args: + model: PyTorch model to prepare + + Returns: + Prepared model ready for ONNX export + """ + model = self.prepare_model(model) + model.eval() + return model + + def _do_onnx_export( + self, + model: torch.nn.Module, + sample_input: Any, + output_path: str, + export_cfg: ONNXExportConfig, + ) -> None: + """ + Perform ONNX export using torch.onnx.export. + + Args: + model: Prepared PyTorch model + sample_input: Sample input tensor + output_path: Path to save ONNX model + export_cfg: Export configuration + + Raises: + RuntimeError: If export fails + """ + self.logger.info("Exporting model to ONNX format...") + if hasattr(sample_input, "shape"): + self.logger.info(" Input shape: %s", sample_input.shape) + self.logger.info(" Output path: %s", output_path) + + self.logger.info(" Opset version: %s", export_cfg.opset_version) + + # Ensure output directory exists + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + + # Export into a private staging directory, then publish the result into place. + # torch.onnx.export may emit external-data sidecar files next to the .onnx (for models + # whose weights exceed the 2GB protobuf limit). Staging the whole set and publishing it + # together means a failed/interrupted export never leaves a partial model in the target + # directory, and the .onnx never becomes visible before the data files it references. + staging = output.parent / f".{output.name}.staging" + produced = staging / output.name + self._reset_dir(staging) + try: + with torch.no_grad(): + torch.onnx.export( + model, + sample_input, + str(produced), + export_params=export_cfg.export_params, + keep_initializers_as_inputs=export_cfg.keep_initializers_as_inputs, + opset_version=export_cfg.opset_version, + do_constant_folding=export_cfg.do_constant_folding, + input_names=list(export_cfg.input_names), + output_names=list(export_cfg.output_names), + dynamic_axes=export_cfg.dynamic_axes, + verbose=export_cfg.verbose, + ) + self._publish(staging, produced, output) + + self.logger.info("ONNX export completed: %s", output_path) + + except Exception as exc: + self.logger.exception("ONNX export failed: %s", output_path) + raise RuntimeError(f"ONNX export failed: {output_path}") from exc + finally: + shutil.rmtree(staging, ignore_errors=True) + + def _simplify_model(self, onnx_path: str) -> None: + """ + Simplify ONNX model using onnxsim. + + Args: + onnx_path: Path to ONNX model file + """ + self.logger.info("Simplifying ONNX model...") + target = Path(onnx_path) + # Save into a staging dir and publish, for the same external-data and atomicity reasons + # as the export above: never overwrite the valid exported model in place. + staging = target.parent / f".{target.name}.simplify.staging" + produced = staging / target.name + try: + model_simplified, success = onnxsim.simplify(onnx_path) + if not success: + self.logger.error("ONNX model simplification failed; keeping unsimplified model") + return + self._reset_dir(staging) + onnx.save(model_simplified, str(produced)) + self._publish(staging, produced, target) + self.logger.info("ONNX model simplified successfully") + except Exception as e: + self.logger.error("ONNX simplification error: %s", e) + finally: + shutil.rmtree(staging, ignore_errors=True) + + @staticmethod + def _reset_dir(path: Path) -> None: + """Create an empty staging directory, removing any leftovers from a prior run.""" + shutil.rmtree(path, ignore_errors=True) + path.mkdir(parents=True) + + @staticmethod + def _publish(staging: Path, produced: Path, target: Path) -> None: + """Move a freshly produced ONNX (and any external-data sidecars) into place. + + Sidecar files are moved first and the main ``.onnx`` (``produced``) last, so a reader + that observes the model file always sees the data files it references. ``os.replace`` + is atomic within the destination directory. + + Args: + staging: Directory holding the freshly produced files. + produced: The main ``.onnx`` file inside ``staging``. + target: Final path for the main ``.onnx`` file. + """ + dest_dir = target.parent + for item in sorted(staging.iterdir()): + if item == produced: + continue + os.replace(item, dest_dir / item.name) + os.replace(produced, target) diff --git a/deployment/exporters/common/tensorrt_exporter.py b/deployment/exporters/common/tensorrt_exporter.py new file mode 100644 index 000000000..c9f3c3260 --- /dev/null +++ b/deployment/exporters/common/tensorrt_exporter.py @@ -0,0 +1,313 @@ +"""TensorRT model exporter.""" + +import logging +import os +from pathlib import Path +from typing import Any, List, Optional, Sequence, Tuple + +import tensorrt as trt +import torch + +from deployment.configs.enums import PrecisionPolicy +from deployment.core.artifacts import Artifact +from deployment.exporters.common.base_exporter import BaseExporter +from deployment.exporters.common.configs import TensorRTExportConfig + + +class TensorRTExporter(BaseExporter): + """ + TensorRT model exporter. + + Converts ONNX models to TensorRT engine format with precision policy support. + """ + + def __init__( + self, + config: TensorRTExportConfig, + logger: logging.Logger, + model_wrapper: Optional[Any] = None, + ) -> None: + """ + Initialize TensorRT exporter. + + Args: + config: TensorRT export configuration dataclass instance. + logger: Logger instance for export progress and diagnostics. + model_wrapper: Optional model wrapper class (usually not needed for TensorRT) + """ + super().__init__(config, logger=logger, model_wrapper=model_wrapper) + + def export( + self, + model: Optional[torch.nn.Module], # Not used for TensorRT, kept for interface compatibility + sample_input: Any, + output_path: str, + onnx_path: Optional[str] = None, + ) -> Artifact: + """ + Export ONNX model to TensorRT engine. + + Args: + model: Not used (TensorRT converts from ONNX) + sample_input: Not used (profile shapes come from config, not the sample) + output_path: Path to save TensorRT engine + onnx_path: Path to source ONNX model (required for TensorRT export) + + Returns: + Artifact object representing the exported TensorRT engine + + Raises: + ValueError: If onnx_path is not provided + RuntimeError: If export fails + """ + if onnx_path is None: + raise ValueError("TensorRT export requires 'onnx_path' to be provided.") + + self.logger.info("Building TensorRT engine with precision policy: %s", self.config.precision_policy.value) + self.logger.info(" ONNX source: %s", onnx_path) + self.logger.info(" Engine output: %s", output_path) + + return self._do_tensorrt_export(onnx_path, output_path) + + def _do_tensorrt_export( + self, + onnx_path: str, + output_path: str, + ) -> Artifact: + """ + Export a single ONNX file to TensorRT engine. + + This method handles the complete export workflow with proper resource management. + + Args: + onnx_path: Path to source ONNX model + output_path: Path to save TensorRT engine + + Returns: + Artifact object representing the exported TensorRT engine + + Raises: + RuntimeError: If export fails + """ + # Initialize TensorRT + trt_logger = trt.Logger(trt.Logger.WARNING) + trt.init_libnvinfer_plugins(trt_logger, "") + + builder = trt.Builder(trt_logger) + try: + builder_config, network, parser = self._create_builder_and_network(builder, trt_logger) + try: + self._parse_onnx(parser, onnx_path) + self._configure_input_profiles(builder, builder_config) + serialized_engine = self._build_engine(builder, builder_config, network) + self._save_engine(serialized_engine, output_path) + return Artifact(path=output_path) + finally: + del parser + del network + finally: + del builder + + def _create_builder_and_network( + self, + builder: trt.Builder, + trt_logger: trt.Logger, + ) -> Tuple[trt.IBuilderConfig, trt.INetworkDefinition, trt.OnnxParser]: + """ + Create builder config, network, and parser. + + Args: + builder: TensorRT builder instance + trt_logger: TensorRT logger instance + + Returns: + Tuple of (builder_config, network, parser) + """ + builder_config = builder.create_builder_config() + + max_workspace_size = self.config.max_workspace_size + builder_config.set_memory_pool_limit(pool=trt.MemoryPoolType.WORKSPACE, pool_size=max_workspace_size) + + # EXPLICIT_BATCH plus any network-creation flags the precision policy needs. + network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + network_flags = self._apply_precision_policy(network_flags, builder_config) + + network = builder.create_network(network_flags) + parser = trt.OnnxParser(network, trt_logger) + + return builder_config, network, parser + + def _apply_precision_policy(self, network_flags: int, builder_config: trt.IBuilderConfig) -> int: + """Apply the configured precision policy to TensorRT. + + Returns the (possibly updated) network-creation flags. ``STRONGLY_TYPED`` is a + network-creation flag and must be folded in before the network is created; + ``FP16``/``TF32`` are builder flags set on the builder config. ``AUTO`` adds nothing + and lets TensorRT decide. + """ + policy = self.config.precision_policy + if policy is PrecisionPolicy.STRONGLY_TYPED: + network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED) + self.logger.info("Using strongly typed TensorRT network creation") + elif policy is PrecisionPolicy.FP16: + builder_config.set_flag(trt.BuilderFlag.FP16) + self.logger.info("BuilderFlag.FP16 enabled") + elif policy is PrecisionPolicy.FP32_TF32: + builder_config.set_flag(trt.BuilderFlag.TF32) + self.logger.info("BuilderFlag.TF32 enabled") + return network_flags + + def _parse_onnx( + self, + parser: trt.OnnxParser, + onnx_path: str, + ) -> None: + """ + Parse ONNX model into the TensorRT network bound to `parser`. + + Args: + parser: TensorRT ONNX parser instance + onnx_path: Path to ONNX model file + + Raises: + RuntimeError: If parsing fails + """ + with open(onnx_path, "rb") as f: + if not parser.parse(f.read()): + self._log_parser_errors(parser) + raise RuntimeError("TensorRT export failed: unable to parse ONNX file") + self.logger.info("Successfully parsed ONNX file") + + def _configure_input_profiles( + self, + builder: trt.Builder, + builder_config: trt.IBuilderConfig, + ) -> None: + """ + Configure TensorRT optimization profiles for input shapes. + + Creates an optimization profile and configures min/opt/max shapes for each input. + See `_configure_input_shapes` for details on shape configuration. + + Note: + ONNX `dynamic_axes` and TensorRT profiles serve different purposes: + + - **ONNX dynamic_axes**: Used during ONNX export to define which dimensions + are symbolic (dynamic) in the ONNX graph. This allows the ONNX model to + accept inputs of varying sizes at those dimensions. + + - **TensorRT profile**: Defines the runtime shape envelope (min/opt/max) that + TensorRT will optimize for. TensorRT builds kernels optimized for shapes + within this envelope. The profile must be compatible with the ONNX dynamic + axes, but they are configured separately and serve different roles: + - dynamic_axes: Export-time graph structure + - TRT profile: Runtime optimization envelope + + They are related but not equivalent. The ONNX model may have dynamic axes, + but TensorRT still needs explicit min/opt/max shapes to build optimized kernels. + + Args: + builder: TensorRT builder instance + builder_config: TensorRT builder config + """ + profile = builder.create_optimization_profile() + self._configure_input_shapes(profile) + builder_config.add_optimization_profile(profile) + + def _build_engine( + self, + builder: trt.Builder, + builder_config: trt.IBuilderConfig, + network: trt.INetworkDefinition, + ) -> bytes: + """ + Build TensorRT engine from network. + + Args: + builder: TensorRT builder instance + builder_config: TensorRT builder config + network: TensorRT network definition + + Returns: + Serialized engine as bytes + + Raises: + RuntimeError: If engine building fails + """ + self.logger.info("Building TensorRT engine (this may take a while)...") + serialized_engine = builder.build_serialized_network(network, builder_config) + + if serialized_engine is None: + self.logger.error("Failed to build TensorRT engine") + raise RuntimeError("TensorRT export failed: builder returned None") + + return serialized_engine + + def _save_engine( + self, + serialized_engine: bytes, + output_path: str, + ) -> None: + """ + Save serialized TensorRT engine to file. + + Args: + serialized_engine: Serialized engine bytes + output_path: Path to save engine file + """ + # Write to a temp file in the same directory, then atomically replace the target. + # Building an engine can take many minutes; a crash mid-write must not leave a + # truncated .engine that a later run would treat as a valid artifact. + output = Path(output_path) + output.parent.mkdir(parents=True, exist_ok=True) + tmp_path = output.with_name(f"{output.name}.tmp") + try: + with open(tmp_path, "wb") as f: + f.write(serialized_engine) + f.flush() + os.fsync(f.fileno()) + os.replace(tmp_path, output) + finally: + tmp_path.unlink(missing_ok=True) + + max_workspace_size = self.config.max_workspace_size + self.logger.info("TensorRT engine saved to %s", output_path) + self.logger.info("Engine max workspace size: %.2f GB", max_workspace_size / (1024**3)) + + def _configure_input_shapes( + self, + profile: trt.IOptimizationProfile, + ) -> None: + """Configure TensorRT optimization profile shapes from config.""" + model_input_cfg = self.config.model_input + if model_input_cfg is None or not model_input_cfg.input_shapes: + raise ValueError( + "TensorRT export requires 'model_input' with 'input_shapes' (min/opt/max per " + "input tensor), but none were configured." + ) + + for input_name, profile_cfg in model_input_cfg.input_shapes.items(): + min_shape = self._to_int_list(profile_cfg.min_shape, input_name, "min") + opt_shape = self._to_int_list(profile_cfg.opt_shape, input_name, "opt") + max_shape = self._to_int_list(profile_cfg.max_shape, input_name, "max") + self.logger.info( + "Setting %s shapes - min: %s, opt: %s, max: %s", + input_name, + min_shape, + opt_shape, + max_shape, + ) + profile.set_shape(input_name, min_shape, opt_shape, max_shape) + + def _log_parser_errors(self, parser: trt.OnnxParser) -> None: + """Log TensorRT parser errors.""" + self.logger.error("Failed to parse ONNX model") + for error in range(parser.num_errors): + self.logger.error("Parser error: %s", parser.get_error(error)) + + @staticmethod + def _to_int_list(shape: Sequence[int], input_name: str, bucket: str) -> List[int]: + """Coerce a configured profile shape to a list of ints; fail loud if missing.""" + if not shape: + raise ValueError(f"{bucket}_shape missing for TensorRT input '{input_name}'.") + return [int(dim) for dim in shape] diff --git a/deployment/exporters/export_pipelines/__init__.py b/deployment/exporters/export_pipelines/__init__.py new file mode 100644 index 000000000..c98085e7f --- /dev/null +++ b/deployment/exporters/export_pipelines/__init__.py @@ -0,0 +1 @@ +"""Export pipeline helpers. Import from ``deployment.exporters.export_pipelines.base`` and ``.interfaces``.""" diff --git a/deployment/exporters/export_pipelines/base.py b/deployment/exporters/export_pipelines/base.py new file mode 100644 index 000000000..ddeef2714 --- /dev/null +++ b/deployment/exporters/export_pipelines/base.py @@ -0,0 +1,71 @@ +""" +Base export pipeline interfaces for specialized export flows. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.artifacts import Artifact +from deployment.core.device import DeviceSpec +from deployment.core.io.base_data_loader import BaseDataLoader + + +class OnnxExportPipeline(ABC): + """ + Base interface for ONNX export pipelines. + """ + + @abstractmethod + def export( + self, + *, + model: Any, + data_loader: BaseDataLoader, + output_dir: str, + config: BaseDeploymentConfig, + sample_idx: int = 0, + ) -> Artifact: + """ + Execute the ONNX export pipeline and return the produced artifact. + + Args: + model: PyTorch model to export + data_loader: Data loader for samples + output_dir: Directory for output files + config: Deployment configuration + sample_idx: Sample index for tracing + + Returns: + Artifact describing the exported ONNX output + """ + + +class TensorRTExportPipeline(ABC): + """ + Base interface for TensorRT export pipelines. + """ + + @abstractmethod + def export( + self, + *, + onnx_path: str, + output_dir: str, + config: BaseDeploymentConfig, + device: DeviceSpec, + ) -> Artifact: + """ + Execute the TensorRT export pipeline and return the produced artifact. + + Args: + onnx_path: Path to ONNX model file/directory + output_dir: Directory for output files + config: Deployment configuration + device: CUDA device specification + + Returns: + Artifact describing the exported TensorRT output + """ diff --git a/deployment/exporters/export_pipelines/interfaces.py b/deployment/exporters/export_pipelines/interfaces.py new file mode 100644 index 000000000..df25d1939 --- /dev/null +++ b/deployment/exporters/export_pipelines/interfaces.py @@ -0,0 +1,76 @@ +""" +Interfaces for export pipeline components. + +This module defines interfaces that allow project-specific code to provide +model-specific knowledge to generic deployment export pipelines. +""" + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Any + +import torch + + +@dataclass(frozen=True) +class ExportableComponent: + """A model component ready for ONNX export. + + Attributes: + name: Component identifier (same as key in deploy config components). Used for + config lookup, output filename, and logs. + module: PyTorch module to export. + sample_input: Sample input tensor for tracing. + """ + + name: str + module: torch.nn.Module + sample_input: Any + + +class ExportSampleAdapter(ABC): + """Interface for adapting model-specific sample extraction for export. + + Implementations convert model-specific feature extraction outputs + into a sample object that component builders can consume. + """ + + @abstractmethod + def extract_sample( + self, + model: torch.nn.Module, + data_loader: Any, + sample_idx: int, + ) -> Any: + """Extract model-specific sample payload for export. + + Args: + model: PyTorch model used for feature extraction + data_loader: Data loader used to access the sample + sample_idx: Sample index used for tracing/feature extraction + + Returns: + Model-specific typed sample payload. + """ + ... + + +class ModelComponentBuilder(ABC): + """Interface for building exportable ONNX components from model and sample.""" + + @abstractmethod + def build_components( + self, + model: torch.nn.Module, + sample: Any, + ) -> list[ExportableComponent]: + """Build all ONNX-exportable components. + + Args: + model: PyTorch model to build components from + sample: Typed sample payload for preparing component inputs + + Returns: + List of exportable model components ready for ONNX export. + """ + ... diff --git a/deployment/pipelines/__init__.py b/deployment/pipelines/__init__.py new file mode 100644 index 000000000..6c5038b25 --- /dev/null +++ b/deployment/pipelines/__init__.py @@ -0,0 +1 @@ +"""Pipeline infrastructure.""" diff --git a/deployment/pipelines/base_factory.py b/deployment/pipelines/base_factory.py new file mode 100644 index 000000000..853e602a6 --- /dev/null +++ b/deployment/pipelines/base_factory.py @@ -0,0 +1,72 @@ +""" +Base Pipeline Factory for Project-specific Pipeline Creation. + +Flattened from `deployment/pipelines/common/base_factory.py`. +""" + +import logging +from abc import ABC, abstractmethod + +import torch + +from deployment.configs.schema import ComponentsConfig +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.evaluator_types import ModelSpec +from deployment.pipelines.base_pipeline import BaseInferencePipeline + +logger = logging.getLogger(__name__) + + +class BasePipelineFactory(ABC): + """Project-specific factory interface for building deployment pipelines. + + A project registers a subclass into `deployment.pipelines.registry.pipeline_registry`. + Evaluators then call into the registry/factory to instantiate the correct pipeline + for a given (project, backend) pair. + """ + + @classmethod + @abstractmethod + def get_project_name(cls) -> str: + """Return the unique project identifier used for registry lookup.""" + raise NotImplementedError + + @classmethod + @abstractmethod + def create_pipeline( + cls, + model_spec: ModelSpec, + pytorch_model: torch.nn.Module, + device: DeviceSpec, + components_cfg: ComponentsConfig, + ) -> BaseInferencePipeline: + """Build and return a pipeline instance for the given model spec. + + Implementations typically: + - Validate/dispatch based on `model_spec.backend` + - Wrap `pytorch_model` or load an ONNX/TensorRT runtime + - Construct a `BaseInferencePipeline` subclass configured for the backend + + Args: + model_spec: Describes the model path/device/backend and any metadata. + pytorch_model: A loaded PyTorch model (used for PYTORCH backends). + device: Optional device override (defaults to `model_spec.device`). + components_cfg: Project-specific component configuration (e.g., file paths, IO specs). + """ + raise NotImplementedError + + @classmethod + def get_supported_backends(cls) -> list: + """Return the list of backends this project factory can instantiate.""" + return [Backend.PYTORCH, Backend.ONNX, Backend.TENSORRT] + + @classmethod + def _validate_backend(cls, backend: Backend) -> None: + """Raise a ValueError if `backend` is not supported by this factory.""" + supported = cls.get_supported_backends() + if backend not in supported: + supported_names = [b.value for b in supported] + raise ValueError( + f"Unsupported backend '{backend.value}' for {cls.get_project_name()}. Supported backends: {supported_names}" + ) diff --git a/deployment/pipelines/base_pipeline.py b/deployment/pipelines/base_pipeline.py new file mode 100644 index 000000000..ae32ac129 --- /dev/null +++ b/deployment/pipelines/base_pipeline.py @@ -0,0 +1,182 @@ +""" +Base inference pipeline for unified model deployment. + +Flattened from `deployment/pipelines/common/base_pipeline.py`. +""" + +import logging +import time +from abc import ABC, abstractmethod +from typing import Any, Dict, Mapping, Optional, Tuple + +import torch + +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.evaluator_types import InferenceResult + +logger = logging.getLogger(__name__) + + +class BaseInferencePipeline(ABC): + """Base contract for a deployment-time inference pipeline. + + A pipeline is responsible for the classic 3-stage inference flow: + `preprocess -> run_model -> postprocess`. + + The default `infer()` implementation measures per-stage latency and returns an + `InferenceResult` with optional breakdown information. + """ + + def __init__( + self, + model: torch.nn.Module, + backend_type: Backend, + device: DeviceSpec, + ) -> None: + """Create a pipeline bound to a model and a device. + + Args: + model: Backend-specific callable/model wrapper used by `run_model`. + device: Target runtime device (string/torch.device/DeviceSpec). + backend_type: Deployment backend enum for logging/metrics. Required. + """ + self.model = model + self.device = device + self.backend_type = backend_type + + logger.info("Initialized %s on device: %s", self.__class__.__name__, self.device) + + @property + def torch_device(self) -> torch.device: + """Return torch.device converted from canonical DeviceSpec.""" + return self.device.to_torch_device() + + @abstractmethod + def preprocess(self, input_data: Any) -> Tuple[Any, Dict[str, Any]]: + """Convert raw input into model-ready tensors/arrays. + + Returns: + A 2-tuple ``(model_input, preprocess_metadata)``: + - ``model_input``: Tensors or structure consumed by :meth:`run_model`. + - ``preprocess_metadata``: Dict merged into the ``metadata`` argument of + :meth:`infer` (together with any ``metadata`` passed by the caller) and + then passed to :meth:`postprocess`. Use an empty dict when nothing extra + is needed. + """ + raise NotImplementedError + + @abstractmethod + def run_model(self, preprocessed_input: Any) -> Tuple[Any, Dict[str, float]]: + """Run the underlying model and return its raw outputs. + + Returns: + A 2-tuple ``(model_output, stage_latencies)``: + - ``model_output``: Raw tensors or structure for :meth:`postprocess` (or + returned as-is when ``infer(..., return_raw_outputs=True)``). + - ``stage_latencies``: Per-substage timings in milliseconds; merged into + `~deployment.core.evaluation.evaluator_types.InferenceResult` + ``breakdown`` (e.g. ``voxel_encoder_ms``). + """ + raise NotImplementedError + + @abstractmethod + def postprocess( + self, + model_output: Any, + metadata: Optional[Mapping[str, Any]] = None, + ) -> Any: + """Convert raw model outputs into final predictions/results. + + Args: + model_output: Value returned by :meth:`run_model` (first element of its tuple). + metadata: Merged dict from ``infer(..., metadata=...)`` plus + ``preprocess_metadata`` from :meth:`preprocess`. May be empty. + """ + raise NotImplementedError + + def infer( + self, input_data: Any, metadata: Optional[Mapping[str, Any]] = None, return_raw_outputs: bool = False + ) -> InferenceResult: + """Run end-to-end inference with latency breakdown. + + Flow: + 1) preprocess(input_data) + 2) run_model(model_input) + 3) postprocess(model_output, merged_metadata) unless `return_raw_outputs=True` + + Args: + input_data: Raw input sample(s) in a project-defined format. + metadata: Optional auxiliary context merged with preprocess metadata. + return_raw_outputs: If True, skip `postprocess` and return raw model output. + + Returns: + InferenceResult with `output`, total latency, and per-stage breakdown. + """ + latency_breakdown: Dict[str, float] = {} + + try: + # Preprocess + start_time = time.perf_counter() + model_input, preprocess_metadata = self.preprocess(input_data) + preprocess_time = time.perf_counter() + latency_breakdown["preprocessing_ms"] = (preprocess_time - start_time) * 1000 + # Build a new dict + metadata = {**(metadata or {}), **preprocess_metadata} + + # Run model + model_start = time.perf_counter() + model_output, model_latency = self.run_model(model_input) + model_time = time.perf_counter() + latency_breakdown["model_ms"] = (model_time - model_start) * 1000 + + latency_breakdown.update(model_latency) + + total_latency = (time.perf_counter() - start_time) * 1000 + + if return_raw_outputs: + return InferenceResult(output=model_output, latency_ms=total_latency, breakdown=latency_breakdown) + + # Postprocess + postprocess_start = time.perf_counter() + postprocess_output = self.postprocess(model_output, metadata) + postprocess_time = time.perf_counter() + latency_breakdown["postprocessing_ms"] = (postprocess_time - postprocess_start) * 1000 + + total_latency = (time.perf_counter() - start_time) * 1000 + return InferenceResult(output=postprocess_output, latency_ms=total_latency, breakdown=latency_breakdown) + + except Exception: + logger.exception("Inference failed.") + raise + + def periodic_cleanup(self, sample_idx: int) -> None: + """Per-sample cleanup hook, always called once per sample by the evaluation loop. + + The default does nothing; overriding is optional. Backends with their own + caching concerns (e.g. TensorRT freeing the CUDA cache every N samples) + override this so the loop never has to special-case a backend. + """ + + def cleanup(self) -> None: + """Release resources owned by the pipeline. + + Subclasses should override when they hold external resources (e.g., CUDA + buffers, TensorRT engines/contexts, file handles). `infer()` does not call + this automatically; use the context manager (`with pipeline:`) or call it + explicitly. + """ + pass + + def __repr__(self): + return f"{self.__class__.__name__}(" f"device={self.device}, " f"backend={self.backend_type})" + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type: + logger.error("Pipeline failed with: %s", exc_val) + + self.cleanup() + return False diff --git a/deployment/pipelines/gpu_resource_mixin.py b/deployment/pipelines/gpu_resource_mixin.py new file mode 100644 index 000000000..89a4acb88 --- /dev/null +++ b/deployment/pipelines/gpu_resource_mixin.py @@ -0,0 +1,126 @@ +""" +GPU Resource Management utilities for TensorRT Pipelines. + +Flattened from `deployment/pipelines/common/gpu_resource_mixin.py`. +""" + +import logging +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional + +import pycuda.driver as cuda +import torch + +logger = logging.getLogger(__name__) + + +def clear_cuda_memory() -> None: + """Best-effort CUDA memory cleanup for long-running deployment workflows.""" + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + + +class GPUResourceMixin(ABC): + """Mixin that provides idempotent GPU resource cleanup. + + Subclasses implement `_release_gpu_resources()` and this mixin ensures cleanup + is called exactly once (including via context-manager or destructor paths). + """ + + _cleanup_called: bool = False + + @abstractmethod + def _release_gpu_resources(self) -> None: + """Release backend-specific GPU resources owned by the instance.""" + raise NotImplementedError + + def cleanup(self) -> None: + """Release GPU resources once and clear CUDA caches (best effort).""" + if self._cleanup_called: + return + + try: + self._release_gpu_resources() + clear_cuda_memory() + self._cleanup_called = True + logger.debug("%s: GPU resources released", self.__class__.__name__) + except Exception as e: + logger.warning("Error during GPU resource cleanup: %s", e) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.cleanup() + return False + + def __del__(self): + try: + self.cleanup() + except Exception: + pass + + +class TensorRTResourceManager: + """Helper that tracks CUDA allocations/stream for TensorRT inference. + + This is intentionally minimal: allocate device buffers, provide a stream, + and free everything on context exit. + """ + + def __init__(self) -> None: + """Create an empty manager (no allocations and no stream).""" + self._allocations: List[Any] = [] + self._stream: Optional[Any] = None + + def allocate(self, nbytes: int) -> Any: + """Allocate `nbytes` on the device and track it for automatic cleanup.""" + allocation = cuda.mem_alloc(nbytes) + self._allocations.append(allocation) + return allocation + + @property + def stream(self) -> Any: + """Return a lazily-created CUDA stream shared by the manager.""" + if self._stream is None: + self._stream = cuda.Stream() + return self._stream + + def synchronize(self) -> None: + """Synchronize the tracked CUDA stream (if created).""" + if self._stream is not None: + self._stream.synchronize() + + def _release_all(self) -> None: + """Free all tracked allocations and drop the stream reference.""" + for allocation in self._allocations: + try: + allocation.free() + except Exception: + pass + self._allocations.clear() + self._stream = None + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.synchronize() + self._release_all() + return False + + +def release_tensorrt_resources( + engines: Optional[Dict[str, Any]] = None, + contexts: Optional[Dict[str, Any]] = None, +) -> None: + """Drop references to TensorRT engines/contexts so they are released. + + Contexts are cleared before engines (TensorRT requires execution contexts to be + released before their parent engine). Destruction itself is refcount/GC-driven. + """ + if contexts: + contexts.clear() + if engines: + engines.clear() diff --git a/deployment/pipelines/registry.py b/deployment/pipelines/registry.py new file mode 100644 index 000000000..2b19af058 --- /dev/null +++ b/deployment/pipelines/registry.py @@ -0,0 +1,109 @@ +""" +Pipeline Registry for Dynamic Project Pipeline Registration. + +Flattened from `deployment/pipelines/common/registry.py`. +""" + +import logging +from typing import Dict, Type + +import torch + +from deployment.configs.schema import ComponentsConfig +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.evaluator_types import ModelSpec +from deployment.pipelines.base_factory import BasePipelineFactory +from deployment.pipelines.base_pipeline import BaseInferencePipeline + +logger = logging.getLogger(__name__) + + +class PipelineRegistry: + """Registry for mapping project names to pipeline factories. + + Factories are responsible for creating a `BaseInferencePipeline` instance + given a `ModelSpec`, a loaded PyTorch model, and a `DeviceSpec`. + """ + + def __init__(self) -> None: + """Initialize an empty registry. + + The registry is populated at import-time by project modules that register + their `BasePipelineFactory` subclasses (typically via a decorator call to + `pipeline_registry.register(...)`). + """ + self._factories: Dict[str, Type[BasePipelineFactory]] = {} + + def register(self, factory_cls: Type[BasePipelineFactory]) -> Type[BasePipelineFactory]: + """Register a project factory class. + + Args: + factory_cls: A subclass of `BasePipelineFactory`. + + Returns: + The same class, enabling decorator usage: + `@pipeline_registry.register` + `class MyFactory(BasePipelineFactory): ...` + """ + if not issubclass(factory_cls, BasePipelineFactory): + raise TypeError(f"Factory class must inherit from BasePipelineFactory, got {factory_cls.__name__}") + + project_name = factory_cls.get_project_name() + + if project_name in self._factories: + logger.warning( + "Overwriting existing factory for project '%s': %s -> %s", + project_name, + self._factories[project_name].__name__, + factory_cls.__name__, + ) + + self._factories[project_name] = factory_cls + logger.debug("Registered pipeline factory: %s -> %s", project_name, factory_cls.__name__) + return factory_cls + + def get_factory(self, project_name: str) -> Type[BasePipelineFactory]: + """Return the registered factory for a project name. + + Raises: + KeyError: If no factory is registered for the given project. + """ + if project_name not in self._factories: + available = list(self._factories.keys()) + raise KeyError(f"No factory registered for project '{project_name}'. Available projects: {available}") + return self._factories[project_name] + + def create_pipeline( + self, + project_name: str, + model_spec: ModelSpec, + pytorch_model: torch.nn.Module, + device: DeviceSpec, + components_cfg: ComponentsConfig, + ) -> BaseInferencePipeline: + """Create a project-specific pipeline instance using the registered factory. + + This is the central instantiation path used by evaluators. + """ + factory = self.get_factory(project_name) + return factory.create_pipeline( + model_spec=model_spec, + pytorch_model=pytorch_model, + device=device, + components_cfg=components_cfg, + ) + + def list_projects(self) -> list: + """List registered project names.""" + return list(self._factories.keys()) + + def is_registered(self, project_name: str) -> bool: + """Return True if a project is registered.""" + return project_name in self._factories + + def reset(self) -> None: + """Clear all registrations (primarily useful for tests).""" + self._factories.clear() + + +pipeline_registry = PipelineRegistry() diff --git a/deployment/projects/__init__.py b/deployment/projects/__init__.py new file mode 100644 index 000000000..a879379d8 --- /dev/null +++ b/deployment/projects/__init__.py @@ -0,0 +1 @@ +"""Per-project bundles. Import registries from ``deployment.projects.registry``.""" diff --git a/deployment/projects/centerpoint/README.md b/deployment/projects/centerpoint/README.md new file mode 100644 index 000000000..af798bde7 --- /dev/null +++ b/deployment/projects/centerpoint/README.md @@ -0,0 +1,65 @@ +# CenterPoint deployment + +CenterPoint is the current reference project for multi-component ONNX and TensorRT export inside `deployment/`. + +## Quick start + +From the repository root: + +```bash +python -m deployment.cli.main centerpoint \ + deployment/projects/centerpoint/config/deploy_config.py \ + \ + --rot-y-axis-reference \ + [--log-level INFO] +``` + +Example: + +```bash +python -m deployment.cli.main centerpoint \ + deployment/projects/centerpoint/config/deploy_config.py \ + projects/CenterPoint/configs/t4dataset/Centerpoint/second_secfpn_8xb16_121m_j6gen2_base_amp_t4metric_v2.py \ + --rot-y-axis-reference +``` + +## What is project-specific here + +- Multi-component export with `pts_voxel_encoder` and `pts_backbone_neck_head` +- CenterPoint-specific CLI flag `--rot-y-axis-reference` +- CenterPoint evaluator, loaders, export pipelines, and backend inference pipelines + +## Config file + +The reference deploy config is `deployment/projects/centerpoint/config/deploy_config.py`. + +Adjust at least: + +- `checkpoint_path` +- `export.work_dir`, `export.mode`, and `export.sample_idx` +- `components` + +Required component keys are `pts_voxel_encoder` and `pts_backbone_neck_head`. + +The evaluation/verification dataset comes from the **model config's** `test_dataloader.dataset.ann_file` (the test info), not from the deploy config. + +## Project layout + +| Path | Role | +| --- | --- | +| `entrypoint.py` | Builds config, loader, evaluator, runner, and export context | +| `runner.py` | `CenterPointDeploymentRunner` | +| `cli.py` | Project-specific CLI flags | +| `config/` | Deploy config | +| `io/` | Data loading and model loading helpers | +| `eval/` | CenterPoint evaluator and metrics helpers | +| `pipelines/` | PyTorch, ONNX, and TensorRT pipelines | +| `export/` | CenterPoint export orchestration | +| `onnx_models/` | Export-time ONNX wrappers | + +## Shared docs + +- [../../docs/runbook.md](../../docs/runbook.md) for CLI behavior and run flow +- [../../docs/configuration.md](../../docs/configuration.md) for shared config reference +- [../../docs/architecture.md](../../docs/architecture.md) for framework structure +- [../../docs/operations.md](../../docs/operations.md) for troubleshooting diff --git a/deployment/projects/centerpoint/__init__.py b/deployment/projects/centerpoint/__init__.py new file mode 100644 index 000000000..d6d1df8cb --- /dev/null +++ b/deployment/projects/centerpoint/__init__.py @@ -0,0 +1,23 @@ +"""CenterPoint deployment bundle. + +Import concrete modules (``deployment.projects.centerpoint.runner``, …). This ``__init__`` only +registers the project with ``deployment.projects.registry`` when the package is imported. +""" + +from __future__ import annotations + +from deployment.projects.centerpoint.cli import add_args +from deployment.projects.centerpoint.entrypoint import run + +# Trigger pipeline factory registration for this project. +from deployment.projects.centerpoint.pipelines.factory import CenterPointPipelineFactory # noqa: F401 +from deployment.projects.registry import ProjectAdapter, project_registry + +project_registry.register( + ProjectAdapter( + name="centerpoint", + add_args=add_args, + run=run, + required_components=("pts_voxel_encoder", "pts_backbone_neck_head"), + ) +) diff --git a/deployment/projects/centerpoint/cli.py b/deployment/projects/centerpoint/cli.py new file mode 100644 index 000000000..cc040e0d9 --- /dev/null +++ b/deployment/projects/centerpoint/cli.py @@ -0,0 +1,14 @@ +"""CenterPoint CLI extensions.""" + +from __future__ import annotations + +import argparse + + +def add_args(parser: argparse.ArgumentParser) -> None: + """Register CenterPoint-specific CLI flags onto a project subparser.""" + parser.add_argument( + "--rot-y-axis-reference", + action="store_true", + help="Convert rotation to y-axis clockwise reference (CenterPoint ONNX-compatible format)", + ) diff --git a/deployment/projects/centerpoint/config/deploy_config.py b/deployment/projects/centerpoint/config/deploy_config.py new file mode 100644 index 000000000..4cddcf83c --- /dev/null +++ b/deployment/projects/centerpoint/config/deploy_config.py @@ -0,0 +1,209 @@ +""" +CenterPoint Deployment Configuration + +Layout (single file, grouped by concern): + 1. SHARED VALUES - single source of truth reused across sections (paths, devices, shapes). + 2. EXPORT - export mode, ONNX/TensorRT build settings, component definitions. + 3. EVALUATION - per-backend evaluation settings. + 4. VERIFICATION - cross-backend numerical verification scenarios. + +Only the top-level names `checkpoint_path`, `deploy_log_path`, `devices`, `export`, +`components`, `onnx_config`, `tensorrt_config`, `evaluation`, `verification` are read by +`BaseDeploymentConfig`. Names prefixed with `_` are local helpers (single-source literals) +and are intentionally not consumed directly. +""" + +# ============================================================================ +# 1. SHARED VALUES (single source of truth) +# Change a path/device/shape here once; every section below references it. +# ============================================================================ + +# Checkpoint - single source of truth for the PyTorch model (used by export + PyTorch eval). +checkpoint_path = "work_dirs/centerpoint/best_checkpoint.pth" + +# Log file path (relative paths are resolved under export.work_dir). None disables file logging. +deploy_log_path = "deployment.log" + +# Device settings (shared by export, evaluation, verification). +devices = dict( + cpu="cpu", + cuda="cuda:0", +) +# Alias reused by the per-backend evaluation settings below so the CUDA device is written once. +_CUDA = devices["cuda"] + +# Deployment output layout. _ONNX_DIR / _TENSORRT_DIR are the single source for both the +# export outputs and the evaluation backends' model_dir / engine_dir (kept in sync here). +_DEPLOY_WORK_DIR = "work_dirs/centerpoint_deployment" +_WORK_DIR = _DEPLOY_WORK_DIR.rstrip("/") +_ONNX_DIR = f"{_WORK_DIR}/onnx" +_TENSORRT_DIR = f"{_WORK_DIR}/tensorrt" + +# TensorRT profile shapes (hoisted so repeated/grid-derived literals live in one place). +# Voxel encoder input: [num_voxels, num_points_per_voxel, voxel_feature_dim]. +_NUM_POINTS_PER_VOXEL = 32 +_VOXEL_FEATURE_DIM = 11 +# Backbone/neck/head input: [batch, channels, grid_h, grid_w] (check grid size in model config). +# min == opt == max here because the BEV grid is fixed for this model. +_SPATIAL_FEATURE_SHAPE = [1, 32, 1020, 1020] + +# ============================================================================ +# 2. EXPORT +# ============================================================================ + +# Export Configuration +# mode: "onnx", "trt", "both", "none" +# work_dir: path to the deployment output root +# onnx_path: path to the ONNX output directory (if mode="trt" and ONNX already exists) +# sample_idx: dataset index of the sample used to trace/shape the exported model +export = dict( + mode="both", + work_dir=_DEPLOY_WORK_DIR, + onnx_path=_ONNX_DIR, + sample_idx=0, +) + +# ONNX Export Settings (shared across all components). +onnx_config = dict( + opset_version=17, + do_constant_folding=True, + export_params=True, + keep_initializers_as_inputs=False, + simplify=False, +) + +# TensorRT Build Settings (shared across all components). +# Supports `auto`, `fp16`, `fp32_tf32`, and `strongly_typed`. +tensorrt_config = dict( + precision_policy="fp16", + max_workspace_size=2 << 30, +) + +# Unified Component Configuration (Single Source of Truth) +# +# Component key is the unique identifier (used for config lookup, filenames, logs). +# Each component defines: +# - onnx_file: Output ONNX filename +# - engine_file: Output TensorRT engine filename +# - io: Input/output specification for ONNX export +# - tensorrt_profile: TensorRT optimization profile (min/opt/max shapes) +components = dict( + pts_voxel_encoder=dict( + onnx_file="pts_voxel_encoder.onnx", + engine_file="pts_voxel_encoder.engine", + io=dict( + inputs=[ + dict(name="input_features", dtype="float32"), + ], + outputs=[ + dict(name="pillar_features", dtype="float32"), + ], + dynamic_axes={ + "input_features": {0: "num_voxels", 1: "num_max_points"}, + "pillar_features": {0: "num_voxels"}, + }, + ), + tensorrt_profile=dict( + input_features=dict( + # Make sure to match the shape of the input to the model. + # [num_voxels, num_points_per_voxel, voxel_feature_dim] + min_shape=[1000, _NUM_POINTS_PER_VOXEL, _VOXEL_FEATURE_DIM], + opt_shape=[20000, _NUM_POINTS_PER_VOXEL, _VOXEL_FEATURE_DIM], + max_shape=[96000, _NUM_POINTS_PER_VOXEL, _VOXEL_FEATURE_DIM], + ), + ), + ), + pts_backbone_neck_head=dict( + onnx_file="pts_backbone_neck_head.onnx", + engine_file="pts_backbone_neck_head.engine", + io=dict( + inputs=[ + dict(name="spatial_features", dtype="float32"), + ], + outputs=[ + dict(name="heatmap", dtype="float32"), + dict(name="reg", dtype="float32"), + dict(name="height", dtype="float32"), + dict(name="dim", dtype="float32"), + dict(name="rot", dtype="float32"), + dict(name="vel", dtype="float32"), + ], + dynamic_axes={ + "spatial_features": {0: "batch_size", 2: "height", 3: "width"}, + "heatmap": {0: "batch_size", 2: "height", 3: "width"}, + "reg": {0: "batch_size", 2: "height", 3: "width"}, + "height": {0: "batch_size", 2: "height", 3: "width"}, + "dim": {0: "batch_size", 2: "height", 3: "width"}, + "rot": {0: "batch_size", 2: "height", 3: "width"}, + "vel": {0: "batch_size", 2: "height", 3: "width"}, + }, + ), + tensorrt_profile=dict( + spatial_features=dict( + # Make sure to match the shape of the input to the model. + # check grid size in the model config + min_shape=_SPATIAL_FEATURE_SHAPE, + opt_shape=_SPATIAL_FEATURE_SHAPE, + max_shape=_SPATIAL_FEATURE_SHAPE, + ), + ), + ), +) + +# ============================================================================ +# 3. EVALUATION +# ============================================================================ +evaluation = dict( + enabled=True, + num_samples=5, + num_warmup=3, + verbose=True, + backends=dict( + pytorch=dict( + enabled=True, + device=_CUDA, + ), + onnx=dict( + enabled=True, + device=_CUDA, + model_dir=_ONNX_DIR, + ), + tensorrt=dict( + enabled=True, + device=_CUDA, + engine_dir=_TENSORRT_DIR, + ), + ), +) + +# ============================================================================ +# 4. VERIFICATION +# +# Tolerance is backend- and machine-dependent: +# - The same scenario can show very different max/mean diffs on different machines: GPU +# architecture, driver, ORT/CUDA/TRT versions, and ORT's CUDA graph partitioning (CPU +# fallback nodes for small ops) all change numerics. ONNX on CPU, ONNX on CUDA, and +# TensorRT on CUDA are not directly comparable to each other as "one true" references. +# - Additionally, the verification configuration should use a precision-aware tolerance, +# especially when FP16 is enabled. +# ============================================================================ +verification = dict( + enabled=False, + # TODO(vividf): double check the tolerance value + tolerance=1, + num_verify_samples=1, + devices=devices, + scenarios=dict( + both=[ + dict(ref_backend="pytorch", ref_device="cpu", test_backend="onnx", test_device="cpu"), + dict(ref_backend="onnx", ref_device="cuda", test_backend="tensorrt", test_device="cuda"), + ], + onnx=[ + dict(ref_backend="pytorch", ref_device="cpu", test_backend="onnx", test_device="cpu"), + ], + trt=[ + dict(ref_backend="onnx", ref_device="cuda", test_backend="tensorrt", test_device="cuda"), + ], + none=[], + ), +) diff --git a/deployment/projects/centerpoint/entrypoint.py b/deployment/projects/centerpoint/entrypoint.py new file mode 100644 index 000000000..335a3e42c --- /dev/null +++ b/deployment/projects/centerpoint/entrypoint.py @@ -0,0 +1,74 @@ +"""CenterPoint deployment entrypoint invoked by the unified CLI.""" + +from __future__ import annotations + +import argparse + +from mmengine.config import Config + +from deployment.cli.args import add_deployment_file_logging, setup_logging +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.contexts import CenterPointExportContext +from deployment.projects.centerpoint.eval.evaluator import CenterPointEvaluator +from deployment.projects.centerpoint.eval.executor import CenterPointExecutor +from deployment.projects.centerpoint.eval.metrics_utils import extract_t4metric_v2_config +from deployment.projects.centerpoint.io.data_loader import CenterPointDataLoader +from deployment.projects.centerpoint.runner import CenterPointDeploymentRunner +from deployment.projects.registry import project_registry + + +def run(args: argparse.Namespace) -> int: + """Run the CenterPoint deployment workflow for the unified CLI. + + Args: + args: Parsed command-line arguments containing deploy_cfg and model_cfg paths. + + Returns: + Exit code (0 for success). + """ + logger = setup_logging(args.log_level) + + deploy_cfg = Config.fromfile(args.deploy_cfg) + model_cfg = Config.fromfile(args.model_cfg) + config = BaseDeploymentConfig(deploy_cfg) + + log_file = config.resolved_deploy_log_file + if log_file: + add_deployment_file_logging(log_file) + logger.info("Deployment log file: %s", log_file) + + project_registry.validate_required_components("centerpoint", config.components_cfg) + + logger.info("=" * 80) + logger.info("CenterPoint Deployment Pipeline") + logger.info("=" * 80) + + data_loader = CenterPointDataLoader( + model_cfg=model_cfg, + ) + logger.info("Loaded %s samples", data_loader.num_samples) + + metrics_config = extract_t4metric_v2_config(model_cfg, logger=logger) + + # One executor instance, shared by the evaluator (evaluate/verify) and the runner + # (which hands it the loaded reference model after export). + executor = CenterPointExecutor(components_cfg=config.components_cfg) + + evaluator = CenterPointEvaluator( + model_cfg=model_cfg, + metrics_config=metrics_config, + executor=executor, + ) + + runner = CenterPointDeploymentRunner( + data_loader=data_loader, + evaluator=evaluator, + executor=executor, + config=config, + model_cfg=model_cfg, + logger=logger, + ) + + context = CenterPointExportContext(rot_y_axis_reference=bool(getattr(args, "rot_y_axis_reference", False))) + runner.run(context=context) + return 0 diff --git a/deployment/projects/centerpoint/eval/__init__.py b/deployment/projects/centerpoint/eval/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deployment/projects/centerpoint/eval/evaluator.py b/deployment/projects/centerpoint/eval/evaluator.py new file mode 100644 index 000000000..9b1dbf7a2 --- /dev/null +++ b/deployment/projects/centerpoint/eval/evaluator.py @@ -0,0 +1,224 @@ +""" +CenterPoint Evaluator for deployment. +""" + +import logging +from typing import Dict, List, Mapping + +import numpy as np +from mmengine.config import Config +from typing_extensions import override + +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.evaluation.base_evaluator import ( + BaseEvaluator, + EvalResultDict, +) +from deployment.core.metrics.detection_3d_metrics import ( + Detection3DMetricsConfig, + Detection3DMetricsInterface, +) + +logger = logging.getLogger(__name__) + + +class CenterPointEvaluator(BaseEvaluator): + """Evaluator implementation for CenterPoint 3D detection. + + Uses the configured `Detection3DMetricsInterface` to compute metrics from pipeline outputs. + + Args: + model_cfg: Model configuration with class_names + metrics_config: Configuration for 3D detection metrics + executor: Backend execution primitives (a `CenterPointExecutor`), shared with + the verification runner. + """ + + def __init__( + self, + model_cfg: Config, + metrics_config: Detection3DMetricsConfig, + executor: BackendExecutor, + ) -> None: + """Initialize CenterPoint evaluator with model config, metrics config, and executor. + + Args: + model_cfg: Model configuration; must have class_names. + metrics_config: Configuration for 3D detection metrics (e.g. T4MetricV2). + executor: Backend execution primitives shared with the verification runner. + + Raises: + ValueError: If model_cfg does not have class_names. + """ + if not hasattr(model_cfg, "class_names"): + raise ValueError("class_names must be provided via model_cfg.class_names.") + + metrics_interface = Detection3DMetricsInterface(metrics_config) + + super().__init__( + metrics_interface=metrics_interface, + model_cfg=model_cfg, + executor=executor, + ) + + @override + def _parse_predictions(self, pipeline_output: object) -> List[Dict]: + """Return pipeline output as a list of prediction dicts (or empty list if not a list). + + Args: + pipeline_output: Raw output from the inference pipeline. + + Returns: + List of prediction dicts, or empty list if pipeline_output is not a list. + """ + return pipeline_output if isinstance(pipeline_output, list) else [] + + @override + def _parse_ground_truths(self, gt_data: Mapping[str, object]) -> List[Dict]: + """Convert gt_bboxes_3d and gt_labels_3d into list of dicts with bbox_3d and label. + + Args: + gt_data: Dict with 'gt_bboxes_3d' and 'gt_labels_3d'. + + Returns: + List of {"bbox_3d": [...], "label": int}. + + Raises: + KeyError: If gt_bboxes_3d or gt_labels_3d is missing. + """ + if "gt_bboxes_3d" not in gt_data: + raise KeyError("gt_bboxes_3d not found in ground truth data.") + if "gt_labels_3d" not in gt_data: + raise KeyError("gt_labels_3d not found in ground truth data.") + + gt_bboxes_3d = gt_data["gt_bboxes_3d"] + gt_labels_3d = gt_data["gt_labels_3d"] + + gt_bboxes_3d = np.asarray(gt_bboxes_3d, dtype=np.float32).reshape( + -1, np.asarray(gt_bboxes_3d).shape[-1] if np.asarray(gt_bboxes_3d).ndim > 1 else 7 + ) + gt_labels_3d = np.asarray(gt_labels_3d, dtype=np.int64).reshape(-1) + + ground_truths = [ + {"bbox_3d": gt_bboxes_3d[i].tolist(), "label": int(gt_labels_3d[i])} for i in range(len(gt_bboxes_3d)) + ] + return ground_truths + + @override + def _add_to_interface(self, predictions: List[Dict], ground_truths: List[Dict]) -> None: + """Add one frame of predictions and ground truths to the metrics interface. + + Args: + predictions: List of prediction dicts (bbox_3d, score, label). + ground_truths: List of ground truth dicts (bbox_3d, label). + """ + self.metrics_interface.add_frame(predictions, ground_truths) + + @override + def _build_results( + self, + latencies: List[float], + latency_breakdowns: List[Dict[str, float]], + num_samples: int, + ) -> EvalResultDict: + """Build evaluation result dict with mAP/mAPH, per-class AP, latency, and optional breakdown. + + Args: + latencies: Per-sample inference latencies (ms). + latency_breakdowns: Per-sample stage-wise latencies (optional). + num_samples: Number of evaluated samples. + + Returns: + EvalResultDict with mAP_by_mode, mAPH_by_mode, per_class_ap_by_mode, + detailed_metrics, latency stats, num_samples, and optionally latency_breakdown. + + Raises: + KeyError: If metrics summary is missing required keys. + """ + latency_stats = self.compute_latency_stats(latencies) + + map_results = self.metrics_interface.compute_metrics() + summary = self.metrics_interface.summary + summary_dict = summary.to_dict() + required_summary_keys = ("mAP_by_mode", "mAPH_by_mode", "per_class_ap_by_mode") + missing = [k for k in required_summary_keys if k not in summary_dict] + if missing: + raise KeyError(f"Missing required metrics summary keys: {missing}") + + result: EvalResultDict = { + "mAP_by_mode": summary_dict["mAP_by_mode"], + "mAPH_by_mode": summary_dict["mAPH_by_mode"], + "per_class_ap_by_mode": summary_dict["per_class_ap_by_mode"], + "detailed_metrics": map_results, + "latency": latency_stats, + "num_samples": num_samples, + } + + if latency_breakdowns: + result["latency_breakdown"] = self._compute_latency_breakdown(latency_breakdowns) + + return result + + @override + def summarize_for_comparison(self, results: EvalResultDict) -> List[str]: + """Summarize mAP/mAPH per mode for the cross-backend comparison.""" + lines: List[str] = [] + for mode, map_value in (results.get("mAP_by_mode") or {}).items(): + lines.append(f" mAP ({mode}): {map_value:.4f}") + for mode, maph_value in (results.get("mAPH_by_mode") or {}).items(): + lines.append(f" mAPH ({mode}): {maph_value:.4f}") + lines.extend(super().summarize_for_comparison(results)) + return lines + + @override + def print_results(self, results: EvalResultDict) -> None: + """Log evaluation results including metrics, latency, and breakdown. + + Args: + results: EvalResultDict from _build_results (mAP, latency, num_samples, etc.). + + Raises: + ValueError: If metrics report or latency is missing from results. + """ + metrics_report = self.metrics_interface.format_metrics_report() + for line in metrics_report.rstrip().split("\n"): + logger.info(line) + + if "latency" not in results: + raise ValueError( + "Latency statistics not found in results. Ensure that evaluation has been run with latency tracking." + ) + latency_stats = results["latency"] + latency_dict = latency_stats.to_dict() + logger.info("") + logger.info("Latency Statistics:") + logger.info(" Mean: %.2f ms", latency_dict["mean_ms"]) + logger.info(" Std: %.2f ms", latency_dict["std_ms"]) + logger.info(" Min: %.2f ms", latency_dict["min_ms"]) + logger.info(" Max: %.2f ms", latency_dict["max_ms"]) + logger.info(" Median: %.2f ms", latency_dict["median_ms"]) + + if "latency_breakdown" in results: + breakdown = results["latency_breakdown"] + breakdown_dict = breakdown.to_dict() if hasattr(breakdown, "to_dict") else breakdown + + if breakdown_dict: + logger.info("") + logger.info("Stage-wise Latency Breakdown:") + top_level_stages = {"preprocessing_ms", "model_ms", "postprocessing_ms"} + for stage, stats in breakdown_dict.items(): + stats_dict = stats.to_dict() if hasattr(stats, "to_dict") else stats + stage_name = stage.replace("_ms", "").replace("_", " ").title() + + output_format = ( + " %-18s: %.2f ± %.2f ms" if stage in top_level_stages else " %-16s: %.2f ± %.2f ms" + ) + logger.info( + output_format, + stage_name, + stats_dict["mean_ms"], + stats_dict["std_ms"], + ) + + logger.info("") + logger.info("Total Samples: %s", results["num_samples"]) diff --git a/deployment/projects/centerpoint/eval/executor.py b/deployment/projects/centerpoint/eval/executor.py new file mode 100644 index 000000000..fae011b02 --- /dev/null +++ b/deployment/projects/centerpoint/eval/executor.py @@ -0,0 +1,88 @@ +""" +CenterPoint backend executor. + +Implements the task-specific backend execution primitives (pipeline creation and +input preparation) for CenterPoint, shared by the evaluator and the verification +runner via `~deployment.core.evaluation.backend_executor.BackendExecutor`. +""" + +import logging +from typing import List, Mapping, Optional + +from typing_extensions import override + +from deployment.configs.schema import ComponentsConfig +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.evaluation.evaluator_types import InferenceInput, ModelSpec +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.pipelines.base_pipeline import BaseInferencePipeline +from deployment.pipelines.registry import pipeline_registry + +logger = logging.getLogger(__name__) + + +class CenterPointExecutor(BackendExecutor): + """Backend execution primitives for CenterPoint (pipeline creation, input prep). + + Args: + components_cfg: Unified components configuration, forwarded to the pipeline + registry when constructing backend pipelines. + """ + + def __init__(self, components_cfg: ComponentsConfig) -> None: + super().__init__() + self._components_cfg = components_cfg + + @override + def get_output_names(self) -> Optional[List[str]]: + """Return the head output names from the components config for verification logging.""" + return [out.name for out in self._components_cfg.get_component("pts_backbone_neck_head").io.outputs] + + @override + def create_pipeline(self, model_spec: ModelSpec, device: DeviceSpec) -> BaseInferencePipeline: + """Create a CenterPoint inference pipeline for the given backend and device. + + Args: + model_spec: Model specification (backend, device, path). + device: Target device for the pipeline. + + Returns: + CenterPoint pipeline instance (PyTorch, ONNX, or TensorRT). + """ + return pipeline_registry.create_pipeline( + project_name="centerpoint", + model_spec=model_spec, + pytorch_model=self.pytorch_model, + device=device, + components_cfg=self._components_cfg, + ) + + @override + def prepare_input( + self, + sample: Mapping[str, object], + data_loader: BaseDataLoader, + device: DeviceSpec, + ) -> InferenceInput: + """Build InferenceInput from sample (points + metainfo). + + Args: + sample: Dict with 'points' and 'metainfo'. + data_loader: Unused; kept for interface compatibility. + device: Unused; kept for interface compatibility. + + Returns: + InferenceInput with data=points and metadata=metainfo. + + Raises: + ValueError: If 'points' is missing from sample. + KeyError: If 'metainfo' is missing from sample. + """ + if "points" not in sample: + raise ValueError(f"Expected 'points' in sample. Got keys: {list(sample.keys())}") + if "metainfo" not in sample: + raise KeyError("Sample must contain 'metainfo' for CenterPoint postprocess.") + points = sample["points"] + metadata = sample["metainfo"] + return InferenceInput(data=points, metadata=metadata) diff --git a/deployment/projects/centerpoint/eval/metrics_utils.py b/deployment/projects/centerpoint/eval/metrics_utils.py new file mode 100644 index 000000000..e3a6e2802 --- /dev/null +++ b/deployment/projects/centerpoint/eval/metrics_utils.py @@ -0,0 +1,76 @@ +""" +CenterPoint metrics utilities. + +This module extracts metrics configuration from MMEngine model configs. +""" + +import logging +from typing import Any + +from mmengine.config import Config, ConfigDict + +from deployment.core.metrics.detection_3d_metrics import Detection3DMetricsConfig + +_T4METRIC_V2_EVALUATOR_TYPE = "T4MetricV2" + + +def extract_t4metric_v2_config( + model_cfg: Config, + logger: logging.Logger, +) -> Detection3DMetricsConfig: + """Extract `Detection3DMetricsConfig` from an MMEngine model config. + + Expects the config to contain a `T4MetricV2` val evaluator. + + Args: + model_cfg: MMEngine model configuration. + logger: Logger instance. + + Returns: + Detection3DMetricsConfig instance with extracted settings. + + Raises: + ValueError: If class_names not provided and not found in model_cfg, + or if evaluator config is missing or not T4MetricV2 type. + """ + + def read_required_cfg_value(cfg: Config | ConfigDict, key: str) -> Any: + """Read a required key/attribute from config object. + + Args: + cfg: MMEngine Config or ConfigDict to read from. + key: Required key/attribute name. + + Returns: + Value stored at the given key/attribute. + + Raises: + ValueError: If key/attribute does not exist in cfg. + """ + if key in cfg: + return cfg[key] + if hasattr(cfg, key): + return getattr(cfg, key) + raise ValueError(f"Missing required key/attribute '{key}'") + + class_names = read_required_cfg_value(model_cfg, "class_names") + evaluator_cfg = read_required_cfg_value(model_cfg, "val_evaluator") + + evaluator_type = read_required_cfg_value(evaluator_cfg, "type") + if evaluator_type != _T4METRIC_V2_EVALUATOR_TYPE: + raise ValueError(f"Evaluator type is '{evaluator_type}', not '{_T4METRIC_V2_EVALUATOR_TYPE}'") + + perception_configs = read_required_cfg_value(evaluator_cfg, "perception_evaluator_configs") + evaluation_config_dict = read_required_cfg_value(perception_configs, "evaluation_config_dict") + frame_id = read_required_cfg_value(perception_configs, "frame_id") + + critical_object_filter_config = read_required_cfg_value(evaluator_cfg, "critical_object_filter_config") + frame_pass_fail_config = read_required_cfg_value(evaluator_cfg, "frame_pass_fail_config") + + return Detection3DMetricsConfig( + class_names=class_names, + frame_id=frame_id, + evaluation_config_dict=evaluation_config_dict, + critical_object_filter_config=critical_object_filter_config, + frame_pass_fail_config=frame_pass_fail_config, + ) diff --git a/deployment/projects/centerpoint/export/__init__.py b/deployment/projects/centerpoint/export/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deployment/projects/centerpoint/export/component_builder.py b/deployment/projects/centerpoint/export/component_builder.py new file mode 100644 index 000000000..623faa690 --- /dev/null +++ b/deployment/projects/centerpoint/export/component_builder.py @@ -0,0 +1,138 @@ +""" +CenterPoint-specific component builder. + +Builds exportable submodules from CenterPoint using typed component config. +""" + +from __future__ import annotations + +import logging + +import torch + +from deployment.configs.schema import ComponentsConfig +from deployment.exporters.export_pipelines.interfaces import ExportableComponent, ModelComponentBuilder +from deployment.projects.centerpoint.io.sample_types import CenterPointFeatureSample, compute_batch_size +from deployment.projects.centerpoint.onnx_models.centerpoint_onnx import CenterPointHeadONNX + + +class CenterPointComponentBuilder(ModelComponentBuilder): + """Build exportable CenterPoint submodules for multi-file ONNX export. + + For CenterPoint we export two components: + - ``pts_voxel_encoder`` (pts_voxel_encoder) + - ``pts_backbone_neck_head`` (pts_backbone + pts_neck + pts_bbox_head) + """ + + def __init__( + self, + components_cfg: ComponentsConfig, + logger: logging.Logger, + ) -> None: + """Initialize CenterPoint component builder. + + Args: + components_cfg: Component config used to resolve export names. + logger: Logger for export progress and diagnostics. + """ + self._components_cfg = components_cfg + self.logger = logger + + def build_components( + self, + model: torch.nn.Module, + sample: CenterPointFeatureSample, + ) -> list[ExportableComponent]: + """Build exportable CenterPoint components from a typed sample. + + Args: + model: CenterPoint model that contains exportable submodules. + sample: Typed export sample used to prepare component inputs. + + Returns: + Exportable components for voxel encoder and backbone/neck/head. + """ + self.logger.info("Extracting CenterPoint components for export...") + + voxel_component = self._create_voxel_encoder_component(model, sample) + backbone_component = self._create_backbone_component(model, sample) + + self.logger.info("Extracted 2 components: pts_voxel_encoder, pts_backbone_neck_head") + return [voxel_component, backbone_component] + + def _create_voxel_encoder_component( + self, + model: torch.nn.Module, + sample: CenterPointFeatureSample, + ) -> ExportableComponent: + """Create exportable component for the voxel encoder (pts_voxel_encoder). + + Args: + model: CenterPoint model containing ``pts_voxel_encoder``. + sample: Typed export sample that provides voxel encoder input tensor. + + Returns: + Exportable voxel encoder component. + """ + component_cfg = self._components_cfg.get_component("pts_voxel_encoder") + return ExportableComponent( + name=component_cfg.name, + module=model.pts_voxel_encoder, + sample_input=sample.input_features, + ) + + def _create_backbone_component( + self, + model: torch.nn.Module, + sample: CenterPointFeatureSample, + ) -> ExportableComponent: + """Create exportable component for backbone + neck + head (pts_backbone_neck_head). + + Args: + model: CenterPoint model containing backbone, neck, and bbox head. + sample: Typed export sample used to derive backbone input features. + + Returns: + Exportable backbone/neck/head component. + """ + backbone_input = self._prepare_backbone_input(model, sample) + backbone_module = self._create_backbone_module(model) + + component_cfg = self._components_cfg.get_component("pts_backbone_neck_head") + return ExportableComponent( + name=component_cfg.name, + module=backbone_module, + sample_input=backbone_input, + ) + + def _prepare_backbone_input( + self, + model: torch.nn.Module, + sample: CenterPointFeatureSample, + ) -> torch.Tensor: + """Compute spatial features for the backbone from typed sample tensors. + + Args: + model: CenterPoint model used to run voxel and middle encoders. + sample: Typed export sample containing input features and coordinates. + + Returns: + Spatial feature tensor consumed by backbone/neck/head. + """ + with torch.no_grad(): + voxel_features = model.pts_voxel_encoder(sample.input_features).squeeze(1) + coors = sample.coors + batch_size = compute_batch_size(coors) + spatial_features = model.pts_middle_encoder(voxel_features, coors, batch_size) + return spatial_features + + def _create_backbone_module(self, model: torch.nn.Module) -> torch.nn.Module: + """Wrap pts_backbone, pts_neck, and pts_bbox_head into one ONNX module. + + Args: + model: CenterPoint model that exposes backbone, neck, and bbox head. + + Returns: + Module that runs backbone, neck, and head as a single forward graph. + """ + return CenterPointHeadONNX(model.pts_backbone, model.pts_neck, model.pts_bbox_head) diff --git a/deployment/projects/centerpoint/export/onnx_export_pipeline.py b/deployment/projects/centerpoint/export/onnx_export_pipeline.py new file mode 100644 index 000000000..84a73f690 --- /dev/null +++ b/deployment/projects/centerpoint/export/onnx_export_pipeline.py @@ -0,0 +1,201 @@ +""" +CenterPoint ONNX export pipeline using composition. + +Splits the CenterPoint model into exportable components (e.g. voxel encoder, +backbone+neck+head) via composition and exports each component +to a separate ONNX file in the given output directory. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import torch +from typing_extensions import override + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.artifacts import Artifact +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.exporters.common.factory import ExporterFactory +from deployment.exporters.common.model_wrappers import IdentityWrapper +from deployment.exporters.common.onnx_exporter import ONNXExporter +from deployment.exporters.export_pipelines.base import OnnxExportPipeline +from deployment.exporters.export_pipelines.interfaces import ( + ExportableComponent, + ExportSampleAdapter, + ModelComponentBuilder, +) +from deployment.projects.centerpoint.io.sample_types import CenterPointFeatureSample + + +class CenterPointONNXExportPipeline(OnnxExportPipeline): + """ONNX export pipeline for CenterPoint (multi-file export). + + Uses a sample adapter + component builder to split the model into exportable + components and exports each with the configured ONNX exporter. + """ + + def __init__( + self, + exporter_factory: type[ExporterFactory], + sample_adapter: ExportSampleAdapter, + component_builder: ModelComponentBuilder, + logger: logging.Logger | None = None, + ) -> None: + """Initialize the pipeline with exporter factory, adapter, and builder. + + Args: + exporter_factory: Factory used to create ONNX exporters per component. + sample_adapter: Adapter that extracts typed sample payload. + component_builder: Builder that creates exportable components from sample. + logger: Optional logger; defaults to module logger if not provided. + """ + self.exporter_factory = exporter_factory + self.sample_adapter = sample_adapter + self.component_builder = component_builder + self.logger = logger or logging.getLogger(__name__) + + @override + def export( + self, + *, + model: torch.nn.Module, + data_loader: BaseDataLoader, + output_dir: str, + config: BaseDeploymentConfig, + sample_idx: int = 0, + ) -> Artifact: + """Export CenterPoint to multi-file ONNX (one file per component). + + Extracts sample data, splits the model into components, and exports each + component to ``/.onnx``. + + Args: + model: CenterPoint model to export. + data_loader: Loader used to get sample data for tracing. + output_dir: Directory where ONNX files are written. + config: Deployment config for exporter options. + sample_idx: Index of the sample to use for export (default 0). + + Returns: + Artifact whose path is the output directory. + """ + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + + self._log_header(output_dir_path, sample_idx) + sample = self._extract_sample_data(model, data_loader, sample_idx) + components = self.component_builder.build_components(model, sample) + + exported_paths = self._export_components(components, output_dir_path, config) + self._log_summary(exported_paths) + + return Artifact(path=str(output_dir_path)) + + def _log_header(self, output_dir: Path, sample_idx: int) -> None: + """Log export header with output directory and sample index. + + Args: + output_dir: Directory where exported ONNX files are written. + sample_idx: Index of sample used for tracing/export. + """ + self.logger.info("=" * 80) + self.logger.info("Exporting CenterPoint to ONNX (multi-file)") + self.logger.info("=" * 80) + self.logger.info("Output directory: %s", output_dir) + self.logger.info("Using sample index: %s", sample_idx) + + def _extract_sample_data( + self, + model: torch.nn.Module, + data_loader: BaseDataLoader, + sample_idx: int, + ) -> CenterPointFeatureSample: + """Extract typed sample payload for component building. + + Args: + model: CenterPoint model (must have _extract_features for ONNX export). + data_loader: Loader to fetch the sample from. + sample_idx: Index of the sample. + + Returns: + Typed `CenterPointFeatureSample` payload. + + Raises: + RuntimeError: If feature extraction fails. + """ + self.logger.info("Extracting features from sample data...") + try: + return self.sample_adapter.extract_sample(model, data_loader, sample_idx) + except Exception as exc: + raise RuntimeError(f"Feature extraction failed: {exc}") from exc + + def _export_components( + self, + components: list[ExportableComponent], + output_dir: Path, + config: BaseDeploymentConfig, + ) -> list[str]: + """Export each component to ONNX under output_dir (one file per component). + + Args: + components: Exportable components (name, module, sample_input). + output_dir: Directory to write .onnx files. + config: Deployment config for building the ONNX exporter. + + Returns: + List of absolute paths of exported ONNX files. + + Raises: + RuntimeError: If any component export fails. + """ + exported_paths: list[str] = [] + for index, component in enumerate(components, start=1): + self.logger.info("\n[%s/%s] Exporting %s...", index, len(components), component.name) + output_path = output_dir / f"{component.name}.onnx" + exporter = self._build_onnx_exporter(config, component_name=component.name) + + try: + exporter.export( + model=component.module, + sample_input=component.sample_input, + output_path=str(output_path), + ) + except Exception as exc: + self.logger.error("Failed to export %s", component.name, exc_info=True) + raise RuntimeError(f"{component.name} export failed") from exc + + exported_paths.append(str(output_path)) + self.logger.info("Exported %s: %s", component.name, output_path) + + return exported_paths + + def _build_onnx_exporter(self, config: BaseDeploymentConfig, component_name: str) -> ONNXExporter: + """Create an ONNX exporter for the given component using the factory. + + Args: + config: Deployment config used to construct the ONNX exporter. + component_name: Component name used to resolve component-level options. + + Returns: + Configured ONNX exporter for the target component. + """ + return self.exporter_factory.create_onnx_exporter( + config=config, + wrapper_cls=IdentityWrapper, + logger=self.logger, + component_name=component_name, + ) + + def _log_summary(self, exported_paths: list[str]) -> None: + """Log success summary and list of exported ONNX file paths. + + Args: + exported_paths: Paths of successfully exported ONNX files. + """ + self.logger.info("\n" + "=" * 80) + self.logger.info("CenterPoint ONNX export successful") + self.logger.info("=" * 80) + for path in exported_paths: + self.logger.info(" • %s", Path(path).name) diff --git a/deployment/projects/centerpoint/export/tensorrt_export_pipeline.py b/deployment/projects/centerpoint/export/tensorrt_export_pipeline.py new file mode 100644 index 000000000..026a94d1b --- /dev/null +++ b/deployment/projects/centerpoint/export/tensorrt_export_pipeline.py @@ -0,0 +1,147 @@ +""" +CenterPoint TensorRT export pipeline using composition. + +Reads ONNX paths from ``deploy_config`` ``components`` (same rules as +``resolve_artifact_path``) and builds one TensorRT engine per component into +``output_dir``. +""" + +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Optional + +import torch +from typing_extensions import override + +from deployment.configs.base import BaseDeploymentConfig +from deployment.configs.schema import ComponentsConfig +from deployment.core.artifacts import Artifact, resolve_artifact_path +from deployment.core.device import DeviceSpec +from deployment.exporters.common.factory import ExporterFactory +from deployment.exporters.export_pipelines.base import TensorRTExportPipeline + + +class CenterPointTensorRTExportPipeline(TensorRTExportPipeline): + """TensorRT export pipeline for CenterPoint. + + Iterates ``components`` in deploy config order and builds one engine per + component from the configured ``onnx_file`` under ``onnx_path``. + """ + + def __init__( + self, + exporter_factory: type[ExporterFactory], + components_cfg: ComponentsConfig, + logger: Optional[logging.Logger] = None, + ) -> None: + """Initialize the pipeline with exporter factory and components config. + + Args: + exporter_factory: Factory used to create TensorRT exporters per component. + components_cfg: Config defining component names, onnx_file and engine_file paths. + logger: Optional logger; defaults to module logger if not provided. + """ + self.exporter_factory = exporter_factory + self._components_cfg = components_cfg + self.logger = logger or logging.getLogger(__name__) + + def _validate_cuda_device(self, device: DeviceSpec) -> int: + """Ensure device is CUDA and return the device index. + + Args: + device: CUDA device specification. + + Returns: + The integer device index. + + Raises: + ValueError: If device is not CUDA. + """ + if not device.is_cuda: + raise ValueError(f"TensorRT export requires CUDA device, got: {device}") + return device.index + + @override + def export( + self, + *, + onnx_path: str, + output_dir: str, + config: BaseDeploymentConfig, + device: DeviceSpec, + ) -> Artifact: + """Convert each component's ONNX to a TensorRT engine under ``output_dir``. + + For every entry in ``components``, resolves ``onnx_file`` under ``onnx_path`` + (must exist) and writes ``engine_file`` relative to ``output_dir``. + + Args: + onnx_path: Directory containing ONNX files (layout matches deploy config). + output_dir: Directory where TensorRT engine files are written. + config: Deployment config for TensorRT exporter options. + device: CUDA device for building engines. + + Returns: + Artifact whose path is the output directory. + + Raises: + ValueError: If ``onnx_path`` is not a directory, CUDA is invalid, or + ``components`` is empty. + FileNotFoundError: If a configured ONNX file is missing under ``onnx_path``. + """ + onnx_dir_path = Path(onnx_path) + if not onnx_dir_path.is_dir(): + raise ValueError(f"onnx_path must be a directory for multi-file export, got: {onnx_path}") + + components = list(self._components_cfg.items()) + if not components: + raise ValueError("components config is empty; nothing to export to TensorRT.") + + device_id = self._validate_cuda_device(device) + self.logger.info("Using CUDA device: %s", device) + + output_dir_path = Path(output_dir) + output_dir_path.mkdir(parents=True, exist_ok=True) + + onnx_dir_str = str(onnx_dir_path) + num = len(components) + # Scope the active CUDA device to this export instead of mutating the process-global + # device via torch.cuda.set_device(); this keeps concurrent/repeat exports isolated. + with torch.cuda.device(device_id): + # Start at 1 so progress logs are human-friendly: [1/N] ... [N/N]. + for i, (component_name, comp) in enumerate(components, 1): + onnx_file = resolve_artifact_path( + base_dir=onnx_dir_str, + components_cfg=self._components_cfg, + component_name=component_name, + file_key="onnx_file", + ) + trt_path = output_dir_path / comp.engine_file + trt_path.parent.mkdir(parents=True, exist_ok=True) + + self.logger.info( + "\n[%s/%s] Converting %s → %s...", + i, + num, + Path(onnx_file).name, + trt_path.name, + ) + + exporter = self.exporter_factory.create_tensorrt_exporter( + config=config, + logger=self.logger, + component_name=component_name, + ) + + artifact = exporter.export( + model=None, + sample_input=None, + output_path=str(trt_path), + onnx_path=onnx_file, + ) + self.logger.info("TensorRT engine saved: %s", artifact.path) + + self.logger.info("\nAll TensorRT engines exported successfully to %s", output_dir_path) + return Artifact(path=str(output_dir_path)) diff --git a/deployment/projects/centerpoint/io/__init__.py b/deployment/projects/centerpoint/io/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deployment/projects/centerpoint/io/data_loader.py b/deployment/projects/centerpoint/io/data_loader.py new file mode 100644 index 000000000..302644d97 --- /dev/null +++ b/deployment/projects/centerpoint/io/data_loader.py @@ -0,0 +1,141 @@ +""" +CenterPoint DataLoader for deployment. + +Wraps MMDet3D Dataset to ensure GT is identical to tools/detection3d/test.py. +Pipeline is run once per sample in load_sample(), avoiding redundant computation. +""" + +import copy + +import mmdet3d.datasets.transforms # noqa: F401 - registers transforms +import torch +from mmengine.config import Config +from mmengine.registry import DATASETS, init_default_scope +from typing_extensions import override + +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.projects.centerpoint.io.sample_types import ( + CenterPointModelInput, + CenterPointSample, +) + + +class CenterPointDataLoader(BaseDataLoader): + """Deployment dataloader for CenterPoint using MMDet3D Dataset. + + This wraps the same Dataset used by tools/detection3d/test.py, ensuring: + - GT is identical + - Pipeline processing is identical + - Pipeline runs once per sample (no cache needed) + + Design: + load_sample() runs the full pipeline and returns all data (input + GT). + preprocess() extracts model inputs from the loaded sample. + """ + + def __init__( + self, + model_cfg: Config, + ) -> None: + """Initialize CenterPoint data loader. + + Args: + model_cfg: MMEngine model config; must have test_dataloader.dataset + (its ``ann_file`` is the test info used for evaluation). + """ + super().__init__() + + self.model_cfg = model_cfg + self.dataset = self._build_dataset(model_cfg) + + def _build_dataset(self, model_cfg: Config) -> torch.utils.data.Dataset: + """Build MMDet3D Dataset from the model config's test_dataloader. + + Args: + model_cfg: MMEngine model config with test_dataloader.dataset. + + Returns: + Built MMDet3D Dataset instance. + + Raises: + AttributeError: If ``model_cfg.test_dataloader`` is missing. + """ + # Set default scope to mmdet3d so transforms are found in the registry + init_default_scope("mmdet3d") + dataset_cfg = copy.deepcopy(model_cfg.test_dataloader.dataset) + + dataset_cfg["test_mode"] = True + + # Build dataset + dataset = DATASETS.build(dataset_cfg) + return dataset + + @override + def load_sample(self, index: int) -> CenterPointSample: + """Load sample by running the full pipeline once. + + Returns a dict containing all data needed for inference and evaluation: + - points: Points tensor (ready for inference) + - metainfo: Sample metadata + - ground_truth: Raw eval_ann_info from MMDet3D (kept unconverted) + + Args: + index: Sample index in the dataset (0 to num_samples - 1). + + Returns: + `CenterPointSample` with keys ``points``, ``metainfo``, ``ground_truth``. + + Raises: + IndexError: If index is out of range. + KeyError: If dataset sample is missing required keys. + ValueError: If ``data_samples`` is None or points shape is invalid. + AttributeError: If ``data_samples`` lacks required attributes. + """ + if index >= len(self.dataset): + raise IndexError(f"Sample index {index} out of range (0-{len(self.dataset)-1})") + + # Run pipeline once + data = self.dataset[index] + + pipeline_inputs = data["inputs"] + points_tensor = pipeline_inputs["points"].to("cpu") + if points_tensor.ndim != 2: + raise ValueError(f"Expected points tensor with shape [N, features], got {points_tensor.shape}") + + data_samples = data["data_samples"] + if data_samples is None: + raise ValueError("Dataset sample contains None 'data_samples', cannot build evaluation ground truth.") + + metainfo = data_samples.metainfo + eval_ann_info = data_samples.eval_ann_info + # Keep raw eval_ann_info here; evaluator will convert to the metrics format. + ground_truth = dict(eval_ann_info) + + return CenterPointSample( + points=points_tensor, + metainfo=dict(metainfo), + ground_truth=ground_truth, + ) + + @override + def preprocess(self, sample: CenterPointSample) -> CenterPointModelInput: + """Extract points and metainfo from loaded sample. + + This is a lightweight operation - pipeline already ran in load_sample(). + + Args: + sample: Result of :meth:`load_sample` with keys ``points`` and ``metainfo``. + + Returns: + Dict with keys ``points`` and ``metainfo`` for inference. + """ + return CenterPointModelInput( + points=sample["points"], + metainfo=sample["metainfo"], + ) + + @property + @override + def num_samples(self) -> int: + """Return the number of samples in the dataset.""" + return len(self.dataset) diff --git a/deployment/projects/centerpoint/io/model_loader.py b/deployment/projects/centerpoint/io/model_loader.py new file mode 100644 index 000000000..f2b816ced --- /dev/null +++ b/deployment/projects/centerpoint/io/model_loader.py @@ -0,0 +1,122 @@ +""" +CenterPoint model loading utilities. + +This module provides ONNX-compatible model building from MMEngine configs. +""" + +from __future__ import annotations + +import copy +from typing import Tuple + +import torch +from mmengine.config import Config +from mmengine.registry import MODELS, init_default_scope +from mmengine.runner import load_checkpoint + +from deployment.core.device import DeviceSpec +from deployment.projects.centerpoint.onnx_models import ( # noqa: F401 - register MODELS + centerpoint_head_onnx, + centerpoint_onnx, + pillar_encoder_onnx, +) + + +def create_onnx_model_cfg( + model_cfg: Config, + device: DeviceSpec, + rot_y_axis_reference: bool = False, +) -> Config: + """Create a model config that swaps modules to ONNX-friendly variants. + + This mutates the `model_cfg.model` subtree to reference classes registered by + `deployment.projects.centerpoint.onnx_models` (e.g., `CenterPointONNX`). + + Args: + model_cfg: Original MMEngine model configuration. + device: Target device specification. + rot_y_axis_reference: Whether to use y-axis rotation reference. + + Returns: + New config whose ``model`` subtree builds the deployment export graph (e.g. ONNX-friendly types). + """ + export_model_cfg = model_cfg.copy() + model_config = copy.deepcopy(export_model_cfg.model) + + model_config.type = "CenterPointONNX" + model_config.point_channels = model_config.pts_voxel_encoder.in_channels + model_config.device = device + + if model_config.pts_voxel_encoder.type == "PillarFeatureNet": + model_config.pts_voxel_encoder.type = "PillarFeatureNetONNX" + elif model_config.pts_voxel_encoder.type == "BackwardPillarFeatureNet": + model_config.pts_voxel_encoder.type = "BackwardPillarFeatureNetONNX" + + model_config.pts_bbox_head.type = "CenterHeadONNX" + model_config.pts_bbox_head.separate_head.type = "SeparateHeadONNX" + model_config.pts_bbox_head.rot_y_axis_reference = rot_y_axis_reference + + if ( + getattr(model_config, "pts_backbone", None) + and getattr(model_config.pts_backbone, "type", None) == "ConvNeXt_PC" + ): + model_config.pts_backbone.with_cp = False + + export_model_cfg.model = model_config + return export_model_cfg + + +def build_model_from_cfg( + model_cfg: Config, + checkpoint_path: str, + device: DeviceSpec, +) -> torch.nn.Module: + """Build a model from MMEngine config and load checkpoint weights. + + Args: + model_cfg: MMEngine model configuration. + checkpoint_path: Path to the checkpoint file. + device: Target device specification. + + Returns: + Loaded and initialized PyTorch model in eval mode. + """ + # Importing onnx_models above triggers MODELS registration for ONNX variants. + init_default_scope("mmdet3d") + + model_config = copy.deepcopy(model_cfg.model) + model = MODELS.build(model_config) + torch_device = device.to_torch_device() + model.to(torch_device) + load_checkpoint(model, checkpoint_path, map_location=torch_device) + model.eval() + model.cfg = model_cfg + return model + + +def build_centerpoint_onnx_model( + base_model_cfg: Config, + checkpoint_path: str, + device: DeviceSpec, + rot_y_axis_reference: bool = False, +) -> Tuple[torch.nn.Module, Config]: + """Build an ONNX-compatible CenterPoint model. + + Convenience wrapper that creates ONNX config and builds the model. + + Args: + base_model_cfg: Base MMEngine model configuration. + checkpoint_path: Path to the checkpoint file. + device: Target device specification. + rot_y_axis_reference: Whether to use y-axis rotation reference. + + Returns: + Tuple of ``(model, export_model_cfg)``; the latter matches ``model.cfg``. + """ + export_model_cfg = create_onnx_model_cfg( + base_model_cfg, + device=device, + rot_y_axis_reference=rot_y_axis_reference, + ) + model = build_model_from_cfg(export_model_cfg, checkpoint_path, device=device) + return model, export_model_cfg diff --git a/deployment/projects/centerpoint/io/sample_adapter.py b/deployment/projects/centerpoint/io/sample_adapter.py new file mode 100644 index 000000000..3cc7d1c73 --- /dev/null +++ b/deployment/projects/centerpoint/io/sample_adapter.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +import logging +from collections.abc import Mapping + +import torch + +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.exporters.export_pipelines.interfaces import ExportSampleAdapter +from deployment.projects.centerpoint.io.sample_types import CenterPointFeatureSample, VoxelDict + + +class CenterPointSampleAdapter(ExportSampleAdapter): + """Adapter for CenterPoint feature extraction output into typed sample payload.""" + + _REQUIRED_VOXEL_KEYS: tuple[str, ...] = ("voxels", "num_points", "coors") + + def __init__(self, logger: logging.Logger) -> None: + """Initialize the sample adapter. + + Args: + logger: Logger for diagnostics. + """ + self.logger = logger + + def extract_sample( + self, + model: torch.nn.Module, + data_loader: BaseDataLoader, + sample_idx: int, + ) -> CenterPointFeatureSample: + """Extract a typed export sample from the model and data loader. + + Args: + model: CenterPoint model with _extract_features (ONNX-compatible). + data_loader: Loader used to fetch sample data. + sample_idx: Index of the sample to extract. + + Returns: + Typed CenterPointFeatureSample for export pipelines. + + Raises: + AttributeError: If model does not have _extract_features. + AssertionError: If ``_extract_features`` return value has unexpected types. + KeyError: If voxel_dict is missing required keys. + """ + if not hasattr(model, "_extract_features"): + raise AttributeError( + "CenterPoint model must have _extract_features method for ONNX export. " + "Please ensure the model is built with ONNX compatibility." + ) + + input_features, voxel_dict = model._extract_features(data_loader, sample_idx) + + assert isinstance( + input_features, torch.Tensor + ), f"input_features must be torch.Tensor, got {type(input_features).__name__}" + assert isinstance(voxel_dict, Mapping), f"voxel_dict must be Mapping, got {type(voxel_dict).__name__}" + + missing = [key for key in self._REQUIRED_VOXEL_KEYS if key not in voxel_dict] + if missing: + raise KeyError(f"voxel_dict missing keys: {missing}") + + invalid = { + key: type(voxel_dict[key]).__name__ + for key in self._REQUIRED_VOXEL_KEYS + if not isinstance(voxel_dict[key], torch.Tensor) + } + if invalid: + raise TypeError(f"voxel_dict invalid tensor fields: {invalid}") + + validated_voxel_dict: VoxelDict = {k: voxel_dict[k] for k in self._REQUIRED_VOXEL_KEYS} + + return CenterPointFeatureSample( + input_features=input_features, + voxel_dict=validated_voxel_dict, + ) diff --git a/deployment/projects/centerpoint/io/sample_types.py b/deployment/projects/centerpoint/io/sample_types.py new file mode 100644 index 000000000..b7fa8e33d --- /dev/null +++ b/deployment/projects/centerpoint/io/sample_types.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Dict, TypedDict + +import torch + +from deployment.core.io.base_data_loader import SampleData + + +class CenterPointSample(SampleData): + """Structured payload after running the MMDet3D test pipeline for one frame. + + Returned by :meth:`deployment.projects.centerpoint.io.data_loader.CenterPointDataLoader.load_sample`. + At runtime this is a plain ``dict``; use bracket access (e.g. ``sample["points"]``). + + Attributes: + points: Point cloud tensor on CPU, shape ``[N, C]`` after pipeline. + metainfo: Per-sample metadata (e.g. lidar path, sample index) as a string-keyed dict. + ground_truth: Raw ``eval_ann_info`` from the detector data sample, for evaluation. + """ + + points: torch.Tensor + metainfo: Dict[str, object] + ground_truth: Dict[str, object] + + +class CenterPointModelInput(TypedDict): + """Subset of a loaded sample passed into the CenterPoint network for inference. + + Produced by :meth:`deployment.projects.centerpoint.io.data_loader.CenterPointDataLoader.preprocess`. + Excludes ``ground_truth``, which is only needed for eval/export wiring. + + Attributes: + points: Point cloud tensor for the model forward. + metainfo: Metadata required by preprocessing or postprocessing. + """ + + points: torch.Tensor + metainfo: Dict[str, object] + + +class VoxelDict(TypedDict): + """Voxelization output from CenterPoint feature extraction (ONNX/export path). + + Matches the dict returned alongside ``input_features`` from ``_extract_features``. + + Attributes: + voxels: Packed voxel feature tensor. + num_points: Per-voxel point counts. + coors: Voxel coordinates (e.g. batch and grid indices). + """ + + voxels: torch.Tensor + num_points: torch.Tensor + coors: torch.Tensor + + +@dataclass(frozen=True) +class CenterPointFeatureSample: + """Immutable bundle of backbone inputs and sparse tensor layout for export. + + Built by `deployment.projects.centerpoint.io.sample_adapter.CenterPointSampleAdapter` + for ONNX/TensorRT pipelines that need validated tensors and a consistent voxel dict. + + Attributes: + input_features: Tensor fed to the rest of the network after voxelization. + voxel_dict: Sparse structure with keys ``voxels``, ``num_points``, ``coors``. + """ + + input_features: torch.Tensor + voxel_dict: VoxelDict + + @property + def coors(self) -> torch.Tensor: + return self.voxel_dict["coors"] + + +def compute_batch_size(coors: torch.Tensor) -> int: + """Infer batch size from voxel coordinates. + + Assumes the batch index is column 0 and rows are sorted by batch index (the + layout produced by mmdet3d voxelization). Returns 1 for an empty tensor. + """ + if len(coors) == 0: + return 1 + return int(coors[-1, 0].item()) + 1 diff --git a/deployment/projects/centerpoint/onnx_models/__init__.py b/deployment/projects/centerpoint/onnx_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deployment/projects/centerpoint/onnx_models/centerpoint_head_onnx.py b/deployment/projects/centerpoint/onnx_models/centerpoint_head_onnx.py new file mode 100644 index 000000000..798ff0f8a --- /dev/null +++ b/deployment/projects/centerpoint/onnx_models/centerpoint_head_onnx.py @@ -0,0 +1,153 @@ +"""CenterPoint deploy-only ONNX head variants. + +This module provides ONNX-friendly implementations of the CenterPoint heads: + +- ``SeparateHeadONNX``: a variant of + `mmdet3d.models.dense_heads.centerpoint_head.SeparateHead` that + redefines the ``heads`` ordering (e.g., ``heatmap``, ``reg``, ``height``, + ``dim``, rotation-related heads, ``vel``) to produce a stable, deterministic + output layout for export. +- ``CenterHeadONNX``: a variant of + `projects.CenterPoint.models.dense_heads.centerpoint_head.CenterHead` + that wraps a single-task ``SeparateHeadONNX`` and exposes an ONNX-oriented + ``forward`` interface, optionally changing the rotation representation to be + relative to the y-axis. + +In this context, *deploy-only* means these classes are intended for model +export and downstream inference (e.g., ONNXRuntime, TensorRT) rather than for +training: they focus on deterministic tensor ordering and export-compatible +forward behavior, and do not add or modify any training-time loss computation. +""" + +from typing import Dict, List, Tuple + +import torch +from mmdet3d.models.dense_heads.centerpoint_head import SeparateHead +from mmdet3d.registry import MODELS +from mmengine.logging import MMLogger + +from projects.CenterPoint.models.dense_heads.centerpoint_head import CenterHead + + +@MODELS.register_module() +class SeparateHeadONNX(SeparateHead): + """onnx support impl of mmdet3d.models.dense_heads.centerpoint_head.SeparateHead""" + + def __init__(self, **kwargs) -> None: + """Initialize SeparateHeadONNX with fixed output order (heatmap, reg, height, dim, rot, vel).""" + super().__init__(**kwargs) + self._logger = MMLogger.get_current_instance() + self._logger.info("Running SeparateHeadONNX!") + + # Note: to fix the output order + rot_heads = {k: None for k in self.heads.keys() if "rot" in k} + + self.heads: Dict[str, None] = { + "heatmap": None, + "reg": None, + "height": None, + "dim": None, + **rot_heads, + "vel": None, + } + + +@MODELS.register_module() +class CenterHeadONNX(CenterHead): + """onnx support impl of mmdet3d.models.dense_heads.centerpoint_head.CenterHead""" + + def __init__(self, rot_y_axis_reference: bool = False, **kwargs) -> None: + """ + :param switch_width_length: Set True to switch the order of width and length. + :param rot_y_axis_reference: Set True to output rotation of sin(y), cos(x) relative to the + y-axis. + """ + super().__init__(**kwargs) + + assert len(self.task_heads) == 1, "CenterPoint must use a single-task head" + self.task_heads: List[SeparateHeadONNX] + self.output_names: List[str] = list(self.task_heads[0].heads.keys()) + self._logger = MMLogger.get_current_instance() + self._rot_y_axis_reference = rot_y_axis_reference + self._logger.info( + "Running CenterHeadONNX! Output rotations in y-axis: %s", + self._rot_y_axis_reference, + ) + + @property + def rot_y_axis_reference(self) -> bool: + """Whether this head outputs rotation relative to the y-axis.""" + return self._rot_y_axis_reference + + def _export_forward_rot_y_axis_reference(self, head_tensors: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor]: + """ + TODO(KokSeang): This is a dirty and quick fix, we need to add the same operation to all + outputs to prevent reordering from ONNX export. However, we probably should use onnx_graphsurgeon + to modify them manually. + """ + # Heatmap + heatmap_tensors = head_tensors["heatmap"][:, torch.tensor([0, 1, 2, 3, 4], dtype=torch.int), :, :] + heatmap_scale_factors = torch.tensor([1.0, 1.0, 1.0, 1.0, 1.0]).to(device=heatmap_tensors.device) + heatmap_scale_factors = heatmap_scale_factors.view([1, -1, 1, 1]) + scale_heatmap_tensors = torch.mul(heatmap_tensors, heatmap_scale_factors) + # Reg + reg_tensors = head_tensors["reg"][:, torch.tensor([0, 1], dtype=torch.int), :, :] + reg_scale_factors = torch.tensor([1.0, 1.0]).to(device=reg_tensors.device) + reg_scale_factors = reg_scale_factors.view([1, -1, 1, 1]) + scale_reg_tensors = torch.mul(reg_tensors, reg_scale_factors) + + # Height + height_tensors = head_tensors["height"][:, torch.tensor([0], dtype=torch.int), :, :] + height_scale_factors = torch.tensor([1.0]).to(device=height_tensors.device) + height_scale_factors = height_scale_factors.view([1, -1, 1, 1]) + scale_height_tensors = torch.mul(height_tensors, height_scale_factors) + + # Dim + # Swap length, width, height to width, length, height + flip_dim_tensors = head_tensors["dim"][:, torch.tensor([1, 0, 2], dtype=torch.int), :, :] + dim_scale_factors = torch.tensor([1.0, 1.0, 1.0]).to(device=flip_dim_tensors.device) + dim_scale_factors = dim_scale_factors.view([1, -1, 1, 1]) + scale_flip_dim_tensors = torch.mul(flip_dim_tensors, dim_scale_factors) + + # Rot + # Swap sin(y), cos(x) to cos(x), sin(y) + flip_rot_tensors = head_tensors["rot"][:, torch.tensor([1, 0], dtype=torch.int), :, :] + # Negate -cos(x) and -sin(y) to change direction + rot_scale_factors = torch.tensor([-1.0, -1.0]).to(device=flip_rot_tensors.device) + rot_scale_factors = rot_scale_factors.view([1, -1, 1, 1]) + scale_flip_rot_tensors = torch.mul(flip_rot_tensors, rot_scale_factors) + + # Vel + vel_tensors = head_tensors["vel"][:, torch.tensor([0, 1], dtype=torch.int), :, :] + vel_scale_factors = torch.tensor([1.0, 1.0]).to(device=vel_tensors.device) + vel_scale_factors = vel_scale_factors.view([1, -1, 1, 1]) + scale_vel_tensors = torch.mul(vel_tensors, vel_scale_factors) + + return ( + scale_heatmap_tensors, + scale_reg_tensors, + scale_height_tensors, + scale_flip_dim_tensors, + scale_flip_rot_tensors, + scale_vel_tensors, + ) + + def _export_forward_single(self, head_tensors: Dict[str, torch.Tensor]) -> Tuple[torch.Tensor]: + """Forward using x-axis reference; return head tensors in output_names order.""" + ret_list: List[torch.Tensor] = [head_tensors[head_name] for head_name in self.output_names] + return tuple(ret_list) + + def forward(self, x: List[torch.Tensor]) -> Tuple[torch.Tensor]: + """Forward pass. + Args: + x (List[torch.Tensor]): multi-level features + Returns: + pred (Tuple[torch.Tensor]): Output results for tasks. + """ + assert len(x) == 1, "The input of CenterHeadONNX must be a single-level feature" + x = self.shared_conv(x[0]) + head_tensors: Dict[str, torch.Tensor] = self.task_heads[0](x) + if self._rot_y_axis_reference: + return self._export_forward_rot_y_axis_reference(head_tensors=head_tensors) + else: + return self._export_forward_single(head_tensors=head_tensors) diff --git a/deployment/projects/centerpoint/onnx_models/centerpoint_onnx.py b/deployment/projects/centerpoint/onnx_models/centerpoint_onnx.py new file mode 100644 index 000000000..76790c7cc --- /dev/null +++ b/deployment/projects/centerpoint/onnx_models/centerpoint_onnx.py @@ -0,0 +1,145 @@ +"""CenterPoint deploy-only ONNX model variants. + +These modules provide ONNX-friendly model wrappers and detector variants used by +the deployment/export pipeline (not training). +""" + +from typing import Any, Dict, List, Tuple + +import torch +from mmdet3d.models.detectors.centerpoint import CenterPoint +from mmdet3d.registry import MODELS +from mmengine.logging import MMLogger +from torch import nn + +from deployment.core.device import DeviceSpec + + +class CenterPointHeadONNX(nn.Module): + """Head module for centerpoint with BACKBONE, NECK and BBOX_HEAD""" + + def __init__(self, backbone: nn.Module, neck: nn.Module, bbox_head: nn.Module): + """Wrap backbone, neck, and bbox_head into a single module for ONNX export. + + Args: + backbone: pts_backbone module. + neck: pts_neck module (may be None). + bbox_head: pts_bbox_head module. + """ + super(CenterPointHeadONNX, self).__init__() + self.backbone: nn.Module = backbone + self.neck: nn.Module = neck + self.bbox_head: nn.Module = bbox_head + self._logger = MMLogger.get_current_instance() + self._logger.info("Running CenterPointHeadONNX!") + + def forward(self, x: torch.Tensor) -> Tuple[List[Dict[str, torch.Tensor]]]: + """ + Note: + torch.onnx.export() doesn't support triple-nested output + + Args: + x (torch.Tensor): (B, C, H, W) + Returns: + tuple[list[dict[str, any]]]: + (num_classes x [num_detect x {'reg', 'height', 'dim', 'rot', 'vel', 'heatmap'}]) + """ + x = self.backbone(x) + if self.neck is not None: + x = self.neck(x) + x = self.bbox_head(x) + + return x + + +@MODELS.register_module() +class CenterPointONNX(CenterPoint): + """onnx support impl of mmdet3d.models.detectors.CenterPoint""" + + def __init__( + self, + point_channels: int = 5, + device: DeviceSpec = DeviceSpec.from_value("cpu"), + **kwargs, + ) -> None: + """Initialize CenterPoint ONNX detector. + + Args: + point_channels: Number of point feature channels (e.g. from voxel encoder). + device: Target device specification. + **kwargs: Passed to CenterPoint base class. + """ + super().__init__(**kwargs) + self._point_channels = point_channels + self._device = device + self._torch_device = self._device.to_torch_device() + self._logger = MMLogger.get_current_instance() + self._logger.info("Running CenterPointONNX!") + + def _get_inputs(self, data_loader, sample_idx=0) -> Dict[str, Any]: + """ + Generate inputs from the provided data loader. + + Args: + data_loader: Loader that implements ``load_sample``. + sample_idx: Index of the sample to fetch. + """ + if data_loader is None: + raise ValueError("data_loader is required for CenterPoint ONNX export") + + if not hasattr(data_loader, "load_sample"): + raise AttributeError("data_loader must implement 'load_sample(sample_idx)'") + + sample = data_loader.load_sample(sample_idx) + + if "points" not in sample: + raise KeyError(f"Sample must contain 'points' (processed tensor). Got keys: {list(sample.keys())}") + + points = sample["points"] + if not isinstance(points, torch.Tensor): + raise TypeError(f"Expected points to be torch.Tensor, got {type(points)}") + + # Ensure points are on the correct device + points = points.to(self._torch_device) + points = [points] + return {"points": points, "data_samples": None} + + def _extract_features(self, data_loader, sample_idx=0) -> Tuple[torch.Tensor, Dict[str, Any]]: + """Extract (input_features, voxel_dict) using a sample from the data loader. + + Runs data preprocessor voxelization and voxel encoder get_input_features. + Used by the ONNX export pipeline to get sample data for tracing. + + Args: + data_loader: Loader with load_sample(sample_idx); must not be None. + sample_idx: Index of the sample to use (default 0). + + Returns: + Tuple of (input_features, voxel_dict) for backbone input and export. + + Raises: + ValueError: If data_loader is None. + KeyError/AttributeError: If sample or model components are invalid. + """ + if data_loader is None: + raise ValueError("data_loader is required to extract features") + + assert self.data_preprocessor is not None and hasattr(self.data_preprocessor, "voxelize") + + # Ensure data preprocessor is on the correct device + if hasattr(self.data_preprocessor, "to"): + self.data_preprocessor.to(self._torch_device) + + inputs = self._get_inputs(data_loader, sample_idx) + voxel_dict = self.data_preprocessor.voxelize(points=inputs["points"], data_samples=inputs["data_samples"]) + + # Ensure all voxel tensors are on the correct device + for key in ["voxels", "num_points", "coors"]: + if key in voxel_dict and isinstance(voxel_dict[key], torch.Tensor): + voxel_dict[key] = voxel_dict[key].to(self._torch_device) + + assert self.pts_voxel_encoder is not None and hasattr(self.pts_voxel_encoder, "get_input_features") + input_features = self.pts_voxel_encoder.get_input_features( + voxel_dict["voxels"], voxel_dict["num_points"], voxel_dict["coors"] + ) + return input_features, voxel_dict diff --git a/deployment/projects/centerpoint/onnx_models/pillar_encoder_onnx.py b/deployment/projects/centerpoint/onnx_models/pillar_encoder_onnx.py new file mode 100644 index 000000000..3ea6df23b --- /dev/null +++ b/deployment/projects/centerpoint/onnx_models/pillar_encoder_onnx.py @@ -0,0 +1,223 @@ +"""CenterPoint deploy-only ONNX voxel encoder variants. + +This module defines ONNX-compatible wrappers around CenterPoint pillar voxel encoders +that are intended for deployment / inference only (that is, for ONNX export and +runtime execution, not for training). The classes here expose helper APIs and forward +signatures that are easier to trace and integrate into componentized inference +pipelines. + +Provided encoder variants include: + +- ``PillarFeatureNetONNX``: ONNX-support implementation of + `mmdet3d.models.voxel_encoders.pillar_encoder.PillarFeatureNet`, keeping + the original behavior but with an ONNX-friendly interface. +- ``BackwardPillarFeatureNetONNX``: backward-compatible pillar feature network based + on `projects.CenterPoint.models.voxel_encoders.pillar_encoder.BackwardPillarFeatureNet` + that prepares pillar features and runs PFN layers without Z-distance features for + use with exported CenterPoint models. +""" + +import torch +from mmdet3d.models.voxel_encoders.pillar_encoder import PillarFeatureNet +from mmdet3d.models.voxel_encoders.utils import get_paddings_indicator +from mmdet3d.registry import MODELS +from mmengine.logging import MMLogger +from torch import Tensor + +from projects.CenterPoint.models.voxel_encoders.pillar_encoder import BackwardPillarFeatureNet + + +@MODELS.register_module() +class PillarFeatureNetONNX(PillarFeatureNet): + """onnx support impl of mmdet3d.models.voxel_encoders.pillar_encoder.PillarFeatureNet""" + + def __init__(self, **kwargs) -> None: + """Initialize PillarFeatureNetONNX; arguments passed to PillarFeatureNet.""" + super().__init__(**kwargs) + self._logger = MMLogger.get_current_instance() + self._logger.info("Running PillarFeatureNetONNX!") + + def get_input_features( + self, + features: Tensor, + num_points: Tensor, + coors: Tensor, + *args, + **kwargs, + ) -> Tensor: + """Forward function. + + Args: + features (torch.Tensor): Point features or raw points in shape + (N, M, C). + num_points (torch.Tensor): Number of points in each pillar. + coors (torch.Tensor): Coordinates of each voxel. + + Returns: + torch.Tensor: Features of pillars. + """ + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + dtype = features.dtype + if self._with_voxel_center: + if not self.legacy: + f_center = torch.zeros_like(features[:, :, :3]) + f_center[:, :, 0] = features[:, :, 0] - (coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) + f_center[:, :, 1] = features[:, :, 1] - (coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) + f_center[:, :, 2] = features[:, :, 2] - (coors[:, 1].to(dtype).unsqueeze(1) * self.vz + self.z_offset) + else: + f_center = features[:, :, :3] + f_center[:, :, 0] = f_center[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset + ) + f_center[:, :, 1] = f_center[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset + ) + f_center[:, :, 2] = f_center[:, :, 2] - ( + coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset + ) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + # The feature decorations were calculated without regard to whether + # pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_points, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + features *= mask + return features + + def forward( + self, + features: torch.Tensor, + ) -> torch.Tensor: + """Forward function. + Args: + features (torch.Tensor): Point features in shape (N, M, C). + num_points (torch.Tensor): Number of points in each pillar. + coors (torch.Tensor): + Returns: + torch.Tensor: Features of pillars. + """ + + for pfn in self.pfn_layers: + features = pfn(features) + + return features + + +@MODELS.register_module() +class BackwardPillarFeatureNetONNX(BackwardPillarFeatureNet): + """Pillar Feature Net. + + The backward-compatible network prepares the pillar features and performs forward pass + through PFNLayers without features from Z-distance. Use this to load models trained + from older mmdet versions. + + Args: + in_channels (int, optional): Number of input features, + either x, y, z or x, y, z, r. Defaults to 4. + feat_channels (tuple, optional): Number of features in each of the + N PFNLayers. Defaults to (64, ). + with_distance (bool, optional): Whether to include Euclidean distance + to points. Defaults to False. + with_cluster_center (bool, optional): [description]. Defaults to True. + with_voxel_center (bool, optional): [description]. Defaults to True. + voxel_size (tuple[float], optional): Size of voxels, only utilize x + and y size. Defaults to (0.2, 0.2, 4). + point_cloud_range (tuple[float], optional): Point cloud range, only + utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1). + norm_cfg ([type], optional): [description]. + Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). + mode (str, optional): The mode to gather point features. Options are + 'max' or 'avg'. Defaults to 'max'. + legacy (bool, optional): Whether to use the new behavior or + the original behavior. Defaults to True. + """ + + def __init__(self, **kwargs) -> None: + """Initialize BackwardPillarFeatureNetONNX; arguments passed to BackwardPillarFeatureNet.""" + super(BackwardPillarFeatureNetONNX, self).__init__(**kwargs) + + def get_input_features(self, features: Tensor, num_points: Tensor, coors: Tensor, *args, **kwargs) -> Tensor: + """Forward function. + + Args: + features (torch.Tensor): Point features or raw points in shape + (N, M, C). + num_points (torch.Tensor): Number of points in each pillar. + coors (torch.Tensor): Coordinates of each voxel. + + Returns: + torch.Tensor: Features of pillars. + """ + features_ls = [features] + # Find distance of x, y, and z from cluster center + if self._with_cluster_center: + points_mean = features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1) + f_cluster = features[:, :, :3] - points_mean + features_ls.append(f_cluster) + + # Find distance of x, y, and z from pillar center + dtype = features.dtype + if self._with_voxel_center: + if not self.legacy: + f_center = torch.zeros_like(features[:, :, :2]) + f_center[:, :, 0] = features[:, :, 0] - (coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) + f_center[:, :, 1] = features[:, :, 1] - (coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) + else: + f_center = features[:, :, :2] + f_center[:, :, 0] = f_center[:, :, 0] - ( + coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset + ) + f_center[:, :, 1] = f_center[:, :, 1] - ( + coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset + ) + features_ls.append(f_center) + + if self._with_distance: + points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) + features_ls.append(points_dist) + + # Combine together feature decorations + features = torch.cat(features_ls, dim=-1) + # The feature decorations were calculated without regard to whether + # pillar was empty. Need to ensure that + # empty pillars remain set to zeros. + voxel_count = features.shape[1] + mask = get_paddings_indicator(num_points, voxel_count, axis=0) + mask = torch.unsqueeze(mask, -1).type_as(features) + features *= mask + return features + + def forward( + self, + features: torch.Tensor, + ) -> torch.Tensor: + """Forward function. + + Args: + features (torch.Tensor): Point features or raw points in shape + (N, M, C). + num_points (torch.Tensor): Number of points in each pillar. + coors (torch.Tensor): Coordinates of each voxel. + + Returns: + torch.Tensor: Features of pillars. + """ + for pfn in self.pfn_layers: + features = pfn(features) + + return features diff --git a/deployment/projects/centerpoint/pipelines/centerpoint_pipeline.py b/deployment/projects/centerpoint/pipelines/centerpoint_pipeline.py new file mode 100644 index 000000000..6e4e074e8 --- /dev/null +++ b/deployment/projects/centerpoint/pipelines/centerpoint_pipeline.py @@ -0,0 +1,336 @@ +""" +CenterPoint inference pipeline base class. + +Provides common preprocessing, postprocessing, and inference logic +shared by PyTorch, ONNX, and TensorRT backend implementations. +""" + +from __future__ import annotations + +import logging +import time +from abc import abstractmethod +from typing import Dict, List, Sequence, Tuple, Union + +import numpy as np +import torch +from mmdet3d.structures import Det3DDataSample, LiDARInstance3DBoxes +from typing_extensions import override + +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.pipelines.base_pipeline import BaseInferencePipeline +from deployment.projects.centerpoint.io.sample_types import compute_batch_size + +logger = logging.getLogger(__name__) + + +class CenterPointInferencePipeline(BaseInferencePipeline): + """Base pipeline for CenterPoint staged inference. + + This normalizes preprocessing/postprocessing for CenterPoint and provides + common helpers (e.g., middle encoder processing) used by PyTorch/ONNX/TensorRT + backend-specific pipelines. + + Attributes: + pytorch_model: Reference PyTorch model for preprocessing/postprocessing. + num_classes: Number of detection classes. + class_names: List of class names. + point_cloud_range: Point cloud range [x_min, y_min, z_min, x_max, y_max, z_max]. + voxel_size: Voxel size [vx, vy, vz]. + """ + + def __init__( + self, + pytorch_model: torch.nn.Module, + backend_type: Backend, + device: DeviceSpec, + ) -> None: + """Initialize CenterPoint pipeline. + + Args: + pytorch_model: PyTorch model for preprocessing/postprocessing. + device: Target runtime device (DeviceSpec). + backend_type: Deployment backend enum. Required. + + Raises: + ValueError: If class_names not found in pytorch_model.cfg. + """ + cfg = pytorch_model.cfg + + class_names = cfg.class_names + point_cloud_range = cfg.point_cloud_range + voxel_size = cfg.voxel_size + + if class_names is None: + raise ValueError("class_names not found in pytorch_model.cfg") + if point_cloud_range is None: + raise ValueError("point_cloud_range not found in pytorch_model.cfg") + if voxel_size is None: + raise ValueError("voxel_size not found in pytorch_model.cfg") + + super().__init__( + model=pytorch_model, + backend_type=backend_type, + device=device, + ) + + self.class_names: List[str] = class_names + self.point_cloud_range: List[float] = point_cloud_range + self.voxel_size: List[float] = voxel_size + self.pytorch_model: torch.nn.Module = pytorch_model + self._rot_y_axis_reference: bool = pytorch_model.pts_bbox_head.rot_y_axis_reference + + def to_device_tensor(self, data: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: + """Convert data to tensor on the pipeline's device. + + Args: + data: Input data (torch.Tensor or np.ndarray). + + Returns: + Tensor on pipeline torch device. + """ + if isinstance(data, np.ndarray): + data = torch.from_numpy(data) + return data.to(self.torch_device) + + def to_numpy(self, data: torch.Tensor, dtype: np.dtype = np.float32) -> np.ndarray: + """Convert tensor to contiguous numpy array. + + Args: + data: Input tensor. + dtype: Target numpy dtype. + + Returns: + Contiguous numpy array. + """ + arr = data.cpu().numpy().astype(dtype) + if not arr.flags["C_CONTIGUOUS"]: + arr = np.ascontiguousarray(arr) + return arr + + @staticmethod + def squeeze_voxel_features(voxel_features: torch.Tensor) -> torch.Tensor: + """Collapse the singleton channel of the voxel-encoder output ``[N, 1, F] -> [N, F]``. + + All backends (PyTorch/ONNX/TensorRT) emit ``[N, 1, F]`` for CenterPoint; the guard + fails loud if a future model variant changes that, instead of silently squeezing + the wrong axis. + """ + if voxel_features.ndim != 3 or voxel_features.shape[1] != 1: + raise RuntimeError(f"Expected voxel encoder output [N, 1, F], got shape {tuple(voxel_features.shape)}.") + return voxel_features.squeeze(1) + + @staticmethod + def order_head_outputs(actual_names: Sequence[str], expected_names: Sequence[str]) -> List[str]: + """Validate backbone-head output names and return them in the configured order. + + ONNX/TensorRT may report outputs in arbitrary order, but CenterPoint postprocess + depends on the exact head order from the component config. This checks for any + missing/extra outputs and returns ``expected_names`` (the config order). + """ + expected_set, actual_set = set(expected_names), set(actual_names) + missing = expected_set - actual_set + extra = actual_set - expected_set + if missing or extra: + raise ValueError( + f"Backbone-head output mismatch: missing={sorted(missing)}, extra={sorted(extra)}; " + f"expected={sorted(expected_set)}, got={sorted(actual_set)}." + ) + return list(expected_names) + + @override + def preprocess( + self, + points: torch.Tensor, + ) -> Tuple[Dict[str, torch.Tensor], Dict[str, object]]: + """Preprocess point cloud data for inference. + + Performs voxelization and feature extraction using the data_preprocessor + and pts_voxel_encoder from the PyTorch model. + + Args: + points: Point cloud tensor of shape [N, point_features]. + + Returns: + Tuple of (preprocessed_dict, metadata_dict). + preprocessed_dict contains: input_features, voxels, num_points, coors. + """ + points_tensor = self.to_device_tensor(points) + + data_samples = [Det3DDataSample()] + # Run data preprocessor + with torch.no_grad(): + batch_inputs = self.pytorch_model.data_preprocessor( + {"inputs": {"points": [points_tensor]}, "data_samples": data_samples} + ) + + voxel_dict = batch_inputs["inputs"]["voxels"] + voxels = voxel_dict["voxels"] + num_points = voxel_dict["num_points"] + coors = voxel_dict["coors"] + + with torch.no_grad(): + input_features = self.pytorch_model.pts_voxel_encoder.get_input_features(voxels, num_points, coors) + + preprocessed_dict = { + "input_features": input_features, + "voxels": voxels, + "num_points": num_points, + "coors": coors, + } + + # Second tuple element: preprocess_metadata for BaseInferencePipeline.infer() + # (merged with caller metadata, then passed to postprocess). Empty here. + return preprocessed_dict, {} + + def process_middle_encoder( + self, + voxel_features: torch.Tensor, + coors: torch.Tensor, + ) -> torch.Tensor: + """Process voxel features through middle encoder (scatter to BEV). + + This step runs on PyTorch regardless of backend because it involves + sparse-to-dense conversion that's not easily exportable to ONNX. + + Args: + voxel_features: Encoded voxel features [N, feature_dim]. + coors: Voxel coordinates [N, 4] (batch_idx, z, y, x). + + Returns: + Spatial features tensor [B, C, H, W]. + """ + voxel_features = self.to_device_tensor(voxel_features) + coors = self.to_device_tensor(coors) + + batch_size = compute_batch_size(coors) + + with torch.no_grad(): + spatial_features = self.pytorch_model.pts_middle_encoder(voxel_features, coors, batch_size) + + return spatial_features + + @override + def run_model( + self, + preprocessed_input: Dict[str, torch.Tensor], + ) -> Tuple[List[torch.Tensor], Dict[str, float]]: + """Run the full model pipeline with latency tracking. + + Args: + preprocessed_input: Dict with keys: input_features, coors. + + Returns: + Tuple of (head_outputs, stage_latencies). + """ + stage_latencies: Dict[str, float] = {} + + start = time.perf_counter() + voxel_features = self.run_voxel_encoder(preprocessed_input["input_features"]) + stage_latencies["voxel_encoder_ms"] = (time.perf_counter() - start) * 1000 + + start = time.perf_counter() + spatial_features = self.process_middle_encoder(voxel_features, preprocessed_input["coors"]) + stage_latencies["middle_encoder_ms"] = (time.perf_counter() - start) * 1000 + + start = time.perf_counter() + head_outputs = self.run_backbone_head(spatial_features) + stage_latencies["backbone_head_ms"] = (time.perf_counter() - start) * 1000 + + return head_outputs, stage_latencies + + @override + def postprocess( + self, + head_outputs: List[torch.Tensor], + sample_meta: Dict[str, object], + ) -> List[Dict[str, Union[List[float], float, int]]]: + """Postprocess head outputs to detection results. + + Args: + head_outputs: List of 6 tensors [heatmap, reg, height, dim, rot, vel]. + sample_meta: Sample metadata dict. + + Returns: + List of detection dicts with keys: bbox_3d, score, label. + + Raises: + ValueError: If head_outputs doesn't contain exactly 6 tensors. + """ + head_outputs = [self.to_device_tensor(out) for out in head_outputs] + + if len(head_outputs) != 6: + raise ValueError(f"Expected 6 head outputs, got {len(head_outputs)}") + + heatmap, reg, height, dim, rot, vel = head_outputs + + # Apply rotation axis correction to mirror the head's export-time convention. + if self._rot_y_axis_reference: + dim = dim[:, [1, 0, 2], :, :] + rot = rot * (-1.0) + rot = rot[:, [1, 0], :, :] + + preds_dict = { + "heatmap": heatmap, + "reg": reg, + "height": height, + "dim": dim, + "rot": rot, + "vel": vel, + } + preds_dicts = ([preds_dict],) + + # Build a new dict instead of mutating the caller's metadata (the same sample_meta + # may be reused across backends for the same frame). + batch_input_metas = [{**sample_meta, "box_type_3d": sample_meta.get("box_type_3d", LiDARInstance3DBoxes)}] + + with torch.no_grad(): + predictions_list = self.pytorch_model.pts_bbox_head.predict_by_feat( + preds_dicts=preds_dicts, batch_input_metas=batch_input_metas + ) + + results: List[Dict[str, Union[List[float], float, int]]] = [] + for pred_instances in predictions_list: + bboxes_3d = pred_instances.bboxes_3d.tensor.cpu().numpy() + scores_3d = pred_instances.scores_3d.cpu().numpy() + labels_3d = pred_instances.labels_3d.cpu().numpy() + + for i in range(len(bboxes_3d)): + results.append( + { + "bbox_3d": bboxes_3d[i][:7].tolist(), + "score": float(scores_3d[i]), + "label": int(labels_3d[i]), + } + ) + + return results + + @abstractmethod + def run_voxel_encoder(self, input_features: torch.Tensor) -> torch.Tensor: + """Run voxel encoder inference. + + Args: + input_features: Input features [N, max_points, C]. + + Returns: + Voxel features [N, feature_dim]. + """ + raise NotImplementedError + + @abstractmethod + def run_backbone_head(self, spatial_features: torch.Tensor) -> List[torch.Tensor]: + """Run backbone and head inference. + + Args: + spatial_features: Spatial features [B, C, H, W]. + + Returns: + List of 6 head output tensors. + """ + raise NotImplementedError + + def __repr__(self) -> str: + """Return string representation with class name, device, and backend.""" + return f"{self.__class__.__name__}(device={self.device}, backend={self.backend_type})" diff --git a/deployment/projects/centerpoint/pipelines/factory.py b/deployment/projects/centerpoint/pipelines/factory.py new file mode 100644 index 000000000..c2b538ec7 --- /dev/null +++ b/deployment/projects/centerpoint/pipelines/factory.py @@ -0,0 +1,97 @@ +""" +CenterPoint Pipeline Factory. + +Registers CenterPoint pipelines into the global pipeline_registry so evaluators can create pipelines +via `pipeline_registry.create_pipeline(...)`. +""" + +import logging + +import torch +from typing_extensions import override + +from deployment.configs.schema import ComponentsConfig +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.evaluator_types import ModelSpec +from deployment.pipelines.base_factory import BasePipelineFactory +from deployment.pipelines.base_pipeline import BaseInferencePipeline +from deployment.pipelines.registry import pipeline_registry +from deployment.projects.centerpoint.pipelines.onnx import CenterPointONNXPipeline +from deployment.projects.centerpoint.pipelines.pytorch import CenterPointPyTorchPipeline +from deployment.projects.centerpoint.pipelines.tensorrt import CenterPointTensorRTPipeline + +logger = logging.getLogger(__name__) + + +@pipeline_registry.register +class CenterPointPipelineFactory(BasePipelineFactory): + """Pipeline factory for CenterPoint across supported backends. + + Supports passing `components_cfg` to configure component file paths + and IO specifications. + """ + + @classmethod + @override + def get_project_name(cls) -> str: + """Return the project name used in pipeline registry and deploy config.""" + return "centerpoint" + + @classmethod + @override + def create_pipeline( + cls, + model_spec: ModelSpec, + pytorch_model: torch.nn.Module, + device: DeviceSpec, + components_cfg: ComponentsConfig, + ) -> BaseInferencePipeline: + """Create a CenterPoint pipeline for the specified backend. + + Args: + model_spec: Model specification (backend/device/artifact) + pytorch_model: PyTorch model instance for preprocessing + device: Override device (uses model_spec.device if None) + components_cfg: Unified component configuration dict from deploy_config. + Used to configure component file paths. + + Returns: + Pipeline instance for the specified backend + """ + device = device or model_spec.device + backend = model_spec.backend + + cls._validate_backend(backend) + + if backend is Backend.PYTORCH: + logger.info("Creating CenterPoint PyTorch pipeline on %s", device) + return CenterPointPyTorchPipeline(pytorch_model, device=device) + + if backend is Backend.ONNX: + logger.info( + "Creating CenterPoint ONNX pipeline from %s on %s", + model_spec.artifact.path, + device, + ) + return CenterPointONNXPipeline( + pytorch_model, + onnx_dir=model_spec.artifact.path, + device=device, + components_cfg=components_cfg, + ) + + if backend is Backend.TENSORRT: + logger.info( + "Creating CenterPoint TensorRT pipeline from %s on %s", + model_spec.artifact.path, + device, + ) + return CenterPointTensorRTPipeline( + pytorch_model, + tensorrt_dir=model_spec.artifact.path, + device=device, + components_cfg=components_cfg, + ) + + raise ValueError(f"Unsupported backend: {backend.value}") diff --git a/deployment/projects/centerpoint/pipelines/onnx.py b/deployment/projects/centerpoint/pipelines/onnx.py new file mode 100644 index 000000000..df8cd3e52 --- /dev/null +++ b/deployment/projects/centerpoint/pipelines/onnx.py @@ -0,0 +1,149 @@ +""" +CenterPoint ONNX Pipeline Implementation. +""" + +from __future__ import annotations + +import logging +from typing import List, Tuple + +import numpy as np +import onnxruntime as ort +import torch +from typing_extensions import override + +from deployment.configs.schema import ComponentsConfig +from deployment.core.artifacts import resolve_artifact_path +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.projects.centerpoint.pipelines.centerpoint_pipeline import CenterPointInferencePipeline + +logger = logging.getLogger(__name__) + + +class CenterPointONNXPipeline(CenterPointInferencePipeline): + """ONNXRuntime-based CenterPoint pipeline (componentized inference). + + Loads separate ONNX models for pts_voxel_encoder and pts_backbone_neck_head components + and runs inference using ONNXRuntime. + + Attributes: + onnx_dir: Directory containing ONNX model files. + voxel_encoder_session: ONNXRuntime session for voxel encoder. + backbone_head_session: ONNXRuntime session for backbone + head. + """ + + def __init__( + self, + pytorch_model: torch.nn.Module, + onnx_dir: str, + device: DeviceSpec, + components_cfg: ComponentsConfig, + ) -> None: + """Initialize ONNX pipeline. + + Args: + pytorch_model: Reference PyTorch model for preprocessing. + onnx_dir: Directory containing ONNX model files. + device: Target runtime device (DeviceSpec). + components_cfg: Component configuration from deploy_config (use ComponentsConfig.from_dict). + If None, raises. + """ + super().__init__(pytorch_model=pytorch_model, backend_type=Backend.ONNX, device=device) + + self.onnx_dir = onnx_dir + self._components_cfg = components_cfg + self.voxel_encoder_session, self.backbone_head_session = self._load_onnx_models() + logger.info("ONNX pipeline initialized with models from: %s", onnx_dir) + + def _load_onnx_models(self) -> Tuple[ort.InferenceSession, ort.InferenceSession]: + """Load ONNX models for each component (voxel encoder and backbone+head). + + Uses self.onnx_dir, self._components_cfg, and self.device to resolve paths + and select execution providers. + + Returns: + The (voxel_encoder_session, backbone_head_session) ONNXRuntime sessions. + + Raises: + FileNotFoundError: If ONNX model files are not found. + RuntimeError: If model loading fails. + """ + voxel_encoder_path = resolve_artifact_path( + base_dir=self.onnx_dir, + components_cfg=self._components_cfg, + component_name="pts_voxel_encoder", + file_key="onnx_file", + ) + backbone_head_path = resolve_artifact_path( + base_dir=self.onnx_dir, + components_cfg=self._components_cfg, + component_name="pts_backbone_neck_head", + file_key="onnx_file", + ) + + # Configure session options + so = ort.SessionOptions() + so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_DISABLE_ALL + so.log_severity_level = 2 # Warning + + # Select execution providers based on device + providers = self.device.to_ort_provider() + device_message = "CUDA" if self.device.is_cuda else "CPU" + logger.info("Using %s execution provider for ONNX", device_message) + + try: + voxel_encoder_session = ort.InferenceSession(voxel_encoder_path, sess_options=so, providers=providers) + logger.info("Loaded voxel encoder: %s", voxel_encoder_path) + backbone_head_session = ort.InferenceSession(backbone_head_path, sess_options=so, providers=providers) + logger.info("Loaded backbone+head: %s", backbone_head_path) + except Exception as e: + raise RuntimeError(f"Failed to load ONNX model: {e}") from e + + return voxel_encoder_session, backbone_head_session + + @override + def run_voxel_encoder(self, input_features: torch.Tensor) -> torch.Tensor: + """Run voxel encoder using ONNXRuntime. + + Args: + input_features: Input features [N, max_points, C]. + + Returns: + Voxel features [N, feature_dim]. + """ + input_array = self.to_numpy(input_features, dtype=np.float32) + input_name = self.voxel_encoder_session.get_inputs()[0].name + output_name = self.voxel_encoder_session.get_outputs()[0].name + + outputs = self.voxel_encoder_session.run([output_name], {input_name: input_array}) + + voxel_features = torch.from_numpy(outputs[0]).to(self.torch_device) + + return self.squeeze_voxel_features(voxel_features) + + @override + def run_backbone_head(self, spatial_features: torch.Tensor) -> List[torch.Tensor]: + """Run backbone and head using ONNXRuntime. + + Args: + spatial_features: Spatial features [B, C, H, W]. + + Returns: + List of head output tensors in configured order. + + Raises: + ValueError: If the ONNX outputs don't match the configured head outputs. + """ + input_array = self.to_numpy(spatial_features, dtype=np.float32) + + input_name = self.backbone_head_session.get_inputs()[0].name + onnx_output_names = [output.name for output in self.backbone_head_session.get_outputs()] + expected_output_names = [ + out.name for out in self._components_cfg.get_component("pts_backbone_neck_head").io.outputs + ] + output_names = self.order_head_outputs(onnx_output_names, expected_output_names) + + # Run inference with ordered output names (ONNX Runtime returns outputs in the same order) + outputs = self.backbone_head_session.run(output_names, {input_name: input_array}) + return [torch.from_numpy(out).to(self.torch_device) for out in outputs] diff --git a/deployment/projects/centerpoint/pipelines/pytorch.py b/deployment/projects/centerpoint/pipelines/pytorch.py new file mode 100644 index 000000000..02b8034bf --- /dev/null +++ b/deployment/projects/centerpoint/pipelines/pytorch.py @@ -0,0 +1,105 @@ +""" +CenterPoint PyTorch Pipeline Implementation. +""" + +from __future__ import annotations + +import logging +from typing import List + +import torch +from typing_extensions import override + +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.projects.centerpoint.pipelines.centerpoint_pipeline import CenterPointInferencePipeline + +logger = logging.getLogger(__name__) + + +class CenterPointPyTorchPipeline(CenterPointInferencePipeline): + """PyTorch-based CenterPoint pipeline (staged to match ONNX/TensorRT outputs). + + This pipeline runs inference using the native PyTorch model, but structures + the execution to match the ONNX/TensorRT staged inference for consistency. + """ + + def __init__( + self, + pytorch_model: torch.nn.Module, + device: DeviceSpec, + ) -> None: + """Initialize PyTorch pipeline. + + Args: + pytorch_model: PyTorch model for inference. + device: Target runtime device. + """ + super().__init__(pytorch_model=pytorch_model, backend_type=Backend.PYTORCH, device=device) + logger.info("PyTorch pipeline initialized (ONNX-compatible staged inference)") + + @override + def run_voxel_encoder(self, input_features: torch.Tensor) -> torch.Tensor: + """Run voxel encoder using PyTorch model. + + Args: + input_features: Input features [N, max_points, C]. + + Returns: + Voxel features [N, feature_dim]. + + Raises: + ValueError: If input_features is None. + RuntimeError: If output shape is unexpected. + """ + input_features = self.to_device_tensor(input_features) + + with torch.no_grad(): + voxel_features = self.pytorch_model.pts_voxel_encoder(input_features) + + return self.squeeze_voxel_features(voxel_features) + + @override + def run_backbone_head(self, spatial_features: torch.Tensor) -> List[torch.Tensor]: + """Run backbone and head using PyTorch model. + + Args: + spatial_features: Spatial features [B, C, H, W]. + + Returns: + List of 6 head output tensors. + + Raises: + ValueError: If head output format is unexpected. + """ + spatial_features = self.to_device_tensor(spatial_features) + + with torch.no_grad(): + x = self.pytorch_model.pts_backbone(spatial_features) + + if hasattr(self.pytorch_model, "pts_neck") and self.pytorch_model.pts_neck is not None: + x = self.pytorch_model.pts_neck(x) + + head_outputs_tuple = self.pytorch_model.pts_bbox_head(x) + + if isinstance(head_outputs_tuple, tuple) and len(head_outputs_tuple) > 0: + first_element = head_outputs_tuple[0] + + if isinstance(first_element, torch.Tensor): + head_outputs = list(head_outputs_tuple) + elif isinstance(first_element, list) and len(first_element) > 0: + preds_dict = first_element[0] + head_outputs = [ + preds_dict["heatmap"], + preds_dict["reg"], + preds_dict["height"], + preds_dict["dim"], + preds_dict["rot"], + preds_dict["vel"], + ] + else: + raise ValueError(f"Unexpected task_outputs format: {type(first_element)}") + else: + raise ValueError(f"Unexpected head_outputs format: {type(head_outputs_tuple)}") + + return head_outputs diff --git a/deployment/projects/centerpoint/pipelines/tensorrt.py b/deployment/projects/centerpoint/pipelines/tensorrt.py new file mode 100644 index 000000000..cefb10c34 --- /dev/null +++ b/deployment/projects/centerpoint/pipelines/tensorrt.py @@ -0,0 +1,333 @@ +""" +CenterPoint TensorRT Pipeline Implementation. +""" + +from __future__ import annotations + +import logging +import time +from typing import Dict, List, Tuple, Union + +import numpy as np +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda +import tensorrt as trt +import torch +from typing_extensions import override + +from deployment.configs.schema import ComponentsConfig +from deployment.core.artifacts import resolve_artifact_path +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.pipelines.gpu_resource_mixin import ( + GPUResourceMixin, + TensorRTResourceManager, + release_tensorrt_resources, +) +from deployment.projects.centerpoint.pipelines.centerpoint_pipeline import CenterPointInferencePipeline + +logger = logging.getLogger(__name__) + + +class CenterPointTensorRTPipeline(GPUResourceMixin, CenterPointInferencePipeline): + """TensorRT-based CenterPoint pipeline (engine-per-component inference). + + Loads separate TensorRT engines for pts_voxel_encoder and pts_backbone_neck_head components + and runs inference using TensorRT execution contexts. + + Attributes: + tensorrt_dir: Directory containing TensorRT engine files. + """ + + # Free the CUDA cache every N evaluated samples + _GPU_CLEANUP_INTERVAL = 10 + + def __init__( + self, + pytorch_model: torch.nn.Module, + tensorrt_dir: str, + components_cfg: ComponentsConfig, + device: DeviceSpec, + ) -> None: + """Initialize TensorRT pipeline. + + Args: + pytorch_model: Reference PyTorch model for preprocessing. + tensorrt_dir: Directory containing TensorRT engine files. + components_cfg: Component configuration from deploy_config (use ComponentsConfig.from_dict). + device: Target CUDA device ('cuda:N'). + + Raises: + ValueError: If device is not a CUDA device or components_cfg is None. + """ + super().__init__(pytorch_model=pytorch_model, backend_type=Backend.TENSORRT, device=device) + + self.tensorrt_dir = tensorrt_dir + self._components_cfg = components_cfg + self._engines: Dict[str, trt.ICudaEngine] = {} + self._contexts: Dict[str, trt.IExecutionContext] = {} + self._logger = trt.Logger(trt.Logger.WARNING) + + # Per-stage pure-GPU times (ms), filled by each stage while its CUDA stream is + # still alive and read back in run_model. + self._gpu_stage_ms: Dict[str, float] = {} + + self._load_tensorrt_engines() + logger.info("TensorRT pipeline initialized with engines from: %s", tensorrt_dir) + + def _load_tensorrt_engines(self) -> None: + """Load TensorRT engines for each component. + + Raises: + FileNotFoundError: If engine files are not found. + RuntimeError: If engine loading or context creation fails. + """ + trt.init_libnvinfer_plugins(self._logger, "") + runtime = trt.Runtime(self._logger) + + engine_files = { + "pts_voxel_encoder": resolve_artifact_path( + base_dir=self.tensorrt_dir, + components_cfg=self._components_cfg, + component_name="pts_voxel_encoder", + file_key="engine_file", + ), + "pts_backbone_neck_head": resolve_artifact_path( + base_dir=self.tensorrt_dir, + components_cfg=self._components_cfg, + component_name="pts_backbone_neck_head", + file_key="engine_file", + ), + } + + for component_name, engine_path in engine_files.items(): + with open(engine_path, "rb") as f: + engine = runtime.deserialize_cuda_engine(f.read()) + if engine is None: + raise RuntimeError(f"Failed to deserialize engine: {engine_path}") + + context = engine.create_execution_context() + if context is None: + raise RuntimeError( + f"Failed to create execution context for {component_name}. " + "This is likely due to GPU out-of-memory." + ) + + self._engines[component_name] = engine + self._contexts[component_name] = context + logger.info("Loaded TensorRT engine: %s", component_name) + + def _get_io_names( + self, + engine: trt.ICudaEngine, + single_output: bool = False, + ) -> Tuple[str, Union[str, List[str]]]: + """Get input and output tensor names from engine. + + Args: + engine: TensorRT engine. + single_output: If True, return single output name instead of list. + + Returns: + Tuple of (input_name, output_name(s)). + + Raises: + RuntimeError: If input or output names cannot be found. + """ + input_name = None + output_names = [] + + for i in range(engine.num_io_tensors): + tensor_name = engine.get_tensor_name(i) + if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: + input_name = tensor_name + elif engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.OUTPUT: + output_names.append(tensor_name) + + if input_name is None: + raise RuntimeError("Could not find input tensor name") + if not output_names: + raise RuntimeError("Could not find output tensor names") + + if single_output: + return input_name, output_names[0] + return input_name, output_names + + @override + def run_voxel_encoder(self, input_features: torch.Tensor) -> torch.Tensor: + """Run voxel encoder using TensorRT. + + Args: + input_features: Input features [N, max_points, C]. + + Returns: + Voxel features [N, feature_dim]. + + Raises: + RuntimeError: If context is None (initialization failed). + """ + engine = self._engines["pts_voxel_encoder"] + context = self._contexts["pts_voxel_encoder"] + if context is None: + raise RuntimeError("pts_voxel_encoder context is None - likely failed to initialize due to GPU OOM") + + input_array = self.to_numpy(input_features, dtype=np.float32) + + input_name, output_name = self._get_io_names(engine, single_output=True) + context.set_input_shape(input_name, input_array.shape) + output_shape = context.get_tensor_shape(output_name) + output_array = np.empty(output_shape, dtype=np.float32) + if not output_array.flags["C_CONTIGUOUS"]: + output_array = np.ascontiguousarray(output_array) + + with TensorRTResourceManager() as manager: + d_input = manager.allocate(input_array.nbytes) + d_output = manager.allocate(output_array.nbytes) + stream = manager.stream + + context.set_tensor_address(input_name, int(d_input)) + context.set_tensor_address(output_name, int(d_output)) + + # Memory transfer: CPU -> GPU + cuda.memcpy_htod_async(d_input, input_array, stream) + + # Record start event and execute inference + start_event = cuda.Event() + end_event = cuda.Event() + start_event.record(stream) + context.execute_async_v3(stream_handle=stream.handle) + end_event.record(stream) + + # Memory transfer: GPU -> CPU + cuda.memcpy_dtoh_async(output_array, d_output, stream) + manager.synchronize() + + # Read GPU timing while the stream is still alive (events are complete after + # synchronize); avoids reading across a stream that has been released. + self._gpu_stage_ms["voxel_encoder_ms"] = end_event.time_since(start_event) + + voxel_features = torch.from_numpy(output_array).to(self.torch_device) + return self.squeeze_voxel_features(voxel_features) + + @override + def run_backbone_head(self, spatial_features: torch.Tensor) -> List[torch.Tensor]: + """Run backbone and head using TensorRT. + + Args: + spatial_features: Spatial features [B, C, H, W]. + + Returns: + List of 6 head output tensors. + + Raises: + RuntimeError: If context is None (initialization failed). + ValueError: If the engine outputs don't match the configured head outputs. + """ + engine = self._engines["pts_backbone_neck_head"] + context = self._contexts["pts_backbone_neck_head"] + if context is None: + raise RuntimeError("pts_backbone_neck_head context is None - likely failed to initialize due to GPU OOM") + + input_array = self.to_numpy(spatial_features, dtype=np.float32) + + input_name, trt_output_names = self._get_io_names(engine, single_output=False) + context.set_input_shape(input_name, input_array.shape) + + expected_output_names = [ + out.name for out in self._components_cfg.get_component("pts_backbone_neck_head").io.outputs + ] + # Validate and order outputs (CenterPoint postprocess depends on the config order). + output_names = self.order_head_outputs(trt_output_names, expected_output_names) + + output_arrays = {} + for output_name in output_names: + output_shape = context.get_tensor_shape(output_name) + output_array = np.empty(output_shape, dtype=np.float32) + if not output_array.flags["C_CONTIGUOUS"]: + output_array = np.ascontiguousarray(output_array) + output_arrays[output_name] = output_array + + with TensorRTResourceManager() as manager: + d_input = manager.allocate(input_array.nbytes) + d_outputs = {name: manager.allocate(arr.nbytes) for name, arr in output_arrays.items()} + stream = manager.stream + + context.set_tensor_address(input_name, int(d_input)) + for output_name in output_names: + context.set_tensor_address(output_name, int(d_outputs[output_name])) + + # Memory transfer: CPU -> GPU + cuda.memcpy_htod_async(d_input, input_array, stream) + + # Record start event and execute inference + start_event = cuda.Event() + end_event = cuda.Event() + start_event.record(stream) + context.execute_async_v3(stream_handle=stream.handle) + end_event.record(stream) + + # Memory transfer: GPU -> CPU + for output_name in output_names: + cuda.memcpy_dtoh_async(output_arrays[output_name], d_outputs[output_name], stream) + + manager.synchronize() + + # Read GPU timing while the stream is still alive (see run_voxel_encoder). + self._gpu_stage_ms["backbone_head_ms"] = end_event.time_since(start_event) + + return [torch.from_numpy(output_arrays[name]).to(self.torch_device) for name in output_names] + + @override + def run_model( + self, + preprocessed_input: Dict[str, torch.Tensor], + ) -> Tuple[List[torch.Tensor], Dict[str, float]]: + """Run complete multi-stage model inference with GPU timing using CUDA events. + + This override uses CUDA events to measure pure GPU inference time for + TensorRT operations, matching the C++ implementation's timing methodology. + + Args: + preprocessed_input: Dict from preprocess() containing: + - 'input_features': Input features for voxel encoder [N_voxels, max_points, 11] + - 'coors': Voxel coordinates [N_voxels, 4] + - 'voxels': Raw voxel data + - 'num_points': Number of points per voxel + + Returns: + Tuple of (head_outputs, stage_latencies): + - head_outputs: List of head outputs [heatmap, reg, height, dim, rot, vel] + - stage_latencies: Dict mapping stage names to latency in ms + - 'voxel_encoder_ms': Pure GPU inference time (CUDA events) + - 'middle_encoder_ms': Wall-clock time (PyTorch) + - 'backbone_head_ms': Pure GPU inference time (CUDA events) + """ + stage_latencies: Dict[str, float] = {} + + # Stage 1: Voxel Encoder (pure-GPU time recorded inside run_voxel_encoder). + voxel_features = self.run_voxel_encoder(preprocessed_input["input_features"]) + stage_latencies["voxel_encoder_ms"] = self._gpu_stage_ms["voxel_encoder_ms"] + + # Stage 2: Middle Encoder (PyTorch, wall-clock). + start = time.perf_counter() + spatial_features = self.process_middle_encoder(voxel_features, preprocessed_input["coors"]) + stage_latencies["middle_encoder_ms"] = (time.perf_counter() - start) * 1000 + + # Stage 3: Backbone + Head (pure-GPU time recorded inside run_backbone_head). + head_outputs = self.run_backbone_head(spatial_features) + stage_latencies["backbone_head_ms"] = self._gpu_stage_ms["backbone_head_ms"] + + return head_outputs, stage_latencies + + @override + def periodic_cleanup(self, sample_idx: int) -> None: + """Free the CUDA cache every ``_GPU_CLEANUP_INTERVAL`` samples during long eval loops.""" + if sample_idx > 0 and sample_idx % self._GPU_CLEANUP_INTERVAL == 0 and torch.cuda.is_available(): + torch.cuda.empty_cache() + + def _release_gpu_resources(self) -> None: + """Release TensorRT resources (engines and contexts).""" + release_tensorrt_resources( + engines=self._engines, + contexts=self._contexts, + ) diff --git a/deployment/projects/centerpoint/runner.py b/deployment/projects/centerpoint/runner.py new file mode 100644 index 000000000..8978dccc3 --- /dev/null +++ b/deployment/projects/centerpoint/runner.py @@ -0,0 +1,126 @@ +""" +CenterPoint-specific deployment runner. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +import torch +from mmengine.config import Config + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.contexts import CenterPointExportContext, ExportContext +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.exporters.common.factory import ExporterFactory +from deployment.exporters.common.model_wrappers import IdentityWrapper +from deployment.exporters.export_pipelines.base import OnnxExportPipeline, TensorRTExportPipeline +from deployment.projects.centerpoint.eval.evaluator import CenterPointEvaluator +from deployment.projects.centerpoint.export.component_builder import CenterPointComponentBuilder +from deployment.projects.centerpoint.export.onnx_export_pipeline import CenterPointONNXExportPipeline +from deployment.projects.centerpoint.export.tensorrt_export_pipeline import CenterPointTensorRTExportPipeline +from deployment.projects.centerpoint.io.model_loader import build_centerpoint_onnx_model +from deployment.projects.centerpoint.io.sample_adapter import CenterPointSampleAdapter +from deployment.runtime.runner import BaseDeploymentRunner + + +class CenterPointDeploymentRunner(BaseDeploymentRunner): + """CenterPoint deployment runner. + + Implements project-specific model loading and wiring to export pipelines, + while reusing the project-agnostic orchestration in `BaseDeploymentRunner`. + + Attributes: + model_cfg: Training MMEngine config (from checkpoint experiment file); not replaced after load. + evaluator: CenterPoint evaluator instance. + """ + + def __init__( + self, + data_loader: BaseDataLoader, + evaluator: CenterPointEvaluator, + executor: BackendExecutor, + config: BaseDeploymentConfig, + model_cfg: Config, + logger: logging.Logger, + onnx_pipeline: Optional[OnnxExportPipeline] = None, + tensorrt_pipeline: Optional[TensorRTExportPipeline] = None, + ) -> None: + """Initialize CenterPoint deployment runner. + + Args: + data_loader: Data loader for loading samples. + evaluator: Evaluator for computing metrics. + executor: Backend execution primitives shared with the evaluator/verification runner. + config: Deployment configuration. + model_cfg: MMEngine model configuration. + logger: Logger instance. + onnx_pipeline: Optional custom ONNX export pipeline. + tensorrt_pipeline: Optional custom TensorRT export pipeline. + """ + + if onnx_pipeline is None: + sample_adapter = CenterPointSampleAdapter(logger=logger) + component_builder = CenterPointComponentBuilder(components_cfg=config.components_cfg, logger=logger) + onnx_pipeline = CenterPointONNXExportPipeline( + exporter_factory=ExporterFactory, + sample_adapter=sample_adapter, + component_builder=component_builder, + logger=logger, + ) + + if tensorrt_pipeline is None: + tensorrt_pipeline = CenterPointTensorRTExportPipeline( + exporter_factory=ExporterFactory, + components_cfg=config.components_cfg, + logger=logger, + ) + + super().__init__( + data_loader=data_loader, + evaluator=evaluator, + executor=executor, + config=config, + model_cfg=model_cfg, + logger=logger, + onnx_wrapper_cls=IdentityWrapper, + onnx_pipeline=onnx_pipeline, + tensorrt_pipeline=tensorrt_pipeline, + ) + + def load_pytorch_model(self, checkpoint_path: str, context: ExportContext) -> torch.nn.Module: + """Load and return the PyTorch model for export. + + Args: + checkpoint_path: Path to the checkpoint file. + context: Export context with additional parameters. + + Returns: + Loaded PyTorch model. + """ + rot_y_axis_reference = self._extract_rot_y_axis_reference(context) + self.logger.info("Export option rot_y_axis_reference = %s", rot_y_axis_reference) + + model, _ = build_centerpoint_onnx_model( + base_model_cfg=self.model_cfg, + checkpoint_path=checkpoint_path, + device=DeviceSpec.from_value("cpu"), + rot_y_axis_reference=rot_y_axis_reference, + ) + return model + + def _extract_rot_y_axis_reference(self, context: ExportContext) -> bool: + """Extract rot_y_axis_reference from the export context. + + Args: + context: Export context; must be a ``CenterPointExportContext``. + + Returns: + Boolean value for rot_y_axis_reference. + """ + if not isinstance(context, CenterPointExportContext): + raise TypeError(f"CenterPoint export requires a CenterPointExportContext, got {type(context).__name__}.") + return context.rot_y_axis_reference diff --git a/deployment/projects/registry.py b/deployment/projects/registry.py new file mode 100644 index 000000000..ce2a8118a --- /dev/null +++ b/deployment/projects/registry.py @@ -0,0 +1,80 @@ +""" +Project registry for deployment bundles. + +Each deployment project registers an adapter that knows how to: +- add its CLI args +- construct data_loader / evaluator / runner +- execute the deployment workflow + +This keeps `deployment/cli/main.py` project-agnostic. +""" + +from __future__ import annotations + +import argparse +from dataclasses import dataclass +from typing import Callable, Dict, Tuple + + +@dataclass(frozen=True) +class ProjectAdapter: + """Minimal adapter interface for a deployment project.""" + + name: str + add_args: Callable[[argparse.ArgumentParser], None] + run: Callable[[argparse.Namespace], int] + required_components: Tuple[str, ...] = () + + +class ProjectRegistry: + """In-memory registry of deployment project adapters. + + The unified CLI discovers and imports `deployment.projects.` packages; + each package registers a `ProjectAdapter` here. This keeps core/cli code + project-agnostic while enabling project-specific argument wiring and run logic. + """ + + def __init__(self) -> None: + self._adapters: Dict[str, ProjectAdapter] = {} + + def register(self, adapter: ProjectAdapter) -> None: + name = adapter.name.strip().lower() + if not name: + raise ValueError("ProjectAdapter.name must be non-empty") + self._adapters[name] = adapter + + def get(self, name: str) -> ProjectAdapter: + key = (name or "").strip().lower() + if key not in self._adapters: + available = ", ".join(sorted(self._adapters.keys())) + raise KeyError(f"Unknown project '{name}'. Available: [{available}]") + return self._adapters[key] + + def list_projects(self) -> list[str]: + return sorted(self._adapters.keys()) + + def validate_required_components(self, project_name: str, components_cfg) -> None: + """Validate required component keys for a registered project.""" + adapter = self.get(project_name) + if not adapter.required_components: + return + + missing = [] + for component_name in adapter.required_components: + try: + components_cfg.get_component(component_name) + except KeyError: + missing.append(component_name) + + if not missing: + return + + available = sorted(list(components_cfg.component_names())) + missing_str = ", ".join(missing) + available_str = ", ".join(available) + raise KeyError( + f"{adapter.name} requires components [{missing_str}], " f"but available components are [{available_str}]." + ) + + +project_registry = ProjectRegistry() diff --git a/deployment/runtime/__init__.py b/deployment/runtime/__init__.py new file mode 100644 index 000000000..38b92d05a --- /dev/null +++ b/deployment/runtime/__init__.py @@ -0,0 +1 @@ +"""Runtime orchestration. Import from concrete modules under ``deployment.runtime.*``.""" diff --git a/deployment/runtime/artifact_manager.py b/deployment/runtime/artifact_manager.py new file mode 100644 index 000000000..ab67d3459 --- /dev/null +++ b/deployment/runtime/artifact_manager.py @@ -0,0 +1,101 @@ +""" +Artifact management for deployment workflows. + +This module handles registration and resolution of model artifacts (PyTorch checkpoints, +ONNX models, TensorRT engines) across different backends. +""" + +import logging +from collections.abc import Mapping +from typing import Dict, Optional, Tuple + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.artifacts import Artifact +from deployment.core.backend import Backend + + +class ArtifactManager: + """ + Manages model artifacts and path resolution for deployment workflows. + + Resolution Order (consistent for all backends): + 1. Registered artifacts (from export operations) - highest priority + 2. Explicit paths from evaluation.backends. config: + - ONNX: evaluation.backends.onnx.model_dir + - TensorRT: evaluation.backends.tensorrt.engine_dir + 3. Backend-specific fallback paths: + - PyTorch: checkpoint_path + - ONNX: export.onnx_path + """ + + def __init__(self, config: BaseDeploymentConfig, logger: logging.Logger) -> None: + """ + Initialize artifact manager. + + Args: + config: Deployment configuration + logger: Logger instance + """ + self.config = config + self.logger = logger + self.artifacts: Dict[str, Artifact] = {} + + def register_artifact(self, backend: Backend, artifact: Artifact) -> None: + """ + Register an artifact for a given backend. + + Args: + backend: Backend to register the artifact for + artifact: Artifact to register + """ + self.artifacts[backend.value] = artifact + self.logger.debug("Registered %s artifact: %s", backend.value, artifact.path) + + def resolve_artifact(self, backend: Backend) -> Tuple[Optional[Artifact], bool]: + """ + Resolve an artifact for a given backend. + + Args: + backend: Backend to resolve the artifact for + Returns: + Tuple containing the artifact and a boolean indicating if the artifact exists + """ + artifact = self.artifacts.get(backend.value) + if artifact: + return artifact, artifact.exists + + config_path = self._get_config_path(backend) + if config_path: + artifact = Artifact(path=config_path) + return artifact, artifact.exists + + return None, False + + def _get_config_path(self, backend: Backend) -> Optional[str]: + """ + Get the configuration path for a given backend. + + Args: + backend: Backend to get the configuration path for + Returns: + Configuration path for the given backend + """ + eval_backends = self.config.evaluation_config.backends + # Backend is a str Enum, so a str-keyed lookup matches both str and Backend keys. + backend_cfg = eval_backends.get(backend.value) if eval_backends else None + if backend_cfg and isinstance(backend_cfg, Mapping): + if backend == Backend.ONNX: + path = backend_cfg.get("model_dir") + if path: + return path + elif backend == Backend.TENSORRT: + path = backend_cfg.get("engine_dir") + if path: + return path + + if backend == Backend.PYTORCH: + return self.config.checkpoint_path + if backend == Backend.ONNX: + return self.config.export_config.onnx_path + + return None diff --git a/deployment/runtime/evaluation_orchestrator.py b/deployment/runtime/evaluation_orchestrator.py new file mode 100644 index 000000000..bf889bcfb --- /dev/null +++ b/deployment/runtime/evaluation_orchestrator.py @@ -0,0 +1,208 @@ +""" +Evaluation orchestration for deployment workflows. + +This module handles cross-backend evaluation with consistent metrics. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict, List, Mapping, Optional + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.backend import Backend +from deployment.core.device import DeviceSpec +from deployment.core.evaluation.base_evaluator import BaseEvaluator +from deployment.core.evaluation.evaluator_types import ModelSpec +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.pipelines.gpu_resource_mixin import clear_cuda_memory +from deployment.runtime.artifact_manager import ArtifactManager + + +class EvaluationOrchestrator: + """ + Orchestrates evaluation across backends with consistent metrics. + + This class handles: + - Resolving models to evaluate from configuration + - Running evaluation for each enabled backend + - Collecting and formatting evaluation results + - Logging evaluation progress and results + - Cross-backend metric comparison + """ + + def __init__( + self, + config: BaseDeploymentConfig, + evaluator: BaseEvaluator, + data_loader: BaseDataLoader, + artifact_manager: ArtifactManager, + logger: logging.Logger, + ): + """ + Initialize the evaluation orchestrator. + + Args: + config: Deployment configuration + evaluator: Evaluator instance for running evaluation + data_loader: Data loader for loading samples + artifact_manager: Artifact manager for resolving model paths + logger: Logger instance + """ + self.config = config + self.evaluator = evaluator + self.data_loader = data_loader + self.artifact_manager = artifact_manager + self.logger = logger + + def run(self) -> Dict[str, Any]: + """ + Run the evaluation orchestration. + + Returns: + Dictionary of evaluation results + """ + eval_config = self.config.evaluation_config + + if not eval_config.enabled: + self.logger.info("Evaluation disabled, skipping...") + return {} + + self.logger.info("=" * 80) + self.logger.info("Running Evaluation") + self.logger.info("=" * 80) + + model_specs = self._resolve_model_specs() + if not model_specs: + self.logger.warning("No models found for evaluation") + return {} + + num_samples = eval_config.num_samples + if num_samples == -1: + num_samples = self.data_loader.num_samples + + verbose = eval_config.verbose + all_results: Dict[str, Any] = {} + + for model_spec in model_specs: + backend = model_spec.backend + self.logger.info("\nEvaluating %s on %s...", backend.value, model_spec.device) + try: + results = self.evaluator.evaluate( + model=model_spec, + data_loader=self.data_loader, + num_samples=num_samples, + verbose=verbose, + num_warmup=eval_config.num_warmup, + ) + all_results[backend.value] = results + self.logger.info("\n%s Results:", backend.value.upper()) + self.evaluator.print_results(results) + except Exception as e: + self.logger.error("Evaluation failed for %s: %s", backend.value, e, exc_info=True) + all_results[backend.value] = {"error": str(e)} + finally: + clear_cuda_memory() + + if len(all_results) > 1: + self._print_cross_backend_comparison(all_results) + + return all_results + + def _resolve_model_specs(self) -> List[ModelSpec]: + """ + Resolve the model specs to evaluate from the configuration. + + For each enabled backend, resolves its device and artifact, keeping only + backends whose artifact exists on disk. + + Returns: + List of model specifications + """ + backend_configs = self.config.evaluation_config.backends + model_specs: List[ModelSpec] = [] + + for backend_key, backend_cfg in backend_configs.items(): + backend_enum = Backend.from_value(backend_key) + if not backend_cfg.get("enabled", False): + continue + + device = self._resolve_device_for_backend(backend_enum, backend_cfg.get("device")) + artifact, artifact_exists = self.artifact_manager.resolve_artifact(backend_enum) + + if artifact_exists and artifact: + model_specs.append(ModelSpec(backend=backend_enum, device=device, artifact=artifact)) + self.logger.info(" - %s: %s (device: %s)", backend_enum.value, artifact.path, device) + elif artifact is not None: + self.logger.warning( + " - %s: %s (not found or invalid, skipping)", + backend_enum.value, + artifact.path, + ) + + return model_specs + + def _resolve_device_for_backend(self, backend: Backend, configured_device: Optional[Any]) -> DeviceSpec: + """ + Resolve the single device a backend will run on (called once per backend). + + Falls back to the backend default when nothing is configured, and enforces + backend constraints: a CUDA-only backend handed a non-CUDA device is overridden + (with a warning) to the default CUDA device. + + Args: + backend: Backend the device is being resolved for + configured_device: Raw device from config (e.g. "cuda:0"), or None/blank to + use the backend default + Returns: + The device the backend will actually use + """ + resolved_device = ( + DeviceSpec.from_value(configured_device) if configured_device else self._get_default_device(backend) + ) + + if backend.requires_cuda and not resolved_device.is_cuda: + default_device = self._get_default_device(backend) + self.logger.warning( + "%s evaluation requires CUDA device. Overriding device from '%s' to '%s'.", + backend.value, + resolved_device, + default_device, + ) + resolved_device = default_device + + return resolved_device + + def _get_default_device(self, backend: Backend) -> DeviceSpec: + """ + Get the default device for a backend. + + Args: + backend: Backend to get the default device for + Returns: + Default device (DeviceSpec) for the backend + """ + if backend is Backend.TENSORRT: + if self.config.device_config.cuda is None: + raise RuntimeError("TensorRT backend requires a configured CUDA device.") + return self.config.device_config.cuda + return self.config.device_config.cpu + + def _print_cross_backend_comparison(self, all_results: Mapping[str, Any]) -> None: + """ + Print the cross-backend comparison results. + + Args: + all_results: Dictionary of all results + """ + self.logger.info("\n" + "=" * 80) + self.logger.info("Cross-Backend Comparison") + self.logger.info("=" * 80) + + for backend_label, results in all_results.items(): + self.logger.info("\n%s:", backend_label.upper()) + if results and "error" not in results: + for line in self.evaluator.summarize_for_comparison(results): + self.logger.info(line) + else: + self.logger.info(" No results available") diff --git a/deployment/runtime/export_orchestrator.py b/deployment/runtime/export_orchestrator.py new file mode 100644 index 000000000..dff5f53f6 --- /dev/null +++ b/deployment/runtime/export_orchestrator.py @@ -0,0 +1,415 @@ +""" +Export orchestration for deployment workflows. + +This module handles all model export logic (PyTorch loading, ONNX export, TensorRT export) +in a unified orchestrator, keeping the deployment runner thin. +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Optional, Type + +import torch + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.artifacts import Artifact +from deployment.core.backend import Backend +from deployment.core.contexts import ExportContext +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.exporters.common.factory import ExporterFactory +from deployment.exporters.common.model_wrappers import BaseModelWrapper +from deployment.exporters.common.onnx_exporter import ONNXExporter +from deployment.exporters.common.tensorrt_exporter import TensorRTExporter +from deployment.exporters.export_pipelines.base import OnnxExportPipeline, TensorRTExportPipeline +from deployment.runtime.artifact_manager import ArtifactManager + + +@dataclass +class ExportResult: + """ + Result of the export orchestration. + + Attributes: + pytorch_model: Loaded PyTorch model (if loaded) + onnx_path: Path to exported ONNX artifact + tensorrt_path: Path to exported TensorRT engine + """ + + pytorch_model: Optional[Any] = None + onnx_path: Optional[str] = None + tensorrt_path: Optional[str] = None + + +class ExportOrchestrator: + """ + Orchestrates model export workflows (PyTorch loading, ONNX, TensorRT). + + This class centralizes all export-related logic: + - Loading PyTorch from checkpoint_path (required for this deployment stack) + - ONNX / TensorRT export (pipeline or per-component) and artifact registration + + By extracting this logic from the runner, the runner becomes a thin + orchestrator that coordinates Export, Verification, and Evaluation. + """ + + ONNX_DIR_NAME = "onnx" + TENSORRT_DIR_NAME = "tensorrt" + DEFAULT_ENGINE_FILENAME = "model.engine" + + def __init__( + self, + config: BaseDeploymentConfig, + data_loader: BaseDataLoader, + artifact_manager: ArtifactManager, + logger: logging.Logger, + model_loader: Callable[..., Any], + onnx_wrapper_cls: Optional[Type[BaseModelWrapper]] = None, + onnx_pipeline: Optional[OnnxExportPipeline] = None, + tensorrt_pipeline: Optional[TensorRTExportPipeline] = None, + ) -> None: + """ + Initialize export orchestrator. + + Args: + config: Deployment configuration + data_loader: Data loader for loading samples + artifact_manager: Artifact manager for resolving model paths + logger: Logger instance + model_loader: Model loader for loading PyTorch model + onnx_wrapper_cls: ONNX wrapper class for exporting ONNX model + onnx_pipeline: ONNX export pipeline + tensorrt_pipeline: TensorRT export pipeline + """ + self.config = config + self.data_loader = data_loader + self.artifact_manager = artifact_manager + self.logger = logger + self._model_loader = model_loader + self._onnx_wrapper_cls = onnx_wrapper_cls + self._onnx_pipeline = onnx_pipeline + self._tensorrt_pipeline = tensorrt_pipeline + + def run(self, context: Optional[ExportContext] = None) -> ExportResult: + """ + Execute the complete export workflow. + + This method: + 1. Loads PyTorch model from checkpoint_path + 2. Exports to ONNX if configured + 3. Exports to TensorRT if configured + 4. Resolves external artifact paths + + Args: + context: Typed export context with parameters. If None, a default + ExportContext is created. + + Returns: + ExportResult containing model and artifact paths + """ + if context is None: + context = ExportContext() + + result = ExportResult() + + should_export_onnx = self.config.export_config.should_export_onnx + should_export_trt = self.config.export_config.should_export_tensorrt + external_onnx_path = self.config.export_config.onnx_path + + pytorch_model = self._load_and_register_pytorch_model(self.config.checkpoint_path, context) + result.pytorch_model = pytorch_model + + if should_export_onnx: + result.onnx_path = self._run_onnx_export(pytorch_model) + if not result.onnx_path: + # ONNX export was explicitly requested for this run but produced nothing. + # Failing here is critical: otherwise the TensorRT stage below would silently + # fall back to `export.onnx_path` (often the same dir) and build an engine from + # a STALE ONNX left by a previous run, yielding an engine that does not match + # the current checkpoint with no error surfaced. + raise RuntimeError( + "ONNX export was requested (export.mode includes ONNX) but no ONNX artifact " + "was produced. Refusing to continue, as TensorRT export would otherwise reuse " + "a stale ONNX file. Check the ONNX export logs above." + ) + + if should_export_trt: + # When this run also produced ONNX, reuse that fresh path (guaranteed present by the + # raise above). In trt-only mode fall back to the externally configured ONNX path. + onnx_path = result.onnx_path if should_export_onnx else external_onnx_path + if not onnx_path: + raise RuntimeError( + "TensorRT export requires an ONNX path but none is available. " + "Set export.onnx_path in config or enable ONNX export (export.mode)." + ) + result.onnx_path = onnx_path + self._register_external_onnx_artifact(onnx_path) + result.tensorrt_path = self._run_tensorrt_export(onnx_path) + + self._resolve_external_artifacts(result) + return result + + def _load_and_register_pytorch_model(self, checkpoint_path: str, context: ExportContext) -> Any: + """ + Load and register a PyTorch model from checkpoint. + + Args: + checkpoint_path: Path to the PyTorch checkpoint + context: Export context with sample index + Returns: + Loaded PyTorch model + Raises: + RuntimeError: If the checkpoint cannot be loaded. + """ + self.logger.info("\nLoading PyTorch model...") + try: + pytorch_model = self._model_loader(checkpoint_path, context) + self.artifact_manager.register_artifact(Backend.PYTORCH, Artifact(path=checkpoint_path)) + return pytorch_model + except Exception as e: + raise RuntimeError(f"Failed to load PyTorch model from '{checkpoint_path}': {e}") from e + + def _run_onnx_export(self, pytorch_model: Any) -> Optional[str]: + """ + Run the ONNX export workflow. + + Args: + pytorch_model: PyTorch model to export + Returns: + Path to the exported ONNX artifact or None if export failed + """ + onnx_artifact = self._export_onnx(pytorch_model) + if onnx_artifact: + return onnx_artifact.path + self.logger.error("ONNX export requested but no artifact was produced.") + return None + + def _register_external_onnx_artifact(self, onnx_path: str) -> None: + """ + Register an external ONNX artifact. + + Args: + onnx_path: Path to the ONNX artifact + """ + if not Path(onnx_path).exists(): + return + self.artifact_manager.register_artifact(Backend.ONNX, Artifact(path=onnx_path)) + + def _run_tensorrt_export(self, onnx_path: str) -> Optional[str]: + """ + Run the TensorRT export workflow. + + Args: + onnx_path: Path to the ONNX artifact + Returns: + Path to the exported TensorRT engine or None if export failed + """ + trt_artifact = self._export_tensorrt(onnx_path) + if trt_artifact: + return trt_artifact.path + self.logger.error("TensorRT export requested but no artifact was produced.") + return None + + def _export_onnx(self, pytorch_model: Any) -> Optional[Artifact]: + """ + Export a PyTorch model to ONNX. + + Args: + pytorch_model: PyTorch model to export + Returns: + Artifact representing the exported ONNX model + """ + if self._onnx_pipeline is None and self._onnx_wrapper_cls is None: + raise RuntimeError("ONNX export requested but no wrapper class or export pipeline provided.") + + sample_idx = self.config.export_config.sample_idx + onnx_dir = Path(self.config.export_config.work_dir) / self.ONNX_DIR_NAME + onnx_dir.mkdir(parents=True, exist_ok=True) + + if self._onnx_pipeline is not None: + self.logger.info("=" * 80) + self.logger.info("Exporting to ONNX via pipeline (%s)", type(self._onnx_pipeline).__name__) + self.logger.info("=" * 80) + artifact = self._onnx_pipeline.export( + model=pytorch_model, + data_loader=self.data_loader, + output_dir=str(onnx_dir), + config=self.config, + sample_idx=sample_idx, + ) + self.artifact_manager.register_artifact(Backend.ONNX, artifact) + self.logger.info("ONNX export successful: %s", artifact.path) + return artifact + + # Per-component export path (no pipeline) + sample = self.data_loader.load_sample(sample_idx) + single_input = self.data_loader.preprocess(sample) + + component_names = list(self.config.components_cfg.component_names()) + self.logger.info("=" * 80) + self.logger.info("Exporting %s component(s) to ONNX", len(component_names)) + self.logger.info("=" * 80) + + for component_name in component_names: + onnx_settings = self.config.get_onnx_settings(component_name) + output_path = onnx_dir / onnx_settings.save_file + exporter = self._build_onnx_exporter(component_name) + + batch_size = onnx_settings.batch_size + if batch_size is None: + input_tensor = single_input + else: + if isinstance(single_input, (list, tuple)): + input_tensor = tuple( + inp.repeat(batch_size, *([1] * (len(inp.shape) - 1))) if len(inp.shape) > 0 else inp + for inp in single_input + ) + else: + input_tensor = single_input.repeat(batch_size, *([1] * (len(single_input.shape) - 1))) + + self.logger.info("Exporting component '%s' → %s", component_name, output_path) + exporter.export(pytorch_model, input_tensor, str(output_path)) + + artifact = Artifact(path=str(onnx_dir)) + self.artifact_manager.register_artifact(Backend.ONNX, artifact) + self.logger.info("ONNX export successful: %s", artifact.path) + return artifact + + def _export_tensorrt(self, onnx_path: str) -> Optional[Artifact]: + """ + Export an ONNX model to TensorRT. + + Args: + onnx_path: Path to the ONNX artifact + Returns: + Artifact representing the exported TensorRT engine + """ + self.logger.info("=" * 80) + if self._tensorrt_pipeline: + self.logger.info("Exporting to TensorRT via pipeline (%s)", type(self._tensorrt_pipeline).__name__) + else: + self.logger.info("Exporting to TensorRT (per-component)") + self.logger.info("=" * 80) + + tensorrt_dir = Path(self.config.export_config.work_dir) / self.TENSORRT_DIR_NAME + tensorrt_dir.mkdir(parents=True, exist_ok=True) + + cuda_device = self.config.device_config.cuda + if cuda_device is None: + raise RuntimeError("TensorRT export requires a CUDA device. Set deploy_cfg.devices['cuda'].") + device_id = cuda_device.index + self.logger.info("Using CUDA device for TensorRT export: %s", cuda_device) + + sample_idx = self.config.export_config.sample_idx + sample_input = self.data_loader.get_shape_sample(sample_idx) + + # Scope the active CUDA device to this export rather than mutating the process-global + # device via torch.cuda.set_device(), so concurrent/repeat exports stay isolated. + with torch.cuda.device(device_id): + if self._tensorrt_pipeline is not None: + artifact = self._tensorrt_pipeline.export( + onnx_path=onnx_path, + output_dir=str(tensorrt_dir), + config=self.config, + device=cuda_device, + ) + self.artifact_manager.register_artifact(Backend.TENSORRT, artifact) + self.logger.info("TensorRT export successful: %s", artifact.path) + return artifact + + component_names = list(self.config.components_cfg.component_names()) + for component_name in component_names: + output_path = self._get_tensorrt_output_path(onnx_path, str(tensorrt_dir), component_name) + exporter = self._build_tensorrt_exporter(component_name) + self.logger.info("Exporting component '%s' → %s", component_name, output_path) + exporter.export( + model=None, + sample_input=sample_input, + output_path=output_path, + onnx_path=onnx_path, + ) + + artifact = Artifact(path=str(tensorrt_dir)) + self.artifact_manager.register_artifact(Backend.TENSORRT, artifact) + self.logger.info("TensorRT export successful: %s", artifact.path) + return artifact + + def _build_onnx_exporter(self, component_name: str) -> ONNXExporter: + """Build an ONNX exporter for the given component.""" + if self._onnx_wrapper_cls is None: + raise RuntimeError("ONNX wrapper class not provided. Cannot create ONNX exporter.") + return ExporterFactory.create_onnx_exporter( + config=self.config, + wrapper_cls=self._onnx_wrapper_cls, + logger=self.logger, + component_name=component_name, + ) + + def _build_tensorrt_exporter(self, component_name: str) -> TensorRTExporter: + """Build a TensorRT exporter for the given component.""" + return ExporterFactory.create_tensorrt_exporter( + config=self.config, + logger=self.logger, + component_name=component_name, + ) + + def _get_tensorrt_output_path( + self, onnx_path: str, tensorrt_dir: str, component_name: Optional[str] = None + ) -> str: + """ + Get the output path for the TensorRT engine. + + Args: + onnx_path: Path to the ONNX artifact + tensorrt_dir: Directory for TensorRT output + component_name: Component being exported. When provided, the engine filename + comes from that component's ``engine_file`` so multiple components do not + overwrite each other. + Returns: + Path to the TensorRT engine file + """ + tensorrt_dir_obj = Path(tensorrt_dir) + if component_name is not None: + engine_file = self.config.components_cfg.get_component(component_name).engine_file + return str(tensorrt_dir_obj / Path(engine_file).name) + + onnx_path_obj = Path(onnx_path) + if onnx_path_obj.is_dir(): + return str(tensorrt_dir_obj / self.DEFAULT_ENGINE_FILENAME) + engine_filename = onnx_path_obj.with_suffix(".engine").name + return str(tensorrt_dir_obj / engine_filename) + + def _resolve_external_artifacts(self, result: ExportResult) -> None: + """ + Fill in artifact paths not produced by this run from configured fallbacks. + + Config-based artifact lookup is delegated to ``ArtifactManager.resolve_artifact`` so + that the resolution rules (registered artifacts, then ``evaluation.backends``, then + per-backend fallbacks) live in exactly one place rather than being duplicated here. + + Args: + result: Export result object to store the artifacts + """ + if not result.onnx_path: + result.onnx_path = self._resolve_configured_artifact(Backend.ONNX) + + if not result.tensorrt_path: + result.tensorrt_path = self._resolve_configured_artifact(Backend.TENSORRT) + + def _resolve_configured_artifact(self, backend: Backend) -> Optional[str]: + """ + Resolve a backend artifact path from configuration, if one exists on disk. + + Args: + backend: Backend to resolve the artifact for + Returns: + The artifact path if it is configured and exists, otherwise None. + """ + artifact, exists = self.artifact_manager.resolve_artifact(backend) + if artifact and exists: + return artifact.path + if artifact: + self.logger.warning("%s artifact path from config does not exist: %s", backend.value, artifact.path) + return None diff --git a/deployment/runtime/runner.py b/deployment/runtime/runner.py new file mode 100644 index 000000000..206d7689d --- /dev/null +++ b/deployment/runtime/runner.py @@ -0,0 +1,111 @@ +""" +Unified deployment runner for common deployment workflows. + +Project-agnostic runtime runner that orchestrates: +- Export (PyTorch -> ONNX -> TensorRT) +- Verification (scenario-based comparisons) +- Evaluation (metrics/latency across backends) +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import Any, Dict, Optional, Type + +from mmengine.config import Config + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.contexts import ExportContext +from deployment.core.evaluation.backend_executor import BackendExecutor +from deployment.core.evaluation.backend_verifier import BackendVerifier +from deployment.core.evaluation.base_evaluator import BaseEvaluator +from deployment.core.evaluation.output_comparator import OutputComparator +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.exporters.common.model_wrappers import BaseModelWrapper +from deployment.exporters.export_pipelines.base import OnnxExportPipeline, TensorRTExportPipeline +from deployment.runtime.artifact_manager import ArtifactManager +from deployment.runtime.evaluation_orchestrator import EvaluationOrchestrator +from deployment.runtime.export_orchestrator import ExportOrchestrator +from deployment.runtime.verification_orchestrator import VerificationOrchestrator + + +@dataclass +class DeploymentResult: + """Standardized structure returned by `BaseDeploymentRunner.run()`.""" + + pytorch_model: Optional[Any] = None + onnx_path: Optional[str] = None + tensorrt_path: Optional[str] = None + verification_results: Dict[str, Any] = field(default_factory=dict) + evaluation_results: Dict[str, Any] = field(default_factory=dict) + + +class BaseDeploymentRunner: + """Base deployment runner for common deployment pipelines.""" + + def __init__( + self, + data_loader: BaseDataLoader, + evaluator: BaseEvaluator, + executor: BackendExecutor, + config: BaseDeploymentConfig, + model_cfg: Config, + logger: logging.Logger, + onnx_wrapper_cls: Optional[Type[BaseModelWrapper]] = None, + onnx_pipeline: Optional[OnnxExportPipeline] = None, + tensorrt_pipeline: Optional[TensorRTExportPipeline] = None, + ) -> None: + self.data_loader = data_loader + self.evaluator = evaluator + self._executor = executor + self.config = config + self.model_cfg = model_cfg + self.logger = logger + + self.artifact_manager = ArtifactManager(config, logger) + + self.export_orchestrator = ExportOrchestrator( + config=config, + data_loader=data_loader, + artifact_manager=self.artifact_manager, + logger=logger, + model_loader=self.load_pytorch_model, + onnx_wrapper_cls=onnx_wrapper_cls, + onnx_pipeline=onnx_pipeline, + tensorrt_pipeline=tensorrt_pipeline, + ) + comparator = OutputComparator(output_names=executor.get_output_names()) + verifier = BackendVerifier(executor, comparator) + self.verification_orchestrator = VerificationOrchestrator( + config, verifier, data_loader, self.artifact_manager, logger + ) + self.evaluation_orchestrator = EvaluationOrchestrator( + config, evaluator, data_loader, self.artifact_manager, logger + ) + + def load_pytorch_model(self, checkpoint_path: str, context: ExportContext) -> Any: + raise NotImplementedError(f"{self.__class__.__name__}.load_pytorch_model() must be implemented by subclasses.") + + def run(self, context: Optional[ExportContext] = None) -> DeploymentResult: + if context is None: + context = ExportContext() + + results = DeploymentResult() + + export_result = self.export_orchestrator.run(context) + results.pytorch_model = export_result.pytorch_model + results.onnx_path = export_result.onnx_path + results.tensorrt_path = export_result.tensorrt_path + + # Hand the loaded reference model to the executor shared by verification and evaluation. + self._executor.set_pytorch_model(export_result.pytorch_model) + + results.verification_results = self.verification_orchestrator.run() + results.evaluation_results = self.evaluation_orchestrator.run() + + self.logger.info("\n" + "=" * 80) + self.logger.info("Deployment Complete!") + self.logger.info("=" * 80) + + return results diff --git a/deployment/runtime/verification_orchestrator.py b/deployment/runtime/verification_orchestrator.py new file mode 100644 index 000000000..19f34cb4e --- /dev/null +++ b/deployment/runtime/verification_orchestrator.py @@ -0,0 +1,172 @@ +""" +Verification orchestration for deployment workflows. + +This module handles scenario-based verification across different backends. +""" + +from __future__ import annotations + +import logging +from typing import Any, Dict + +from deployment.configs.base import BaseDeploymentConfig +from deployment.core.backend import Backend +from deployment.core.evaluation.backend_verifier import BackendVerifier +from deployment.core.evaluation.evaluator_types import ModelSpec +from deployment.core.io.base_data_loader import BaseDataLoader +from deployment.runtime.artifact_manager import ArtifactManager + + +class VerificationOrchestrator: + """ + Orchestrates verification across backends using scenario-based verification. + + This class handles: + - Running verification scenarios from config + - Resolving model paths via ArtifactManager + - Collecting and aggregating verification results + - Logging verification progress and results + """ + + def __init__( + self, + config: BaseDeploymentConfig, + verifier: BackendVerifier, + data_loader: BaseDataLoader, + artifact_manager: ArtifactManager, + logger: logging.Logger, + ) -> None: + """ + Initialize verification orchestrator. + + Args: + config: Deployment configuration + verifier: Backend verifier that runs reference-vs-test comparisons + data_loader: Data loader for loading samples + artifact_manager: Artifact manager for resolving model paths + logger: Logger instance + """ + self.config = config + self.verifier = verifier + self.data_loader = data_loader + self.artifact_manager = artifact_manager + self.logger = logger + + def run(self) -> Dict[str, Any]: + """ + Run verification on exported models using policy-based verification. + + Returns: + Verification results dictionary + """ + verification_cfg = self.config.verification_config + + if not verification_cfg.enabled: + self.logger.info("Verification disabled (verification.enabled=False), skipping...") + return {} + + export_mode = self.config.export_config.mode + scenarios = self.config.get_verification_scenarios(export_mode) + + if not scenarios: + self.logger.info( + "No verification scenarios for export mode '%s', skipping...", + export_mode.value, + ) + return {} + + _, pytorch_valid = self.artifact_manager.resolve_artifact(Backend.PYTORCH) + if not pytorch_valid: + self.logger.warning( + "PyTorch checkpoint not registered or missing; verification needs it for preprocessing/decode. " + "Skipping verification." + ) + return {} + + num_verify_samples = verification_cfg.num_verify_samples + tolerance = verification_cfg.tolerance + self.logger.info("=" * 80) + self.logger.info("Running Verification (mode: %s)", export_mode.value) + self.logger.info("=" * 80) + + all_results: Dict[str, Any] = {} + total_passed = 0 + total_failed = 0 + + for i, policy in enumerate(scenarios): + ref_device = policy.ref_device + test_device = policy.test_device + + self.logger.info( + "\nScenario %s/%s: %s(%s) vs %s(%s)", + i + 1, + len(scenarios), + policy.ref_backend.value, + ref_device, + policy.test_backend.value, + test_device, + ) + + ref_artifact, ref_valid = self.artifact_manager.resolve_artifact(policy.ref_backend) + test_artifact, test_valid = self.artifact_manager.resolve_artifact(policy.test_backend) + + if not ref_valid or not test_valid: + ref_path = ref_artifact.path if ref_artifact else None + test_path = test_artifact.path if test_artifact else None + self.logger.warning( + " Skipping: missing or invalid artifacts (ref=%s, valid=%s, test=%s, valid=%s)", + ref_path, + ref_valid, + test_path, + test_valid, + ) + continue + + reference_spec = ModelSpec(backend=policy.ref_backend, device=ref_device, artifact=ref_artifact) + test_spec = ModelSpec(backend=policy.test_backend, device=test_device, artifact=test_artifact) + + verification_results = self.verifier.run( + reference=reference_spec, + test=test_spec, + data_loader=self.data_loader, + num_samples=num_verify_samples, + tolerance=tolerance, + ) + + policy_key = f"{policy.ref_backend.value}_{ref_device}_vs_{policy.test_backend.value}_{test_device}" + all_results[policy_key] = verification_results + + if "summary" in verification_results: + summary = verification_results["summary"] + passed = summary.get("passed", 0) + failed = summary.get("failed", 0) + total_passed += passed + total_failed += failed + if failed == 0: + self.logger.info("Scenario %s passed (%s comparisons)", i + 1, passed) + else: + self.logger.warning( + "Scenario %s failed (%s/%s comparisons)", + i + 1, + failed, + passed + failed, + ) + + self.logger.info("\n" + "=" * 80) + if total_failed == 0: + self.logger.info("All verifications passed! (%s total)", total_passed) + else: + self.logger.warning( + "%s/%s verifications failed", + total_failed, + total_passed + total_failed, + ) + self.logger.info("=" * 80) + + all_results["summary"] = { + "passed": total_passed, + "failed": total_failed, + "total": total_passed + total_failed, + } + + return all_results diff --git a/deployment/tests/__init__.py b/deployment/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/deployment/tests/test_artifacts.py b/deployment/tests/test_artifacts.py new file mode 100644 index 000000000..58a0acf40 --- /dev/null +++ b/deployment/tests/test_artifacts.py @@ -0,0 +1,117 @@ +"""Unit tests for artifact path resolution and ArtifactManager priority order.""" + +from __future__ import annotations + +import logging +from types import SimpleNamespace + +import pytest + +from deployment.core.artifacts import Artifact, resolve_artifact_path +from deployment.core.backend import Backend +from deployment.runtime.artifact_manager import ArtifactManager + + +# -------------------------------------------------------------------------------------- +# resolve_artifact_path (pure path logic) +# -------------------------------------------------------------------------------------- +class TestResolveArtifactPath: + def test_resolves_existing_file(self, tmp_path): + (tmp_path / "model.onnx").write_bytes(b"x") + out = resolve_artifact_path( + base_dir=str(tmp_path), + components_cfg={"model": {"onnx_file": "model.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + assert out == str((tmp_path / "model.onnx").resolve()) + + def test_base_dir_must_be_directory(self, tmp_path): + f = tmp_path / "not_a_dir" + f.write_bytes(b"x") + with pytest.raises(ValueError): + resolve_artifact_path( + base_dir=str(f), + components_cfg={"model": {"onnx_file": "model.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + + def test_missing_filename_raises_keyerror(self, tmp_path): + with pytest.raises(KeyError): + resolve_artifact_path( + base_dir=str(tmp_path), + components_cfg={"model": {}}, + component_name="model", + file_key="onnx_file", + ) + + def test_absolute_filename_rejected(self, tmp_path): + with pytest.raises(ValueError): + resolve_artifact_path( + base_dir=str(tmp_path), + components_cfg={"model": {"onnx_file": "/abs/model.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + + def test_escaping_base_dir_rejected(self, tmp_path): + with pytest.raises(ValueError): + resolve_artifact_path( + base_dir=str(tmp_path), + components_cfg={"model": {"onnx_file": "../escape.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + + def test_missing_file_raises_filenotfound(self, tmp_path): + with pytest.raises(FileNotFoundError): + resolve_artifact_path( + base_dir=str(tmp_path), + components_cfg={"model": {"onnx_file": "absent.onnx"}}, + component_name="model", + file_key="onnx_file", + ) + + +# -------------------------------------------------------------------------------------- +# ArtifactManager resolution priority +# -------------------------------------------------------------------------------------- +def _stub_config(tmp_path): + return SimpleNamespace( + checkpoint_path=str(tmp_path / "ckpt.pth"), + export_config=SimpleNamespace(onnx_path=str(tmp_path / "onnx_export")), + evaluation_config=SimpleNamespace( + backends={ + "onnx": {"model_dir": str(tmp_path / "cfg_onnx")}, + "tensorrt": {"engine_dir": str(tmp_path / "cfg_trt")}, + } + ), + ) + + +class TestArtifactManager: + def test_registered_artifact_takes_priority(self, tmp_path): + ckpt = tmp_path / "real_ckpt.pth" + ckpt.write_bytes(b"x") + mgr = ArtifactManager(_stub_config(tmp_path), logging.getLogger("test")) + mgr.register_artifact(Backend.PYTORCH, Artifact(path=str(ckpt))) + + artifact, exists = mgr.resolve_artifact(Backend.PYTORCH) + assert artifact is not None + assert artifact.path == str(ckpt) + assert exists is True + + def test_falls_back_to_eval_backend_config(self, tmp_path): + mgr = ArtifactManager(_stub_config(tmp_path), logging.getLogger("test")) + artifact, exists = mgr.resolve_artifact(Backend.TENSORRT) + assert artifact is not None + assert artifact.path == str(tmp_path / "cfg_trt") + # Path does not actually exist -> exists is False (no silent success). + assert exists is False + + def test_pytorch_falls_back_to_checkpoint_path(self, tmp_path): + mgr = ArtifactManager(_stub_config(tmp_path), logging.getLogger("test")) + artifact, _ = mgr.resolve_artifact(Backend.PYTORCH) + assert artifact is not None + assert artifact.path == str(tmp_path / "ckpt.pth") diff --git a/deployment/tests/test_config_schema.py b/deployment/tests/test_config_schema.py new file mode 100644 index 000000000..ed37c77e9 --- /dev/null +++ b/deployment/tests/test_config_schema.py @@ -0,0 +1,80 @@ +"""Unit tests for deploy-config schema parsing/validation. + +These are pure-Python (no GPU) and exercise the validation branches in +``deployment.configs.schema`` that previously had no coverage. +""" + +from __future__ import annotations + +import pytest + +from deployment.configs.schema import ComponentsConfig, VerificationScenario + + +def _valid_component() -> dict: + return { + "onnx_file": "model.onnx", + "engine_file": "model.engine", + "io": { + "inputs": [{"name": "x", "dtype": "float32"}], + "outputs": [{"name": "y", "dtype": "float32"}], + "dynamic_axes": {"x": {0: "batch"}}, + }, + } + + +class TestComponentsConfig: + def test_parses_valid_component(self): + cfg = ComponentsConfig.from_dict({"model": _valid_component()}) + comp = cfg.get_component("model") + assert comp.name == "model" + assert comp.onnx_file == "model.onnx" + assert comp.engine_file == "model.engine" + assert [i.name for i in comp.io.inputs] == ["x"] + assert [o.name for o in comp.io.outputs] == ["y"] + assert comp.io.dynamic_axes == {"x": {0: "batch"}} + + def test_missing_onnx_file_raises(self): + comp = _valid_component() + del comp["onnx_file"] + with pytest.raises(KeyError): + ComponentsConfig.from_dict({"model": comp}) + + def test_empty_outputs_raises(self): + comp = _valid_component() + comp["io"]["outputs"] = [] + with pytest.raises(KeyError): + ComponentsConfig.from_dict({"model": comp}) + + def test_bad_dynamic_axes_type_raises(self): + comp = _valid_component() + comp["io"]["dynamic_axes"] = {"x": {"not_an_int": "batch"}} + with pytest.raises(TypeError): + ComponentsConfig.from_dict({"model": comp}) + + def test_unknown_component_lookup_raises(self): + cfg = ComponentsConfig.from_dict({"model": _valid_component()}) + with pytest.raises(KeyError): + cfg.get_component("does_not_exist") + + def test_get_artifact_filename(self): + cfg = ComponentsConfig.from_dict({"model": _valid_component()}) + assert cfg.get_artifact_filename("model", "engine_file") == "model.engine" + + +class TestVerificationScenario: + def test_parses_valid_scenario(self): + scenario = VerificationScenario.from_dict( + { + "ref_backend": "pytorch", + "ref_device": "cpu", + "test_backend": "onnx", + "test_device": "cpu", + } + ) + assert scenario.ref_backend.value == "pytorch" + assert scenario.test_backend.value == "onnx" + + def test_missing_keys_raises(self): + with pytest.raises(ValueError): + VerificationScenario.from_dict({"ref_backend": "pytorch"}) diff --git a/deployment/tests/test_export_orchestrator.py b/deployment/tests/test_export_orchestrator.py new file mode 100644 index 000000000..64d0fe9d7 --- /dev/null +++ b/deployment/tests/test_export_orchestrator.py @@ -0,0 +1,86 @@ +"""Unit tests for ExportOrchestrator control flow. + +Covers the two behaviors this stack relies on for correctness: + * the stale-ONNX guard (a requested ONNX export that produces nothing must abort the run, + never fall through to TensorRT with a stale ONNX); + * external-artifact resolution delegating to ArtifactManager (single source of truth). + +These are pure control-flow tests: model loading and the actual export steps are stubbed. +""" + +from __future__ import annotations + +import logging +from types import SimpleNamespace +from unittest.mock import Mock + +import pytest + +from deployment.core.artifacts import Artifact +from deployment.core.backend import Backend +from deployment.core.contexts import ExportContext +from deployment.runtime.export_orchestrator import ExportOrchestrator, ExportResult + + +def _orchestrator(export_config, artifact_manager=None) -> ExportOrchestrator: + config = SimpleNamespace(checkpoint_path="unused.pth", export_config=export_config) + return ExportOrchestrator( + config=config, + data_loader=Mock(), + artifact_manager=artifact_manager or Mock(), + logger=logging.getLogger("test"), + model_loader=Mock(), + ) + + +class TestStaleOnnxGuard: + def test_requested_onnx_producing_nothing_aborts_run(self): + export_config = SimpleNamespace(should_export_onnx=True, should_export_tensorrt=True, onnx_path="stale/onnx") + orch = _orchestrator(export_config) + # Bypass real model loading and force ONNX export to produce nothing. + orch._load_and_register_pytorch_model = lambda ckpt, ctx: object() + orch._run_onnx_export = lambda model: None + + with pytest.raises(RuntimeError, match="stale"): + orch.run(ExportContext()) + + def test_does_not_reach_tensorrt_when_onnx_fails(self): + export_config = SimpleNamespace(should_export_onnx=True, should_export_tensorrt=True, onnx_path="stale/onnx") + orch = _orchestrator(export_config) + orch._load_and_register_pytorch_model = lambda ckpt, ctx: object() + orch._run_onnx_export = lambda model: None + orch._run_tensorrt_export = Mock(side_effect=AssertionError("TensorRT must not run on stale ONNX")) + + with pytest.raises(RuntimeError): + orch.run(ExportContext()) + orch._run_tensorrt_export.assert_not_called() + + +class TestExternalArtifactResolution: + def test_resolution_delegates_to_artifact_manager(self): + manager = Mock() + + def resolve(backend): + if backend == Backend.ONNX: + return Artifact(path="/models/model.onnx"), True + return None, False + + manager.resolve_artifact.side_effect = resolve + orch = _orchestrator(SimpleNamespace(), artifact_manager=manager) + + result = ExportResult() + orch._resolve_external_artifacts(result) + + assert result.onnx_path == "/models/model.onnx" + assert result.tensorrt_path is None + + def test_configured_path_that_does_not_exist_is_ignored(self): + manager = Mock() + manager.resolve_artifact.return_value = (Artifact(path="/missing/model.engine"), False) + orch = _orchestrator(SimpleNamespace(), artifact_manager=manager) + + result = ExportResult() + orch._resolve_external_artifacts(result) + + assert result.onnx_path is None + assert result.tensorrt_path is None diff --git a/deployment/tests/test_onnx_exporter.py b/deployment/tests/test_onnx_exporter.py new file mode 100644 index 000000000..bd355dae9 --- /dev/null +++ b/deployment/tests/test_onnx_exporter.py @@ -0,0 +1,88 @@ +"""Unit tests for ONNXExporter's atomic, external-data-safe write path. + +These exercise the staging-dir publish logic in ``_do_onnx_export`` without running a real +torch.onnx export: ``torch.onnx.export`` is monkeypatched to simulate what it writes (a main +``.onnx`` plus optional external-data sidecars), so the tests stay CPU-only and fast. +""" + +from __future__ import annotations + +import logging +from pathlib import Path + +import pytest +import torch + +from deployment.exporters.common.configs import ONNXExportConfig +from deployment.exporters.common.onnx_exporter import ONNXExporter + + +def _exporter() -> ONNXExporter: + return ONNXExporter(ONNXExportConfig(), logging.getLogger("test")) + + +def _no_staging_left(target: Path) -> bool: + """No leftover staging directory next to the target after publish/cleanup.""" + return not (target.parent / f".{target.name}.staging").exists() + + +class TestAtomicExport: + def test_successful_export_moves_into_place(self, tmp_path, monkeypatch): + target = tmp_path / "out" / "model.onnx" + cfg = ONNXExportConfig() + + def fake_export(model, args, f, **kwargs): + Path(f).write_bytes(b"onnx-bytes") + + monkeypatch.setattr(torch.onnx, "export", fake_export) + _exporter()._do_onnx_export(model=object(), sample_input=object(), output_path=str(target), export_cfg=cfg) + + assert target.read_bytes() == b"onnx-bytes" + assert _no_staging_left(target) + + def test_external_data_sidecar_published_too(self, tmp_path, monkeypatch): + target = tmp_path / "model.onnx" + cfg = ONNXExportConfig() + + def fake_export(model, args, f, **kwargs): + # Simulate a >2GB export: a main file plus an external-data sidecar next to it. + Path(f).write_bytes(b"graph") + Path(f).with_name(Path(f).name + ".data").write_bytes(b"weights") + + monkeypatch.setattr(torch.onnx, "export", fake_export) + _exporter()._do_onnx_export(model=object(), sample_input=object(), output_path=str(target), export_cfg=cfg) + + assert target.read_bytes() == b"graph" + assert (tmp_path / "model.onnx.data").read_bytes() == b"weights" + assert _no_staging_left(target) + + def test_failed_export_leaves_no_partial_artifact(self, tmp_path, monkeypatch): + target = tmp_path / "model.onnx" + cfg = ONNXExportConfig() + + def fake_export(model, args, f, **kwargs): + Path(f).write_bytes(b"partial") # something is written... + raise ValueError("boom") # ...then export fails + + monkeypatch.setattr(torch.onnx, "export", fake_export) + with pytest.raises(RuntimeError, match="ONNX export failed"): + _exporter()._do_onnx_export(model=object(), sample_input=object(), output_path=str(target), export_cfg=cfg) + + assert not target.exists() + assert _no_staging_left(target) + + def test_failed_export_preserves_previous_good_model(self, tmp_path, monkeypatch): + target = tmp_path / "model.onnx" + target.write_bytes(b"previous-good") + cfg = ONNXExportConfig() + + def fake_export(model, args, f, **kwargs): + raise ValueError("boom") + + monkeypatch.setattr(torch.onnx, "export", fake_export) + with pytest.raises(RuntimeError): + _exporter()._do_onnx_export(model=object(), sample_input=object(), output_path=str(target), export_cfg=cfg) + + # The valid pre-existing artifact must survive a failed re-export. + assert target.read_bytes() == b"previous-good" + assert _no_staging_left(target) diff --git a/deployment/tests/test_output_comparator.py b/deployment/tests/test_output_comparator.py new file mode 100644 index 000000000..4b25d1c8d --- /dev/null +++ b/deployment/tests/test_output_comparator.py @@ -0,0 +1,54 @@ +"""Unit tests for OutputComparator, focused on the shape-mismatch path.""" + +from __future__ import annotations + +import numpy as np + +from deployment.core.evaluation.output_comparator import OutputComparator + + +class TestOutputComparator: + def test_identical_arrays_pass(self): + a = [np.zeros((2, 3), dtype=np.float32)] + b = [np.zeros((2, 3), dtype=np.float32)] + summary, details = OutputComparator().compare(a, b, tolerance=1e-6) + assert summary.passed + assert summary.max_diff == 0.0 + assert len(details) == 1 + + def test_within_tolerance_passes(self): + a = [np.zeros((4,), dtype=np.float32)] + b = [np.full((4,), 0.05, dtype=np.float32)] + summary, _ = OutputComparator().compare(a, b, tolerance=0.1) + assert summary.passed + assert summary.max_diff <= 0.1 + + def test_exceeds_tolerance_fails(self): + a = [np.zeros((4,), dtype=np.float32)] + b = [np.full((4,), 1.0, dtype=np.float32)] + summary, _ = OutputComparator().compare(a, b, tolerance=0.1) + assert not summary.passed + assert "tolerance" in (summary.reason or "") + + def test_shape_mismatch_fails_with_reason(self): + a = [np.zeros((2, 3), dtype=np.float32)] + b = [np.zeros((2, 4), dtype=np.float32)] + summary, details = OutputComparator().compare(a, b, tolerance=1.0) + assert not summary.passed + assert "shape mismatch" in (summary.reason or "") + # The mismatched tensor is recorded with infinite diffs, not silently dropped. + assert details and details[0].max_diff == float("inf") + + def test_length_mismatch_fails(self): + a = [np.zeros((2,), dtype=np.float32), np.zeros((2,), dtype=np.float32)] + b = [np.zeros((2,), dtype=np.float32)] + summary, _ = OutputComparator().compare(a, b, tolerance=1.0) + assert not summary.passed + assert "length mismatch" in (summary.reason or "") + + def test_named_outputs_label_paths(self): + a = [np.zeros((2,), dtype=np.float32)] + b = [np.ones((2,), dtype=np.float32)] + summary, details = OutputComparator(output_names=["heatmap"]).compare(a, b, tolerance=0.1) + assert not summary.passed + assert "heatmap" in details[0].path diff --git a/projects/CenterPoint/Dockerfile b/projects/CenterPoint/Dockerfile new file mode 100644 index 000000000..7b86c99eb --- /dev/null +++ b/projects/CenterPoint/Dockerfile @@ -0,0 +1,13 @@ +ARG AWML_BASE_IMAGE="autoware-ml:latest" +FROM ${AWML_BASE_IMAGE} +ARG TRT_VERSION=10.8.0.43 + +# Install pip dependencies +RUN python3 -m pip --no-cache-dir install \ + onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ \ + onnxsim \ + pycuda \ + tensorrt-cu12==${TRT_VERSION} + +WORKDIR /workspace +RUN pip install --no-cache-dir -e . diff --git a/projects/CenterPoint/README.md b/projects/CenterPoint/README.md index 6b265e890..839fbd498 100644 --- a/projects/CenterPoint/README.md +++ b/projects/CenterPoint/README.md @@ -41,6 +41,24 @@ docker run -it --rm --gpus all --shm-size=64g --name awml -p 6006:6006 -v $PWD/:/workspace -v $PWD/data:/workspace/data autoware-ml ``` +For ONNX and TensorRT evaluation + +- If you need to use deployment, ONNX runtime, or TensorRT evaluation, please build the docker image first: + +```sh +# Build the base autoware-ml image (if not already built) +DOCKER_BUILDKIT=1 docker build -t autoware-ml . + +# Build the centerpoint-deployment image +docker build -t centerpoint-deployment:latest -f projects/CenterPoint/Dockerfile . +``` + +- Run the docker container: + +```sh +docker run -it --rm --gpus all --shm-size=64g --name awml_deployment -p 6006:6006 -v $PWD/:/workspace -v $PWD/data:/workspace/data centerpoint-deployment:latest +``` + ### 2. Train #### 2.1 Environment set up @@ -110,12 +128,21 @@ where `frame-range` represents the range of frames to visualize. ### 5. Deploy -- Make an onnx file for a CenterPoint model. +- Run the deployment pipeline: + - Export ONNX/TensorRT artifacts. + - Verify the exported artifacts. + - (Optionally) run evaluation. + - Update `deployment/projects/centerpoint/config/deploy_config.py` so that the following entries point to your experiment: + - `checkpoint_path` (e.g., `checkpoint_path="work_dirs/centerpoint/t4dataset/second_secfpn_2xb8_121m_base/epoch_50.pth"`). + - `runtime_io.info_file`. + - `export.work_dir`. ```sh -# Deploy for t4dataset -DIR="work_dirs/centerpoint/t4dataset/second_secfpn_2xb8_121m_base/" && -python projects/CenterPoint/scripts/deploy.py projects/CenterPoint/configs/t4dataset/second_secfpn_2xb8_121m_base.py $DIR/epoch_50.pth --replace_onnx_models --device gpu --rot_y_axis_reference +# Deploy for t4dataset (export + verification + evaluation) +python -m deployment.cli.main centerpoint \ + deployment/projects/centerpoint/config/deploy_config.py \ + projects/CenterPoint/configs/t4dataset/second_secfpn_2xb8_121m_base.py \ + --rot-y-axis-reference ``` where `rot_y_axis_reference` can be removed if we would like to use the original counterclockwise x-axis rotation system. diff --git a/projects/CenterPoint/configs/t4dataset/Centerpoint/second_secfpn_8xb16_121m_j6gen2_base_amp.py b/projects/CenterPoint/configs/t4dataset/Centerpoint/second_secfpn_8xb16_121m_j6gen2_base_amp.py index 281d133d7..629289937 100644 --- a/projects/CenterPoint/configs/t4dataset/Centerpoint/second_secfpn_8xb16_121m_j6gen2_base_amp.py +++ b/projects/CenterPoint/configs/t4dataset/Centerpoint/second_secfpn_8xb16_121m_j6gen2_base_amp.py @@ -42,7 +42,7 @@ # user setting data_root = "data/t4dataset/" -info_directory_path = "info/user_name/" +info_directory_path = "info/" train_gpu_size = 8 train_batch_size = 16 test_batch_size = 2 diff --git a/projects/CenterPoint/models/__init__.py b/projects/CenterPoint/models/__init__.py index 13d1792cb..c713add79 100644 --- a/projects/CenterPoint/models/__init__.py +++ b/projects/CenterPoint/models/__init__.py @@ -1,13 +1,10 @@ from .backbones.second import SECOND from .dense_heads.centerpoint_head import CenterHead, CustomSeparateHead -from .dense_heads.centerpoint_head_onnx import CenterHeadONNX, SeparateHeadONNX from .detectors.centerpoint import CenterPoint -from .detectors.centerpoint_onnx import CenterPointONNX from .losses.amp_gaussian_focal_loss import AmpGaussianFocalLoss from .necks.second_fpn import SECONDFPN from .task_modules.coders.centerpoint_bbox_coders import CenterPointBBoxCoder from .voxel_encoders.pillar_encoder import BackwardPillarFeatureNet -from .voxel_encoders.pillar_encoder_onnx import BackwardPillarFeatureNetONNX, PillarFeatureNetONNX __all__ = [ "SECOND", @@ -16,11 +13,6 @@ "CenterHead", "CustomSeparateHead", "BackwardPillarFeatureNet", - "PillarFeatureNetONNX", - "BackwardPillarFeatureNetONNX", - "CenterPointONNX", - "CenterHeadONNX", - "SeparateHeadONNX", "CenterPointBBoxCoder", "AmpGaussianFocalLoss", ] diff --git a/projects/CenterPoint/models/detectors/centerpoint_onnx.py b/projects/CenterPoint/models/detectors/centerpoint_onnx.py deleted file mode 100644 index ff568a00d..000000000 --- a/projects/CenterPoint/models/detectors/centerpoint_onnx.py +++ /dev/null @@ -1,176 +0,0 @@ -import os -from typing import Callable, Dict, List, Tuple - -import torch -from mmdet3d.models.detectors.centerpoint import CenterPoint -from mmdet3d.registry import MODELS -from mmengine.logging import MMLogger, print_log -from torch import nn - - -class CenterPointHeadONNX(nn.Module): - """Head module for centerpoint with BACKBONE, NECK and BBOX_HEAD""" - - def __init__(self, backbone: nn.Module, neck: nn.Module, bbox_head: nn.Module): - super(CenterPointHeadONNX, self).__init__() - self.backbone: nn.Module = backbone - self.neck: nn.Module = neck - self.bbox_head: nn.Module = bbox_head - self._logger = MMLogger.get_current_instance() - self._logger.info("Running CenterPointHeadONNX!") - - def forward(self, x: torch.Tensor) -> Tuple[List[Dict[str, torch.Tensor]]]: - """ - Note: - torch.onnx.export() doesn't support triple-nested output - - Args: - x (torch.Tensor): (B, C, H, W) - Returns: - tuple[list[dict[str, any]]]: - (num_classes x [num_detect x {'reg', 'height', 'dim', 'rot', 'vel', 'heatmap'}]) - """ - x = self.backbone(x) - if self.neck is not None: - x = self.neck(x) - x = self.bbox_head(x) - - return x - - -@MODELS.register_module() -class CenterPointONNX(CenterPoint): - """onnx support impl of mmdet3d.models.detectors.CenterPoint""" - - def __init__(self, point_channels: int = 5, device: str = "cpu", **kwargs): - super().__init__(**kwargs) - self._point_channels = point_channels - self._device = device - self._torch_device = torch.device("cuda:0") if self._device == "gpu" else torch.device("cpu") - self._logger = MMLogger.get_current_instance() - self._logger.info("Running CenterPointONNX!") - - def _get_random_inputs(self): - """ - Generate random inputs and preprocess it to feed it to onnx. - """ - # Input channels - points = [ - torch.rand(1000, self._point_channels).to(self._torch_device), - # torch.rand(1000, self._point_channels).to(self._torch_device), - ] - # We only need lidar pointclouds for CenterPoint. - return {"points": points, "data_samples": None} - - def _extract_random_features(self): - assert self.data_preprocessor is not None and hasattr(self.data_preprocessor, "voxelize") - - # Get inputs - inputs = self._get_random_inputs() - voxel_dict = self.data_preprocessor.voxelize(points=inputs["points"], data_samples=inputs["data_samples"]) - assert self.pts_voxel_encoder is not None and hasattr(self.pts_voxel_encoder, "get_input_features") - input_features = self.pts_voxel_encoder.get_input_features( - voxel_dict["voxels"], voxel_dict["num_points"], voxel_dict["coors"] - ) - return input_features, voxel_dict - - def save_onnx( - self, - save_dir: str, - verbose=False, - onnx_opset_version=13, - ): - """Save onnx model - Args: - batch_dict (dict[str, any]) - save_dir (str): directory path to save onnx models - verbose (bool, optional) - onnx_opset_version (int, optional) - """ - print_log(f"Running onnx_opset_version: {onnx_opset_version}") - # Get features - input_features, voxel_dict = self._extract_random_features() - - # === pts_voxel_encoder === - pth_onnx_pve = os.path.join(save_dir, "pts_voxel_encoder.onnx") - torch.onnx.export( - self.pts_voxel_encoder, - (input_features,), - f=pth_onnx_pve, - input_names=("input_features",), - output_names=("pillar_features",), - dynamic_axes={ - "input_features": {0: "num_voxels", 1: "num_max_points"}, - "pillar_features": {0: "num_voxels"}, - }, - verbose=verbose, - opset_version=onnx_opset_version, - ) - print_log(f"Saved pts_voxel_encoder onnx model: {pth_onnx_pve}") - voxel_features = self.pts_voxel_encoder(input_features) - voxel_features = voxel_features.squeeze(1) - - # Note: pts_middle_encoder isn't exported - coors = voxel_dict["coors"] - batch_size = coors[-1, 0] + 1 - x = self.pts_middle_encoder(voxel_features, coors, batch_size) - # x (torch.tensor): (batch_size, num_pillar_features, W, H) - - # === pts_backbone === - assert self.pts_bbox_head is not None and hasattr(self.pts_bbox_head, "output_names") - pts_backbone_neck_head = CenterPointHeadONNX( - self.pts_backbone, - self.pts_neck, - self.pts_bbox_head, - ) - # pts_backbone_neck_head = torch.jit.script(pts_backbone_neck_head) - pth_onnx_backbone_neck_head = os.path.join(save_dir, "pts_backbone_neck_head.onnx") - torch.onnx.export( - pts_backbone_neck_head, - (x,), - f=pth_onnx_backbone_neck_head, - input_names=("spatial_features",), - output_names=tuple(self.pts_bbox_head.output_names), - dynamic_axes={ - name: {0: "batch_size", 2: "H", 3: "W"} - for name in ["spatial_features"] + self.pts_bbox_head.output_names - }, - verbose=verbose, - opset_version=onnx_opset_version, - ) - print_log(f"Saved pts_backbone_neck_head onnx model: {pth_onnx_backbone_neck_head}") - - def save_torchscript( - self, - save_dir: str, - verbose: bool = False, - ): - """Save torchscript model - Args: - batch_dict (dict[str, any]) - save_dir (str): directory path to save onnx models - verbose (bool, optional) - """ - # Get features - input_features, voxel_dict = self._extract_random_features() - - pth_pt_pve = os.path.join(save_dir, "pts_voxel_encoder.pt") - traced_pts_voxel_encoder = torch.jit.trace(self.pts_voxel_encoder, (input_features,)) - traced_pts_voxel_encoder.save(pth_pt_pve) - - voxel_features = traced_pts_voxel_encoder(input_features) - voxel_features = voxel_features.squeeze() - - # Note: pts_middle_encoder isn't exported - coors = voxel_dict["coors"] - batch_size = coors[-1, 0] + 1 - x = self.pts_middle_encoder(voxel_features, coors, batch_size) - - pts_backbone_neck_head = CenterPointHeadONNX( - self.pts_backbone, - self.pts_neck, - self.pts_bbox_head, - ) - pth_pt_head = os.path.join(save_dir, "pts_backbone_neck_head.pt") - traced_pts_backbone_neck_head = torch.jit.trace(pts_backbone_neck_head, (x)) - traced_pts_backbone_neck_head.save(pth_pt_head) diff --git a/projects/CenterPoint/runners/deployment_runner.py b/projects/CenterPoint/runners/deployment_runner.py deleted file mode 100644 index bbd703cbb..000000000 --- a/projects/CenterPoint/runners/deployment_runner.py +++ /dev/null @@ -1,103 +0,0 @@ -from pathlib import Path -from typing import Optional, Union - -from mmengine.registry import MODELS, init_default_scope -from torch import nn - -from autoware_ml.detection3d.runners.base_runner import BaseRunner - - -class DeploymentRunner(BaseRunner): - """Runner to run deploment of mmdet3D model to generate ONNX with random inputs.""" - - def __init__( - self, - model_cfg_path: str, - checkpoint_path: str, - work_dir: Path, - rot_y_axis_reference: bool = False, - device: str = "gpu", - replace_onnx_models: bool = False, - default_scope: str = "mmengine", - experiment_name: str = "", - log_level: Union[int, str] = "INFO", - log_file: Optional[str] = None, - onnx_opset_version: int = 13, - ) -> None: - """ - :param model_cfg_path: MMDet3D model config path. - :param checkpoint_path: Checkpoint path to load weights. - :param work_dir: Working directory to save outputs. - :param rot_y_axis_reference: Set True to convert rotation - from x-axis counterclockwiese to y-axis clockwise. - :param device: Working devices, only 'gpu' or 'cpu' supported. - :param replace_onnx_models: Set True to replace model with ONNX, - for example, CenterHead -> CenterHeadONNX. - :param default_scope: Default scope in mmdet3D. - :param experiment_name: Experiment name. - :param log_level: Logging and display log messages above this level. - :param log_file: Logger file. - :param oxx_opset_version: onnx opset version. - """ - super(DeploymentRunner, self).__init__( - model_cfg_path=model_cfg_path, - checkpoint_path=checkpoint_path, - work_dir=work_dir, - device=device, - default_scope=default_scope, - experiment_name=experiment_name, - log_level=log_level, - log_file=log_file, - ) - - # We need init deafault scope to mmdet3d to search registries in the mmdet3d scope - init_default_scope("mmdet3d") - - self._rot_y_axis_reference = rot_y_axis_reference - self._replace_onnx_models = replace_onnx_models - self._onnx_opset_version = onnx_opset_version - - def build_model(self) -> nn.Module: - """ - Build a model. Replace the model by ONNX model if replace_onnx_model is set. - :return torch.nn.Module. A torch module. - """ - self._logger.info("===== Building CenterPoint model ====") - model_cfg = self._cfg.get("model") - # Update Model type to ONNX - if self._replace_onnx_models: - self._logger.info("Replacing ONNX models!") - model_cfg.type = "CenterPointONNX" - model_cfg.point_channels = model_cfg.pts_voxel_encoder.in_channels - model_cfg.device = self._device - model_cfg.pts_voxel_encoder.type = ( - "PillarFeatureNetONNX" - if model_cfg.pts_voxel_encoder.type == "PillarFeatureNet" - else "BackwardPillarFeatureNetONNX" - ) - model_cfg.pts_bbox_head.type = "CenterHeadONNX" - model_cfg.pts_bbox_head.separate_head.type = "SeparateHeadONNX" - model_cfg.pts_bbox_head.rot_y_axis_reference = self._rot_y_axis_reference - - if model_cfg.pts_backbone.type == "ConvNeXt_PC": - # Always set with_cp (gradient checkpointing) to False for deployment - model_cfg.pts_backbone.with_cp = False - model = MODELS.build(model_cfg) - model.to(self._torch_device) - - self._logger.info(model) - self._logger.info("===== Built CenterPoint model ====") - return model - - def run(self) -> None: - """Start running the Runner.""" - # Building a model - model = self.build_model() - - # Loading checkpoint to the model - self.load_verify_checkpoint(model=model) - - assert hasattr(model, "save_onnx"), "The model must have the function: save_onnx()!" - - # Run and save onnx model! - model.save_onnx(save_dir=self._work_dir, onnx_opset_version=self._onnx_opset_version) diff --git a/projects/CenterPoint/scripts/deploy.py b/projects/CenterPoint/scripts/deploy.py deleted file mode 100644 index 3aea2ee29..000000000 --- a/projects/CenterPoint/scripts/deploy.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Script to export CenterPoint to onnx/torchscript -""" - -import argparse -import logging -import os -from pathlib import Path - -from projects.CenterPoint.runners.deployment_runner import DeploymentRunner - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Export CenterPoint model to backends.", - ) - parser.add_argument( - "model_cfg_path", - help="model config path", - ) - parser.add_argument( - "checkpoint", - help="model checkpoint path", - ) - parser.add_argument( - "--work-dir", - default="", - help="the dir to save logs and models", - ) - parser.add_argument( - "--log-level", - help="set log level", - default="INFO", - choices=list(logging._nameToLevel.keys()), - ) - parser.add_argument("--onnx_opset_version", type=int, default=13, help="onnx opset version") - parser.add_argument( - "--device", - choices=["cpu", "gpu"], - default="gpu", - help="Set running device!", - ) - parser.add_argument( - "--replace_onnx_models", - action="store_true", - help="Set False to disable replacement of model by ONNX model, for example, CenterHead -> CenterHeadONNX", - ) - parser.add_argument( - "--rot_y_axis_reference", - action="store_true", - help="Set True to output rotation in y-axis clockwise in CenterHeadONNX", - ) - args = parser.parse_args() - return args - - -def build_deploy_runner(args) -> DeploymentRunner: - """Build a DeployRunner.""" - model_cfg_path = args.model_cfg_path - checkpoint_path = args.checkpoint - experiment_name = Path(model_cfg_path).stem - work_dir = ( - Path(os.getcwd()) / "work_dirs" / "deployment" / experiment_name if not args.work_dir else Path(args.work_dir) - ) - - deployment_runner = DeploymentRunner( - experiment_name=experiment_name, - model_cfg_path=model_cfg_path, - checkpoint_path=checkpoint_path, - work_dir=work_dir, - replace_onnx_models=args.replace_onnx_models, - device=args.device, - rot_y_axis_reference=args.rot_y_axis_reference, - onnx_opset_version=args.onnx_opset_version, - ) - return deployment_runner - - -if __name__ == "__main__": - """Launch a DeployRunner.""" - args = parse_args() - - # Build DeploymentRunner - deployment_runner = build_deploy_runner(args=args) - - # Start running DeploymentRunner - deployment_runner.run()