From 6e0f1e4aa6ebfc0617583c53dbbff22b05365c18 Mon Sep 17 00:00:00 2001 From: YuTeh Shen Date: Tue, 28 Apr 2026 07:16:38 +0000 Subject: [PATCH] feat: Add success rate to benchmark results Log cartpole camera survival success through extras and teach the benchmark scripts to read success-rate TensorBoard tags from both direct and RL-Games episode paths. This makes the --output_path benchmark JSON include a success_rate scalar when tasks emit it. --- scripts/benchmarks/benchmark_rlgames.py | 5 +++++ scripts/benchmarks/benchmark_rsl_rl.py | 5 +++++ scripts/benchmarks/utils.py | 18 ++++++++++++++++++ source/isaaclab_tasks/config/extension.toml | 2 +- source/isaaclab_tasks/docs/CHANGELOG.rst | 10 ++++++++++ .../direct/cartpole/cartpole_camera_env.py | 5 +++++ 6 files changed, 44 insertions(+), 1 deletion(-) diff --git a/scripts/benchmarks/benchmark_rlgames.py b/scripts/benchmarks/benchmark_rlgames.py index f6c000a8fce9..7ef482ad3a6f 100644 --- a/scripts/benchmarks/benchmark_rlgames.py +++ b/scripts/benchmarks/benchmark_rlgames.py @@ -95,11 +95,13 @@ from scripts.benchmarks.utils import ( get_backend_type, get_preset_string, + get_success_rate_log, log_app_start_time, log_convergence, log_python_imports_time, log_rl_policy_episode_lengths, log_rl_policy_rewards, + log_rl_policy_success_rates, log_runtime_step_times, log_scene_creation_time, log_simulation_start_time, @@ -274,6 +276,9 @@ def main( log_runtime_step_times(benchmark, rl_training_times, compute_stats=True) log_rl_policy_rewards(benchmark, log_data["rewards/iter"]) log_rl_policy_episode_lengths(benchmark, log_data["episode_lengths/iter"]) + success_rates = get_success_rate_log(log_data) + if success_rates is not None: + log_rl_policy_success_rates(benchmark, success_rates) log_convergence( benchmark, log_data["rewards/iter"], diff --git a/scripts/benchmarks/benchmark_rsl_rl.py b/scripts/benchmarks/benchmark_rsl_rl.py index 596b267e248a..753f81f0cb51 100644 --- a/scripts/benchmarks/benchmark_rsl_rl.py +++ b/scripts/benchmarks/benchmark_rsl_rl.py @@ -97,11 +97,13 @@ from scripts.benchmarks.utils import ( get_backend_type, get_preset_string, + get_success_rate_log, log_app_start_time, log_convergence, log_python_imports_time, log_rl_policy_episode_lengths, log_rl_policy_rewards, + log_rl_policy_success_rates, log_runtime_step_times, log_scene_creation_time, log_simulation_start_time, @@ -268,6 +270,9 @@ def main( log_runtime_step_times(benchmark, rl_training_times, compute_stats=True) log_rl_policy_rewards(benchmark, log_data["Train/mean_reward"]) log_rl_policy_episode_lengths(benchmark, log_data["Train/mean_episode_length"]) + success_rates = get_success_rate_log(log_data) + if success_rates is not None: + log_rl_policy_success_rates(benchmark, success_rates) log_convergence( benchmark, diff --git a/scripts/benchmarks/utils.py b/scripts/benchmarks/utils.py index 0a9dffd4f701..564f9cd93a1b 100644 --- a/scripts/benchmarks/utils.py +++ b/scripts/benchmarks/utils.py @@ -19,6 +19,7 @@ os.path.dirname(__file__), "..", "..", "source", "isaaclab_tasks", "test", "benchmarking" ) _CONFIGS_YAML = os.path.join(_BENCHMARKING_DIR, "configs.yaml") +SUCCESS_RATE_LOG_TAGS = ("Metrics/success_rate", "Episode/Metrics/success_rate") def get_backend_type(cli_backend: str) -> str: @@ -151,6 +152,23 @@ def log_rl_policy_episode_lengths(benchmark: BaseIsaacLabBenchmark, value: list) benchmark.add_measurement("train", measurement=measurement) +def log_rl_policy_success_rates(benchmark: BaseIsaacLabBenchmark, value: list): + if not value: + return + measurement = ListMeasurement(name="Success Rates", value=value) + benchmark.add_measurement("train", measurement=measurement) + # Log the best observed success rate as a scalar for benchmark JSON backends. + measurement = SingleMeasurement(name="success_rate", value=max(value), unit="float") + benchmark.add_measurement("train", measurement=measurement) + + +def get_success_rate_log(log_data: dict) -> list | None: + for tag in SUCCESS_RATE_LOG_TAGS: + if tag in log_data: + return log_data[tag] + return None + + def check_convergence( rewards: list[float], threshold: float, diff --git a/source/isaaclab_tasks/config/extension.toml b/source/isaaclab_tasks/config/extension.toml index 158c271c5feb..2e7ee00764d7 100644 --- a/source/isaaclab_tasks/config/extension.toml +++ b/source/isaaclab_tasks/config/extension.toml @@ -1,7 +1,7 @@ [package] # Note: Semantic Versioning is used: https://semver.org/ -version = "1.5.29" +version = "1.5.30" # Description title = "Isaac Lab Environments" diff --git a/source/isaaclab_tasks/docs/CHANGELOG.rst b/source/isaaclab_tasks/docs/CHANGELOG.rst index 42a68f5ae361..d35762f85010 100644 --- a/source/isaaclab_tasks/docs/CHANGELOG.rst +++ b/source/isaaclab_tasks/docs/CHANGELOG.rst @@ -1,6 +1,16 @@ Changelog --------- +1.5.30 (2026-04-28) +~~~~~~~~~~~~~~~~~~~ + +Added +^^^^^ + +* Added benchmark extraction for ``Metrics/success_rate`` and survival + success logging for direct cartpole camera environments. + + 1.5.29 (2026-04-27) ~~~~~~~~~~~~~~~~~~~ diff --git a/source/isaaclab_tasks/isaaclab_tasks/direct/cartpole/cartpole_camera_env.py b/source/isaaclab_tasks/isaaclab_tasks/direct/cartpole/cartpole_camera_env.py index 18215b4788df..1544d17e2789 100644 --- a/source/isaaclab_tasks/isaaclab_tasks/direct/cartpole/cartpole_camera_env.py +++ b/source/isaaclab_tasks/isaaclab_tasks/direct/cartpole/cartpole_camera_env.py @@ -151,6 +151,11 @@ def _get_dones(self) -> tuple[torch.Tensor, torch.Tensor]: def _reset_idx(self, env_ids: Sequence[int] | None): if env_ids is None: env_ids = self._cartpole._ALL_INDICES + + # Log survival success rate before resetting. + survived = self.reset_time_outs[env_ids].float() + self.extras.setdefault("log", {})["Metrics/success_rate"] = survived.mean().item() + super()._reset_idx(env_ids) joint_pos = self._cartpole.data.default_joint_pos.torch[env_ids]