Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions isaaclab_arena/evaluation/episode_writer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import json
from pathlib import Path
from typing import TYPE_CHECKING

from isaaclab_arena.metrics.metrics_logger import metrics_to_plain_python_types

if TYPE_CHECKING:
from isaaclab_arena.evaluation.job_manager import Job


def write_episode_summaries(env, job: Job, output_path: str | Path) -> int:
"""Append one JSONL row per recorded episode for the just-completed job.

Each row has shape::

{
"job_name": "<job.name>",
"episode_idx": <episode index in the recorded dataset>,
"arena_env_args": <full job.arena_env_args_dict>,
"outcomes": <per-episode metric values>
}

Per-episode metric values come from the env's ``MetricsManager`` (the same machinery
that backs ``compute_metrics``), so all HDF5/metric access stays in the metrics layer.

Args:
env: The (possibly gym-wrapped) Arena env that just finished its rollout. Its
``MetricsManager`` provides the per-episode metric values.
job: The Job that ran. Its ``arena_env_args_dict`` is logged verbatim under
``arena_env_args``.
output_path: JSONL file to append to. Created (with parent dirs) if absent.

Returns:
Number of rows written.
"""
unwrapped_env = env.unwrapped
if not hasattr(unwrapped_env.cfg, "metrics") or unwrapped_env.cfg.metrics is None:
return 0

per_episode_metrics = unwrapped_env.metrics_manager.compute_per_episode()
arena_env_args_snapshot = dict(job.arena_env_args_dict)

output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "a", encoding="utf-8") as jsonl_output:
for episode_idx, episode_metrics in enumerate(per_episode_metrics):
Comment on lines +47 to +53

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Silent empty arena_env_args when Job constructed without arena_env_args_dict

job.arena_env_args_dict defaults to {} when a Job is constructed directly (i.e., not through Job.from_dict()). In that case the JSONL row is written as "arena_env_args": {} with no error or warning, producing rows that are silently useless to the sensitivity analyzer. At a minimum, adding a guard that logs a warning (and optionally skips writing) when the dict is empty would surface this misconfiguration before it silently corrupts an analysis run.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

summary_row = {
"job_name": job.name,
"episode_idx": episode_idx,
"arena_env_args": arena_env_args_snapshot,
"outcomes": metrics_to_plain_python_types(episode_metrics),
}
jsonl_output.write(json.dumps(summary_row) + "\n")

return len(per_episode_metrics)
14 changes: 14 additions & 0 deletions isaaclab_arena/evaluation/eval_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from typing import TYPE_CHECKING

from isaaclab_arena.cli.isaaclab_arena_cli import get_isaaclab_arena_cli_parser
from isaaclab_arena.evaluation.episode_writer import write_episode_summaries
from isaaclab_arena.evaluation.eval_runner_cli import add_eval_runner_arguments
from isaaclab_arena.evaluation.job_manager import Job, JobManager, Status
from isaaclab_arena.evaluation.policy_runner import get_policy_cls, rollout_policy
Expand Down Expand Up @@ -200,6 +201,15 @@ def main():
# Check if any job requires cameras and enable them if needed before starting simulation
enable_cameras_if_required(eval_jobs_config, args_cli)

# --episode_summary (opt-in): the writer logs the full arena_env_args per episode;
# the analyzer's factors.yaml decides which keys are factors (no eval-side knowledge).
episode_summary_enabled = args_cli.episode_summary is not None
if episode_summary_enabled:
print(
"[INFO] Episode summary recording enabled. Per-episode arena_env_args + outcomes"
f" → {args_cli.episode_summary}"
)

with SimulationAppContext(args_cli):
job_manager = JobManager(eval_jobs_config["jobs"])
metrics_logger = MetricsLogger()
Expand Down Expand Up @@ -250,6 +260,10 @@ def main():
language_instruction=job.language_instruction,
)

if episode_summary_enabled:
rows = write_episode_summaries(env, job, args_cli.episode_summary)
print(f"[INFO] Wrote {rows} episode summaries for job '{job.name}'")
Comment on lines +260 to +262

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Summary write failure silently fails a successful job

write_episode_summaries runs inside the outer try/except block, so any I/O failure (disk full, permission denied, bad JSON serialization) will trigger the except handler at line 273. That handler calls job_manager.complete_job(job, metrics={}, status=Status.FAILED), discarding the already-computed metrics from rollout_policy and marking a successful rollout as FAILED. Since the summary is opt-in and supplemental, its failure should be isolated — otherwise a temporary disk issue poisons your eval results.


job_manager.complete_job(job, metrics=metrics, status=Status.COMPLETED)

# users may not specify metrics for a task, although it's not recommended
Expand Down
9 changes: 9 additions & 0 deletions isaaclab_arena/evaluation/eval_runner_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,12 @@ def add_eval_runner_arguments(parser: argparse.ArgumentParser) -> None:
" set only if a long sweep grows in host memory or gets OOM-killed."
),
)
parser.add_argument(
"--episode_summary",
type=str,
default=None,
help=(
"Append one JSONL row per recorded episode (arena_env_args + outcomes) to"
" this file. Consumed by the sensitivity analyzer. Default unset — no recording."
),
)
6 changes: 6 additions & 0 deletions isaaclab_arena/evaluation/job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def __init__(
policy_config_dict: dict = None,
status: Status = None,
language_instruction: str = None,
arena_env_args_dict: dict | None = None,
):
"""Initialize a Job instance.

Expand All @@ -42,9 +43,13 @@ def __init__(
status: Job status (defaults to PENDING)
language_instruction: Optional language instruction override for the policy. When set,
takes precedence over the task's own description.
arena_env_args_dict: The original dict form of arena_env_args before conversion to
CLI args list. Preserves typed values (e.g. floats stay floats) for downstream
consumers that need to index by key.
"""
self.name = name
self.arena_env_args = arena_env_args
self.arena_env_args_dict = arena_env_args_dict if arena_env_args_dict is not None else {}
assert num_envs > 0, "num_envs must be greater than 0"
assert not (
num_steps is not None and num_episodes is not None
Expand Down Expand Up @@ -102,6 +107,7 @@ def from_dict(cls, data: dict) -> "Job":
return cls(
name=data["name"],
arena_env_args=cls.convert_args_dict_to_cli_args_list(data["arena_env_args"]),
arena_env_args_dict=data["arena_env_args"],
policy_type=data["policy_type"],
num_envs=num_envs,
num_steps=num_steps,
Expand Down
30 changes: 30 additions & 0 deletions isaaclab_arena/metrics/metrics_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,33 @@ def compute(self) -> dict[str, Any]:
metrics_data[term_name] = term_cfg.compute_metric_func(recorded_metric_data, **term_cfg.params)
metrics_data["num_episodes"] = get_num_episodes(dataset_path)
return metrics_data

def compute_per_episode(self) -> list[dict[str, Any]]:
"""Compute every registered metric separately for each recorded episode.

Where :meth:`compute` reduces across all episodes to one aggregate value per
metric, this returns one ``{metric_name: value}`` dict per episode — each metric's
compute func is fed that single episode's recorded array (a one-element list).

Returns:
A list with one metric dict per episode, in recorded order.
"""
dataset_path = get_metric_recorder_dataset_path(self._env)
num_episodes = get_num_episodes(dataset_path)

# Recorded data arrives grouped by metric (each term -> one array per episode).
# Read it once here, then transpose into one metric dict per episode below.
episode_arrays_by_term = {
term_name: get_recorded_metric_data(dataset_path, term_cfg.recorder_term_name)
for term_name, term_cfg in zip(self._term_names, self._term_cfgs)
}

per_episode_metrics: list[dict[str, Any]] = []
for episode_index in range(num_episodes):
Comment on lines +76 to +87

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Extra HDF5 open + potential IndexError between two independent reads

get_num_episodes opens and closes the HDF5 file once more after episode_arrays_by_term has already been populated. The two reads are independent h5py.File opens, so if the returned list lengths don't match num_episodes (e.g., a term's dataset has fewer demos than the top-level demo count), episode_arrays_by_term[term_name][episode_index] will raise an IndexError. Deriving the episode count directly from the already-loaded lists avoids the redundant open and keeps both measurements consistent.

Suggested change
dataset_path = get_metric_recorder_dataset_path(self._env)
num_episodes = get_num_episodes(dataset_path)
# Recorded data arrives grouped by metric (each term -> one array per episode).
# Read it once here, then transpose into one metric dict per episode below.
episode_arrays_by_term = {
term_name: get_recorded_metric_data(dataset_path, term_cfg.recorder_term_name)
for term_name, term_cfg in zip(self._term_names, self._term_cfgs)
}
per_episode_metrics: list[dict[str, Any]] = []
for episode_index in range(num_episodes):
dataset_path = get_metric_recorder_dataset_path(self._env)
# Recorded data arrives grouped by metric (each term -> one array per episode).
# Read it once here, then transpose into one metric dict per episode below.
episode_arrays_by_term = {
term_name: get_recorded_metric_data(dataset_path, term_cfg.recorder_term_name)
for term_name, term_cfg in zip(self._term_names, self._term_cfgs)
}
# Derive episode count from the already-loaded data to stay consistent with it.
# Fall back to get_num_episodes only when there are no registered terms.
if episode_arrays_by_term:
num_episodes = min(len(v) for v in episode_arrays_by_term.values())
else:
num_episodes = get_num_episodes(dataset_path)
per_episode_metrics: list[dict[str, Any]] = []
for episode_index in range(num_episodes):

episode_metrics: dict[str, Any] = {}
for term_name, term_cfg in zip(self._term_names, self._term_cfgs):
# compute_metric_func reduces a list of per-episode arrays; give it just this one.
episode_array = episode_arrays_by_term[term_name][episode_index]
Comment on lines +87 to +91

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 HDF5 alphabetical key order silently swaps episodes for 10+ episode jobs

get_recorded_metric_data iterates for demo in f["data"], which h5py returns in alphabetical order by default (creation-order tracking is off unless the file was created with track_order=True). For a job with 20 episodes the key order is demo_0, demo_1, demo_10, demo_11, …, demo_19, demo_2, demo_3, …, so episode_arrays_by_term[term][2] contains data from demo_10 (the simulation's 11th episode) rather than demo_2. All nine jobs in light_intensity_sweep_jobs_config.json use num_episodes: 20, so every JSONL row from those jobs will carry a wrong episode_idx. Any downstream use that joins on episode_idx (e.g. matching to video recordings) will silently cross-reference the wrong episode.

The fix is to sort the demo keys numerically before building the lists in get_recorded_metric_data, or to drive the outer loop from the sorted keys in episode_arrays_by_term directly instead of from a separate get_num_episodes call.

episode_metrics[term_name] = term_cfg.compute_metric_func([episode_array], **term_cfg.params)
per_episode_metrics.append(episode_metrics)
return per_episode_metrics
Loading