isaac-sim · cvolkcvolk · Jun 12, 2026 · Jun 12, 2026 · arena-review-bot · Jun 12, 2026
@@ -36,6 +36,7 @@
 from isaaclab_arena.utils.multiprocess import get_local_rank
 from isaaclab_arena.variations import variations_hydra, variations_printing
 from isaaclab_arena.variations.variation_base import BuildTimeVariationBase, RunTimeVariationBase, VariationBase
+from isaaclab_arena.variations.variation_recording import collect_variation_draws, collect_variation_factor_schema
 
 
 class ArenaEnvBuilder:
@@ -98,6 +99,22 @@ def get_variations_catalogue_as_string(self) -> str:
         variations: dict[str, list[VariationBase]] = self.get_all_variations()
         return variations_printing.get_variations_catalogue_as_string(variations, hydra_overrides=self.hydra_overrides)
 
+    def get_enabled_variation_factor_schema(self) -> dict[str, dict]:
+        """Return the sensitivity factor schema for every enabled variation (``<asset>.<variation>``).
+
+        Run-level metadata: which variations were varied and the prior (type + range) each samples
+        from. Logged once into the episode-summary header so the analysis is self-describing.
+        """
+        return collect_variation_factor_schema(self.get_all_variations())
+
+    def get_enabled_variation_draws(self) -> dict[str, object]:
+        """Return ``{<asset>.<variation>: realized_value}`` for this build's enabled variations.
+
+        Build-time variations sample once per build, so the returned draws are fixed for the
+        lifetime of the env this builder produced. Call after the env is built.
+        """
+        return collect_variation_draws(self.get_all_variations())
+
     def _compose_variations_event_cfg(self) -> Any | None:
         """Build a configclass with one :class:`EventTermCfg` per enabled run-time variation.
 

@@ -0,0 +1,107 @@
+# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
+# All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from isaaclab_arena.metrics.metrics_logger import metrics_to_plain_python_types
+
+if TYPE_CHECKING:
+    from isaaclab_arena.evaluation.job_manager import Job
+
+
+# Bumped when the on-disk record shape changes so readers can branch on it.
+SCHEMA_VERSION = 1
+
+
+def write_episode_summaries(env, job: Job, output_path: str | Path) -> int:
+    """Append one JSONL row per recorded episode for the just-completed job.
+
+    The file is self-describing: its first line is a ``"record": "meta"`` header (written once,
+    by whichever job reaches an empty/absent file) carrying the run-level slice identity, the
+    enabled-variation factor schema, and the outcome names. Every later line is an episode::
+
+        {
+          "record": "episode",
+          "job_name": "<job.name>",
+          "episode_idx": <episode index in the recorded dataset>,
+          "arena_env_args": <full job.arena_env_args_dict>,
+          "variation_draws": <realized <asset>.<variation> values this build sampled>,
+          "outcomes": <per-episode metric values>
+        }
+
+    Per-episode metric values come from the env's ``MetricsManager`` (the same machinery that
+    backs ``compute_metrics``), so all HDF5/metric access stays in the metrics layer. The
+    variation factor schema and draws are read off the env where ``load_env`` attached them.
+
+    Args:
+        env: The (possibly gym-wrapped) Arena env that just finished its rollout. Its
+            ``MetricsManager`` provides the per-episode metric values.
+        job: The Job that ran. Its ``arena_env_args_dict`` is logged verbatim under
+            ``arena_env_args``.
+        output_path: JSONL file to append to. Created (with parent dirs) if absent.
+
+    Returns:
+        Number of episode rows written.
+    """
+    unwrapped_env = env.unwrapped
+    if not hasattr(unwrapped_env.cfg, "metrics") or unwrapped_env.cfg.metrics is None:
+        return 0
+
+    per_episode_metrics = unwrapped_env.metrics_manager.compute_per_episode()
+    arena_env_args_snapshot = dict(job.arena_env_args_dict)
+    # Build-time draws are fixed for this env, so the same dict applies to every episode it ran.
+    variation_draws = dict(getattr(unwrapped_env, "arena_variation_draws", {}) or {})
+
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    _write_run_header_if_new(output_path, unwrapped_env, job)
+
+    with open(output_path, "a", encoding="utf-8") as jsonl_output:
+        for episode_idx, episode_metrics in enumerate(per_episode_metrics):
+            summary_row = {
+                "record": "episode",
+                "job_name": job.name,
+                "episode_idx": episode_idx,
+                "arena_env_args": arena_env_args_snapshot,
+                "variation_draws": variation_draws,
+                "outcomes": metrics_to_plain_python_types(episode_metrics),
-                "outcomes": metrics_to_plain_python_types(episode_metrics),
+                "outcomes": {name: sanitize_metric_value(value) for name, value in episode_metrics.items()},
-                "outcomes": metrics_to_plain_python_types(episode_metrics),
+                "outcomes": {name: sanitize_metric_value(value) for name, value in episode_metrics.items()},
+            }
+            jsonl_output.write(json.dumps(summary_row) + "\n")
+
+    return len(per_episode_metrics)
+
+
+def _write_run_header_if_new(output_path: Path, unwrapped_env, job: Job) -> None:
+    """Write the one-time ``"record": "meta"`` header iff ``output_path`` is empty/absent.
+
+    Made idempotent by the empty-file check so it fires exactly once per run — including across
+    the per-chunk subprocesses that append to a shared file (chunks run sequentially, so the
+    first writes the header and the rest see a non-empty file and skip).
+    """
+    if output_path.exists() and output_path.stat().st_size > 0:
+        return
+    header = {
+        "record": "meta",
+        "schema_version": SCHEMA_VERSION,
+        "slice": _slice_identity(job),
+        "factors": dict(getattr(unwrapped_env, "arena_variation_factor_schema", {}) or {}),
+        "outcome_names": list(unwrapped_env.metrics_manager.active_terms),
+    }
+    with open(output_path, "a", encoding="utf-8") as jsonl_output:
+        jsonl_output.write(json.dumps(header) + "\n")
+
+
+def _slice_identity(job: Job) -> dict:
+    """The (policy, task, embodiment) tuple this run analyses — MNPE assumes a single source."""
+    arena_env_args = job.arena_env_args_dict
+    return {
+        "environment": arena_env_args.get("environment"),
+        "embodiment": arena_env_args.get("embodiment"),
+        "policy_type": job.policy_type,
+    }
@@ -19,6 +19,7 @@
 from typing import TYPE_CHECKING
 
 from isaaclab_arena.cli.isaaclab_arena_cli import get_isaaclab_arena_cli_parser
+from isaaclab_arena.evaluation.episode_writer import write_episode_summaries
 from isaaclab_arena.evaluation.eval_runner_cli import add_eval_runner_arguments
 from isaaclab_arena.evaluation.job_manager import Job, JobManager, Status
 from isaaclab_arena.evaluation.policy_runner import get_policy_cls, rollout_policy
@@ -51,6 +52,12 @@ def load_env(
         env_cfg.recorders.dataset_filename = f"dataset_{job_name}"
 
     env = arena_builder.make_registered(env_cfg, render_mode=render_mode)
+
+    # Surface the enabled variations so the episode summary writer can record them: the factor
+    # schema (run-level prior) and this build's realized build-time draws (fixed for the env's life).
+    env.unwrapped.arena_variation_factor_schema = arena_builder.get_enabled_variation_factor_schema()
+    env.unwrapped.arena_variation_draws = arena_builder.get_enabled_variation_draws()
+
     # Don't reset here - rollout_policy() will reset the env. Every reset triggers a new episode, initializing recorder & creating a new hdf5 entry.
     return env
 
@@ -244,6 +251,15 @@ def main():
     # Check if any job requires cameras and enable them if needed before starting simulation
     enable_cameras_if_required(eval_jobs_config, args_cli)
 
+    # --episode_summary (opt-in): the writer logs the full arena_env_args + realized variation
+    # draws per episode; the analyzer's factors / header decide which keys are factors.
+    episode_summary_enabled = args_cli.episode_summary is not None
+    if episode_summary_enabled:
+        print(
+            "[INFO] Episode summary recording enabled. Per-episode arena_env_args + variation_draws"
+            f" + outcomes → {args_cli.episode_summary}"
+        )
+
     with SimulationAppContext(args_cli):
         job_manager = JobManager(eval_jobs_config["jobs"])
         metrics_logger = MetricsLogger()
@@ -307,6 +323,16 @@ def main():
                         language_instruction=job.language_instruction,
                     )
 
+                    if episode_summary_enabled:
+                        # Opt-in instrumentation: a write failure here must not fail an
+                        # otherwise-successful rebuild (which would discard its real metrics).
+                        # Written per rebuild — each re-samples this build's variation draws.
+                        try:
+                            rows = write_episode_summaries(env, job, args_cli.episode_summary)
+                            print(f"[INFO] Wrote {rows} episode summaries for job '{job.name}' (rebuild {rebuild_idx})")
+                        except Exception as summary_error:
+                            print(f"[WARNING] Failed to write episode summaries for job '{job.name}': {summary_error}")
+
                     job_manager.complete_job(job, metrics=metrics, status=Status.COMPLETED)
 
                     # users may not specify metrics for a task, although it's not recommended

@@ -38,3 +38,13 @@ def add_eval_runner_arguments(parser: argparse.ArgumentParser) -> None:
             " set only if a long sweep grows in host memory or gets OOM-killed."
         ),
     )
+    parser.add_argument(
+        "--episode_summary",
+        type=str,
+        default=None,
+        help=(
+            "Opt-in: append one self-describing JSONL row per episode (arena_env_args, realized"
+            " variation_draws, and outcomes) to this path, for sensitivity analysis. The first"
+            " line is a run-level header (slice + factor schema). Unset disables recording."
+        ),
+    )
@@ -31,6 +31,7 @@ def __init__(
         status: Status = None,
         language_instruction: str = None,
         variations: list[str] = None,
+        arena_env_args_dict: dict = None,
     ):
         """Initialize a Job instance.
 
@@ -49,9 +50,12 @@ def __init__(
                 takes precedence over the task's own description.
             variations: Hydra variation override strings (e.g. ``"cracker_box.color.enabled=true"``)
                 applied when composing the environment cfg. Defaults to no overrides.
+            arena_env_args_dict: The original dict form of arena_env_args (before conversion to a
+                CLI list). Logged verbatim per episode by the sensitivity episode-summary writer.
         """
         self.name = name
         self.arena_env_args = arena_env_args
+        self.arena_env_args_dict = arena_env_args_dict if arena_env_args_dict is not None else {}
         self.variations = variations if variations is not None else []
         assert num_envs > 0, "num_envs must be greater than 0"
         assert not (
@@ -114,6 +118,7 @@ def from_dict(cls, data: dict) -> "Job":
         return cls(
             name=data["name"],
             arena_env_args=cls.convert_args_dict_to_cli_args_list(data["arena_env_args"]),
+            arena_env_args_dict=data["arena_env_args"],
             policy_type=data["policy_type"],
             num_envs=num_envs,
             num_steps=num_steps,

@@ -27,7 +27,9 @@ def get_recorded_metric_data(dataset_path: pathlib.Path, recorder_term_name: str
     recorded_metric_data_per_demo: list[np.ndarray] = []
     with h5py.File(dataset_path, "r") as f:
         demos = f["data"]
-        for demo in demos:
+        # h5py iterates group members alphabetically (demo_0, demo_1, demo_10, demo_2, ...),
+        # so sort by the numeric demo index to return episodes in recorded order.
+        for demo in sorted(demos, key=lambda name: int(name.split("_")[-1])):
             recorded_metric_data_per_demo.append(demos[demo][recorder_term_name][:])
     return recorded_metric_data_per_demo
 

@@ -5,7 +5,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 from isaaclab_arena.metrics.metric_data import MetricData, MetricsDataCollection
 from isaaclab_arena.metrics.metric_term_cfg import MetricTermCfg
@@ -68,3 +68,33 @@ def compute(self) -> MetricsDataCollection:
             num_episodes=get_num_episodes(dataset_path), metric_data_entries=metric_data_entries
         )
         return metrics_data_collection
+
+    def compute_per_episode(self) -> list[dict[str, Any]]:
+        """Compute every registered metric separately for each recorded episode.
+
+        Where :meth:`compute` reduces across all episodes to one aggregate value per
+        metric, this returns one ``{metric_name: value}`` dict per episode — each metric's
+        compute func is fed that single episode's recorded array (a one-element list).
+
+        Returns:
+            A list with one metric dict per episode, in recorded order.
+        """
+        dataset_path = get_metric_recorder_dataset_path(self._env)
+        num_episodes = get_num_episodes(dataset_path)
+
+        # Recorded data arrives grouped by metric (each term -> one array per episode).
+        # Read it once here, then transpose into one metric dict per episode below.
+        episode_arrays_by_term = {
+            term_name: get_recorded_metric_data(dataset_path, term_cfg.recorder_term_name)
+            for term_name, term_cfg in zip(self._term_names, self._term_cfgs)
+        }
+
+        per_episode_metrics: list[dict[str, Any]] = []
+        for episode_index in range(num_episodes):
+            episode_metrics: dict[str, Any] = {}
+            for term_name, term_cfg in zip(self._term_names, self._term_cfgs):
+                # compute_metric_func reduces a list of per-episode arrays; give it just this one.
+                episode_array = episode_arrays_by_term[term_name][episode_index]
+                episode_metrics[term_name] = term_cfg.compute_metric_func([episode_array], **term_cfg.params)
+            per_episode_metrics.append(episode_metrics)
+        return per_episode_metrics
@@ -76,5 +76,6 @@ def apply(self) -> None:
         assert self.sampler is not None, "HDRImageVariation: sampler not set."
         # Pass HDR names as the choice sampler's choices.
         hdr_name = self.sampler.sample(num_samples=1, choices=hdr_names)[0]
+        self._record_draw(hdr_name)
         hdr_cls: type[HDRImage] = registry.get_hdr_by_name(hdr_name)
         self._light.add_hdr(hdr_cls())
@@ -48,4 +48,5 @@ def __init__(
     def apply(self) -> None:
         assert self.sampler is not None, "LightIntensityVariation: sampler not set."
         intensity = float(self.sampler.sample(num_samples=1)[0, 0])
+        self._record_draw(intensity)
         self._light.set_intensity(intensity)
@@ -55,13 +55,29 @@ class VariationBase(ABC):
     def __init__(self, cfg: VariationBaseCfg, name: str):
         self.name = name
         self._sampler: SamplerBase | None = None
+        self._last_draw: object | None = None
         self.apply_cfg(cfg)
 
     @property
     def enabled(self) -> bool:
         """Whether this variation is active and should be built into ``events_cfg``."""
         return self.cfg.enabled
 
+    @property
+    def last_draw(self) -> object | None:
+        """The value most recently sampled by this variation, or ``None`` if never sampled.
+
+        Set by subclasses via :meth:`_record_draw` when they realise a sample, so eval recording
+        can log the realized factor value per build/episode. Build-time variations populate it once
+        per env build; run-time variations do not record here yet (their per-reset draws would need
+        a per-env buffer — tracked as a follow-up).
+        """
+        return self._last_draw
+
+    def _record_draw(self, value: object) -> None:
+        """Store ``value`` as the realized sample of this variation (see :attr:`last_draw`)."""
+        self._last_draw = value
+
     def enable(self) -> None:
         """Mark this variation as active."""
         self.cfg.enabled = True