Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 41 additions & 10 deletions isaaclab_arena/analysis/sensitivity/generate_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,16 @@

from isaaclab_arena.analysis.sensitivity.analyzer import SensitivityAnalyzer
from isaaclab_arena.analysis.sensitivity.episode_results_reader import dataset_from_episode_results
from isaaclab_arena.analysis.sensitivity.plotting import plot_marginals
from isaaclab_arena.analysis.sensitivity.plotting import plot_corner, plot_importance, plot_marginals

# Each report plot, mapped to the renderer that draws it. "marginals" writes the given output_path
# directly; the others write a sibling file suffixed with the plot name.
_PLOT_RENDERERS = {
"marginals": plot_marginals,
"importance": plot_importance,
"corner": plot_corner,
}
_DEFAULT_PLOTS = ("marginals", "importance", "corner")


def generate_report(
Expand All @@ -21,26 +30,33 @@ def generate_report(
outcome_names: list[str] | tuple[str, ...] = ("success",),
factor_names: list[str] | tuple[str, ...] | None = None,
observation: list[float] | None = None,
plots: list[str] | tuple[str, ...] = _DEFAULT_PLOTS,
seed: int | None = 0,
) -> Path:
"""Build a sensitivity report from an episode_results.jsonl, fit, and save a figure.
) -> dict[str, Path]:
"""Build a sensitivity report from an episode_results.jsonl, fit, and save the requested figures.

The factor schema is discovered from the recorder's per-episode variation draws. The output
format follows the output_path extension (.png, .pdf, …).
The factor schema is discovered from the recorder's per-episode variation draws, the posterior
is fit and sampled once, and each requested plot is rendered from those shared samples. The
output format follows the output_path extension (.png, .pdf, …).

Args:
episode_results_path: episode_results.jsonl produced by the per-episode recorder.
output_path: Destination figure file (parent dirs created if absent).
output_path: Destination figure file for the marginals plot; the other plots are written
as siblings suffixed with their name (e.g. report_importance.png). Parent dirs created.
outcome_names: Which per-episode outcome(s) to condition on.
factor_names: Which recorded variations to analyze. None analyzes all of them.
observation: Outcome values to condition on, one per outcome name. None conditions on
success (1) for every binary outcome.
plots: Which plots to render, any of "marginals", "importance", "corner".
seed: Seed for torch's global RNG so a report is reproducible. Pass None to leave the
RNG untouched.

Returns:
The resolved output path.
A mapping of each rendered plot name to its written path.
"""
unknown = [name for name in plots if name not in _PLOT_RENDERERS]
assert not unknown, f"Unknown plot(s) {unknown}; choose from {sorted(_PLOT_RENDERERS)}."

# Estimator training (fit) and posterior sampling both draw from torch's global RNG in
# sequence, so seeding once here makes the whole report reproducible.
if seed is not None:
Expand All @@ -54,11 +70,17 @@ def generate_report(
dataset.default_observation() if observation is None else torch.tensor(observation, dtype=torch.float32)
)
samples = analyzer.sample_posterior(observation_tensor)

output_path = Path(output_path)
plot_marginals(samples, dataset, observation_tensor, output_path=str(output_path))
written: dict[str, Path] = {}
for name in plots:
# The marginals plot keeps output_path; the others get a name-suffixed sibling.
plot_path = output_path if name == "marginals" else output_path.with_stem(f"{output_path.stem}_{name}")
_PLOT_RENDERERS[name](samples, dataset, observation_tensor, output_path=str(plot_path))
written[name] = plot_path
plt.close("all")
print(f"[INFO] Wrote report → {output_path}")
return output_path
print(f"[INFO] Wrote report → {', '.join(str(path) for path in written.values())}")
return written


def main():
Expand Down Expand Up @@ -107,6 +129,14 @@ def main():
"Outcomes are binary, so use 1 for success or 0 for failure. Defaults to 1 (success)."
),
)
parser.add_argument(
"--plots",
type=str,
nargs="+",
choices=sorted(_PLOT_RENDERERS),
default=list(_DEFAULT_PLOTS),
help="Which plots to render. Default: marginals importance corner.",
)
parser.add_argument(
"--seed",
type=int,
Expand All @@ -121,6 +151,7 @@ def main():
outcome_names=args.outcome,
factor_names=args.factors,
observation=args.observation,
plots=args.plots,
seed=args.seed,
)

Expand Down
125 changes: 125 additions & 0 deletions isaaclab_arena/analysis/sensitivity/marginals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Copyright (c) 2026, The Isaac Lab Arena Project Developers (https://github.com/isaac-sim/IsaacLab-Arena/blob/main/CONTRIBUTORS.md).
# All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0

"""Pure (numpy-only) estimators over posterior samples: marginal densities and factor importance.

These functions take already-drawn posterior samples and return summary arrays. They run no
inference and do no plotting, so the static report and the interactive app share one source of

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docstring justifies the split by sharing "one source of truth" with "the interactive app", but I don't see such an app in the repo. The dedup with plotting.py already justifies the module on its own — could we drop the interactive-app clause (or scope it to the static report) so the rationale matches what's actually here?

truth for what a marginal and an importance score mean.
"""

from __future__ import annotations

import numpy as np
from scipy.stats import gaussian_kde
from typing import TYPE_CHECKING

if TYPE_CHECKING:
import torch

from isaaclab_arena.analysis.sensitivity.dataset import FactorSpec, SensitivityDataset

_CONSTANT_STD = 1e-9
"""Below this sample std a continuous posterior is treated as a point mass (KDE is undefined)."""


def continuous_marginal_density(
factor_samples: np.ndarray, value_range: tuple[float, float], num_grid: int = 200
) -> tuple[np.ndarray, np.ndarray | None]:
"""KDE of a continuous factor's posterior samples on a grid over its swept range.

Args:
factor_samples: 1D posterior draws for one continuous factor, in original units.
value_range: The (low, high) the factor was swept over; the grid spans it.
num_grid: Number of evenly spaced grid points across the range.

Returns:
``(grid, density)`` where density is the KDE evaluated on grid. density is None when the
samples are effectively constant (a point mass, where a KDE is undefined) — callers draw
that as a single line at the sample mean.
"""
range_low, range_high = value_range
grid = np.linspace(range_low, range_high, num_grid)
if float(np.std(factor_samples)) < _CONSTANT_STD:
return grid, None
density = gaussian_kde(factor_samples)(grid)
return grid, density


def categorical_marginal_probs(factor_samples: np.ndarray, num_choices: int) -> np.ndarray:
"""Posterior probability per integer-coded choice of a categorical factor.

sbi returns categorical columns as floats over the integer-code support, so samples are
rounded to the nearest code in [0, num_choices - 1] and tallied into frequencies.

Args:
factor_samples: 1D posterior draws for one categorical factor (float-coded).
num_choices: Number of choices the factor can take.

Returns:
A length-``num_choices`` array of probabilities summing to 1.
"""
codes = np.clip(np.round(factor_samples), 0, num_choices - 1).astype(int)
return np.bincount(codes, minlength=num_choices) / len(codes)


def factor_importance(factor_samples: np.ndarray, factor: FactorSpec) -> float:
"""How far a factor's posterior marginal moved from its uniform prior, in [0, 1].

The score is the total-variation distance between the posterior marginal and the uniform
prior the factor was swept from, normalized so 1 means the posterior collapsed onto a single
value and 0 means it is indistinguishable from the prior (the factor did not affect the
outcome). Continuous and categorical factors share the [0, 1] scale, so a single ranking
compares them directly.

Args:
factor_samples: 1D posterior draws for this factor, in the factor's own units/codes.
factor: The factor spec (its type, range, and choices define the prior to compare against).
"""
if factor.type == "continuous":
assert factor.range is not None, f"Continuous factor {factor.name!r} has no range."
grid, density = continuous_marginal_density(factor_samples, factor.range)
if density is None:
return 1.0 # a point mass is maximally far from the uniform prior
span = factor.range[1] - factor.range[0]
if span <= 0:
return 0.0
# Normalize the KDE over the (truncated) range so it integrates to 1 there, then compare
# it to the uniform prior 1/span. TV = 0.5 * integral|posterior - prior|, already in [0, 1].
area = np.trapz(density, grid)
if area <= 0:
return 0.0
posterior = density / area
return float(0.5 * np.trapz(np.abs(posterior - 1.0 / span), grid))

assert factor.choices is not None, f"Categorical factor {factor.name!r} has no choices."
num_choices = len(factor.choices)
probs = categorical_marginal_probs(factor_samples, num_choices)
total_variation = 0.5 * float(np.sum(np.abs(probs - 1.0 / num_choices)))
# A categorical TV maxes out at 1 - 1/k (all mass on one choice), so rescale to [0, 1] to keep
# it comparable with the continuous score.
ceiling = 1.0 - 1.0 / num_choices
return total_variation / ceiling if ceiling > 0 else 0.0


def factor_importances(samples: torch.Tensor, dataset: SensitivityDataset) -> list[tuple[str, float]]:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we add a unit test for the importance scoring? It's the one new piece here that's pure, deterministic, and CPU-only, yet the normalization is non-obvious: continuous uses raw TV with a point-mass shortcut to 1.0, while categorical rescales by the 1 - 1/k ceiling so both types share the [0,1] scale. test_sensitivity_analysis.py is the natural home — feed factor_importance synthetic samples directly (no fit needed) and assert a collapsed posterior scores ≈1 and a near-uniform one ≈0, for one continuous and one categorical factor. That pins the cross-type comparability the tornado ranking depends on.

"""Importance score for every factor, sorted most- to least-important.

A thin dataset-level wrapper over factor_importance: it slices each factor's column out of the
joint posterior samples and scores it against its prior.

Args:
samples: ``(num_samples, num_factors)`` posterior draws in the dataset's factor layout.
dataset: The dataset, for the factor schema and column layout.

Returns:
``(factor_name, score)`` pairs sorted by descending score; each score is in [0, 1].
"""
sample_array = samples.cpu().numpy()
scored = [
(factor.name, factor_importance(sample_array[:, dataset.factor_columns[factor.name]].squeeze(-1), factor))
for factor in dataset.factors
]
return sorted(scored, key=lambda name_score: name_score[1], reverse=True)
Loading
Loading