amd · asyms · Jun 18, 2026 · Jun 24, 2026
@@ -21,4 +21,5 @@ runs:
         source ${{ inputs.env_name }}/bin/activate
         pip install --upgrade pip
         pip install -r requirements.txt
+        pip install -r requirements_stream.txt
         echo "Prerequisites installed into ${{ inputs.env_name }}"
@@ -10,6 +10,7 @@ __pycache__
 build/*
 **/_build/**
 **/build/**
+/outputs/
 *.exe
 *.csv
 secret_github_token

@@ -164,7 +164,17 @@ def pytest_configure(config):
 
 
 def pytest_collection_modifyitems(config, items):
-    device = aie_utils.DefaultNPURuntime.device().resolve().name
+    # Resolve the active NPU device for device-gating. On a host without an NPU
+    # runtime, aie_utils.DefaultNPURuntime is None; in that case there is no
+    # device to gate against, so skip the supported_devices filtering entirely.
+    # Hermetic, NPU-free tests (no supported_devices marker) are unaffected.
+    runtime = aie_utils.DefaultNPURuntime
+    if runtime is None:
+        return
+    npu_device = runtime.device()
+    if npu_device is None:
+        return
+    device = npu_device.resolve().name
     for item in items:
         marker = item.get_closest_marker("supported_devices")
         if marker and device not in marker.args:

@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Minimal demo: build and run the stream-dse-backed fused SwiGLU-prefill operator.
+
+stream-dse generates one fused MLIR design for the whole SwiGLU-prefill block;
+IRON compiles it to an xclbin and runs it once on the NPU. Requires stream-dse
+(see requirements_stream.txt) and an npu2 device.
+
+    python demos/swiglu_prefill_stream/demo.py
+"""
+
+import time
+
+import torch
+from ml_dtypes import bfloat16
+
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream
+
+SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM = 256, 512, 2048
+
+
+def rand_bf16(*shape: int) -> XRTTensor:
+    return XRTTensor.from_torch(torch.randn(*shape, dtype=torch.bfloat16))
+
+
+def main() -> None:
+    op = SwiGLUPrefillStream(
+        seq_len=SEQ_LEN, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM
+    )
+    op.compile()
+    run = op.get_callable()
+
+    x = rand_bf16(SEQ_LEN, EMBEDDING_DIM)
+    w_gate = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
+    w_up = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
+    w_down = rand_bf16(HIDDEN_DIM, EMBEDDING_DIM)
+    out = XRTTensor((SEQ_LEN * EMBEDDING_DIM,), dtype=bfloat16)
+
+    run(x, w_gate, w_up, w_down, out)  # warmup
+    start = time.perf_counter()
+    run(x, w_gate, w_up, w_down, out)
+    elapsed_us = (time.perf_counter() - start) * 1e6
+    print(
+        f"SwiGLU-prefill {SEQ_LEN}x{EMBEDDING_DIM}x{HIDDEN_DIM} ran in {elapsed_us:.1f} us"
+    )
+
+
+if __name__ == "__main__":
+    main()
@@ -18,3 +18,4 @@
     PythonGeneratedMLIRArtifact,
     DesignGenerator,
 )
+from .layout import Stride, TiledStride, TiledStridedLayout, tiled_2d
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tiled-strided memory layouts for IRON operators.
+
+A tiled-strided layout describes how a logical multi-dimensional tensor is laid
+out in memory as a hierarchy of tiles, each level carrying its own ``(step,
+bound)`` stride. It is the layout model AIE kernels are written against: a GEMM
+microkernel, for example, reads its ``MxK`` operand as ``mt x kt`` tiles of
+``r x s`` elements, which is exactly a two-level tiled-strided layout.
+
+The types here mirror ``snaxc.ir.tsl`` (``Stride`` -> ``TiledStride`` ->
+``TiledStridedLayout``) so an IRON-authored layout can be handed to stream-dse's
+code generation verbatim via :meth:`TiledStridedLayout.to_snaxc`. They carry no
+stream-dse / snaxc / xdsl dependency themselves -- the snaxc import is lazy and
+confined to ``to_snaxc`` -- so they are usable (and testable) in a plain IRON
+install with no AIE codegen toolchain present.
+
+This is a common primitive: it is meant to be shared across operators as the one
+place a kernel's operand layouts are defined, rather than re-derived per operator
+or hand-copied into stream-dse.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Stride:
+    """One stride level: ``bound`` elements spaced ``step`` apart.
+
+    ``step``/``bound`` may be ``None`` to denote a dynamic (run-time) value,
+    matching snaxc's convention.
+    """
+
+    step: int | None
+    bound: int | None
+
+
+@dataclass
+class TiledStride:
+    """The strides of a single tensor dimension, outermost tile first.
+
+    A simple (untiled) dimension has one stride; one level of tiling has two
+    (the outer tile stride followed by the inner element stride), and so on.
+    """
+
+    strides: tuple[Stride, ...]
+
+    def __post_init__(self) -> None:
+        self.strides = tuple(self.strides)
+
+
+@dataclass
+class TiledStridedLayout:
+    """A tiled-strided layout: one :class:`TiledStride` per tensor dimension."""
+
+    tstrides: tuple[TiledStride, ...]
+    offset: int = 0
+
+    def __post_init__(self) -> None:
+        self.tstrides = tuple(self.tstrides)
+
+    def to_snaxc(self):
+        """Return the equivalent ``snaxc.ir.tsl.TiledStridedLayout``.
+
+        The snaxc import is deferred to here so this module stays usable without
+        the AIE codegen toolchain installed. Used to feed IRON-authored layouts
+        into stream-dse code generation.
+        """
+        from snaxc.ir.tsl import (
+            Stride as SnaxStride,
+            TiledStride as SnaxTiledStride,
+            TiledStridedLayout as SnaxTiledStridedLayout,
+        )
+
+        return SnaxTiledStridedLayout(
+            [
+                SnaxTiledStride([SnaxStride(s.step, s.bound) for s in ts.strides])
+                for ts in self.tstrides
+            ],
+            offset=self.offset,
+        )
+
+
+def tiled_2d(rows: int, cols: int, row_unit: int, col_unit: int) -> TiledStridedLayout:
+    """Two-level tiled-strided layout for a ``rows x cols`` tensor.
+
+    The tensor is tiled into ``(rows // row_unit) x (cols // col_unit)`` tiles of
+    ``row_unit x col_unit`` elements, the tiles laid out row-major and each tile
+    stored row-major internally. This reproduces stream-dse's GEMM/elementwise
+    operand layouts (the intrinsic ``row_unit``/``col_unit`` are the kernel's MAC
+    tile dimensions).
+    """
+    rows_t, cols_t = rows // row_unit, cols // col_unit
+    return TiledStridedLayout(
+        (
+            TiledStride(
+                (
+                    Stride(row_unit * col_unit * cols_t, rows_t),
+                    Stride(col_unit, row_unit),
+                )
+            ),
+            TiledStride((Stride(row_unit * col_unit, cols_t), Stride(1, col_unit))),
+        )
+    )
@@ -2,7 +2,29 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
-from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
+
+try:
+    # XRT (pyxrt) is only present on a host with the NPU runtime installed.
+    # Import lazily so that pure-MLIR / introspection code paths (and their
+    # tests) can import iron.* without an NPU. XRTSubBuffer below only needs
+    # these symbols when actually instantiated, which requires the NPU anyway.
+    from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
+except ImportError:  # pragma: no cover - exercised only when XRT is absent
+
+    class XRTTensor:  # type: ignore[no-redef]
+        """Placeholder used when XRT/pyxrt is unavailable.
+
+        Instantiating it (i.e. attempting NPU work without XRT) fails loudly;
+        merely importing the module does not.
+        """
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "XRTTensor requires pyxrt/XRT, which is not installed. "
+                "NPU runtime operations are unavailable in this environment."
+            )
+
+    _pyxrt = None
 
 
 def get_shim_dma_limit(dev) -> int:

@@ -1,17 +1,48 @@
 # SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from .elementwise_add.op import ElementwiseAdd
-from .elementwise_mul.op import ElementwiseMul
-from .gemm.op import GEMM
-from .gemv.op import GEMV
-from .mha.op import MHA
-from .rms_norm.op import RMSNorm
-from .rope.op import RoPE
-from .silu.op import SiLU
-from .softmax.op import Softmax
-from .swiglu_decode.op import SwiGLUDecode
-from .swiglu_prefill.op import SwiGLUPrefill
-from .transpose.op import Transpose
-from .strided_copy.op import StridedCopy
-from .repeat.op import Repeat
+"""IRON operators.
+
+Operator classes are exposed lazily (PEP 562): they are imported on first
+attribute access rather than eagerly at package import time. This lets the
+package be imported on a host without the NPU runtime (XRT/pyxrt), while ``from
+iron.operators import GEMM`` continues to work unchanged when the runtime is
+available.
+"""
+
+import importlib
+
+# Public operator name -> defining submodule (relative to this package).
+_OPERATORS = {
+    "ElementwiseAdd": ".elementwise_add.op",
+    "ElementwiseMul": ".elementwise_mul.op",
+    "GEMM": ".gemm.op",
+    "GEMV": ".gemv.op",
+    "MHA": ".mha.op",
+    "RMSNorm": ".rms_norm.op",
+    "RoPE": ".rope.op",
+    "SiLU": ".silu.op",
+    "Softmax": ".softmax.op",
+    "SwiGLUDecode": ".swiglu_decode.op",
+    "SwiGLUPrefill": ".swiglu_prefill.op",
+    "SwiGLUPrefillStream": ".swiglu_prefill_stream.op",
+    "Transpose": ".transpose.op",
+    "StridedCopy": ".strided_copy.op",
+    "Repeat": ".repeat.op",
+}
+
+__all__ = list(_OPERATORS)
+
+
+def __getattr__(name: str):
+    module_path = _OPERATORS.get(name)
+    if module_path is None:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    module = importlib.import_module(module_path, __name__)
+    attr = getattr(module, name)
+    globals()[name] = attr  # cache so subsequent access skips __getattr__
+    return attr
+
+
+def __dir__():
+    return sorted(set(globals()) | set(_OPERATORS))
@@ -0,0 +1,53 @@
+<!--
+SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# SwiGLU prefill (stream-dse codegen)
+
+This operator is **fused**: the whole SwiGLU-prefill block (both GEMMs + SiLU +
+elementwise-mul) is emitted as a **single MLIR design generated by
+[`stream-dse`](https://github.com/KULeuven-MICAS/stream)**, then compiled by IRON's normal
+flow into one xclbin. Unlike the other operators, its MLIR is not written by hand — it is
+produced at build time by [`stream_design.py`](./stream_design.py), which calls the installed
+`stream` package (`stream.api.optimize_allocation_co(..., enable_codegen=True)`).
+
+## Enabling stream codegen
+
+`stream-dse` is an **optional, separately-installed** dependency (it is *not* in IRON's
+`requirements.txt`). Install it into the **same environment** as IRON via the extra
+requirements file:
+
+```bash
+pip install -r requirements_stream.txt
+stream-setup-aie          # required: installs stream-dse's AIE codegen deps
+```
+
+Notes:
+- MLIR generation uses the open-source **OR-Tools GSCIP** solver (`backend="ortools_gscip"`),
+  so **no Gurobi license** is required.
+- `stream-setup-aie` is **required**: it installs the AIE codegen packages stream-dse needs
+  that cannot be plain PyPI dependencies (`snax-mlir`/`snaxc`, `xdsl-aie`, `aie-python-extras`),
+  since they are direct git/URL installs. It also installs the `mlir_aie` / `llvm-aie` wheels,
+  but skips those if IRON's `requirements.txt` already provided them.
+- Importing the operator does **not** require `stream-dse` (the launcher is imported lazily);
+  only **building** (`operator.compile()` / running the test) does.
+
+## Build & run
+
+```bash
+# build + run on an NPU2 (Strix) device
+source /opt/xilinx/xrt/setup.sh        # XRT on PATH (provides pyxrt + xclbinutil)
+pytest iron/operators/swiglu_prefill_stream/test.py
+```
+
+The feasible/verified shape is **seq 256 / embedding 512 / hidden 2048**, tiles
+**32 / 32 / 64**, target **npu2**.
+
+## Caveats (stream-dse packaging)
+
+- The hardware-description YAML (`whole_array_strix.yaml` + `hardware/cores/*.yaml`) is
+  resolved from the **installed `stream` package**, where it ships as package data
+  (stream-dse >= 1.13.3); nothing is vendored in this operator.
+- `stream-dse` writes its generated ONNX workload / mapping YAML **into its installed package
+  directory**, so that environment must be writable.