From 5103ce59cf6af3ab2c8e57168fcfd4fa04b19dae Mon Sep 17 00:00:00 2001
From: asyms <arne.symons@kuleuven.be>
Date: Thu, 18 Jun 2026 15:41:27 +0200
Subject: [PATCH 1/2] Add stream-dse-backed fused SwiGLU-prefill operator

SwiGLUPrefillStream compiles the whole SwiGLU-prefill block (gate/up GEMMs +
SiLU + elementwise-mul + down GEMM) as a single fused MLIR design generated by
stream-dse, producing one xclbin instead of chaining separately-compiled
sub-operators. The design is generated at build time by stream_design.py and
compiled through IRON's normal flow.

The fused design's per-kernel operand layouts (the tiled-strided DMA tiling) are
authored on the IRON side and fed into stream-dse code generation rather than
hand-copied inside stream: iron.common.layout provides a TiledStridedLayout type,
and swiglu_prefill_stream/stream_kernels.py injects IRON's layouts through
optimize_allocation_co(kernels=...) -- the override hook added in stream-dse
1.13.4 -- keeping stream's kernel construction and replacing only
operand_layouts().

stream-dse is an optional dependency (requirements_stream.txt); the operator's
test skips when it is absent. Importing iron.operators no longer requires an NPU
runtime (lazy XRT import), so the package loads on hosts without XRT/pyxrt.
Includes a minimal k=1 demo under demos/swiglu_prefill_stream/.
---
 .gitignore                                    |   1 +
 conftest.py                                   |  12 +-
 demos/swiglu_prefill_stream/demo.py           |  52 ++++++
 iron/common/__init__.py                       |   1 +
 iron/common/layout.py                         | 107 +++++++++++
 iron/common/utils.py                          |  24 ++-
 iron/operators/__init__.py                    |  59 ++++--
 .../operators/swiglu_prefill_stream/README.md |  53 ++++++
 iron/operators/swiglu_prefill_stream/op.py    | 176 ++++++++++++++++++
 .../swiglu_prefill_stream/stream_design.py    | 106 +++++++++++
 .../swiglu_prefill_stream/stream_kernels.py   |  94 ++++++++++
 iron/operators/swiglu_prefill_stream/test.py  | 103 ++++++++++
 requirements_stream.txt                       |  26 +++
 13 files changed, 798 insertions(+), 16 deletions(-)
 create mode 100644 demos/swiglu_prefill_stream/demo.py
 create mode 100644 iron/common/layout.py
 create mode 100644 iron/operators/swiglu_prefill_stream/README.md
 create mode 100644 iron/operators/swiglu_prefill_stream/op.py
 create mode 100644 iron/operators/swiglu_prefill_stream/stream_design.py
 create mode 100644 iron/operators/swiglu_prefill_stream/stream_kernels.py
 create mode 100644 iron/operators/swiglu_prefill_stream/test.py
 create mode 100644 requirements_stream.txt

diff --git a/.gitignore b/.gitignore
index c2e66af8..bdffa86e 100755
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ __pycache__
 build/*
 **/_build/**
 **/build/**
+/outputs/
 *.exe
 *.csv
 secret_github_token
diff --git a/conftest.py b/conftest.py
index 564d8563..6408c503 100644
--- a/conftest.py
+++ b/conftest.py
@@ -164,7 +164,17 @@ def pytest_configure(config):
 
 
 def pytest_collection_modifyitems(config, items):
-    device = aie_utils.DefaultNPURuntime.device().resolve().name
+    # Resolve the active NPU device for device-gating. On a host without an NPU
+    # runtime, aie_utils.DefaultNPURuntime is None; in that case there is no
+    # device to gate against, so skip the supported_devices filtering entirely.
+    # Hermetic, NPU-free tests (no supported_devices marker) are unaffected.
+    runtime = aie_utils.DefaultNPURuntime
+    if runtime is None:
+        return
+    npu_device = runtime.device()
+    if npu_device is None:
+        return
+    device = npu_device.resolve().name
     for item in items:
         marker = item.get_closest_marker("supported_devices")
         if marker and device not in marker.args:
diff --git a/demos/swiglu_prefill_stream/demo.py b/demos/swiglu_prefill_stream/demo.py
new file mode 100644
index 00000000..0796f300
--- /dev/null
+++ b/demos/swiglu_prefill_stream/demo.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Minimal demo: build and run the stream-dse-backed fused SwiGLU-prefill operator.
+
+stream-dse generates one fused MLIR design for the whole SwiGLU-prefill block;
+IRON compiles it to an xclbin and runs it once on the NPU. Requires stream-dse
+(see requirements_stream.txt) and an npu2 device.
+
+    python demos/swiglu_prefill_stream/demo.py
+"""
+
+import time
+
+import torch
+from ml_dtypes import bfloat16
+
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream
+
+SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM = 256, 512, 2048
+
+
+def rand_bf16(*shape: int) -> XRTTensor:
+    return XRTTensor.from_torch(torch.randn(*shape, dtype=torch.bfloat16))
+
+
+def main() -> None:
+    op = SwiGLUPrefillStream(
+        seq_len=SEQ_LEN, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM
+    )
+    op.compile()
+    run = op.get_callable()
+
+    x = rand_bf16(SEQ_LEN, EMBEDDING_DIM)
+    w_gate = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
+    w_up = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
+    w_down = rand_bf16(HIDDEN_DIM, EMBEDDING_DIM)
+    out = XRTTensor((SEQ_LEN * EMBEDDING_DIM,), dtype=bfloat16)
+
+    run(x, w_gate, w_up, w_down, out)  # warmup
+    start = time.perf_counter()
+    run(x, w_gate, w_up, w_down, out)
+    elapsed_us = (time.perf_counter() - start) * 1e6
+    print(
+        f"SwiGLU-prefill {SEQ_LEN}x{EMBEDDING_DIM}x{HIDDEN_DIM} ran in {elapsed_us:.1f} us"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/iron/common/__init__.py b/iron/common/__init__.py
index 9f5a8f7a..cb2ff31b 100644
--- a/iron/common/__init__.py
+++ b/iron/common/__init__.py
@@ -18,3 +18,4 @@
     PythonGeneratedMLIRArtifact,
     DesignGenerator,
 )
+from .layout import Stride, TiledStride, TiledStridedLayout, tiled_2d
diff --git a/iron/common/layout.py b/iron/common/layout.py
new file mode 100644
index 00000000..09770524
--- /dev/null
+++ b/iron/common/layout.py
@@ -0,0 +1,107 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tiled-strided memory layouts for IRON operators.
+
+A tiled-strided layout describes how a logical multi-dimensional tensor is laid
+out in memory as a hierarchy of tiles, each level carrying its own ``(step,
+bound)`` stride. It is the layout model AIE kernels are written against: a GEMM
+microkernel, for example, reads its ``MxK`` operand as ``mt x kt`` tiles of
+``r x s`` elements, which is exactly a two-level tiled-strided layout.
+
+The types here mirror ``snaxc.ir.tsl`` (``Stride`` -> ``TiledStride`` ->
+``TiledStridedLayout``) so an IRON-authored layout can be handed to stream-dse's
+code generation verbatim via :meth:`TiledStridedLayout.to_snaxc`. They carry no
+stream-dse / snaxc / xdsl dependency themselves -- the snaxc import is lazy and
+confined to ``to_snaxc`` -- so they are usable (and testable) in a plain IRON
+install with no AIE codegen toolchain present.
+
+This is a common primitive: it is meant to be shared across operators as the one
+place a kernel's operand layouts are defined, rather than re-derived per operator
+or hand-copied into stream-dse.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class Stride:
+    """One stride level: ``bound`` elements spaced ``step`` apart.
+
+    ``step``/``bound`` may be ``None`` to denote a dynamic (run-time) value,
+    matching snaxc's convention.
+    """
+
+    step: int | None
+    bound: int | None
+
+
+@dataclass
+class TiledStride:
+    """The strides of a single tensor dimension, outermost tile first.
+
+    A simple (untiled) dimension has one stride; one level of tiling has two
+    (the outer tile stride followed by the inner element stride), and so on.
+    """
+
+    strides: tuple[Stride, ...]
+
+    def __post_init__(self) -> None:
+        self.strides = tuple(self.strides)
+
+
+@dataclass
+class TiledStridedLayout:
+    """A tiled-strided layout: one :class:`TiledStride` per tensor dimension."""
+
+    tstrides: tuple[TiledStride, ...]
+    offset: int = 0
+
+    def __post_init__(self) -> None:
+        self.tstrides = tuple(self.tstrides)
+
+    def to_snaxc(self):
+        """Return the equivalent ``snaxc.ir.tsl.TiledStridedLayout``.
+
+        The snaxc import is deferred to here so this module stays usable without
+        the AIE codegen toolchain installed. Used to feed IRON-authored layouts
+        into stream-dse code generation.
+        """
+        from snaxc.ir.tsl import (
+            Stride as SnaxStride,
+            TiledStride as SnaxTiledStride,
+            TiledStridedLayout as SnaxTiledStridedLayout,
+        )
+
+        return SnaxTiledStridedLayout(
+            [
+                SnaxTiledStride([SnaxStride(s.step, s.bound) for s in ts.strides])
+                for ts in self.tstrides
+            ],
+            offset=self.offset,
+        )
+
+
+def tiled_2d(rows: int, cols: int, row_unit: int, col_unit: int) -> TiledStridedLayout:
+    """Two-level tiled-strided layout for a ``rows x cols`` tensor.
+
+    The tensor is tiled into ``(rows // row_unit) x (cols // col_unit)`` tiles of
+    ``row_unit x col_unit`` elements, the tiles laid out row-major and each tile
+    stored row-major internally. This reproduces stream-dse's GEMM/elementwise
+    operand layouts (the intrinsic ``row_unit``/``col_unit`` are the kernel's MAC
+    tile dimensions).
+    """
+    rows_t, cols_t = rows // row_unit, cols // col_unit
+    return TiledStridedLayout(
+        (
+            TiledStride(
+                (
+                    Stride(row_unit * col_unit * cols_t, rows_t),
+                    Stride(col_unit, row_unit),
+                )
+            ),
+            TiledStride((Stride(row_unit * col_unit, cols_t), Stride(1, col_unit))),
+        )
+    )
diff --git a/iron/common/utils.py b/iron/common/utils.py
index e8d4a1f6..778acdea 100644
--- a/iron/common/utils.py
+++ b/iron/common/utils.py
@@ -2,7 +2,29 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
-from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
+
+try:
+    # XRT (pyxrt) is only present on a host with the NPU runtime installed.
+    # Import lazily so that pure-MLIR / introspection code paths (and their
+    # tests) can import iron.* without an NPU. XRTSubBuffer below only needs
+    # these symbols when actually instantiated, which requires the NPU anyway.
+    from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
+except ImportError:  # pragma: no cover - exercised only when XRT is absent
+
+    class XRTTensor:  # type: ignore[no-redef]
+        """Placeholder used when XRT/pyxrt is unavailable.
+
+        Instantiating it (i.e. attempting NPU work without XRT) fails loudly;
+        merely importing the module does not.
+        """
+
+        def __init__(self, *args, **kwargs):
+            raise ImportError(
+                "XRTTensor requires pyxrt/XRT, which is not installed. "
+                "NPU runtime operations are unavailable in this environment."
+            )
+
+    _pyxrt = None
 
 
 def get_shim_dma_limit(dev) -> int:
diff --git a/iron/operators/__init__.py b/iron/operators/__init__.py
index 4a6c5604..ce7019e3 100644
--- a/iron/operators/__init__.py
+++ b/iron/operators/__init__.py
@@ -1,17 +1,48 @@
 # SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from .elementwise_add.op import ElementwiseAdd
-from .elementwise_mul.op import ElementwiseMul
-from .gemm.op import GEMM
-from .gemv.op import GEMV
-from .mha.op import MHA
-from .rms_norm.op import RMSNorm
-from .rope.op import RoPE
-from .silu.op import SiLU
-from .softmax.op import Softmax
-from .swiglu_decode.op import SwiGLUDecode
-from .swiglu_prefill.op import SwiGLUPrefill
-from .transpose.op import Transpose
-from .strided_copy.op import StridedCopy
-from .repeat.op import Repeat
+"""IRON operators.
+
+Operator classes are exposed lazily (PEP 562): they are imported on first
+attribute access rather than eagerly at package import time. This lets the
+package be imported on a host without the NPU runtime (XRT/pyxrt), while ``from
+iron.operators import GEMM`` continues to work unchanged when the runtime is
+available.
+"""
+
+import importlib
+
+# Public operator name -> defining submodule (relative to this package).
+_OPERATORS = {
+    "ElementwiseAdd": ".elementwise_add.op",
+    "ElementwiseMul": ".elementwise_mul.op",
+    "GEMM": ".gemm.op",
+    "GEMV": ".gemv.op",
+    "MHA": ".mha.op",
+    "RMSNorm": ".rms_norm.op",
+    "RoPE": ".rope.op",
+    "SiLU": ".silu.op",
+    "Softmax": ".softmax.op",
+    "SwiGLUDecode": ".swiglu_decode.op",
+    "SwiGLUPrefill": ".swiglu_prefill.op",
+    "SwiGLUPrefillStream": ".swiglu_prefill_stream.op",
+    "Transpose": ".transpose.op",
+    "StridedCopy": ".strided_copy.op",
+    "Repeat": ".repeat.op",
+}
+
+__all__ = list(_OPERATORS)
+
+
+def __getattr__(name: str):
+    module_path = _OPERATORS.get(name)
+    if module_path is None:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    module = importlib.import_module(module_path, __name__)
+    attr = getattr(module, name)
+    globals()[name] = attr  # cache so subsequent access skips __getattr__
+    return attr
+
+
+def __dir__():
+    return sorted(set(globals()) | set(_OPERATORS))
diff --git a/iron/operators/swiglu_prefill_stream/README.md b/iron/operators/swiglu_prefill_stream/README.md
new file mode 100644
index 00000000..2b7c779c
--- /dev/null
+++ b/iron/operators/swiglu_prefill_stream/README.md
@@ -0,0 +1,53 @@
+<!--
+SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+SPDX-License-Identifier: Apache-2.0
+-->
+
+# SwiGLU prefill (stream-dse codegen)
+
+This operator is **fused**: the whole SwiGLU-prefill block (both GEMMs + SiLU +
+elementwise-mul) is emitted as a **single MLIR design generated by
+[`stream-dse`](https://github.com/KULeuven-MICAS/stream)**, then compiled by IRON's normal
+flow into one xclbin. Unlike the other operators, its MLIR is not written by hand — it is
+produced at build time by [`stream_design.py`](./stream_design.py), which calls the installed
+`stream` package (`stream.api.optimize_allocation_co(..., enable_codegen=True)`).
+
+## Enabling stream codegen
+
+`stream-dse` is an **optional, separately-installed** dependency (it is *not* in IRON's
+`requirements.txt`). Install it into the **same environment** as IRON via the extra
+requirements file:
+
+```bash
+pip install -r requirements_stream.txt
+stream-setup-aie          # required: installs stream-dse's AIE codegen deps
+```
+
+Notes:
+- MLIR generation uses the open-source **OR-Tools GSCIP** solver (`backend="ortools_gscip"`),
+  so **no Gurobi license** is required.
+- `stream-setup-aie` is **required**: it installs the AIE codegen packages stream-dse needs
+  that cannot be plain PyPI dependencies (`snax-mlir`/`snaxc`, `xdsl-aie`, `aie-python-extras`),
+  since they are direct git/URL installs. It also installs the `mlir_aie` / `llvm-aie` wheels,
+  but skips those if IRON's `requirements.txt` already provided them.
+- Importing the operator does **not** require `stream-dse` (the launcher is imported lazily);
+  only **building** (`operator.compile()` / running the test) does.
+
+## Build & run
+
+```bash
+# build + run on an NPU2 (Strix) device
+source /opt/xilinx/xrt/setup.sh        # XRT on PATH (provides pyxrt + xclbinutil)
+pytest iron/operators/swiglu_prefill_stream/test.py
+```
+
+The feasible/verified shape is **seq 256 / embedding 512 / hidden 2048**, tiles
+**32 / 32 / 64**, target **npu2**.
+
+## Caveats (stream-dse packaging)
+
+- The hardware-description YAML (`whole_array_strix.yaml` + `hardware/cores/*.yaml`) is
+  resolved from the **installed `stream` package**, where it ships as package data
+  (stream-dse >= 1.13.3); nothing is vendored in this operator.
+- `stream-dse` writes its generated ONNX workload / mapping YAML **into its installed package
+  directory**, so that environment must be writable.
diff --git a/iron/operators/swiglu_prefill_stream/op.py b/iron/operators/swiglu_prefill_stream/op.py
new file mode 100644
index 00000000..3d1d40ae
--- /dev/null
+++ b/iron/operators/swiglu_prefill_stream/op.py
@@ -0,0 +1,176 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, field
+from typing import Any
+
+import aie.utils as aie_utils
+
+from iron.common import (
+    MLIROperator,
+    AIERuntimeArgSpec,
+    KernelObjectArtifact,
+    SourceArtifact,
+    PythonGeneratedMLIRArtifact,
+    DesignGenerator,
+)
+from iron.common.device_utils import get_kernel_dir
+
+
+@dataclass
+class SwiGLUPrefillStream(MLIROperator):
+    """Fused SwiGLU-prefill block whose MLIR is generated by stream-dse.
+
+    Unlike the composite ``SwiGLUPrefill`` (which chains four separately compiled
+    sub-operators and orchestrates them from the host), this operator compiles a
+    single fused MLIR design produced by stream-dse: one xclbin with a single
+    runtime sequence taking ``(input, weights_1, weights_2, weights_3, output)``.
+
+    The MLIR is produced lazily at compile time by ``stream_design.py``, which
+    calls the installed ``stream-dse`` package. Building therefore requires
+    ``stream-dse`` (``pip install stream-dse`` + ``stream-setup-aie``); merely
+    importing this operator does not.
+    """
+
+    seq_len: int
+    embedding_dim: int
+    hidden_dim: int
+    seq_len_tile_size: int = 32
+    embedding_tile_size: int = 32
+    hidden_tile_size: int = 64
+    last_gemm_down: bool = True
+    in_dtype: str = field(default="bf16", repr=False)
+    out_dtype: str = field(default="bf16", repr=False)
+    trace_size: int = field(default=0, repr=False)
+    rows: int = field(default=4, repr=False)
+    num_aie_columns: int = field(default=8, repr=False)
+    backend: str = field(default="ortools_gscip", repr=False)
+    # Weights are runtime data supplied by the caller after construction; they
+    # are not needed to build the design.
+    weights_1: Any = field(default=None, repr=False, compare=False)
+    weights_2: Any = field(default=None, repr=False, compare=False)
+    weights_3: Any = field(default=None, repr=False, compare=False)
+    context: Any = field(default=None, repr=False, compare=False)
+
+    def __post_init__(self):
+        MLIROperator.__init__(self, context=self.context)
+
+    def get_mlir_artifact(self):
+        npu = aie_utils.get_current_device().resolve().name
+        return PythonGeneratedMLIRArtifact(
+            f"{self.name}.mlir",
+            DesignGenerator(
+                self.operator_dir / "stream_design.py",
+                "run_main_aie_codegen_swiglu",
+                (),
+                {
+                    "seq_len": self.seq_len,
+                    "embedding_dim": self.embedding_dim,
+                    "hidden_dim": self.hidden_dim,
+                    "in_dtype": self.in_dtype,
+                    "out_dtype": self.out_dtype,
+                    "trace_size": self.trace_size,
+                    "rows": self.rows,
+                    "cols": self.num_aie_columns,
+                    "npu": npu,
+                    "seq_len_tile_size": self.seq_len_tile_size,
+                    "embedding_tile_size": self.embedding_tile_size,
+                    "hidden_tile_size": self.hidden_tile_size,
+                    "last_gemm_down": self.last_gemm_down,
+                    "backend": self.backend,
+                },
+            ),
+        )
+
+    def _mm_kernel(self, tile_m, tile_k, tile_n):
+        """Compile the shared ``mm.cc`` kernel for one tile configuration.
+
+        stream-dse emits dimension-suffixed kernel symbols (e.g.
+        ``matmul_bf16_bf16_32_32_64``) so the gate/up and down GEMMs -- which use
+        different tile shapes -- can coexist in one fused design. Upstream
+        ``mm.cc`` emits unsuffixed ``matmul_bf16_bf16`` / ``zero_bf16``, so we
+        rename them to match the names the generated MLIR calls.
+        """
+        base_dir = self.context.base_dir
+        kernel_dir = get_kernel_dir()
+        suffix = f"{tile_m}_{tile_k}_{tile_n}"
+        return KernelObjectArtifact(
+            f"mm_{suffix}.o",
+            dependencies=[
+                SourceArtifact(base_dir / "aie_kernels" / kernel_dir / "mm.cc")
+            ],
+            extra_flags=[
+                f"-DDIM_M={tile_m}",
+                f"-DDIM_K={tile_k}",
+                f"-DDIM_N={tile_n}",
+                "-Dbf16_bf16_ONLY",
+            ],
+            rename_symbols={
+                "matmul_bf16_bf16": f"matmul_bf16_bf16_{suffix}",
+                "zero_bf16": f"zero_bf16_{suffix}",
+            },
+        )
+
+    def get_kernel_artifacts(self):
+        base_dir = self.context.base_dir
+        kernel_dir = get_kernel_dir()
+        kernels = [
+            # gate/up projections: input (seq, embedding) @ W (embedding, hidden)
+            self._mm_kernel(
+                self.seq_len_tile_size,
+                self.embedding_tile_size,
+                self.hidden_tile_size,
+            ),
+            KernelObjectArtifact(
+                "silu.o",
+                dependencies=[
+                    SourceArtifact(base_dir / "aie_kernels" / kernel_dir / "silu.cc")
+                ],
+            ),
+            KernelObjectArtifact(
+                "mul.o",
+                dependencies=[
+                    SourceArtifact(base_dir / "aie_kernels" / "generic" / "mul.cc")
+                ],
+            ),
+        ]
+        if self.last_gemm_down:
+            # down projection: intermediate (seq, hidden) @ W (hidden, embedding)
+            kernels.insert(
+                1,
+                self._mm_kernel(
+                    self.seq_len_tile_size,
+                    self.hidden_tile_size,
+                    self.embedding_tile_size,
+                ),
+            )
+        return kernels
+
+    def set_up_artifacts(self):
+        # stream-dse emits a placed/routed design that links a distinct kernel
+        # object per core, so it needs --dynamic-objFifos (both targets) and
+        # --no-unified (separate per-core compilation; xclbin only).
+        xclbin_artifact, insts_artifact = self.get_artifacts(dynamic_obj_fifos=True)
+        xclbin_artifact.extra_flags.append("--no-unified")
+        self.xclbin_artifact = xclbin_artifact
+        self.insts_artifact = insts_artifact
+        self.add_artifacts([xclbin_artifact, insts_artifact])
+
+    def get_arg_spec(self):
+        specs = [
+            AIERuntimeArgSpec("in", (self.seq_len, self.embedding_dim)),  # input
+            AIERuntimeArgSpec(
+                "in", (self.embedding_dim, self.hidden_dim)
+            ),  # weights_1 (gate)
+            AIERuntimeArgSpec(
+                "in", (self.embedding_dim, self.hidden_dim)
+            ),  # weights_2 (up)
+        ]
+        if self.last_gemm_down:
+            specs.append(
+                AIERuntimeArgSpec("in", (self.hidden_dim, self.embedding_dim))
+            )  # weights_3 (down)
+            specs.append(AIERuntimeArgSpec("out", (self.seq_len, self.embedding_dim)))
+        else:
+            specs.append(AIERuntimeArgSpec("out", (self.seq_len, self.hidden_dim)))
+        return specs
diff --git a/iron/operators/swiglu_prefill_stream/stream_design.py b/iron/operators/swiglu_prefill_stream/stream_design.py
new file mode 100644
index 00000000..9ae37f7f
--- /dev/null
+++ b/iron/operators/swiglu_prefill_stream/stream_design.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Stream-dse MLIR generation launcher for the fused SwiGLU-prefill operator.
+
+This is the in-IRON replacement for the previously hardcoded
+``/home/micas/stream_aie/main_swiglu.py`` entry point. It calls the *installed*
+``stream-dse`` package (``pip install stream-dse`` followed by ``stream-setup-aie``)
+to produce a single fused MLIR module for the whole SwiGLU-prefill block, which
+IRON then compiles into an xclbin/insts pair.
+
+The function signature mirrors ``run_main_aie_codegen_swiglu`` from stream-dse's
+``scripts/main_swiglu.py`` reference entry point. Because ``scripts/`` is not
+shipped in the stream-dse wheel, that logic is vendored here; the hardware-
+description YAML is resolved from the installed ``stream`` package, where it ships
+as package data (stream-dse >= 1.13.3).
+
+This module is imported lazily (by ``DesignGenerator`` at compile time), so
+importing the operator does not require ``stream-dse`` to be installed -- only
+building it does.
+"""
+
+import os
+import re
+
+import stream
+from stream.api import optimize_allocation_co
+from stream.inputs.aie.mapping.make_swiglu_mapping import make_swiglu_mapping
+from stream.inputs.aie.workload.make_onnx_swiglu import make_swiglu_workload
+
+from iron.operators.swiglu_prefill_stream.stream_kernels import iron_kernels
+
+# Hardware description for the whole-array Strix (npu2) target, shipped as package
+# data inside the installed stream package (stream-dse >= 1.13.3).
+_ACCELERATOR = os.path.join(
+    os.path.dirname(stream.__file__),
+    "inputs",
+    "aie",
+    "hardware",
+    "whole_array_strix.yaml",
+)
+
+
+def run_main_aie_codegen_swiglu(
+    seq_len,
+    embedding_dim,
+    hidden_dim,
+    in_dtype="bf16",
+    out_dtype="bf16",
+    trace_size=0,
+    rows=4,
+    cols=8,
+    npu="npu2",
+    seq_len_tile_size=32,
+    embedding_tile_size=32,
+    hidden_tile_size=64,
+    last_gemm_down=True,
+    backend="ortools_gscip",
+):
+    """Generate the fused SwiGLU-prefill MLIR module via stream-dse.
+
+    Returns the (xdsl) MLIR module; ``str(module)`` yields the textual MLIR that
+    IRON's ``PythonGeneratedMLIRArtifact`` writes to disk and compiles.
+
+    The default ``ortools_gscip`` backend is the license-free OR-Tools GSCIP
+    solver, so no Gurobi license is required.
+    """
+    workload_path = make_swiglu_workload(
+        seq_len,
+        embedding_dim,
+        hidden_dim,
+        in_dtype,
+        out_dtype,
+        last_gemm_down=last_gemm_down,
+    )
+    mapping_path = make_swiglu_mapping(
+        seq_len,
+        embedding_dim,
+        hidden_dim,
+        last_gemm_down,
+        seq_len_tile_size,
+        embedding_tile_size,
+        hidden_tile_size,
+    )
+
+    hw_name = os.path.splitext(os.path.basename(_ACCELERATOR))[0]
+    wl_name = re.split(r"/|\.", workload_path)[-1]
+    if wl_name == "onnx":
+        wl_name = re.split(r"/|\.", workload_path)[-2]
+    experiment_id = f"{hw_name}-{wl_name}-{rows}_row_{cols}_col"
+
+    ctx = optimize_allocation_co(
+        hardware=_ACCELERATOR,
+        workload=workload_path,
+        mapping=mapping_path,
+        experiment_id=experiment_id,
+        output_path="outputs",
+        skip_if_exists=False,
+        enable_codegen=True,
+        trace_size=trace_size,
+        nb_cols_to_use=cols,
+        npu=npu,
+        backend=backend,
+        kernels=iron_kernels(),  # IRON-authored operand layouts drive the DMA tiling
+    )
+    return ctx.get("module")
diff --git a/iron/operators/swiglu_prefill_stream/stream_kernels.py b/iron/operators/swiglu_prefill_stream/stream_kernels.py
new file mode 100644
index 00000000..17bb57aa
--- /dev/null
+++ b/iron/operators/swiglu_prefill_stream/stream_kernels.py
@@ -0,0 +1,94 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""IRON-authored operand layouts for the stream-dse SwiGLU-prefill kernels.
+
+stream-dse selects an AIE kernel per computation node and uses each kernel's
+``operand_layouts()`` to drive the DMA tiling emitted into the design MLIR.
+:func:`iron_kernels` returns the ``optimize_allocation_co(kernels=...)`` override
+that keeps every kernel stream would build but replaces its operand layouts with
+the ones defined here -- the single source of truth -- converted to stream's
+tiled-strided layout via :meth:`iron.common.TiledStridedLayout.to_snaxc`. IRON
+owns the layouts; stream owns construction, symbol names and the MLIR rewrite.
+
+Each override is stream's own kernel, re-typed to a subclass that overrides only
+``operand_layouts()``: the kernel is still built by stream's ``AIEKernels``
+factory (so its constructor signature is inherited, not re-declared here), then
+its ``__class__`` is swapped. The subclasses are module-level so the kernels stay
+picklable (stream stores them on the mapping).
+
+stream / snaxc / xdsl are imported at module load, so this module is only
+importable where the AIE codegen toolchain is installed; it is imported only from
+``stream_design.py``.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Callable
+
+from stream.compiler.kernels.eltwise_mul import EltwiseMulKernel
+from stream.compiler.kernels.gemm import GemmKernel
+from stream.compiler.kernels.silu import SiluKernel
+
+from iron.common import TiledStridedLayout, tiled_2d
+
+# Intrinsic MAC tile dimensions of the aie2p kernels stream-dse targets; the
+# operand layouts below are the contract the generated DMAs and the compiled
+# kernel objects must agree on.
+R, S, T = 4, 8, 8
+
+
+def _gemm_layouts(m: int, k: int, n: int) -> tuple[TiledStridedLayout, ...]:
+    return (tiled_2d(m, k, R, S), tiled_2d(k, n, S, T), tiled_2d(m, n, R, T))
+
+
+def _elementwise_layouts(
+    count: int, tile: tuple[int, int] = (32, 64)
+) -> tuple[TiledStridedLayout, ...]:
+    return (tiled_2d(*tile, R, T),) * count
+
+
+class _IronGemmKernel(GemmKernel):
+    def operand_layouts(self):
+        return [tsl.to_snaxc() for tsl in _gemm_layouts(self.m, self.k, self.n)]
+
+
+class _IronSiluKernel(SiluKernel):
+    def operand_layouts(self):
+        return [tsl.to_snaxc() for tsl in _elementwise_layouts(2)]
+
+
+class _IronEltwiseMulKernel(EltwiseMulKernel):
+    def operand_layouts(self):
+        return [tsl.to_snaxc() for tsl in _elementwise_layouts(3)]
+
+
+# stream AIEKernels name -> IRON subclass overriding operand_layouts().
+_OVERRIDES: dict[str, type] = {
+    "gemm": _IronGemmKernel,
+    "silu": _IronSiluKernel,
+    "eltwise_mul": _IronEltwiseMulKernel,
+}
+
+
+def iron_kernels() -> dict[str, Callable[..., Any]]:
+    """Return the ``optimize_allocation_co(kernels=...)`` override registry.
+
+    Only kernels for which IRON defines layouts are overridden; any other kernel
+    stream needs falls through to its built-in ``AIEKernels`` entry.
+    """
+    from stream.compiler.kernels import AIEKernels
+
+    def override(factory: Callable[..., Any], cls: type) -> Callable[..., Any]:
+        def make(*args: Any, **kwargs: Any) -> Any:
+            kernel = factory(*args, **kwargs)
+            kernel.__class__ = cls
+            return kernel
+
+        return make
+
+    return {
+        name: override(AIEKernels[name], cls)
+        for name, cls in _OVERRIDES.items()
+        if name in AIEKernels
+    }
diff --git a/iron/operators/swiglu_prefill_stream/test.py b/iron/operators/swiglu_prefill_stream/test.py
new file mode 100644
index 00000000..05b953f3
--- /dev/null
+++ b/iron/operators/swiglu_prefill_stream/test.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+import pytest
+
+# The fused design is generated by stream-dse at compile() time. stream-dse is an
+# optional dependency (see requirements_stream.txt) that the default CI image does
+# not install, so skip this whole module when it is unavailable.
+pytest.importorskip(
+    "stream", reason="stream-dse not installed (see requirements_stream.txt)"
+)
+
+from ml_dtypes import bfloat16
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream
+
+# swiglu_prefill_stream shares the same reference implementation as swiglu_decode:
+# both compute W3 @ (SiLU(W1 @ x) * (W2 @ x)). Prefill operates on a full
+# sequence (M > 1); the stream variant fuses the whole block into one design.
+from iron.operators.swiglu_decode.reference import generate_golden_reference
+from iron.common.test_utils import verify_buffer
+
+
+def get_params():
+    # (seq_len, embedding_dim, hidden_dim, seq_tile, embedding_tile, hidden_tile)
+    # The 256/512/2048 config with 32/32/64 tiling is the MILP-feasible shape on
+    # the whole-array Strix (npu2) target.
+    params_list = [(256, 512, 2048, 32, 32, 64)]
+    return [pytest.param(*p) for p in params_list]
+
+
+@pytest.mark.supported_devices("npu2")
+@pytest.mark.metrics(
+    Latency=r"Latency \(us\): (?P<value>[\d\.]+)",
+    Bandwidth=r"Effective Bandwidth: (?P<value>[\d\.e\+-]+) GB/s",
+)
+@pytest.mark.parametrize(
+    "seq_len,embedding_dim,hidden_dim,seq_tile,embedding_tile,hidden_tile", get_params()
+)
+def test_swiglu_prefill_stream(
+    seq_len,
+    embedding_dim,
+    hidden_dim,
+    seq_tile,
+    embedding_tile,
+    hidden_tile,
+    aie_context,
+):
+    golden_ref = generate_golden_reference(M=seq_len, K=embedding_dim, N=hidden_dim)
+
+    operator = SwiGLUPrefillStream(
+        seq_len=seq_len,
+        embedding_dim=embedding_dim,
+        hidden_dim=hidden_dim,
+        seq_len_tile_size=seq_tile,
+        embedding_tile_size=embedding_tile,
+        hidden_tile_size=hidden_tile,
+        context=aie_context,
+    )
+    # The stream design consumes weights in their natural (K, N) layout, so no
+    # transpose is applied (the composite SwiGLUPrefill transposes; this one does
+    # not).
+    operator.weights_1 = golden_ref["w_gate"]
+    operator.weights_2 = golden_ref["w_up"]
+    operator.weights_3 = golden_ref["w_down"]
+
+    operator.compile()
+    op_func = operator.get_callable()
+
+    input_buf = XRTTensor.from_torch(golden_ref["input"])
+    w1_buf = XRTTensor.from_torch(operator.weights_1)
+    w2_buf = XRTTensor.from_torch(operator.weights_2)
+    w3_buf = XRTTensor.from_torch(operator.weights_3)
+    output_buf = XRTTensor((seq_len * embedding_dim,), dtype=bfloat16)
+
+    # Warmup
+    op_func(input_buf, w1_buf, w2_buf, w3_buf, output_buf)
+
+    start = time.perf_counter()
+    op_func(input_buf, w1_buf, w2_buf, w3_buf, output_buf)
+    elapsed_us = (time.perf_counter() - start) * 1e6
+
+    total_bytes = input_buf.buffer_object().size() + output_buf.buffer_object().size()
+    bandwidth_gbps = total_bytes / (elapsed_us * 1e-6) / 1e9
+    print(f"Latency (us): {elapsed_us:.2f}")
+    print(f"Effective Bandwidth: {bandwidth_gbps:.4f} GB/s")
+
+    # SwiGLU chains several bf16 kernels, so rounding error accumulates; verify
+    # the final output against the float reference with relaxed tolerances and a
+    # small allowance for precision outliers.
+    output = output_buf.to_torch().reshape((seq_len, embedding_dim))
+    errors = verify_buffer(
+        output,
+        "output",
+        golden_ref["output"],
+        rel_tol=0.08,
+        abs_tol=0.7,
+        max_error_rate=0.05,
+    )
+
+    assert not errors, f"Test failed with errors: {errors}"
diff --git a/requirements_stream.txt b/requirements_stream.txt
new file mode 100644
index 00000000..b0a18e34
--- /dev/null
+++ b/requirements_stream.txt
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Optional dependency for the stream-dse-backed fused SwiGLU-prefill operator
+# (iron/operators/swiglu_prefill_stream).
+#
+# It is NOT installed by the default CI (requirements.txt); the operator's test
+# skips itself (pytest.importorskip) when stream-dse is absent. Install this file
+# to build/run the operator and its test:
+#
+#     pip install -r requirements_stream.txt
+#     stream-setup-aie   # REQUIRED: installs stream-dse's AIE codegen deps that
+#                        # cannot be PyPI dependencies (snax-mlir/snaxc, xdsl-aie,
+#                        # aie-python-extras); also installs the mlir_aie/llvm-aie
+#                        # wheels, skipping any already provided by requirements.txt.
+#
+# Notes:
+# - stream-dse generates the fused MLIR design at build time (license-free
+#   OR-Tools GSCIP solver; no Gurobi needed) and writes its generated workload/
+#   mapping files into its own installed package directory, so that environment
+#   must be writable.
+# - >=1.13.4 is required: stream_design.py feeds IRON-authored operand layouts
+#   into code generation via optimize_allocation_co(kernels=...), the override
+#   hook added in stream-dse 1.13.4.
+
+stream-dse>=1.13.4

From 96bbfd8927ce24c698fb8a5d4fa961b26908b23c Mon Sep 17 00:00:00 2001
From: asyms <arne.symons@kuleuven.be>
Date: Wed, 24 Jun 2026 09:11:03 +0200
Subject: [PATCH 2/2] add stream requirements to prereqs action

---
 .github/actions/prereqs/action.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/prereqs/action.yaml b/.github/actions/prereqs/action.yaml
index 7363b0fb..f32708c4 100644
--- a/.github/actions/prereqs/action.yaml
+++ b/.github/actions/prereqs/action.yaml
@@ -21,4 +21,5 @@ runs:
         source ${{ inputs.env_name }}/bin/activate
         pip install --upgrade pip
         pip install -r requirements.txt
+        pip install -r requirements_stream.txt
         echo "Prerequisites installed into ${{ inputs.env_name }}"