From 5103ce59cf6af3ab2c8e57168fcfd4fa04b19dae Mon Sep 17 00:00:00 2001 From: asyms Date: Thu, 18 Jun 2026 15:41:27 +0200 Subject: [PATCH 1/2] Add stream-dse-backed fused SwiGLU-prefill operator SwiGLUPrefillStream compiles the whole SwiGLU-prefill block (gate/up GEMMs + SiLU + elementwise-mul + down GEMM) as a single fused MLIR design generated by stream-dse, producing one xclbin instead of chaining separately-compiled sub-operators. The design is generated at build time by stream_design.py and compiled through IRON's normal flow. The fused design's per-kernel operand layouts (the tiled-strided DMA tiling) are authored on the IRON side and fed into stream-dse code generation rather than hand-copied inside stream: iron.common.layout provides a TiledStridedLayout type, and swiglu_prefill_stream/stream_kernels.py injects IRON's layouts through optimize_allocation_co(kernels=...) -- the override hook added in stream-dse 1.13.4 -- keeping stream's kernel construction and replacing only operand_layouts(). stream-dse is an optional dependency (requirements_stream.txt); the operator's test skips when it is absent. Importing iron.operators no longer requires an NPU runtime (lazy XRT import), so the package loads on hosts without XRT/pyxrt. Includes a minimal k=1 demo under demos/swiglu_prefill_stream/. --- .gitignore | 1 + conftest.py | 12 +- demos/swiglu_prefill_stream/demo.py | 52 ++++++ iron/common/__init__.py | 1 + iron/common/layout.py | 107 +++++++++++ iron/common/utils.py | 24 ++- iron/operators/__init__.py | 59 ++++-- .../operators/swiglu_prefill_stream/README.md | 53 ++++++ iron/operators/swiglu_prefill_stream/op.py | 176 ++++++++++++++++++ .../swiglu_prefill_stream/stream_design.py | 106 +++++++++++ .../swiglu_prefill_stream/stream_kernels.py | 94 ++++++++++ iron/operators/swiglu_prefill_stream/test.py | 103 ++++++++++ requirements_stream.txt | 26 +++ 13 files changed, 798 insertions(+), 16 deletions(-) create mode 100644 demos/swiglu_prefill_stream/demo.py create mode 100644 iron/common/layout.py create mode 100644 iron/operators/swiglu_prefill_stream/README.md create mode 100644 iron/operators/swiglu_prefill_stream/op.py create mode 100644 iron/operators/swiglu_prefill_stream/stream_design.py create mode 100644 iron/operators/swiglu_prefill_stream/stream_kernels.py create mode 100644 iron/operators/swiglu_prefill_stream/test.py create mode 100644 requirements_stream.txt diff --git a/.gitignore b/.gitignore index c2e66af8..bdffa86e 100755 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ __pycache__ build/* **/_build/** **/build/** +/outputs/ *.exe *.csv secret_github_token diff --git a/conftest.py b/conftest.py index 564d8563..6408c503 100644 --- a/conftest.py +++ b/conftest.py @@ -164,7 +164,17 @@ def pytest_configure(config): def pytest_collection_modifyitems(config, items): - device = aie_utils.DefaultNPURuntime.device().resolve().name + # Resolve the active NPU device for device-gating. On a host without an NPU + # runtime, aie_utils.DefaultNPURuntime is None; in that case there is no + # device to gate against, so skip the supported_devices filtering entirely. + # Hermetic, NPU-free tests (no supported_devices marker) are unaffected. + runtime = aie_utils.DefaultNPURuntime + if runtime is None: + return + npu_device = runtime.device() + if npu_device is None: + return + device = npu_device.resolve().name for item in items: marker = item.get_closest_marker("supported_devices") if marker and device not in marker.args: diff --git a/demos/swiglu_prefill_stream/demo.py b/demos/swiglu_prefill_stream/demo.py new file mode 100644 index 00000000..0796f300 --- /dev/null +++ b/demos/swiglu_prefill_stream/demo.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Minimal demo: build and run the stream-dse-backed fused SwiGLU-prefill operator. + +stream-dse generates one fused MLIR design for the whole SwiGLU-prefill block; +IRON compiles it to an xclbin and runs it once on the NPU. Requires stream-dse +(see requirements_stream.txt) and an npu2 device. + + python demos/swiglu_prefill_stream/demo.py +""" + +import time + +import torch +from ml_dtypes import bfloat16 + +from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor +from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream + +SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM = 256, 512, 2048 + + +def rand_bf16(*shape: int) -> XRTTensor: + return XRTTensor.from_torch(torch.randn(*shape, dtype=torch.bfloat16)) + + +def main() -> None: + op = SwiGLUPrefillStream( + seq_len=SEQ_LEN, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM + ) + op.compile() + run = op.get_callable() + + x = rand_bf16(SEQ_LEN, EMBEDDING_DIM) + w_gate = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM) + w_up = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM) + w_down = rand_bf16(HIDDEN_DIM, EMBEDDING_DIM) + out = XRTTensor((SEQ_LEN * EMBEDDING_DIM,), dtype=bfloat16) + + run(x, w_gate, w_up, w_down, out) # warmup + start = time.perf_counter() + run(x, w_gate, w_up, w_down, out) + elapsed_us = (time.perf_counter() - start) * 1e6 + print( + f"SwiGLU-prefill {SEQ_LEN}x{EMBEDDING_DIM}x{HIDDEN_DIM} ran in {elapsed_us:.1f} us" + ) + + +if __name__ == "__main__": + main() diff --git a/iron/common/__init__.py b/iron/common/__init__.py index 9f5a8f7a..cb2ff31b 100644 --- a/iron/common/__init__.py +++ b/iron/common/__init__.py @@ -18,3 +18,4 @@ PythonGeneratedMLIRArtifact, DesignGenerator, ) +from .layout import Stride, TiledStride, TiledStridedLayout, tiled_2d diff --git a/iron/common/layout.py b/iron/common/layout.py new file mode 100644 index 00000000..09770524 --- /dev/null +++ b/iron/common/layout.py @@ -0,0 +1,107 @@ +# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Tiled-strided memory layouts for IRON operators. + +A tiled-strided layout describes how a logical multi-dimensional tensor is laid +out in memory as a hierarchy of tiles, each level carrying its own ``(step, +bound)`` stride. It is the layout model AIE kernels are written against: a GEMM +microkernel, for example, reads its ``MxK`` operand as ``mt x kt`` tiles of +``r x s`` elements, which is exactly a two-level tiled-strided layout. + +The types here mirror ``snaxc.ir.tsl`` (``Stride`` -> ``TiledStride`` -> +``TiledStridedLayout``) so an IRON-authored layout can be handed to stream-dse's +code generation verbatim via :meth:`TiledStridedLayout.to_snaxc`. They carry no +stream-dse / snaxc / xdsl dependency themselves -- the snaxc import is lazy and +confined to ``to_snaxc`` -- so they are usable (and testable) in a plain IRON +install with no AIE codegen toolchain present. + +This is a common primitive: it is meant to be shared across operators as the one +place a kernel's operand layouts are defined, rather than re-derived per operator +or hand-copied into stream-dse. +""" + +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Stride: + """One stride level: ``bound`` elements spaced ``step`` apart. + + ``step``/``bound`` may be ``None`` to denote a dynamic (run-time) value, + matching snaxc's convention. + """ + + step: int | None + bound: int | None + + +@dataclass +class TiledStride: + """The strides of a single tensor dimension, outermost tile first. + + A simple (untiled) dimension has one stride; one level of tiling has two + (the outer tile stride followed by the inner element stride), and so on. + """ + + strides: tuple[Stride, ...] + + def __post_init__(self) -> None: + self.strides = tuple(self.strides) + + +@dataclass +class TiledStridedLayout: + """A tiled-strided layout: one :class:`TiledStride` per tensor dimension.""" + + tstrides: tuple[TiledStride, ...] + offset: int = 0 + + def __post_init__(self) -> None: + self.tstrides = tuple(self.tstrides) + + def to_snaxc(self): + """Return the equivalent ``snaxc.ir.tsl.TiledStridedLayout``. + + The snaxc import is deferred to here so this module stays usable without + the AIE codegen toolchain installed. Used to feed IRON-authored layouts + into stream-dse code generation. + """ + from snaxc.ir.tsl import ( + Stride as SnaxStride, + TiledStride as SnaxTiledStride, + TiledStridedLayout as SnaxTiledStridedLayout, + ) + + return SnaxTiledStridedLayout( + [ + SnaxTiledStride([SnaxStride(s.step, s.bound) for s in ts.strides]) + for ts in self.tstrides + ], + offset=self.offset, + ) + + +def tiled_2d(rows: int, cols: int, row_unit: int, col_unit: int) -> TiledStridedLayout: + """Two-level tiled-strided layout for a ``rows x cols`` tensor. + + The tensor is tiled into ``(rows // row_unit) x (cols // col_unit)`` tiles of + ``row_unit x col_unit`` elements, the tiles laid out row-major and each tile + stored row-major internally. This reproduces stream-dse's GEMM/elementwise + operand layouts (the intrinsic ``row_unit``/``col_unit`` are the kernel's MAC + tile dimensions). + """ + rows_t, cols_t = rows // row_unit, cols // col_unit + return TiledStridedLayout( + ( + TiledStride( + ( + Stride(row_unit * col_unit * cols_t, rows_t), + Stride(col_unit, row_unit), + ) + ), + TiledStride((Stride(row_unit * col_unit, cols_t), Stride(1, col_unit))), + ) + ) diff --git a/iron/common/utils.py b/iron/common/utils.py index e8d4a1f6..778acdea 100644 --- a/iron/common/utils.py +++ b/iron/common/utils.py @@ -2,7 +2,29 @@ # SPDX-License-Identifier: Apache-2.0 import numpy as np -from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt + +try: + # XRT (pyxrt) is only present on a host with the NPU runtime installed. + # Import lazily so that pure-MLIR / introspection code paths (and their + # tests) can import iron.* without an NPU. XRTSubBuffer below only needs + # these symbols when actually instantiated, which requires the NPU anyway. + from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt +except ImportError: # pragma: no cover - exercised only when XRT is absent + + class XRTTensor: # type: ignore[no-redef] + """Placeholder used when XRT/pyxrt is unavailable. + + Instantiating it (i.e. attempting NPU work without XRT) fails loudly; + merely importing the module does not. + """ + + def __init__(self, *args, **kwargs): + raise ImportError( + "XRTTensor requires pyxrt/XRT, which is not installed. " + "NPU runtime operations are unavailable in this environment." + ) + + _pyxrt = None def get_shim_dma_limit(dev) -> int: diff --git a/iron/operators/__init__.py b/iron/operators/__init__.py index 4a6c5604..ce7019e3 100644 --- a/iron/operators/__init__.py +++ b/iron/operators/__init__.py @@ -1,17 +1,48 @@ # SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from .elementwise_add.op import ElementwiseAdd -from .elementwise_mul.op import ElementwiseMul -from .gemm.op import GEMM -from .gemv.op import GEMV -from .mha.op import MHA -from .rms_norm.op import RMSNorm -from .rope.op import RoPE -from .silu.op import SiLU -from .softmax.op import Softmax -from .swiglu_decode.op import SwiGLUDecode -from .swiglu_prefill.op import SwiGLUPrefill -from .transpose.op import Transpose -from .strided_copy.op import StridedCopy -from .repeat.op import Repeat +"""IRON operators. + +Operator classes are exposed lazily (PEP 562): they are imported on first +attribute access rather than eagerly at package import time. This lets the +package be imported on a host without the NPU runtime (XRT/pyxrt), while ``from +iron.operators import GEMM`` continues to work unchanged when the runtime is +available. +""" + +import importlib + +# Public operator name -> defining submodule (relative to this package). +_OPERATORS = { + "ElementwiseAdd": ".elementwise_add.op", + "ElementwiseMul": ".elementwise_mul.op", + "GEMM": ".gemm.op", + "GEMV": ".gemv.op", + "MHA": ".mha.op", + "RMSNorm": ".rms_norm.op", + "RoPE": ".rope.op", + "SiLU": ".silu.op", + "Softmax": ".softmax.op", + "SwiGLUDecode": ".swiglu_decode.op", + "SwiGLUPrefill": ".swiglu_prefill.op", + "SwiGLUPrefillStream": ".swiglu_prefill_stream.op", + "Transpose": ".transpose.op", + "StridedCopy": ".strided_copy.op", + "Repeat": ".repeat.op", +} + +__all__ = list(_OPERATORS) + + +def __getattr__(name: str): + module_path = _OPERATORS.get(name) + if module_path is None: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + module = importlib.import_module(module_path, __name__) + attr = getattr(module, name) + globals()[name] = attr # cache so subsequent access skips __getattr__ + return attr + + +def __dir__(): + return sorted(set(globals()) | set(_OPERATORS)) diff --git a/iron/operators/swiglu_prefill_stream/README.md b/iron/operators/swiglu_prefill_stream/README.md new file mode 100644 index 00000000..2b7c779c --- /dev/null +++ b/iron/operators/swiglu_prefill_stream/README.md @@ -0,0 +1,53 @@ + + +# SwiGLU prefill (stream-dse codegen) + +This operator is **fused**: the whole SwiGLU-prefill block (both GEMMs + SiLU + +elementwise-mul) is emitted as a **single MLIR design generated by +[`stream-dse`](https://github.com/KULeuven-MICAS/stream)**, then compiled by IRON's normal +flow into one xclbin. Unlike the other operators, its MLIR is not written by hand — it is +produced at build time by [`stream_design.py`](./stream_design.py), which calls the installed +`stream` package (`stream.api.optimize_allocation_co(..., enable_codegen=True)`). + +## Enabling stream codegen + +`stream-dse` is an **optional, separately-installed** dependency (it is *not* in IRON's +`requirements.txt`). Install it into the **same environment** as IRON via the extra +requirements file: + +```bash +pip install -r requirements_stream.txt +stream-setup-aie # required: installs stream-dse's AIE codegen deps +``` + +Notes: +- MLIR generation uses the open-source **OR-Tools GSCIP** solver (`backend="ortools_gscip"`), + so **no Gurobi license** is required. +- `stream-setup-aie` is **required**: it installs the AIE codegen packages stream-dse needs + that cannot be plain PyPI dependencies (`snax-mlir`/`snaxc`, `xdsl-aie`, `aie-python-extras`), + since they are direct git/URL installs. It also installs the `mlir_aie` / `llvm-aie` wheels, + but skips those if IRON's `requirements.txt` already provided them. +- Importing the operator does **not** require `stream-dse` (the launcher is imported lazily); + only **building** (`operator.compile()` / running the test) does. + +## Build & run + +```bash +# build + run on an NPU2 (Strix) device +source /opt/xilinx/xrt/setup.sh # XRT on PATH (provides pyxrt + xclbinutil) +pytest iron/operators/swiglu_prefill_stream/test.py +``` + +The feasible/verified shape is **seq 256 / embedding 512 / hidden 2048**, tiles +**32 / 32 / 64**, target **npu2**. + +## Caveats (stream-dse packaging) + +- The hardware-description YAML (`whole_array_strix.yaml` + `hardware/cores/*.yaml`) is + resolved from the **installed `stream` package**, where it ships as package data + (stream-dse >= 1.13.3); nothing is vendored in this operator. +- `stream-dse` writes its generated ONNX workload / mapping YAML **into its installed package + directory**, so that environment must be writable. diff --git a/iron/operators/swiglu_prefill_stream/op.py b/iron/operators/swiglu_prefill_stream/op.py new file mode 100644 index 00000000..3d1d40ae --- /dev/null +++ b/iron/operators/swiglu_prefill_stream/op.py @@ -0,0 +1,176 @@ +# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass, field +from typing import Any + +import aie.utils as aie_utils + +from iron.common import ( + MLIROperator, + AIERuntimeArgSpec, + KernelObjectArtifact, + SourceArtifact, + PythonGeneratedMLIRArtifact, + DesignGenerator, +) +from iron.common.device_utils import get_kernel_dir + + +@dataclass +class SwiGLUPrefillStream(MLIROperator): + """Fused SwiGLU-prefill block whose MLIR is generated by stream-dse. + + Unlike the composite ``SwiGLUPrefill`` (which chains four separately compiled + sub-operators and orchestrates them from the host), this operator compiles a + single fused MLIR design produced by stream-dse: one xclbin with a single + runtime sequence taking ``(input, weights_1, weights_2, weights_3, output)``. + + The MLIR is produced lazily at compile time by ``stream_design.py``, which + calls the installed ``stream-dse`` package. Building therefore requires + ``stream-dse`` (``pip install stream-dse`` + ``stream-setup-aie``); merely + importing this operator does not. + """ + + seq_len: int + embedding_dim: int + hidden_dim: int + seq_len_tile_size: int = 32 + embedding_tile_size: int = 32 + hidden_tile_size: int = 64 + last_gemm_down: bool = True + in_dtype: str = field(default="bf16", repr=False) + out_dtype: str = field(default="bf16", repr=False) + trace_size: int = field(default=0, repr=False) + rows: int = field(default=4, repr=False) + num_aie_columns: int = field(default=8, repr=False) + backend: str = field(default="ortools_gscip", repr=False) + # Weights are runtime data supplied by the caller after construction; they + # are not needed to build the design. + weights_1: Any = field(default=None, repr=False, compare=False) + weights_2: Any = field(default=None, repr=False, compare=False) + weights_3: Any = field(default=None, repr=False, compare=False) + context: Any = field(default=None, repr=False, compare=False) + + def __post_init__(self): + MLIROperator.__init__(self, context=self.context) + + def get_mlir_artifact(self): + npu = aie_utils.get_current_device().resolve().name + return PythonGeneratedMLIRArtifact( + f"{self.name}.mlir", + DesignGenerator( + self.operator_dir / "stream_design.py", + "run_main_aie_codegen_swiglu", + (), + { + "seq_len": self.seq_len, + "embedding_dim": self.embedding_dim, + "hidden_dim": self.hidden_dim, + "in_dtype": self.in_dtype, + "out_dtype": self.out_dtype, + "trace_size": self.trace_size, + "rows": self.rows, + "cols": self.num_aie_columns, + "npu": npu, + "seq_len_tile_size": self.seq_len_tile_size, + "embedding_tile_size": self.embedding_tile_size, + "hidden_tile_size": self.hidden_tile_size, + "last_gemm_down": self.last_gemm_down, + "backend": self.backend, + }, + ), + ) + + def _mm_kernel(self, tile_m, tile_k, tile_n): + """Compile the shared ``mm.cc`` kernel for one tile configuration. + + stream-dse emits dimension-suffixed kernel symbols (e.g. + ``matmul_bf16_bf16_32_32_64``) so the gate/up and down GEMMs -- which use + different tile shapes -- can coexist in one fused design. Upstream + ``mm.cc`` emits unsuffixed ``matmul_bf16_bf16`` / ``zero_bf16``, so we + rename them to match the names the generated MLIR calls. + """ + base_dir = self.context.base_dir + kernel_dir = get_kernel_dir() + suffix = f"{tile_m}_{tile_k}_{tile_n}" + return KernelObjectArtifact( + f"mm_{suffix}.o", + dependencies=[ + SourceArtifact(base_dir / "aie_kernels" / kernel_dir / "mm.cc") + ], + extra_flags=[ + f"-DDIM_M={tile_m}", + f"-DDIM_K={tile_k}", + f"-DDIM_N={tile_n}", + "-Dbf16_bf16_ONLY", + ], + rename_symbols={ + "matmul_bf16_bf16": f"matmul_bf16_bf16_{suffix}", + "zero_bf16": f"zero_bf16_{suffix}", + }, + ) + + def get_kernel_artifacts(self): + base_dir = self.context.base_dir + kernel_dir = get_kernel_dir() + kernels = [ + # gate/up projections: input (seq, embedding) @ W (embedding, hidden) + self._mm_kernel( + self.seq_len_tile_size, + self.embedding_tile_size, + self.hidden_tile_size, + ), + KernelObjectArtifact( + "silu.o", + dependencies=[ + SourceArtifact(base_dir / "aie_kernels" / kernel_dir / "silu.cc") + ], + ), + KernelObjectArtifact( + "mul.o", + dependencies=[ + SourceArtifact(base_dir / "aie_kernels" / "generic" / "mul.cc") + ], + ), + ] + if self.last_gemm_down: + # down projection: intermediate (seq, hidden) @ W (hidden, embedding) + kernels.insert( + 1, + self._mm_kernel( + self.seq_len_tile_size, + self.hidden_tile_size, + self.embedding_tile_size, + ), + ) + return kernels + + def set_up_artifacts(self): + # stream-dse emits a placed/routed design that links a distinct kernel + # object per core, so it needs --dynamic-objFifos (both targets) and + # --no-unified (separate per-core compilation; xclbin only). + xclbin_artifact, insts_artifact = self.get_artifacts(dynamic_obj_fifos=True) + xclbin_artifact.extra_flags.append("--no-unified") + self.xclbin_artifact = xclbin_artifact + self.insts_artifact = insts_artifact + self.add_artifacts([xclbin_artifact, insts_artifact]) + + def get_arg_spec(self): + specs = [ + AIERuntimeArgSpec("in", (self.seq_len, self.embedding_dim)), # input + AIERuntimeArgSpec( + "in", (self.embedding_dim, self.hidden_dim) + ), # weights_1 (gate) + AIERuntimeArgSpec( + "in", (self.embedding_dim, self.hidden_dim) + ), # weights_2 (up) + ] + if self.last_gemm_down: + specs.append( + AIERuntimeArgSpec("in", (self.hidden_dim, self.embedding_dim)) + ) # weights_3 (down) + specs.append(AIERuntimeArgSpec("out", (self.seq_len, self.embedding_dim))) + else: + specs.append(AIERuntimeArgSpec("out", (self.seq_len, self.hidden_dim))) + return specs diff --git a/iron/operators/swiglu_prefill_stream/stream_design.py b/iron/operators/swiglu_prefill_stream/stream_design.py new file mode 100644 index 00000000..9ae37f7f --- /dev/null +++ b/iron/operators/swiglu_prefill_stream/stream_design.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Stream-dse MLIR generation launcher for the fused SwiGLU-prefill operator. + +This is the in-IRON replacement for the previously hardcoded +``/home/micas/stream_aie/main_swiglu.py`` entry point. It calls the *installed* +``stream-dse`` package (``pip install stream-dse`` followed by ``stream-setup-aie``) +to produce a single fused MLIR module for the whole SwiGLU-prefill block, which +IRON then compiles into an xclbin/insts pair. + +The function signature mirrors ``run_main_aie_codegen_swiglu`` from stream-dse's +``scripts/main_swiglu.py`` reference entry point. Because ``scripts/`` is not +shipped in the stream-dse wheel, that logic is vendored here; the hardware- +description YAML is resolved from the installed ``stream`` package, where it ships +as package data (stream-dse >= 1.13.3). + +This module is imported lazily (by ``DesignGenerator`` at compile time), so +importing the operator does not require ``stream-dse`` to be installed -- only +building it does. +""" + +import os +import re + +import stream +from stream.api import optimize_allocation_co +from stream.inputs.aie.mapping.make_swiglu_mapping import make_swiglu_mapping +from stream.inputs.aie.workload.make_onnx_swiglu import make_swiglu_workload + +from iron.operators.swiglu_prefill_stream.stream_kernels import iron_kernels + +# Hardware description for the whole-array Strix (npu2) target, shipped as package +# data inside the installed stream package (stream-dse >= 1.13.3). +_ACCELERATOR = os.path.join( + os.path.dirname(stream.__file__), + "inputs", + "aie", + "hardware", + "whole_array_strix.yaml", +) + + +def run_main_aie_codegen_swiglu( + seq_len, + embedding_dim, + hidden_dim, + in_dtype="bf16", + out_dtype="bf16", + trace_size=0, + rows=4, + cols=8, + npu="npu2", + seq_len_tile_size=32, + embedding_tile_size=32, + hidden_tile_size=64, + last_gemm_down=True, + backend="ortools_gscip", +): + """Generate the fused SwiGLU-prefill MLIR module via stream-dse. + + Returns the (xdsl) MLIR module; ``str(module)`` yields the textual MLIR that + IRON's ``PythonGeneratedMLIRArtifact`` writes to disk and compiles. + + The default ``ortools_gscip`` backend is the license-free OR-Tools GSCIP + solver, so no Gurobi license is required. + """ + workload_path = make_swiglu_workload( + seq_len, + embedding_dim, + hidden_dim, + in_dtype, + out_dtype, + last_gemm_down=last_gemm_down, + ) + mapping_path = make_swiglu_mapping( + seq_len, + embedding_dim, + hidden_dim, + last_gemm_down, + seq_len_tile_size, + embedding_tile_size, + hidden_tile_size, + ) + + hw_name = os.path.splitext(os.path.basename(_ACCELERATOR))[0] + wl_name = re.split(r"/|\.", workload_path)[-1] + if wl_name == "onnx": + wl_name = re.split(r"/|\.", workload_path)[-2] + experiment_id = f"{hw_name}-{wl_name}-{rows}_row_{cols}_col" + + ctx = optimize_allocation_co( + hardware=_ACCELERATOR, + workload=workload_path, + mapping=mapping_path, + experiment_id=experiment_id, + output_path="outputs", + skip_if_exists=False, + enable_codegen=True, + trace_size=trace_size, + nb_cols_to_use=cols, + npu=npu, + backend=backend, + kernels=iron_kernels(), # IRON-authored operand layouts drive the DMA tiling + ) + return ctx.get("module") diff --git a/iron/operators/swiglu_prefill_stream/stream_kernels.py b/iron/operators/swiglu_prefill_stream/stream_kernels.py new file mode 100644 index 00000000..17bb57aa --- /dev/null +++ b/iron/operators/swiglu_prefill_stream/stream_kernels.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""IRON-authored operand layouts for the stream-dse SwiGLU-prefill kernels. + +stream-dse selects an AIE kernel per computation node and uses each kernel's +``operand_layouts()`` to drive the DMA tiling emitted into the design MLIR. +:func:`iron_kernels` returns the ``optimize_allocation_co(kernels=...)`` override +that keeps every kernel stream would build but replaces its operand layouts with +the ones defined here -- the single source of truth -- converted to stream's +tiled-strided layout via :meth:`iron.common.TiledStridedLayout.to_snaxc`. IRON +owns the layouts; stream owns construction, symbol names and the MLIR rewrite. + +Each override is stream's own kernel, re-typed to a subclass that overrides only +``operand_layouts()``: the kernel is still built by stream's ``AIEKernels`` +factory (so its constructor signature is inherited, not re-declared here), then +its ``__class__`` is swapped. The subclasses are module-level so the kernels stay +picklable (stream stores them on the mapping). + +stream / snaxc / xdsl are imported at module load, so this module is only +importable where the AIE codegen toolchain is installed; it is imported only from +``stream_design.py``. +""" + +from __future__ import annotations + +from typing import Any, Callable + +from stream.compiler.kernels.eltwise_mul import EltwiseMulKernel +from stream.compiler.kernels.gemm import GemmKernel +from stream.compiler.kernels.silu import SiluKernel + +from iron.common import TiledStridedLayout, tiled_2d + +# Intrinsic MAC tile dimensions of the aie2p kernels stream-dse targets; the +# operand layouts below are the contract the generated DMAs and the compiled +# kernel objects must agree on. +R, S, T = 4, 8, 8 + + +def _gemm_layouts(m: int, k: int, n: int) -> tuple[TiledStridedLayout, ...]: + return (tiled_2d(m, k, R, S), tiled_2d(k, n, S, T), tiled_2d(m, n, R, T)) + + +def _elementwise_layouts( + count: int, tile: tuple[int, int] = (32, 64) +) -> tuple[TiledStridedLayout, ...]: + return (tiled_2d(*tile, R, T),) * count + + +class _IronGemmKernel(GemmKernel): + def operand_layouts(self): + return [tsl.to_snaxc() for tsl in _gemm_layouts(self.m, self.k, self.n)] + + +class _IronSiluKernel(SiluKernel): + def operand_layouts(self): + return [tsl.to_snaxc() for tsl in _elementwise_layouts(2)] + + +class _IronEltwiseMulKernel(EltwiseMulKernel): + def operand_layouts(self): + return [tsl.to_snaxc() for tsl in _elementwise_layouts(3)] + + +# stream AIEKernels name -> IRON subclass overriding operand_layouts(). +_OVERRIDES: dict[str, type] = { + "gemm": _IronGemmKernel, + "silu": _IronSiluKernel, + "eltwise_mul": _IronEltwiseMulKernel, +} + + +def iron_kernels() -> dict[str, Callable[..., Any]]: + """Return the ``optimize_allocation_co(kernels=...)`` override registry. + + Only kernels for which IRON defines layouts are overridden; any other kernel + stream needs falls through to its built-in ``AIEKernels`` entry. + """ + from stream.compiler.kernels import AIEKernels + + def override(factory: Callable[..., Any], cls: type) -> Callable[..., Any]: + def make(*args: Any, **kwargs: Any) -> Any: + kernel = factory(*args, **kwargs) + kernel.__class__ = cls + return kernel + + return make + + return { + name: override(AIEKernels[name], cls) + for name, cls in _OVERRIDES.items() + if name in AIEKernels + } diff --git a/iron/operators/swiglu_prefill_stream/test.py b/iron/operators/swiglu_prefill_stream/test.py new file mode 100644 index 00000000..05b953f3 --- /dev/null +++ b/iron/operators/swiglu_prefill_stream/test.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import time +import pytest + +# The fused design is generated by stream-dse at compile() time. stream-dse is an +# optional dependency (see requirements_stream.txt) that the default CI image does +# not install, so skip this whole module when it is unavailable. +pytest.importorskip( + "stream", reason="stream-dse not installed (see requirements_stream.txt)" +) + +from ml_dtypes import bfloat16 +from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor +from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream + +# swiglu_prefill_stream shares the same reference implementation as swiglu_decode: +# both compute W3 @ (SiLU(W1 @ x) * (W2 @ x)). Prefill operates on a full +# sequence (M > 1); the stream variant fuses the whole block into one design. +from iron.operators.swiglu_decode.reference import generate_golden_reference +from iron.common.test_utils import verify_buffer + + +def get_params(): + # (seq_len, embedding_dim, hidden_dim, seq_tile, embedding_tile, hidden_tile) + # The 256/512/2048 config with 32/32/64 tiling is the MILP-feasible shape on + # the whole-array Strix (npu2) target. + params_list = [(256, 512, 2048, 32, 32, 64)] + return [pytest.param(*p) for p in params_list] + + +@pytest.mark.supported_devices("npu2") +@pytest.mark.metrics( + Latency=r"Latency \(us\): (?P[\d\.]+)", + Bandwidth=r"Effective Bandwidth: (?P[\d\.e\+-]+) GB/s", +) +@pytest.mark.parametrize( + "seq_len,embedding_dim,hidden_dim,seq_tile,embedding_tile,hidden_tile", get_params() +) +def test_swiglu_prefill_stream( + seq_len, + embedding_dim, + hidden_dim, + seq_tile, + embedding_tile, + hidden_tile, + aie_context, +): + golden_ref = generate_golden_reference(M=seq_len, K=embedding_dim, N=hidden_dim) + + operator = SwiGLUPrefillStream( + seq_len=seq_len, + embedding_dim=embedding_dim, + hidden_dim=hidden_dim, + seq_len_tile_size=seq_tile, + embedding_tile_size=embedding_tile, + hidden_tile_size=hidden_tile, + context=aie_context, + ) + # The stream design consumes weights in their natural (K, N) layout, so no + # transpose is applied (the composite SwiGLUPrefill transposes; this one does + # not). + operator.weights_1 = golden_ref["w_gate"] + operator.weights_2 = golden_ref["w_up"] + operator.weights_3 = golden_ref["w_down"] + + operator.compile() + op_func = operator.get_callable() + + input_buf = XRTTensor.from_torch(golden_ref["input"]) + w1_buf = XRTTensor.from_torch(operator.weights_1) + w2_buf = XRTTensor.from_torch(operator.weights_2) + w3_buf = XRTTensor.from_torch(operator.weights_3) + output_buf = XRTTensor((seq_len * embedding_dim,), dtype=bfloat16) + + # Warmup + op_func(input_buf, w1_buf, w2_buf, w3_buf, output_buf) + + start = time.perf_counter() + op_func(input_buf, w1_buf, w2_buf, w3_buf, output_buf) + elapsed_us = (time.perf_counter() - start) * 1e6 + + total_bytes = input_buf.buffer_object().size() + output_buf.buffer_object().size() + bandwidth_gbps = total_bytes / (elapsed_us * 1e-6) / 1e9 + print(f"Latency (us): {elapsed_us:.2f}") + print(f"Effective Bandwidth: {bandwidth_gbps:.4f} GB/s") + + # SwiGLU chains several bf16 kernels, so rounding error accumulates; verify + # the final output against the float reference with relaxed tolerances and a + # small allowance for precision outliers. + output = output_buf.to_torch().reshape((seq_len, embedding_dim)) + errors = verify_buffer( + output, + "output", + golden_ref["output"], + rel_tol=0.08, + abs_tol=0.7, + max_error_rate=0.05, + ) + + assert not errors, f"Test failed with errors: {errors}" diff --git a/requirements_stream.txt b/requirements_stream.txt new file mode 100644 index 00000000..b0a18e34 --- /dev/null +++ b/requirements_stream.txt @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Optional dependency for the stream-dse-backed fused SwiGLU-prefill operator +# (iron/operators/swiglu_prefill_stream). +# +# It is NOT installed by the default CI (requirements.txt); the operator's test +# skips itself (pytest.importorskip) when stream-dse is absent. Install this file +# to build/run the operator and its test: +# +# pip install -r requirements_stream.txt +# stream-setup-aie # REQUIRED: installs stream-dse's AIE codegen deps that +# # cannot be PyPI dependencies (snax-mlir/snaxc, xdsl-aie, +# # aie-python-extras); also installs the mlir_aie/llvm-aie +# # wheels, skipping any already provided by requirements.txt. +# +# Notes: +# - stream-dse generates the fused MLIR design at build time (license-free +# OR-Tools GSCIP solver; no Gurobi needed) and writes its generated workload/ +# mapping files into its own installed package directory, so that environment +# must be writable. +# - >=1.13.4 is required: stream_design.py feeds IRON-authored operand layouts +# into code generation via optimize_allocation_co(kernels=...), the override +# hook added in stream-dse 1.13.4. + +stream-dse>=1.13.4 From 96bbfd8927ce24c698fb8a5d4fa961b26908b23c Mon Sep 17 00:00:00 2001 From: asyms Date: Wed, 24 Jun 2026 09:11:03 +0200 Subject: [PATCH 2/2] add stream requirements to prereqs action --- .github/actions/prereqs/action.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/actions/prereqs/action.yaml b/.github/actions/prereqs/action.yaml index 7363b0fb..f32708c4 100644 --- a/.github/actions/prereqs/action.yaml +++ b/.github/actions/prereqs/action.yaml @@ -21,4 +21,5 @@ runs: source ${{ inputs.env_name }}/bin/activate pip install --upgrade pip pip install -r requirements.txt + pip install -r requirements_stream.txt echo "Prerequisites installed into ${{ inputs.env_name }}"