Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/actions/prereqs/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ runs:
source ${{ inputs.env_name }}/bin/activate
pip install --upgrade pip
pip install -r requirements.txt
pip install -r requirements_stream.txt
echo "Prerequisites installed into ${{ inputs.env_name }}"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ __pycache__
build/*
**/_build/**
**/build/**
/outputs/
*.exe
*.csv
secret_github_token
Expand Down
12 changes: 11 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,17 @@ def pytest_configure(config):


def pytest_collection_modifyitems(config, items):
device = aie_utils.DefaultNPURuntime.device().resolve().name
# Resolve the active NPU device for device-gating. On a host without an NPU
# runtime, aie_utils.DefaultNPURuntime is None; in that case there is no
# device to gate against, so skip the supported_devices filtering entirely.
# Hermetic, NPU-free tests (no supported_devices marker) are unaffected.
runtime = aie_utils.DefaultNPURuntime
if runtime is None:
return
npu_device = runtime.device()
if npu_device is None:
return
device = npu_device.resolve().name
for item in items:
marker = item.get_closest_marker("supported_devices")
if marker and device not in marker.args:
Expand Down
52 changes: 52 additions & 0 deletions demos/swiglu_prefill_stream/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Minimal demo: build and run the stream-dse-backed fused SwiGLU-prefill operator.

stream-dse generates one fused MLIR design for the whole SwiGLU-prefill block;
IRON compiles it to an xclbin and runs it once on the NPU. Requires stream-dse
(see requirements_stream.txt) and an npu2 device.

python demos/swiglu_prefill_stream/demo.py
"""

import time

import torch
from ml_dtypes import bfloat16

from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream

SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM = 256, 512, 2048


def rand_bf16(*shape: int) -> XRTTensor:
return XRTTensor.from_torch(torch.randn(*shape, dtype=torch.bfloat16))


def main() -> None:
op = SwiGLUPrefillStream(
seq_len=SEQ_LEN, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM
)
op.compile()
run = op.get_callable()

x = rand_bf16(SEQ_LEN, EMBEDDING_DIM)
w_gate = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
w_up = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
w_down = rand_bf16(HIDDEN_DIM, EMBEDDING_DIM)
out = XRTTensor((SEQ_LEN * EMBEDDING_DIM,), dtype=bfloat16)

run(x, w_gate, w_up, w_down, out) # warmup
start = time.perf_counter()
run(x, w_gate, w_up, w_down, out)
elapsed_us = (time.perf_counter() - start) * 1e6
print(
f"SwiGLU-prefill {SEQ_LEN}x{EMBEDDING_DIM}x{HIDDEN_DIM} ran in {elapsed_us:.1f} us"
)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions iron/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
PythonGeneratedMLIRArtifact,
DesignGenerator,
)
from .layout import Stride, TiledStride, TiledStridedLayout, tiled_2d
107 changes: 107 additions & 0 deletions iron/common/layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Tiled-strided memory layouts for IRON operators.

A tiled-strided layout describes how a logical multi-dimensional tensor is laid
out in memory as a hierarchy of tiles, each level carrying its own ``(step,
bound)`` stride. It is the layout model AIE kernels are written against: a GEMM
microkernel, for example, reads its ``MxK`` operand as ``mt x kt`` tiles of
``r x s`` elements, which is exactly a two-level tiled-strided layout.

The types here mirror ``snaxc.ir.tsl`` (``Stride`` -> ``TiledStride`` ->
``TiledStridedLayout``) so an IRON-authored layout can be handed to stream-dse's
code generation verbatim via :meth:`TiledStridedLayout.to_snaxc`. They carry no
stream-dse / snaxc / xdsl dependency themselves -- the snaxc import is lazy and
confined to ``to_snaxc`` -- so they are usable (and testable) in a plain IRON
install with no AIE codegen toolchain present.

This is a common primitive: it is meant to be shared across operators as the one
place a kernel's operand layouts are defined, rather than re-derived per operator
or hand-copied into stream-dse.
"""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class Stride:
"""One stride level: ``bound`` elements spaced ``step`` apart.

``step``/``bound`` may be ``None`` to denote a dynamic (run-time) value,
matching snaxc's convention.
"""

step: int | None
bound: int | None


@dataclass
class TiledStride:
"""The strides of a single tensor dimension, outermost tile first.

A simple (untiled) dimension has one stride; one level of tiling has two
(the outer tile stride followed by the inner element stride), and so on.
"""

strides: tuple[Stride, ...]

def __post_init__(self) -> None:
self.strides = tuple(self.strides)


@dataclass
class TiledStridedLayout:
"""A tiled-strided layout: one :class:`TiledStride` per tensor dimension."""

tstrides: tuple[TiledStride, ...]
offset: int = 0

def __post_init__(self) -> None:
self.tstrides = tuple(self.tstrides)

def to_snaxc(self):
"""Return the equivalent ``snaxc.ir.tsl.TiledStridedLayout``.

The snaxc import is deferred to here so this module stays usable without
the AIE codegen toolchain installed. Used to feed IRON-authored layouts
into stream-dse code generation.
"""
from snaxc.ir.tsl import (
Stride as SnaxStride,
TiledStride as SnaxTiledStride,
TiledStridedLayout as SnaxTiledStridedLayout,
)

return SnaxTiledStridedLayout(
[
SnaxTiledStride([SnaxStride(s.step, s.bound) for s in ts.strides])
for ts in self.tstrides
],
offset=self.offset,
)


def tiled_2d(rows: int, cols: int, row_unit: int, col_unit: int) -> TiledStridedLayout:
"""Two-level tiled-strided layout for a ``rows x cols`` tensor.

The tensor is tiled into ``(rows // row_unit) x (cols // col_unit)`` tiles of
``row_unit x col_unit`` elements, the tiles laid out row-major and each tile
stored row-major internally. This reproduces stream-dse's GEMM/elementwise
operand layouts (the intrinsic ``row_unit``/``col_unit`` are the kernel's MAC
tile dimensions).
"""
rows_t, cols_t = rows // row_unit, cols // col_unit
return TiledStridedLayout(
(
TiledStride(
(
Stride(row_unit * col_unit * cols_t, rows_t),
Stride(col_unit, row_unit),
)
),
TiledStride((Stride(row_unit * col_unit, cols_t), Stride(1, col_unit))),
)
)
24 changes: 23 additions & 1 deletion iron/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,29 @@
# SPDX-License-Identifier: Apache-2.0

import numpy as np
from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt

try:
# XRT (pyxrt) is only present on a host with the NPU runtime installed.
# Import lazily so that pure-MLIR / introspection code paths (and their
# tests) can import iron.* without an NPU. XRTSubBuffer below only needs
# these symbols when actually instantiated, which requires the NPU anyway.
from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
except ImportError: # pragma: no cover - exercised only when XRT is absent

class XRTTensor: # type: ignore[no-redef]
"""Placeholder used when XRT/pyxrt is unavailable.

Instantiating it (i.e. attempting NPU work without XRT) fails loudly;
merely importing the module does not.
"""

def __init__(self, *args, **kwargs):
raise ImportError(
"XRTTensor requires pyxrt/XRT, which is not installed. "
"NPU runtime operations are unavailable in this environment."
)

_pyxrt = None


def get_shim_dma_limit(dev) -> int:
Expand Down
59 changes: 45 additions & 14 deletions iron/operators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,48 @@
# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from .elementwise_add.op import ElementwiseAdd
from .elementwise_mul.op import ElementwiseMul
from .gemm.op import GEMM
from .gemv.op import GEMV
from .mha.op import MHA
from .rms_norm.op import RMSNorm
from .rope.op import RoPE
from .silu.op import SiLU
from .softmax.op import Softmax
from .swiglu_decode.op import SwiGLUDecode
from .swiglu_prefill.op import SwiGLUPrefill
from .transpose.op import Transpose
from .strided_copy.op import StridedCopy
from .repeat.op import Repeat
"""IRON operators.

Operator classes are exposed lazily (PEP 562): they are imported on first
attribute access rather than eagerly at package import time. This lets the
package be imported on a host without the NPU runtime (XRT/pyxrt), while ``from
iron.operators import GEMM`` continues to work unchanged when the runtime is
available.
"""

import importlib

# Public operator name -> defining submodule (relative to this package).
_OPERATORS = {
"ElementwiseAdd": ".elementwise_add.op",
"ElementwiseMul": ".elementwise_mul.op",
"GEMM": ".gemm.op",
"GEMV": ".gemv.op",
"MHA": ".mha.op",
"RMSNorm": ".rms_norm.op",
"RoPE": ".rope.op",
"SiLU": ".silu.op",
"Softmax": ".softmax.op",
"SwiGLUDecode": ".swiglu_decode.op",
"SwiGLUPrefill": ".swiglu_prefill.op",
"SwiGLUPrefillStream": ".swiglu_prefill_stream.op",
"Transpose": ".transpose.op",
"StridedCopy": ".strided_copy.op",
"Repeat": ".repeat.op",
}

__all__ = list(_OPERATORS)


def __getattr__(name: str):
module_path = _OPERATORS.get(name)
if module_path is None:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
module = importlib.import_module(module_path, __name__)
attr = getattr(module, name)
globals()[name] = attr # cache so subsequent access skips __getattr__
return attr


def __dir__():
return sorted(set(globals()) | set(_OPERATORS))
53 changes: 53 additions & 0 deletions iron/operators/swiglu_prefill_stream/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<!--
SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->

# SwiGLU prefill (stream-dse codegen)

This operator is **fused**: the whole SwiGLU-prefill block (both GEMMs + SiLU +
elementwise-mul) is emitted as a **single MLIR design generated by
[`stream-dse`](https://github.com/KULeuven-MICAS/stream)**, then compiled by IRON's normal
flow into one xclbin. Unlike the other operators, its MLIR is not written by hand — it is
produced at build time by [`stream_design.py`](./stream_design.py), which calls the installed
`stream` package (`stream.api.optimize_allocation_co(..., enable_codegen=True)`).

## Enabling stream codegen

`stream-dse` is an **optional, separately-installed** dependency (it is *not* in IRON's
`requirements.txt`). Install it into the **same environment** as IRON via the extra
requirements file:

```bash
pip install -r requirements_stream.txt
stream-setup-aie # required: installs stream-dse's AIE codegen deps
```

Notes:
- MLIR generation uses the open-source **OR-Tools GSCIP** solver (`backend="ortools_gscip"`),
so **no Gurobi license** is required.
- `stream-setup-aie` is **required**: it installs the AIE codegen packages stream-dse needs
that cannot be plain PyPI dependencies (`snax-mlir`/`snaxc`, `xdsl-aie`, `aie-python-extras`),
since they are direct git/URL installs. It also installs the `mlir_aie` / `llvm-aie` wheels,
but skips those if IRON's `requirements.txt` already provided them.
- Importing the operator does **not** require `stream-dse` (the launcher is imported lazily);
only **building** (`operator.compile()` / running the test) does.

## Build & run

```bash
# build + run on an NPU2 (Strix) device
source /opt/xilinx/xrt/setup.sh # XRT on PATH (provides pyxrt + xclbinutil)
pytest iron/operators/swiglu_prefill_stream/test.py
```

The feasible/verified shape is **seq 256 / embedding 512 / hidden 2048**, tiles
**32 / 32 / 64**, target **npu2**.

## Caveats (stream-dse packaging)

- The hardware-description YAML (`whole_array_strix.yaml` + `hardware/cores/*.yaml`) is
resolved from the **installed `stream` package**, where it ships as package data
(stream-dse >= 1.13.3); nothing is vendored in this operator.
- `stream-dse` writes its generated ONNX workload / mapping YAML **into its installed package
directory**, so that environment must be writable.
Loading