Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ __pycache__
build/*
**/_build/**
**/build/**
/outputs/
*.exe
*.csv
secret_github_token
Expand Down
12 changes: 11 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,17 @@ def pytest_configure(config):


def pytest_collection_modifyitems(config, items):
device = aie_utils.DefaultNPURuntime.device().resolve().name
# Resolve the active NPU device for device-gating. On a host without an NPU
# runtime, aie_utils.DefaultNPURuntime is None; in that case there is no
# device to gate against, so skip the supported_devices filtering entirely.
# Hermetic, NPU-free tests (no supported_devices marker) are unaffected.
runtime = aie_utils.DefaultNPURuntime
if runtime is None:
return
npu_device = runtime.device()
if npu_device is None:
return
device = npu_device.resolve().name
for item in items:
marker = item.get_closest_marker("supported_devices")
if marker and device not in marker.args:
Expand Down
52 changes: 52 additions & 0 deletions demos/swiglu_prefill_stream/demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Minimal demo: build and run the stream-dse-backed fused SwiGLU-prefill operator.

stream-dse generates one fused MLIR design for the whole SwiGLU-prefill block;
IRON compiles it to an xclbin and runs it once on the NPU. Requires stream-dse
(see requirements_stream.txt) and an npu2 device.

python demos/swiglu_prefill_stream/demo.py
"""

import time

import torch
from ml_dtypes import bfloat16

from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
from iron.operators.swiglu_prefill_stream.op import SwiGLUPrefillStream

SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM = 256, 512, 2048


def rand_bf16(*shape: int) -> XRTTensor:
return XRTTensor.from_torch(torch.randn(*shape, dtype=torch.bfloat16))


def main() -> None:
op = SwiGLUPrefillStream(
seq_len=SEQ_LEN, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM
)
op.compile()
run = op.get_callable()

x = rand_bf16(SEQ_LEN, EMBEDDING_DIM)
w_gate = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
w_up = rand_bf16(EMBEDDING_DIM, HIDDEN_DIM)
w_down = rand_bf16(HIDDEN_DIM, EMBEDDING_DIM)
out = XRTTensor((SEQ_LEN * EMBEDDING_DIM,), dtype=bfloat16)

run(x, w_gate, w_up, w_down, out) # warmup
start = time.perf_counter()
run(x, w_gate, w_up, w_down, out)
elapsed_us = (time.perf_counter() - start) * 1e6
print(
f"SwiGLU-prefill {SEQ_LEN}x{EMBEDDING_DIM}x{HIDDEN_DIM} ran in {elapsed_us:.1f} us"
)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions iron/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
PythonGeneratedMLIRArtifact,
DesignGenerator,
)
from .layout import Stride, TiledStride, TiledStridedLayout, tiled_2d
107 changes: 107 additions & 0 deletions iron/common/layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-FileCopyrightText: Copyright (C) 2026 KU Leuven (MICAS). All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""Tiled-strided memory layouts for IRON operators.

A tiled-strided layout describes how a logical multi-dimensional tensor is laid
out in memory as a hierarchy of tiles, each level carrying its own ``(step,
bound)`` stride. It is the layout model AIE kernels are written against: a GEMM
microkernel, for example, reads its ``MxK`` operand as ``mt x kt`` tiles of
``r x s`` elements, which is exactly a two-level tiled-strided layout.

The types here mirror ``snaxc.ir.tsl`` (``Stride`` -> ``TiledStride`` ->
``TiledStridedLayout``) so an IRON-authored layout can be handed to stream-dse's
code generation verbatim via :meth:`TiledStridedLayout.to_snaxc`. They carry no
stream-dse / snaxc / xdsl dependency themselves -- the snaxc import is lazy and
confined to ``to_snaxc`` -- so they are usable (and testable) in a plain IRON
install with no AIE codegen toolchain present.

This is a common primitive: it is meant to be shared across operators as the one
place a kernel's operand layouts are defined, rather than re-derived per operator
or hand-copied into stream-dse.
"""

from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class Stride:
"""One stride level: ``bound`` elements spaced ``step`` apart.

``step``/``bound`` may be ``None`` to denote a dynamic (run-time) value,
matching snaxc's convention.
"""

step: int | None
bound: int | None


@dataclass
class TiledStride:
"""The strides of a single tensor dimension, outermost tile first.

A simple (untiled) dimension has one stride; one level of tiling has two
(the outer tile stride followed by the inner element stride), and so on.
"""

strides: tuple[Stride, ...]

def __post_init__(self) -> None:
self.strides = tuple(self.strides)


@dataclass
class TiledStridedLayout:
"""A tiled-strided layout: one :class:`TiledStride` per tensor dimension."""

tstrides: tuple[TiledStride, ...]
offset: int = 0

def __post_init__(self) -> None:
self.tstrides = tuple(self.tstrides)

def to_snaxc(self):
"""Return the equivalent ``snaxc.ir.tsl.TiledStridedLayout``.

The snaxc import is deferred to here so this module stays usable without
the AIE codegen toolchain installed. Used to feed IRON-authored layouts
into stream-dse code generation.
"""
from snaxc.ir.tsl import (
Stride as SnaxStride,
TiledStride as SnaxTiledStride,
TiledStridedLayout as SnaxTiledStridedLayout,
)

return SnaxTiledStridedLayout(
[
SnaxTiledStride([SnaxStride(s.step, s.bound) for s in ts.strides])
for ts in self.tstrides
],
offset=self.offset,
)


def tiled_2d(rows: int, cols: int, row_unit: int, col_unit: int) -> TiledStridedLayout:
"""Two-level tiled-strided layout for a ``rows x cols`` tensor.

The tensor is tiled into ``(rows // row_unit) x (cols // col_unit)`` tiles of
``row_unit x col_unit`` elements, the tiles laid out row-major and each tile
stored row-major internally. This reproduces stream-dse's GEMM/elementwise
operand layouts (the intrinsic ``row_unit``/``col_unit`` are the kernel's MAC
tile dimensions).
"""
rows_t, cols_t = rows // row_unit, cols // col_unit
return TiledStridedLayout(
(
TiledStride(
(
Stride(row_unit * col_unit * cols_t, rows_t),
Stride(col_unit, row_unit),
)
),
TiledStride((Stride(row_unit * col_unit, cols_t), Stride(1, col_unit))),
)
)
24 changes: 23 additions & 1 deletion iron/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,29 @@
# SPDX-License-Identifier: Apache-2.0

import numpy as np
from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt

try:
# XRT (pyxrt) is only present on a host with the NPU runtime installed.
# Import lazily so that pure-MLIR / introspection code paths (and their
# tests) can import iron.* without an NPU. XRTSubBuffer below only needs
# these symbols when actually instantiated, which requires the NPU anyway.
from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor, xrt as _pyxrt
except ImportError: # pragma: no cover - exercised only when XRT is absent

class XRTTensor: # type: ignore[no-redef]
"""Placeholder used when XRT/pyxrt is unavailable.

Instantiating it (i.e. attempting NPU work without XRT) fails loudly;
merely importing the module does not.
"""

def __init__(self, *args, **kwargs):
raise ImportError(
"XRTTensor requires pyxrt/XRT, which is not installed. "
"NPU runtime operations are unavailable in this environment."
)

_pyxrt = None


def get_shim_dma_limit(dev) -> int:
Expand Down
59 changes: 45 additions & 14 deletions iron/operators/__init__.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,48 @@
# SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from .elementwise_add.op import ElementwiseAdd
from .elementwise_mul.op import ElementwiseMul
from .gemm.op import GEMM
from .gemv.op import GEMV
from .mha.op import MHA
from .rms_norm.op import RMSNorm
from .rope.op import RoPE
from .silu.op import SiLU
from .softmax.op import Softmax
from .swiglu_decode.op import SwiGLUDecode
from .swiglu_prefill.op import SwiGLUPrefill
from .transpose.op import Transpose
from .strided_copy.op import StridedCopy
from .repeat.op import Repeat
"""IRON operators.

Operator classes are exposed lazily (PEP 562): they are imported on first
attribute access rather than eagerly at package import time. This lets the
package be imported on a host without the NPU runtime (XRT/pyxrt), while ``from
iron.operators import GEMM`` continues to work unchanged when the runtime is
available.
"""

import importlib

# Public operator name -> defining submodule (relative to this package).
_OPERATORS = {
"ElementwiseAdd": ".elementwise_add.op",
"ElementwiseMul": ".elementwise_mul.op",
"GEMM": ".gemm.op",
"GEMV": ".gemv.op",
"MHA": ".mha.op",
"RMSNorm": ".rms_norm.op",
"RoPE": ".rope.op",
"SiLU": ".silu.op",
"Softmax": ".softmax.op",
"SwiGLUDecode": ".swiglu_decode.op",
"SwiGLUPrefill": ".swiglu_prefill.op",
"SwiGLUPrefillStream": ".swiglu_prefill_stream.op",
"Transpose": ".transpose.op",
"StridedCopy": ".strided_copy.op",
"Repeat": ".repeat.op",
}

__all__ = list(_OPERATORS)


def __getattr__(name: str):
module_path = _OPERATORS.get(name)
if module_path is None:
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
module = importlib.import_module(module_path, __name__)
attr = getattr(module, name)
globals()[name] = attr # cache so subsequent access skips __getattr__
return attr


def __dir__():
return sorted(set(globals()) | set(_OPERATORS))
53 changes: 53 additions & 0 deletions iron/operators/swiglu_prefill_stream/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
<!--
SPDX-FileCopyrightText: Copyright (C) 2026 Advanced Micro Devices, Inc. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->

# SwiGLU prefill (stream-dse codegen)

This operator is **fused**: the whole SwiGLU-prefill block (both GEMMs + SiLU +
elementwise-mul) is emitted as a **single MLIR design generated by
[`stream-dse`](https://github.com/KULeuven-MICAS/stream)**, then compiled by IRON's normal
flow into one xclbin. Unlike the other operators, its MLIR is not written by hand — it is
produced at build time by [`stream_design.py`](./stream_design.py), which calls the installed
`stream` package (`stream.api.optimize_allocation_co(..., enable_codegen=True)`).

## Enabling stream codegen

`stream-dse` is an **optional, separately-installed** dependency (it is *not* in IRON's
`requirements.txt`). Install it into the **same environment** as IRON via the extra
requirements file:

```bash
pip install -r requirements_stream.txt
stream-setup-aie # required: installs stream-dse's AIE codegen deps
```

Notes:
- MLIR generation uses the open-source **OR-Tools GSCIP** solver (`backend="ortools_gscip"`),
so **no Gurobi license** is required.
- `stream-setup-aie` is **required**: it installs the AIE codegen packages stream-dse needs
that cannot be plain PyPI dependencies (`snax-mlir`/`snaxc`, `xdsl-aie`, `aie-python-extras`),
since they are direct git/URL installs. It also installs the `mlir_aie` / `llvm-aie` wheels,
but skips those if IRON's `requirements.txt` already provided them.
- Importing the operator does **not** require `stream-dse` (the launcher is imported lazily);
only **building** (`operator.compile()` / running the test) does.

## Build & run

```bash
# build + run on an NPU2 (Strix) device
source /opt/xilinx/xrt/setup.sh # XRT on PATH (provides pyxrt + xclbinutil)
pytest iron/operators/swiglu_prefill_stream/test.py
```

The feasible/verified shape is **seq 256 / embedding 512 / hidden 2048**, tiles
**32 / 32 / 64**, target **npu2**.

## Caveats (stream-dse packaging)

- The hardware-description YAML (`whole_array_strix.yaml` + `hardware/cores/*.yaml`) is
resolved from the **installed `stream` package**, where it ships as package data
(stream-dse >= 1.13.3); nothing is vendored in this operator.
- `stream-dse` writes its generated ONNX workload / mapping YAML **into its installed package
directory**, so that environment must be writable.
Loading
Loading