diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
index a191f5bf..7806a2f8 100644
--- a/lib/bumblebee.ex
+++ b/lib/bumblebee.ex
@@ -192,6 +192,8 @@ defmodule Bumblebee do
     "Qwen3Model" => {Bumblebee.Text.Qwen3, :base},
     "Qwen3ForCausalLM" => {Bumblebee.Text.Qwen3, :for_causal_language_modeling},
     "Qwen3ForSequenceClassification" => {Bumblebee.Text.Qwen3, :for_sequence_classification},
+    "Qwen3VLForConditionalGeneration" =>
+      {Bumblebee.Multimodal.Qwen3VL, :for_conditional_generation},
     "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification},
     "ResNetModel" => {Bumblebee.Vision.ResNet, :base},
     "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling},
@@ -242,12 +244,14 @@ defmodule Bumblebee do
 
   @transformers_image_processor_type_to_featurizer %{
     "BlipImageProcessor" => Bumblebee.Vision.BlipFeaturizer,
-    "BitImageProcessor" => Bumblebee.Vision.BitFeaturizer
+    "BitImageProcessor" => Bumblebee.Vision.BitFeaturizer,
+    "Qwen3VLImageProcessor" => Bumblebee.Vision.Qwen3VLFeaturizer
   }
 
   @model_type_to_featurizer %{
     "convnext" => Bumblebee.Vision.ConvNextFeaturizer,
     "deit" => Bumblebee.Vision.DeitFeaturizer,
+    "qwen3_vl" => Bumblebee.Vision.Qwen3VLFeaturizer,
     "resnet" => Bumblebee.Vision.ConvNextFeaturizer,
     "vit" => Bumblebee.Vision.VitFeaturizer,
     "whisper" => Bumblebee.Audio.WhisperFeaturizer
@@ -274,7 +278,9 @@ defmodule Bumblebee do
     "mpnet" => :mpnet,
     "phi" => :code_gen,
     "phi3" => :llama,
+    "qwen2_vl" => :qwen2,
     "qwen3" => :qwen2,
+    "qwen3_vl" => :qwen2,
     "roberta" => :roberta,
     "smollm3" => :smollm3,
     "t5" => :t5,
diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
index 188b0ffe..8f009251 100644
--- a/lib/bumblebee/layers/transformer.ex
+++ b/lib/bumblebee/layers/transformer.ex
@@ -75,6 +75,7 @@ defmodule Bumblebee.Layers.Transformer do
             :num_blocks,
             :rotary_embedding,
             :attention_window_size,
+            :post_block_hook,
             attention_mask: Layers.none(),
             attention_head_mask: Layers.none(),
             attention_relative_bias: nil,
@@ -97,6 +98,7 @@ defmodule Bumblebee.Layers.Transformer do
     cache = opts[:cache]
     rotary_embedding = opts[:rotary_embedding]
     attention_window_size = opts[:attention_window_size]
+    post_block_hook = opts[:post_block_hook]
 
     block_opts = Keyword.take(opts, block_opts_keys)
 
@@ -160,6 +162,14 @@ defmodule Bumblebee.Layers.Transformer do
               ] ++ block_opts
             )
 
+          # Apply post-block hook if provided (e.g., for DeepStack feature injection)
+          hidden_state =
+            if post_block_hook do
+              post_block_hook.(idx, hidden_state)
+            else
+              hidden_state
+            end
+
           cache = Layers.Decoder.put_block_cache(state.cache, idx, block_cache)
 
           %{
diff --git a/lib/bumblebee/multimodal/image_text_to_text.ex b/lib/bumblebee/multimodal/image_text_to_text.ex
new file mode 100644
index 00000000..a789c37a
--- /dev/null
+++ b/lib/bumblebee/multimodal/image_text_to_text.ex
@@ -0,0 +1,247 @@
+defmodule Bumblebee.Multimodal.ImageTextToText do
+  @moduledoc """
+  Generation helpers for vision-language models like Qwen3-VL.
+
+  Two entry points:
+
+    * `generate/6` — one-shot call. Featurizes, expands the prompt
+      placeholder, and runs generation. Each call recompiles the graph
+      when the image or sequence length changes, so it suits
+      interactive use.
+
+    * `compile/5` + `run/3` — compile the generation graph **once** for
+      upper-bound shapes, then run repeatedly with images of varying
+      sizes. The featurizer pads `pixel_values` and `image_grid_thw` to
+      the configured maxima, and the vision encoder excludes padded
+      patches from attention via `patch_valid`.
+  """
+
+  alias Bumblebee.Text
+
+  @placeholder "<|image_pad|>"
+
+  @doc """
+  Generates text from a prompt that includes a `<|image_pad|>` marker
+  and an image.
+
+  ## Required arguments
+
+    * `model_info` - a loaded `Bumblebee.Multimodal.Qwen3VL` (or compatible)
+      model
+    * `featurizer` - a configured `Bumblebee.Vision.Qwen3VLFeaturizer`
+    * `tokenizer` - a loaded tokenizer for the same model
+    * `generation_config` - a `Bumblebee.Text.GenerationConfig`
+    * `text` - the user prompt containing exactly one `<|image_pad|>` marker
+    * `image` - an image tensor or `t:StbImage.t/0`
+
+  ## Returns
+
+      %{text: "<generated text>", token_ids: [...]}
+
+  ## Example
+
+      {:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-VL-2B-Instruct"})
+      {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-VL-2B-Instruct"})
+
+      {:ok, featurizer} =
+        Bumblebee.load_featurizer({:hf, "Qwen/Qwen3-VL-2B-Instruct"},
+          module: Bumblebee.Vision.Qwen3VLFeaturizer
+        )
+
+      featurizer = Bumblebee.configure(featurizer, quality: :low)
+      {:ok, gen_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-VL-2B-Instruct"})
+      gen_config = Bumblebee.configure(gen_config, max_new_tokens: 64)
+
+      Bumblebee.Multimodal.ImageTextToText.generate(
+        model_info, featurizer, tokenizer, gen_config,
+        "<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>What is in this image?<|im_end|>\\n<|im_start|>assistant\\n",
+        image
+      )
+  """
+  def generate(
+        model_info,
+        featurizer,
+        tokenizer,
+        %Text.GenerationConfig{} = generation_config,
+        text,
+        image
+      ) do
+    %{model: model, params: params, spec: spec} = model_info
+
+    unless Map.has_key?(spec, :image_token_id) do
+      raise ArgumentError,
+            "expected a multimodal model with :image_token_id, got #{inspect(spec.__struct__)}"
+    end
+
+    merge_size =
+      case spec do
+        %{vision_spec: %{spatial_merge_size: ms}} -> ms
+        _ -> 1
+      end
+
+    image_inputs = Bumblebee.apply_featurizer(featurizer, image)
+    visual_tokens = visual_tokens_for(image_inputs["image_grid_thw"], merge_size)
+    expanded_text = expand_marker(text, visual_tokens)
+
+    tokenizer = Bumblebee.configure(tokenizer, return_token_type_ids: false)
+    text_inputs = Bumblebee.apply_tokenizer(tokenizer, expanded_text)
+
+    inputs =
+      text_inputs
+      |> Map.merge(image_inputs)
+      |> Map.put("seed", Nx.tensor([:erlang.system_time()], type: :s64))
+
+    generate_fun = Text.Generation.build_generate(model, spec, generation_config)
+    %{token_ids: token_ids} = generate_fun.(params, inputs)
+
+    decoded =
+      token_ids
+      |> Nx.to_batched(1)
+      |> Enum.map(&Bumblebee.Tokenizer.decode(tokenizer, Nx.to_flat_list(&1)))
+      |> hd()
+
+    %{text: decoded, token_ids: token_ids}
+  end
+
+  @doc """
+  Compiles the generation graph once for the given upper-bound shapes.
+
+  The returned struct can be passed to `run/3` repeatedly. Calls with
+  images that produce fewer than `:max_patches` real patches or
+  shorter than `:sequence_length` prompts are padded; the vision
+  encoder masks the padded positions out of attention.
+
+  ## Options
+
+    * `:max_patches` (required) — upper bound on total patches across
+      all images in one call. Must be a multiple of `merge_size ** 2`.
+    * `:max_num_images` (required) — upper bound on number of images
+      per call.
+    * `:sequence_length` (required) — upper bound on token count
+      (prompt + generated).
+  """
+  def compile(
+        model_info,
+        featurizer,
+        tokenizer,
+        %Text.GenerationConfig{} = generation_config,
+        opts
+      ) do
+    opts = Keyword.validate!(opts, [:max_patches, :max_num_images, :sequence_length])
+    max_patches = Keyword.fetch!(opts, :max_patches)
+    max_num_images = Keyword.fetch!(opts, :max_num_images)
+    sequence_length = Keyword.fetch!(opts, :sequence_length)
+
+    %{model: model, params: params, spec: spec} = model_info
+
+    unless Map.has_key?(spec, :image_token_id) do
+      raise ArgumentError,
+            "expected a multimodal model with :image_token_id, got #{inspect(spec.__struct__)}"
+    end
+
+    merge_size = spec.vision_spec.spatial_merge_size
+
+    featurizer =
+      Bumblebee.configure(featurizer,
+        max_patches: max_patches,
+        max_num_images: max_num_images
+      )
+
+    tokenizer =
+      Bumblebee.configure(tokenizer,
+        length: sequence_length,
+        pad_direction: :left,
+        return_token_type_ids: false
+      )
+
+    generate_fun = Text.Generation.build_generate(model, spec, generation_config)
+
+    %{
+      generate_fun: generate_fun,
+      params: params,
+      spec: spec,
+      featurizer: featurizer,
+      tokenizer: tokenizer,
+      merge_size: merge_size,
+      max_patches: max_patches,
+      max_num_images: max_num_images,
+      sequence_length: sequence_length
+    }
+  end
+
+  @doc """
+  Runs a prompt + image through a pre-compiled generator from `compile/5`.
+
+  EXLA caches the compiled graph by input shape; since the featurizer
+  pads to the upper bounds configured in `compile/5`, every call hits
+  the same cached graph.
+  """
+  def run(compiled, text, image) do
+    %{
+      generate_fun: generate_fun,
+      params: params,
+      featurizer: featurizer,
+      tokenizer: tokenizer,
+      merge_size: merge_size
+    } = compiled
+
+    image_inputs = Bumblebee.apply_featurizer(featurizer, image)
+    grid_thw_real = unpad_grid_thw(image_inputs["image_grid_thw"])
+    visual_tokens = visual_tokens_for(grid_thw_real, merge_size)
+    expanded_text = expand_marker(text, visual_tokens)
+
+    text_inputs = Bumblebee.apply_tokenizer(tokenizer, expanded_text)
+
+    inputs =
+      text_inputs
+      |> Map.merge(image_inputs)
+      |> Map.put("seed", Nx.tensor([:erlang.system_time()], type: :s64))
+
+    %{token_ids: token_ids} = generate_fun.(params, inputs)
+
+    decoded =
+      token_ids
+      |> Nx.to_batched(1)
+      |> Enum.map(&Bumblebee.Tokenizer.decode(tokenizer, Nx.to_flat_list(&1)))
+      |> hd()
+
+    %{text: decoded, token_ids: token_ids}
+  end
+
+  # Drops padding rows ([0, 0, 0]) so visual_tokens_for matches the
+  # actual prompt expansion length.
+  defp unpad_grid_thw(grid_thw) do
+    grid_thw
+    |> Nx.to_list()
+    |> Enum.reject(fn [t, h, w] -> t == 0 and h == 0 and w == 0 end)
+    |> case do
+      [] -> Nx.tensor([[0, 0, 0]], type: :s64)
+      rows -> Nx.tensor(rows, type: :s64)
+    end
+  end
+
+  defp expand_marker(text, visual_tokens) do
+    case String.split(text, @placeholder) do
+      [_only] ->
+        raise ArgumentError,
+              "the prompt must contain a #{@placeholder} marker where the image " <>
+                "embedding should be spliced in, got: #{inspect(text)}"
+
+      [prefix, suffix] ->
+        prefix <> String.duplicate(@placeholder, visual_tokens) <> suffix
+
+      _multiple ->
+        raise ArgumentError,
+              "expected exactly one #{@placeholder} marker in the prompt"
+    end
+  end
+
+  defp visual_tokens_for(grid_thw, merge_size) do
+    grid_thw
+    |> Nx.to_list()
+    |> Enum.map(fn [t, h, w] ->
+      t * div(h, merge_size) * div(w, merge_size)
+    end)
+    |> Enum.sum()
+  end
+end
diff --git a/lib/bumblebee/multimodal/qwen3_vl.ex b/lib/bumblebee/multimodal/qwen3_vl.ex
new file mode 100644
index 00000000..c847ef37
--- /dev/null
+++ b/lib/bumblebee/multimodal/qwen3_vl.ex
@@ -0,0 +1,565 @@
+defmodule Bumblebee.Multimodal.Qwen3VL do
+  alias Bumblebee.Shared
+
+  options =
+    [
+      image_token_id: [
+        default: 151_655,
+        doc: "the token ID used to represent images in the input sequence"
+      ],
+      video_token_id: [
+        default: 151_656,
+        doc: "the token ID used to represent videos in the input sequence"
+      ],
+      vision_start_token_id: [
+        default: 151_652,
+        doc: "the token ID marking the start of visual content"
+      ],
+      vision_end_token_id: [
+        default: 151_653,
+        doc: "the token ID marking the end of visual content"
+      ]
+    ] ++ Shared.common_options([:output_hidden_states, :output_attentions])
+
+  @moduledoc """
+  Qwen3-VL model for vision-language tasks.
+
+  ## Architectures
+
+    * `:for_conditional_generation` - Qwen3-VL with a language modeling
+      head for image/video-to-text generation
+
+  ## Inputs
+
+    * `"pixel_values"` - `{num_patches, flattened_patch_size}`
+
+      Concatenated, pre-extracted image/video patches from the featurizer.
+      Shape is `{num_patches, channels * temporal_patch_size * patch_size * patch_size}`.
+
+    * `"image_grid_thw"` - `{num_images, 3}`
+
+      Per-image grid dimensions `[temporal, height, width]` in patch
+      units. Threaded into the vision encoder so it can compute correct
+      per-patch positions for variable image sizes and multiple images
+      per prompt.
+
+    * `"input_ids"` - `{batch_size, sequence_length}`
+
+      Indices of input sequence tokens in the vocabulary. Should contain
+      special image/video tokens at positions where visual content appears.
+
+    * `"attention_mask"` - `{batch_size, sequence_length}`
+
+      Mask indicating which tokens to attend to.
+
+  ## Global layer options
+
+  #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])}
+
+  ## Configuration
+
+  #{Shared.options_doc(options)}
+
+  ## References
+
+    * [Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct)
+
+  """
+
+  defstruct [architecture: :for_conditional_generation, vision_spec: nil, text_spec: nil] ++
+              Shared.option_defaults(options)
+
+  @behaviour Bumblebee.ModelSpec
+  @behaviour Bumblebee.Configurable
+  @behaviour Bumblebee.Text.Generation
+
+  alias Bumblebee.Layers
+
+  @impl true
+  def architectures(), do: [:for_conditional_generation]
+
+  @impl true
+  def config(spec, opts) do
+    Shared.put_config_attrs(spec, opts)
+  end
+
+  @impl true
+  def input_template(%{vision_spec: vision_spec}) do
+    # Vision input is pre-extracted patches: {num_patches, flattened_patch_size}
+    # flattened_patch_size = channels * temporal_patch_size * patch_size * patch_size
+    patch_size = vision_spec.patch_size
+    temporal_patch_size = vision_spec.temporal_patch_size
+
+    flattened_patch_size =
+      vision_spec.num_channels * temporal_patch_size * patch_size * patch_size
+
+    # Use 196 patches as template (14x14 grid from 224x224 image)
+    num_patches = 196
+
+    %{
+      "pixel_values" => Nx.template({num_patches, flattened_patch_size}, :f32),
+      "image_grid_thw" => Nx.template({1, 3}, :s64),
+      "input_ids" => Nx.template({1, 1}, :u32)
+    }
+  end
+
+  @impl true
+  def init_cache(%{text_spec: text_spec}, batch_size, max_length, inputs) do
+    text_spec.__struct__.init_cache(text_spec, batch_size, max_length, inputs)
+  end
+
+  @impl true
+  def traverse_cache(_spec, cache, fun) do
+    Layers.Decoder.traverse_cache(cache, fun)
+  end
+
+  @impl true
+  def model(%__MODULE__{architecture: :for_conditional_generation} = spec) do
+    inputs = inputs(spec)
+
+    vision_model =
+      Bumblebee.build_model(spec.vision_spec)
+      |> Bumblebee.Utils.Axon.prefix_names("vision_model.")
+      |> Bumblebee.Utils.Axon.plug_inputs(%{
+        "pixel_values" => inputs["pixel_values"],
+        "image_grid_thw" => inputs["image_grid_thw"]
+      })
+
+    # Get vision embeddings using correct Axon.nx pattern
+    vision_hidden_state =
+      Layers.if_present inputs["pixel_values"] do
+        Axon.nx(vision_model, & &1.hidden_state)
+      else
+        Layers.none()
+      end
+
+    # Extract DeepStack features from vision encoder
+    # These are hidden states from intermediate layers passed through mergers
+    deepstack_features =
+      Layers.if_present inputs["pixel_values"] do
+        Axon.nx(vision_model, & &1.deepstack_hidden_states)
+      else
+        Layers.none()
+      end
+
+    # Substitute visual embeddings into text input
+    input_embeddings =
+      substitute_visual_embeddings(
+        inputs["input_ids"],
+        vision_hidden_state,
+        spec,
+        name: "embed_substitute"
+      )
+
+    # Create visual position mask for DeepStack injection
+    visual_mask =
+      Layers.if_present inputs["pixel_values"] do
+        Axon.nx(inputs["input_ids"], fn ids ->
+          image_mask = Nx.equal(ids, spec.image_token_id)
+          video_mask = Nx.equal(ids, spec.video_token_id)
+          Nx.logical_or(image_mask, video_mask)
+        end)
+      else
+        Layers.none()
+      end
+
+    # Build text decoder with DeepStack injection hook
+    text_outputs =
+      text_decoder_with_deepstack(
+        input_embeddings,
+        inputs["attention_mask"],
+        inputs["position_ids"],
+        inputs["cache"],
+        deepstack_features,
+        visual_mask,
+        spec,
+        name: "text_model"
+      )
+
+    Layers.output(%{
+      logits: text_outputs.logits,
+      cache: text_outputs.cache,
+      hidden_states: text_outputs.hidden_states,
+      attentions: text_outputs.attentions
+    })
+  end
+
+  defp inputs(spec) do
+    # Vision inputs - pre-extracted patches from featurizer
+    # Shape: {num_patches, flattened_patch_size} where
+    # flattened_patch_size = channels * temporal_patch_size * patch_size * patch_size
+    patch_size = spec.vision_spec.patch_size
+    temporal_patch_size = spec.vision_spec.temporal_patch_size
+
+    flattened_patch_size =
+      spec.vision_spec.num_channels * temporal_patch_size * patch_size * patch_size
+
+    vision_shape = {nil, flattened_patch_size}
+
+    # Text inputs
+    text_shape = {nil, nil}
+    hidden_shape = {nil, nil, spec.text_spec.hidden_size}
+
+    Bumblebee.Utils.Model.inputs_to_map([
+      Axon.input("pixel_values", optional: true, shape: vision_shape),
+      Axon.input("image_grid_thw", optional: true, shape: {nil, 3}),
+      Axon.input("input_ids", shape: text_shape),
+      Axon.input("attention_mask", optional: true, shape: text_shape),
+      Axon.input("position_ids", optional: true, shape: text_shape),
+      Axon.input("input_embeddings", optional: true, shape: hidden_shape),
+      Axon.input("cache", optional: true)
+    ])
+  end
+
+  defp substitute_visual_embeddings(input_ids, vision_hidden_state, spec, _opts) do
+    # Get the token embeddings for the input_ids
+    token_embeddings =
+      Axon.embedding(input_ids, spec.text_spec.vocab_size, spec.text_spec.hidden_size,
+        name: "text_model.embedder.token_embedding"
+      )
+
+    # If no vision input, just return token embeddings
+    # Otherwise, substitute visual embeddings at image/video token positions
+    Layers.if_present vision_hidden_state do
+      Axon.layer(
+        fn token_embeds, visual_embeds, input_ids, _opts ->
+          # Create mask for visual tokens
+          image_mask = Nx.equal(input_ids, spec.image_token_id)
+          video_mask = Nx.equal(input_ids, spec.video_token_id)
+          visual_mask = Nx.logical_or(image_mask, video_mask)
+
+          # visual_embeds shape: {batch, num_visual_tokens, hidden_size}
+          # visual_mask shape: {batch, seq_len}
+          # This is a simplified substitution - a full implementation would need
+          # to handle variable numbers of visual tokens per sequence
+          substitute_at_mask(token_embeds, visual_embeds, visual_mask)
+        end,
+        [token_embeddings, vision_hidden_state, input_ids]
+      )
+    else
+      # No visual input - just use token embeddings
+      token_embeddings
+    end
+  end
+
+  # Substitute visual embeddings at positions where mask is true
+  defp substitute_at_mask(token_embeds, visual_embeds, mask) do
+    # token_embeds: {batch, seq_len, hidden}
+    # visual_embeds: {batch, num_visual, hidden}
+    # mask: {batch, seq_len} - boolean mask where image tokens are
+    {batch_size, seq_len, hidden_size} = Nx.shape(token_embeds)
+    {_, num_visual, _} = Nx.shape(visual_embeds)
+
+    # We need to scatter visual_embeds into positions where mask is true
+    # Create indices for where to place visual embeddings
+    # mask_indices gives us which positions in seq_len are image tokens
+
+    # Convert mask to indices - find positions where mask is true
+    # For each position in the sequence, if it's an image token,
+    # we need to know which visual embedding to use
+
+    # Create a cumulative sum of the mask to get visual embedding indices
+    # mask: [0, 0, 1, 1, 1, 0, 0] -> cumsum: [0, 0, 1, 2, 3, 3, 3]
+    # Then subtract 1 where mask is true to get 0-indexed: [-, -, 0, 1, 2, -, -]
+    mask_int = Nx.as_type(mask, :s32)
+    cumsum = Nx.cumulative_sum(mask_int, axis: 1)
+    # visual_indices gives the index into visual_embeds for each position
+    # For non-image positions, this will be garbage but we'll mask it out
+    visual_indices = Nx.subtract(cumsum, 1)
+    # Clamp to valid range
+    visual_indices = Nx.clip(visual_indices, 0, num_visual - 1)
+
+    # Gather visual embeddings according to indices
+    # visual_indices shape: {batch, seq_len}
+    # We need to gather from visual_embeds {batch, num_visual, hidden}
+    # Result should be {batch, seq_len, hidden}
+
+    # Expand indices to match hidden dimension for gathering
+    # {batch, seq_len} -> {batch, seq_len, hidden}
+    visual_indices_expanded = Nx.new_axis(visual_indices, -1)
+
+    visual_indices_expanded =
+      Nx.broadcast(visual_indices_expanded, {batch_size, seq_len, hidden_size})
+
+    visual_gathered = Nx.take_along_axis(visual_embeds, visual_indices_expanded, axis: 1)
+
+    # Expand mask for broadcasting with hidden dimension
+    mask_expanded = Nx.new_axis(mask, -1)
+    mask_expanded = Nx.broadcast(mask_expanded, {batch_size, seq_len, hidden_size})
+
+    # Select: where mask is true, use visual; else use token
+    Nx.select(mask_expanded, visual_gathered, token_embeds)
+  end
+
+  # Build text decoder with DeepStack feature injection
+  # This builds the decoder directly so we can use post_block_hook for injection
+  defp text_decoder_with_deepstack(
+         embeddings,
+         attention_mask,
+         position_ids,
+         cache,
+         deepstack_features,
+         visual_mask,
+         spec,
+         opts
+       ) do
+    name = opts[:name]
+    text_spec = spec.text_spec
+
+    import Bumblebee.Utils.Model, only: [join: 2]
+
+    # Default position_ids if not provided
+    position_ids =
+      Layers.default position_ids do
+        Layers.default_position_ids(embeddings)
+      end
+
+    # Build query and key normalization functions for Qwen3
+    query_norm =
+      if text_spec.use_qk_norm do
+        &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, channel_index: -1, name: &2)
+      end
+
+    key_norm =
+      if text_spec.use_qk_norm do
+        &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, channel_index: -1, name: &2)
+      end
+
+    # DeepStack injection layers (0, 1, 2 in Python)
+    # The vision encoder extracts features from layers [5, 11, 17] (1-indexed)
+    # These are injected into decoder layers [0, 1, 2]
+    deepstack_injection_layers = MapSet.new([0, 1, 2])
+
+    # Build post_block_hook for DeepStack injection
+    # The hook is always defined, but only applies injection at layers 0, 1, 2
+    # when deepstack_features and visual_mask are present
+    post_block_hook = fn layer_idx, hidden_state ->
+      if MapSet.member?(deepstack_injection_layers, layer_idx) do
+        # Conditionally inject deepstack features at visual token positions
+        Layers.if_present deepstack_features do
+          Axon.layer(
+            fn hidden, ds_features, mask, _opts ->
+              inject_deepstack_features(hidden, ds_features, mask, layer_idx)
+            end,
+            [hidden_state, deepstack_features, visual_mask],
+            name: join(name, "deepstack_inject.#{layer_idx}")
+          )
+        else
+          hidden_state
+        end
+      else
+        hidden_state
+      end
+    end
+
+    # Run decoder blocks with hook
+    decoder_outputs =
+      Layers.Transformer.blocks(embeddings,
+        num_blocks: text_spec.num_blocks,
+        num_attention_heads: text_spec.num_attention_heads,
+        num_key_value_heads: text_spec.num_key_value_heads,
+        hidden_size: text_spec.hidden_size,
+        attention_head_size: text_spec.attention_head_size,
+        kernel_initializer: Axon.Initializers.normal(scale: text_spec.initializer_scale),
+        query_use_bias: false,
+        key_use_bias: false,
+        value_use_bias: false,
+        output_use_bias: false,
+        block_type: :norm_first,
+        attention_mask: attention_mask,
+        cache: cache,
+        causal: true,
+        layer_norm: &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, name: &2),
+        ffn:
+          &gated_ffn(&1, text_spec.intermediate_size, text_spec.hidden_size,
+            name: &2,
+            activation: text_spec.activation,
+            initializer_scale: text_spec.initializer_scale
+          ),
+        rotary_embedding: [
+          position_ids: position_ids,
+          max_positions: text_spec.max_positions,
+          base: text_spec.rotary_embedding_base,
+          scaling_strategy: text_spec.rotary_embedding_scaling_strategy
+        ],
+        query_norm: query_norm,
+        key_norm: key_norm,
+        post_block_hook: post_block_hook,
+        name: join(name, "decoder.blocks")
+      )
+
+    # Final layer norm
+    hidden_state =
+      Layers.rms_norm(decoder_outputs.hidden_state,
+        name: join(name, "output_norm"),
+        epsilon: text_spec.layer_norm_epsilon
+      )
+
+    # Language modeling head
+    logits =
+      Layers.dense_transposed(hidden_state, text_spec.vocab_size,
+        kernel_initializer: Axon.Initializers.normal(scale: text_spec.initializer_scale),
+        name: join(name, "language_modeling_head.output")
+      )
+
+    %{
+      logits: logits,
+      hidden_states: Layers.append(decoder_outputs.hidden_states, hidden_state),
+      attentions: decoder_outputs.attentions,
+      cache: decoder_outputs.cache
+    }
+  end
+
+  # Inject DeepStack features at visual token positions
+  # Formula: hidden_states[visual_mask] += deepstack_features[layer_idx]
+  defp inject_deepstack_features(hidden_state, deepstack_features_tuple, visual_mask, layer_idx) do
+    # deepstack_features_tuple is a tuple of {feature_0, feature_1, feature_2}
+    # Each feature has shape {batch, num_visual_tokens, hidden_size}
+    deepstack_feature = elem(deepstack_features_tuple, layer_idx)
+
+    # hidden_state: {batch, seq_len, hidden}
+    # visual_mask: {batch, seq_len}
+    # deepstack_feature: {batch, num_visual, hidden}
+    {batch_size, seq_len, hidden_size} = Nx.shape(hidden_state)
+    {_, num_visual, _} = Nx.shape(deepstack_feature)
+
+    # Create indices to gather deepstack features for each position
+    mask_int = Nx.as_type(visual_mask, :s32)
+    cumsum = Nx.cumulative_sum(mask_int, axis: 1)
+    visual_indices = Nx.subtract(cumsum, 1)
+    visual_indices = Nx.clip(visual_indices, 0, num_visual - 1)
+
+    # Expand indices for gathering
+    visual_indices_expanded = Nx.new_axis(visual_indices, -1)
+
+    visual_indices_expanded =
+      Nx.broadcast(visual_indices_expanded, {batch_size, seq_len, hidden_size})
+
+    # Gather features according to position
+    gathered_features = Nx.take_along_axis(deepstack_feature, visual_indices_expanded, axis: 1)
+
+    # Create additive mask - only add at visual positions
+    mask_expanded = Nx.new_axis(visual_mask, -1)
+    mask_expanded = Nx.broadcast(mask_expanded, {batch_size, seq_len, hidden_size})
+
+    # Add features at visual positions (zero elsewhere)
+    addition = Nx.select(mask_expanded, gathered_features, Nx.tensor(0.0))
+    Nx.add(hidden_state, addition)
+  end
+
+  # Gated FFN for Qwen3 text decoder
+  defp gated_ffn(hidden_state, intermediate_size, output_size, opts) do
+    import Bumblebee.Utils.Model, only: [join: 2]
+    name = opts[:name]
+    activation = opts[:activation]
+    initializer_scale = opts[:initializer_scale]
+    kernel_initializer = Axon.Initializers.normal(scale: initializer_scale)
+
+    intermediate =
+      Axon.dense(hidden_state, intermediate_size,
+        kernel_initializer: kernel_initializer,
+        name: join(name, "intermediate"),
+        use_bias: false
+      )
+
+    gate =
+      Axon.dense(hidden_state, intermediate_size,
+        kernel_initializer: kernel_initializer,
+        name: join(name, "gate"),
+        use_bias: false
+      )
+
+    hidden_state = Axon.multiply(intermediate, Axon.activation(gate, activation))
+
+    Axon.dense(hidden_state, output_size,
+      kernel_initializer: kernel_initializer,
+      name: join(name, "output"),
+      use_bias: false
+    )
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Config do
+    def load(spec, data) do
+      import Shared.Converters
+
+      opts =
+        convert!(data,
+          image_token_id: {"image_token_id", number()},
+          video_token_id: {"video_token_id", number()},
+          vision_start_token_id: {"vision_start_token_id", number()},
+          vision_end_token_id: {"vision_end_token_id", number()}
+        )
+
+      # Load text spec from text_config first to get hidden_size
+      text_data = Map.get(data, "text_config", data)
+
+      # Qwen3-VL uses QK-norm in the text model (same as standalone Qwen3)
+      text_spec =
+        Bumblebee.configure(Bumblebee.Text.Qwen3,
+          architecture: :for_causal_language_modeling
+        )
+        |> Bumblebee.HuggingFace.Transformers.Config.load(text_data)
+
+      # Load vision spec with out_hidden_size from text config
+      vision_data =
+        data
+        |> Map.put_new("vision_config", %{})
+        |> update_in(["vision_config"], fn vc ->
+          Map.put_new(vc, "out_hidden_size", text_spec.hidden_size)
+        end)
+
+      vision_spec =
+        Bumblebee.configure(Bumblebee.Vision.Qwen3VLVision)
+        |> Bumblebee.HuggingFace.Transformers.Config.load(vision_data)
+
+      @for.config(
+        %{spec | vision_spec: vision_spec, text_spec: text_spec},
+        opts
+      )
+    end
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Model do
+    def params_mapping(spec) do
+      vision_mapping =
+        Bumblebee.HuggingFace.Transformers.Model.params_mapping(spec.vision_spec)
+        |> Enum.map(fn {bumblebee, hf} -> {"vision_model.#{bumblebee}", hf} end)
+        |> Map.new()
+
+      # Qwen3-VL text model uses `model.language_model.*` paths instead of Qwen3's `model.*`
+      # The loader infers a "model." prefix from PyTorch state, so we use "language_model.*"
+      # paths (the loader will prepend "model." automatically)
+      text_mapping = %{
+        "text_model.embedder.token_embedding" => "language_model.embed_tokens",
+        "text_model.decoder.blocks.{n}.self_attention.query" =>
+          "language_model.layers.{n}.self_attn.q_proj",
+        "text_model.decoder.blocks.{n}.self_attention.key" =>
+          "language_model.layers.{n}.self_attn.k_proj",
+        "text_model.decoder.blocks.{n}.self_attention.value" =>
+          "language_model.layers.{n}.self_attn.v_proj",
+        "text_model.decoder.blocks.{n}.self_attention.output" =>
+          "language_model.layers.{n}.self_attn.o_proj",
+        "text_model.decoder.blocks.{n}.self_attention.query_norm" =>
+          "language_model.layers.{n}.self_attn.q_norm",
+        "text_model.decoder.blocks.{n}.self_attention.key_norm" =>
+          "language_model.layers.{n}.self_attn.k_norm",
+        "text_model.decoder.blocks.{n}.self_attention_norm" =>
+          "language_model.layers.{n}.input_layernorm",
+        "text_model.decoder.blocks.{n}.ffn.gate" => "language_model.layers.{n}.mlp.gate_proj",
+        "text_model.decoder.blocks.{n}.ffn.intermediate" =>
+          "language_model.layers.{n}.mlp.up_proj",
+        "text_model.decoder.blocks.{n}.ffn.output" => "language_model.layers.{n}.mlp.down_proj",
+        "text_model.decoder.blocks.{n}.output_norm" =>
+          "language_model.layers.{n}.post_attention_layernorm",
+        "text_model.output_norm" => "language_model.norm",
+        "text_model.language_modeling_head.output" =>
+          if(spec.text_spec.tie_word_embeddings,
+            do: "language_model.embed_tokens",
+            else: "language_model.lm_head"
+          )
+      }
+
+      Map.merge(vision_mapping, text_mapping)
+    end
+  end
+end
diff --git a/lib/bumblebee/vision/qwen3_vl_featurizer.ex b/lib/bumblebee/vision/qwen3_vl_featurizer.ex
new file mode 100644
index 00000000..77446eed
--- /dev/null
+++ b/lib/bumblebee/vision/qwen3_vl_featurizer.ex
@@ -0,0 +1,420 @@
+defmodule Bumblebee.Vision.Qwen3VLFeaturizer do
+  alias Bumblebee.Shared
+
+  options = [
+    resize: [
+      default: true,
+      doc: "whether to resize images via the smart-resize algorithm"
+    ],
+    resize_method: [
+      default: :bicubic,
+      doc:
+        "the resizing method, either of `:nearest`, `:bilinear`, `:bicubic`, `:lanczos3`, `:lanczos5`"
+    ],
+    normalize: [
+      default: true,
+      doc: "whether or not to normalize the input with mean and standard deviation"
+    ],
+    image_mean: [
+      default: [0.5, 0.5, 0.5],
+      doc: "the sequence of mean values for each channel, to be used when normalizing images"
+    ],
+    image_std: [
+      default: [0.5, 0.5, 0.5],
+      doc:
+        "the sequence of standard deviations for each channel, to be used when normalizing images"
+    ],
+    patch_size: [
+      default: 16,
+      doc: "the spatial patch size"
+    ],
+    temporal_patch_size: [
+      default: 2,
+      doc: "the temporal patch size for video frames"
+    ],
+    merge_size: [
+      default: 2,
+      doc: "the merge factor for spatial patches"
+    ],
+    quality: [
+      default: :medium,
+      doc: """
+      preset controlling the `:min_pixels` / `:max_pixels` caps used by smart-resize.
+      One of `:low` (~256 visual tokens), `:medium` (~1280), or `:high` (16384).
+      Ignored if `:min_pixels` and `:max_pixels` are both set explicitly.
+      """
+    ],
+    min_pixels: [
+      default: nil,
+      doc: """
+      explicit minimum total pixels after smart-resize. Overrides the `:quality`
+      preset when set.
+      """
+    ],
+    max_pixels: [
+      default: nil,
+      doc: """
+      explicit maximum total pixels after smart-resize. Overrides the `:quality`
+      preset when set.
+      """
+    ],
+    max_patches: [
+      default: nil,
+      doc: """
+      when set, pads `pixel_values` along the patches axis to this size with
+      zeros. Required for compile-once-and-pad serving of variable-size
+      images. Must be a multiple of `merge_size ** 2`.
+      """
+    ],
+    max_num_images: [
+      default: nil,
+      doc: """
+      when set, pads `image_grid_thw` to this many rows with `[0, 0, 0]`.
+      Required alongside `:max_patches` for compile-once-and-pad serving.
+      """
+    ]
+  ]
+
+  @moduledoc """
+  Qwen3-VL featurizer for image and video data.
+
+  Accepts a single image, a list of images, or a `%{video: [frame, ...]}`
+  map. When given multiple images they are concatenated into a single
+  flat sequence of patches; per-image grid dimensions are returned as
+  `image_grid_thw`.
+
+  ## Quality profiles
+
+  Smart-resize caps the total number of pixels passed through the
+  patchifier. The `:quality` preset is a convenience over the explicit
+  `:min_pixels` / `:max_pixels` keys:
+
+    * `:low` — ~256 visual tokens per image (fastest, lowest detail)
+    * `:medium` — ~1280 visual tokens per image (default)
+    * `:high` — up to 16384 visual tokens per image (full Qwen ceiling)
+
+  Set `:min_pixels` and/or `:max_pixels` to override the preset.
+
+  ## Configuration
+
+  #{Shared.options_doc(options)}
+  """
+
+  defstruct Shared.option_defaults(options)
+
+  @behaviour Bumblebee.Featurizer
+  @behaviour Bumblebee.Configurable
+
+  alias Bumblebee.Utils.Image
+
+  @impl true
+  def config(featurizer, opts) do
+    Shared.put_config_attrs(featurizer, opts)
+  end
+
+  @impl true
+  def process_input(featurizer, input) do
+    factor = featurizer.patch_size * featurizer.merge_size
+    {min_pixels, max_pixels} = resolve_pixel_bounds(featurizer, factor)
+
+    per_image =
+      for image_or_video <- normalize_input(input) do
+        process_one(featurizer, image_or_video, min_pixels, max_pixels, factor)
+      end
+
+    pixel_values =
+      per_image
+      |> Enum.map(& &1.pixel_values)
+      |> Nx.concatenate(axis: 0)
+
+    image_grid_thw =
+      per_image
+      |> Enum.map(& &1.grid_thw)
+      |> Nx.stack()
+
+    {pixel_values, image_grid_thw} =
+      maybe_pad_to_max(pixel_values, image_grid_thw, featurizer)
+
+    %{
+      "pixel_values" => pixel_values,
+      "image_grid_thw" => image_grid_thw
+    }
+  end
+
+  defp maybe_pad_to_max(pixel_values, image_grid_thw, featurizer) do
+    pixel_values = maybe_pad_patches(pixel_values, featurizer)
+    image_grid_thw = maybe_pad_grid_thw(image_grid_thw, featurizer)
+    {pixel_values, image_grid_thw}
+  end
+
+  defp maybe_pad_patches(pixel_values, %{max_patches: nil}), do: pixel_values
+
+  defp maybe_pad_patches(pixel_values, featurizer) do
+    {num_patches, flat} = Nx.shape(pixel_values)
+    max_patches = featurizer.max_patches
+    merge_sq = featurizer.merge_size * featurizer.merge_size
+
+    unless rem(max_patches, merge_sq) == 0 do
+      raise ArgumentError,
+            ":max_patches (#{max_patches}) must be a multiple of merge_size**2 " <>
+              "(= #{merge_sq})"
+    end
+
+    if num_patches > max_patches do
+      raise ArgumentError,
+            "featurizer produced #{num_patches} patches but :max_patches is " <>
+              "#{max_patches}; raise :max_patches or lower :quality / :max_pixels"
+    end
+
+    pad_rows = max_patches - num_patches
+
+    if pad_rows == 0 do
+      pixel_values
+    else
+      padding = Nx.broadcast(Nx.tensor(0.0, type: Nx.type(pixel_values)), {pad_rows, flat})
+      Nx.concatenate([pixel_values, padding], axis: 0)
+    end
+  end
+
+  defp maybe_pad_grid_thw(image_grid_thw, %{max_num_images: nil}), do: image_grid_thw
+
+  defp maybe_pad_grid_thw(image_grid_thw, featurizer) do
+    {num_images, 3} = Nx.shape(image_grid_thw)
+    max_num_images = featurizer.max_num_images
+
+    if num_images > max_num_images do
+      raise ArgumentError,
+            "got #{num_images} images but :max_num_images is #{max_num_images}"
+    end
+
+    pad_rows = max_num_images - num_images
+
+    if pad_rows == 0 do
+      image_grid_thw
+    else
+      padding = Nx.broadcast(Nx.tensor(0, type: Nx.type(image_grid_thw)), {pad_rows, 3})
+      Nx.concatenate([image_grid_thw, padding], axis: 0)
+    end
+  end
+
+  defp normalize_input(input) when is_list(input), do: input
+  defp normalize_input(%{image: _} = input), do: [input]
+  defp normalize_input(%{video: _} = input), do: [input]
+  defp normalize_input(input), do: [%{image: input}]
+
+  defp process_one(featurizer, %{video: frames}, min_pixels, max_pixels, factor)
+       when is_list(frames) do
+    process_frames(featurizer, frames, min_pixels, max_pixels, factor)
+  end
+
+  defp process_one(featurizer, %{image: image}, min_pixels, max_pixels, factor) do
+    process_frames(featurizer, [image], min_pixels, max_pixels, factor)
+  end
+
+  defp process_one(featurizer, image, min_pixels, max_pixels, factor) do
+    process_frames(featurizer, [image], min_pixels, max_pixels, factor)
+  end
+
+  defp process_frames(featurizer, frames, min_pixels, max_pixels, factor) do
+    num_channels = length(featurizer.image_mean)
+
+    batched_frames =
+      Enum.map(frames, fn frame ->
+        frame
+        |> Image.to_batched_tensor()
+        |> Nx.as_type(:f32)
+        |> Image.normalize_channels(num_channels)
+      end)
+
+    [first | _] = batched_frames
+    {1, height, width, _} = Nx.shape(first)
+
+    {target_h, target_w} =
+      if featurizer.resize do
+        smart_resize(height, width, min_pixels, max_pixels, factor)
+      else
+        h = max(factor, round_to_multiple(height, factor))
+        w = max(factor, round_to_multiple(width, factor))
+        {h, w}
+      end
+
+    mean = Nx.tensor(featurizer.image_mean)
+    std = Nx.tensor(featurizer.image_std)
+
+    processed_frames =
+      Enum.map(batched_frames, fn frame ->
+        frame
+        |> NxImage.resize({target_h, target_w}, method: featurizer.resize_method)
+        |> NxImage.to_continuous(0, 1)
+        |> maybe_normalize(featurizer, mean, std)
+        |> Nx.squeeze(axes: [0])
+      end)
+
+    stacked = Nx.stack(processed_frames)
+    {stacked, temporal} = ensure_temporal(stacked, featurizer.temporal_patch_size)
+
+    patches_t = div(temporal, featurizer.temporal_patch_size)
+    patches_h = div(target_h, featurizer.patch_size)
+    patches_w = div(target_w, featurizer.patch_size)
+
+    pixel_values = window_patchify(stacked, featurizer, patches_t, patches_h, patches_w)
+
+    %{
+      pixel_values: pixel_values,
+      grid_thw: Nx.tensor([patches_t, patches_h, patches_w], type: :s64)
+    }
+  end
+
+  defp maybe_normalize(images, %{normalize: false}, _mean, _std), do: images
+  defp maybe_normalize(images, _, mean, std), do: NxImage.normalize(images, mean, std)
+
+  defp ensure_temporal(stacked, temporal_patch_size) do
+    {temporal, _, _, _} = Nx.shape(stacked)
+
+    target =
+      if temporal < temporal_patch_size do
+        temporal_patch_size
+      else
+        div(temporal, temporal_patch_size) * temporal_patch_size
+      end
+
+    cond do
+      target == temporal ->
+        {stacked, temporal}
+
+      target > temporal ->
+        last = stacked[(temporal - 1)..(temporal - 1)//1]
+        pad = Nx.tile(last, [target - temporal, 1, 1, 1])
+        {Nx.concatenate([stacked, pad], axis: 0), target}
+
+      target < temporal ->
+        {Nx.slice_along_axis(stacked, 0, target, axis: 0), target}
+    end
+  end
+
+  # Arranges patches in "windowed" order so that every group of
+  # merge_size * merge_size consecutive patches forms a contiguous
+  # spatial merge block. This lets the vision encoder's patch merger
+  # reshape {N, hidden} -> {N/merge^2, merge^2 * hidden} without
+  # needing to know per-image grid dimensions.
+  defp window_patchify(stacked, featurizer, patches_t, patches_h, patches_w) do
+    {_temporal, _height, _width, channels} = Nx.shape(stacked)
+    patch_size = featurizer.patch_size
+    temporal_patch_size = featurizer.temporal_patch_size
+    merge_size = featurizer.merge_size
+    merged_h = div(patches_h, merge_size)
+    merged_w = div(patches_w, merge_size)
+
+    stacked
+    |> Nx.reshape({
+      patches_t,
+      temporal_patch_size,
+      merged_h,
+      merge_size,
+      patch_size,
+      merged_w,
+      merge_size,
+      patch_size,
+      channels
+    })
+    |> Nx.transpose(axes: [0, 2, 5, 3, 6, 8, 1, 4, 7])
+    |> Nx.reshape({
+      patches_t * merged_h * merged_w * merge_size * merge_size,
+      channels * temporal_patch_size * patch_size * patch_size
+    })
+  end
+
+  defp smart_resize(height, width, min_pixels, max_pixels, factor) do
+    ratio = max(height, width) / min(height, width)
+
+    if ratio > 200 do
+      raise ArgumentError,
+            "image aspect ratio is #{Float.round(ratio, 2)}, " <>
+              "which exceeds the supported limit of 200"
+    end
+
+    h_bar = max(factor, round_to_multiple(height, factor))
+    w_bar = max(factor, round_to_multiple(width, factor))
+
+    cond do
+      h_bar * w_bar > max_pixels ->
+        beta = :math.sqrt(height * width / max_pixels)
+        h2 = floor_to_multiple(height / beta, factor)
+        w2 = floor_to_multiple(width / beta, factor)
+        {max(factor, h2), max(factor, w2)}
+
+      h_bar * w_bar < min_pixels ->
+        beta = :math.sqrt(min_pixels / (height * width))
+        h2 = ceil_to_multiple(height * beta, factor)
+        w2 = ceil_to_multiple(width * beta, factor)
+        {h2, w2}
+
+      true ->
+        {h_bar, w_bar}
+    end
+  end
+
+  defp round_to_multiple(value, factor) do
+    round(value / factor) * factor
+  end
+
+  defp floor_to_multiple(value, factor) do
+    trunc(value / factor) * factor
+  end
+
+  defp ceil_to_multiple(value, factor) do
+    trunc(Float.ceil(value / factor)) * factor
+  end
+
+  defp resolve_pixel_bounds(featurizer, factor) do
+    f2 = factor * factor
+
+    {default_min, default_max} =
+      case featurizer.quality do
+        :low ->
+          {4 * f2, 256 * f2}
+
+        :medium ->
+          {4 * f2, 1280 * f2}
+
+        :high ->
+          {4 * f2, 16384 * f2}
+
+        other ->
+          raise ArgumentError,
+                "invalid :quality #{inspect(other)}, expected :low, :medium, or :high"
+      end
+
+    min_pixels = featurizer.min_pixels || default_min
+    max_pixels = featurizer.max_pixels || default_max
+
+    if min_pixels > max_pixels do
+      raise ArgumentError,
+            "min_pixels (#{min_pixels}) must not exceed max_pixels (#{max_pixels})"
+    end
+
+    {min_pixels, max_pixels}
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Config do
+    def load(featurizer, data) do
+      import Shared.Converters
+
+      opts =
+        convert!(data,
+          resize: {"do_resize", boolean()},
+          resize_method: {"resample", resize_method()},
+          normalize: {"do_normalize", boolean()},
+          image_mean: {"image_mean", list(number())},
+          image_std: {"image_std", list(number())},
+          patch_size: {"patch_size", number()},
+          temporal_patch_size: {"temporal_patch_size", number()},
+          merge_size: {"merge_size", number()},
+          min_pixels: {"min_pixels", number()},
+          max_pixels: {"max_pixels", number()}
+        )
+
+      @for.config(featurizer, opts)
+    end
+  end
+end
diff --git a/lib/bumblebee/vision/qwen3_vl_vision.ex b/lib/bumblebee/vision/qwen3_vl_vision.ex
new file mode 100644
index 00000000..41c7fd8a
--- /dev/null
+++ b/lib/bumblebee/vision/qwen3_vl_vision.ex
@@ -0,0 +1,770 @@
+defmodule Bumblebee.Vision.Qwen3VLVision do
+  import Nx.Defn
+
+  alias Bumblebee.Shared
+
+  options =
+    [
+      hidden_size: [
+        default: 1024,
+        doc: "the dimensionality of hidden layers"
+      ],
+      num_blocks: [
+        default: 24,
+        doc: "the number of Transformer blocks in the encoder"
+      ],
+      num_attention_heads: [
+        default: 16,
+        doc: "the number of attention heads for each attention layer in the encoder"
+      ],
+      intermediate_size: [
+        default: 4096,
+        doc:
+          "the dimensionality of the intermediate layer in the transformer feed-forward network (FFN) in the encoder"
+      ],
+      num_channels: [
+        default: 3,
+        doc: "the number of channels in the input"
+      ],
+      patch_size: [
+        default: 16,
+        doc: "the size of the patch spatial dimensions"
+      ],
+      temporal_patch_size: [
+        default: 2,
+        doc: "the size of the patch temporal dimension (for video)"
+      ],
+      spatial_merge_size: [
+        default: 2,
+        doc: "the factor by which to merge spatial patches"
+      ],
+      out_hidden_size: [
+        default: 2048,
+        doc: "the output dimensionality after patch merger"
+      ],
+      num_position_embeddings: [
+        default: 2304,
+        doc: "the number of learned absolute position embeddings (a square grid)"
+      ],
+      deepstack_visual_indexes: [
+        default: [5, 11, 17],
+        doc:
+          "the encoder layer indices from which to extract DeepStack features (0-indexed, matching HuggingFace's `enumerate(self.blocks)`)"
+      ],
+      activation: [
+        default: :gelu_approx_tanh,
+        doc: "the activation function"
+      ],
+      layer_norm_epsilon: [
+        default: 1.0e-6,
+        doc: "the epsilon used by the layer normalization layers"
+      ],
+      rotary_embedding_base: [
+        default: 10_000,
+        doc: "base for computing rotary embedding frequency"
+      ],
+      initializer_scale: [
+        default: 0.02,
+        doc:
+          "the standard deviation of the normal initializer used for initializing kernel parameters"
+      ]
+    ]
+
+  @moduledoc """
+  The Qwen3-VL vision encoder for processing images and video frames.
+
+  Patches arrive from the featurizer in windowed order: every group of
+  `spatial_merge_size ** 2` consecutive patches forms a contiguous spatial
+  merge block. Combined with the per-image `image_grid_thw` tensor, this
+  encoder supports a variable number of images of varying sizes in a
+  single forward pass.
+
+  ## Architectures
+
+    * `:base` - the base vision encoder model
+
+  ## Inputs
+
+    * `"pixel_values"` - `{num_patches, num_channels * temporal_patch_size * patch_size * patch_size}`
+
+      Concatenated, pre-extracted image/video patches from the featurizer.
+
+    * `"image_grid_thw"` - `{num_images, 3}`
+
+      Per-image grid dimensions `[temporal, height, width]` in patch
+      units, used to derive per-patch row/column positions for the
+      learned bilinear position embedding and the 2D rotary embedding.
+
+  ## Global layer options
+
+  #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])}
+
+  ## Configuration
+
+  #{Shared.options_doc(options)}
+  """
+
+  defstruct [architecture: :base] ++ Shared.option_defaults(options)
+
+  @behaviour Bumblebee.ModelSpec
+  @behaviour Bumblebee.Configurable
+
+  import Bumblebee.Utils.Model, only: [join: 2]
+
+  alias Bumblebee.Layers
+
+  @impl true
+  def architectures(), do: [:base]
+
+  @impl true
+  def config(spec, opts) do
+    Shared.put_config_attrs(spec, opts)
+  end
+
+  @impl true
+  def input_template(spec) do
+    patch_size = spec.patch_size
+    temporal_patch_size = spec.temporal_patch_size
+    flattened_patch_size = spec.num_channels * temporal_patch_size * patch_size * patch_size
+    # 14x14 grid from a 224x224 image with patch_size=16
+    num_patches = 196
+
+    %{
+      "pixel_values" => Nx.template({num_patches, flattened_patch_size}, :f32),
+      "image_grid_thw" => Nx.template({1, 3}, :s64)
+    }
+  end
+
+  @impl true
+  def model(%__MODULE__{architecture: :base} = spec) do
+    inputs = inputs(spec)
+
+    inputs
+    |> core(spec)
+    |> Layers.output()
+  end
+
+  defp inputs(spec) do
+    patch_size = spec.patch_size
+    temporal_patch_size = spec.temporal_patch_size
+    flattened_patch_size = spec.num_channels * temporal_patch_size * patch_size * patch_size
+
+    Bumblebee.Utils.Model.inputs_to_map([
+      Axon.input("pixel_values", shape: {nil, flattened_patch_size}),
+      Axon.input("image_grid_thw", shape: {nil, 3})
+    ])
+  end
+
+  defp core(inputs, spec) do
+    pixel_values = inputs["pixel_values"]
+    grid_thw = inputs["image_grid_thw"]
+
+    embeddings =
+      pixel_values
+      |> patch_embedding(spec, name: "patch_embed")
+      |> position_embedding(grid_thw, spec, name: "pos_embed")
+
+    encoder_outputs = encoder(embeddings, grid_thw, spec, name: "blocks")
+
+    hidden_state = patch_merger(encoder_outputs.hidden_state, spec, name: "merger")
+
+    %{
+      hidden_state: hidden_state,
+      hidden_states: encoder_outputs.hidden_states,
+      attentions: encoder_outputs.attentions,
+      deepstack_hidden_states: encoder_outputs.deepstack_hidden_states
+    }
+  end
+
+  defp patch_embedding(pixel_values, spec, opts) do
+    name = opts[:name]
+
+    # Input: {num_patches, channels * temporal_patch_size * patch_size * patch_size}
+    # PyTorch's Conv3d with kernel=stride=full_patch is equivalent to a dense projection
+    # over the flattened patch features. The kernel param keeps PyTorch's
+    # {out_channels, in_channels, t, h, w} layout for clean weight loading.
+    reshaped =
+      Axon.nx(pixel_values, fn x ->
+        {num_patches, _flat} = Nx.shape(x)
+
+        Nx.reshape(
+          x,
+          {num_patches, spec.num_channels, spec.temporal_patch_size, spec.patch_size,
+           spec.patch_size}
+        )
+      end)
+
+    kernel_param =
+      Axon.param(
+        "kernel",
+        fn _ ->
+          {spec.hidden_size, spec.num_channels, spec.temporal_patch_size, spec.patch_size,
+           spec.patch_size}
+        end,
+        initializer: kernel_initializer(spec)
+      )
+
+    bias_param =
+      Axon.param("bias", fn _ -> {spec.hidden_size} end, initializer: Axon.Initializers.zeros())
+
+    Axon.layer(
+      fn x, kernel, bias, _opts ->
+        {num_patches, c, t, h, w} = Nx.shape(x)
+        {hidden_size, _, _, _, _} = Nx.shape(kernel)
+
+        x_flat = Nx.reshape(x, {num_patches, c * t * h * w})
+        k_flat = kernel |> Nx.reshape({hidden_size, c * t * h * w}) |> Nx.transpose()
+
+        x_flat
+        |> Nx.dot(k_flat)
+        |> Nx.add(bias)
+      end,
+      [reshaped, kernel_param, bias_param],
+      name: join(name, "proj"),
+      op_name: :conv3d
+    )
+    |> Axon.nx(fn x -> Nx.new_axis(x, 0) end)
+  end
+
+  defp position_embedding(embeddings, grid_thw, spec, opts) do
+    name = opts[:name]
+
+    pos_embed_param =
+      Axon.param(
+        "weight",
+        fn _, _ -> {spec.num_position_embeddings, spec.hidden_size} end,
+        initializer: kernel_initializer(spec)
+      )
+
+    Axon.layer(
+      fn embed, grid_thw_t, pos_embed, _opts ->
+        bilinear_interpolated_position(embed, grid_thw_t, pos_embed, spec)
+      end,
+      [embeddings, grid_thw, pos_embed_param],
+      name: name,
+      op_name: :position_embedding
+    )
+  end
+
+  defp bilinear_interpolated_position(embed, grid_thw, pos_embed, spec) do
+    {_batch, total_patches, _hidden} = Nx.shape(embed)
+    src_grid_size = trunc(:math.sqrt(spec.num_position_embeddings))
+    merge_size = spec.spatial_merge_size
+
+    {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, _image_id, _patch_valid} =
+      patch_metadata(grid_thw, total_patches, merge_size)
+
+    src_max_f = Nx.tensor(src_grid_size - 1, type: :f32)
+
+    grid_h_minus_one = grid_h_per_patch |> Nx.subtract(1) |> Nx.max(1) |> Nx.as_type(:f32)
+    grid_w_minus_one = grid_w_per_patch |> Nx.subtract(1) |> Nx.max(1) |> Nx.as_type(:f32)
+
+    row_src_f =
+      row_in_image
+      |> Nx.as_type(:f32)
+      |> Nx.multiply(src_max_f)
+      |> Nx.divide(grid_h_minus_one)
+
+    col_src_f =
+      col_in_image
+      |> Nx.as_type(:f32)
+      |> Nx.multiply(src_max_f)
+      |> Nx.divide(grid_w_minus_one)
+
+    row_src_f = Nx.select(Nx.equal(grid_h_per_patch, 1), Nx.tensor(0.0), row_src_f)
+    col_src_f = Nx.select(Nx.equal(grid_w_per_patch, 1), Nx.tensor(0.0), col_src_f)
+
+    row_floor = row_src_f |> Nx.floor() |> Nx.as_type(:s32)
+    col_floor = col_src_f |> Nx.floor() |> Nx.as_type(:s32)
+    row_ceil = row_floor |> Nx.add(1) |> Nx.min(src_grid_size - 1)
+    col_ceil = col_floor |> Nx.add(1) |> Nx.min(src_grid_size - 1)
+
+    dh = Nx.subtract(row_src_f, Nx.as_type(row_floor, :f32))
+    dw = Nx.subtract(col_src_f, Nx.as_type(col_floor, :f32))
+
+    idx_ff = row_floor |> Nx.multiply(src_grid_size) |> Nx.add(col_floor)
+    idx_fc = row_floor |> Nx.multiply(src_grid_size) |> Nx.add(col_ceil)
+    idx_cf = row_ceil |> Nx.multiply(src_grid_size) |> Nx.add(col_floor)
+    idx_cc = row_ceil |> Nx.multiply(src_grid_size) |> Nx.add(col_ceil)
+
+    emb_ff = Nx.take(pos_embed, idx_ff, axis: 0)
+    emb_fc = Nx.take(pos_embed, idx_fc, axis: 0)
+    emb_cf = Nx.take(pos_embed, idx_cf, axis: 0)
+    emb_cc = Nx.take(pos_embed, idx_cc, axis: 0)
+
+    w_ff = dh |> Nx.subtract(1.0) |> Nx.negate() |> Nx.multiply(Nx.subtract(1.0, dw))
+    w_fc = dh |> Nx.subtract(1.0) |> Nx.negate() |> Nx.multiply(dw)
+    w_cf = Nx.multiply(dh, Nx.subtract(1.0, dw))
+    w_cc = Nx.multiply(dh, dw)
+
+    interpolated =
+      Nx.multiply(emb_ff, Nx.new_axis(w_ff, -1))
+      |> Nx.add(Nx.multiply(emb_fc, Nx.new_axis(w_fc, -1)))
+      |> Nx.add(Nx.multiply(emb_cf, Nx.new_axis(w_cf, -1)))
+      |> Nx.add(Nx.multiply(emb_cc, Nx.new_axis(w_cc, -1)))
+
+    Nx.add(embed, interpolated)
+  end
+
+  # Per-patch metadata derived from image_grid_thw.
+  # Returns {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, image_id_per_patch}.
+  # All tensors have shape {total_patches}.
+  defp patch_metadata(grid_thw, total_patches, merge_size) do
+    grid_t = grid_thw[[.., 0]]
+    grid_h = grid_thw[[.., 1]]
+    grid_w = grid_thw[[.., 2]]
+
+    patches_per_image = grid_t |> Nx.multiply(grid_h) |> Nx.multiply(grid_w)
+
+    cumulative = Nx.cumulative_sum(patches_per_image)
+    exclusive_cumulative = Nx.subtract(cumulative, patches_per_image)
+    total_real_patches = Nx.sum(patches_per_image)
+
+    patch_indices = Nx.iota({total_patches}, type: :s64)
+
+    # Patches beyond total_real_patches are padding slots (when the
+    # featurizer was configured with :max_patches). Mark them invalid so
+    # downstream attention masking can exclude them entirely.
+    patch_valid = Nx.less(patch_indices, total_real_patches)
+
+    image_id_raw =
+      patch_indices
+      |> Nx.new_axis(-1)
+      |> Nx.greater_equal(Nx.new_axis(cumulative, 0))
+      |> Nx.sum(axes: [-1])
+      |> Nx.as_type(:s64)
+
+    n_images = Nx.axis_size(grid_thw, 0)
+    # Padded patches map to image_id == n_images (out of bounds). Clip so
+    # gather operations succeed. Their derived row/col/grid values are
+    # garbage but get masked out via `patch_valid` in the attention step.
+    image_id_per_patch = Nx.clip(image_id_raw, 0, n_images - 1)
+
+    offset_per_patch = Nx.take(exclusive_cumulative, image_id_per_patch)
+    local_index = Nx.subtract(patch_indices, offset_per_patch)
+
+    grid_h_per_patch = Nx.take(grid_h, image_id_per_patch)
+    grid_w_per_patch = Nx.take(grid_w, image_id_per_patch)
+
+    # Padded images have grid_w == 0; guard the divisions so we don't
+    # divide by zero. The resulting coordinates for padded patches are
+    # arbitrary and are masked out downstream.
+    safe_grid_w = Nx.max(grid_w_per_patch, merge_size)
+
+    merge_sq = merge_size * merge_size
+    merged_w_per_patch = Nx.quotient(safe_grid_w, merge_size)
+
+    block_idx = Nx.quotient(local_index, merge_sq)
+    within = Nx.remainder(local_index, merge_sq)
+    block_row = Nx.quotient(block_idx, merged_w_per_patch)
+    block_col = Nx.remainder(block_idx, merged_w_per_patch)
+    within_h = Nx.quotient(within, merge_size)
+    within_w = Nx.remainder(within, merge_size)
+
+    row_in_image = block_row |> Nx.multiply(merge_size) |> Nx.add(within_h)
+    col_in_image = block_col |> Nx.multiply(merge_size) |> Nx.add(within_w)
+
+    {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, image_id_per_patch,
+     patch_valid}
+  end
+
+  defp encoder(embeddings, grid_thw, spec, opts) do
+    name = opts[:name]
+
+    deepstack_indexes = MapSet.new(spec.deepstack_visual_indexes)
+
+    head_dim = div(spec.hidden_size, spec.num_attention_heads)
+    rotary_dim = div(head_dim, 2)
+
+    rotary_2d =
+      Axon.layer(
+        fn embed, grid_thw_t, _opts ->
+          {_batch, total_patches, _hidden} = Nx.shape(embed)
+
+          {row_in_image, col_in_image, _, _, _, _} =
+            patch_metadata(grid_thw_t, total_patches, spec.spatial_merge_size)
+
+          compute_2d_rotary_from_positions(
+            row_in_image,
+            col_in_image,
+            rotary_dim,
+            spec.rotary_embedding_base
+          )
+        end,
+        [embeddings, grid_thw],
+        op_name: :rotary_2d
+      )
+
+    attention_mask =
+      Axon.layer(
+        fn embed, grid_thw_t, _opts ->
+          {_batch, total_patches, _hidden} = Nx.shape(embed)
+
+          {_, _, _, _, image_id_per_patch, patch_valid} =
+            patch_metadata(grid_thw_t, total_patches, spec.spatial_merge_size)
+
+          block_diagonal_attention_mask(image_id_per_patch, patch_valid)
+        end,
+        [embeddings, grid_thw],
+        op_name: :attention_mask
+      )
+
+    vision_transformer_blocks(
+      embeddings,
+      rotary_2d,
+      attention_mask,
+      spec,
+      deepstack_indexes,
+      name
+    )
+  end
+
+  # 2D rotary cos/sin from per-patch (row, col) positions.
+  # Returns {cos, sin}, each of shape {total_patches, rotary_dim}.
+  defnp compute_2d_rotary_from_positions(row_positions, col_positions, rotary_dim, base) do
+    half_rotary_dim = div(rotary_dim, 2)
+    range = Nx.iota({half_rotary_dim}) |> Nx.multiply(2) |> Nx.divide(rotary_dim)
+    inv_freq = 1.0 / Nx.pow(base, range)
+
+    row_angles = Nx.outer(Nx.as_type(row_positions, :f32), inv_freq)
+    col_angles = Nx.outer(Nx.as_type(col_positions, :f32), inv_freq)
+
+    angles = Nx.concatenate([row_angles, col_angles], axis: -1)
+    {Nx.cos(angles), Nx.sin(angles)}
+  end
+
+  # Returns {total_patches, total_patches} boolean tensor where True means
+  # the two patches share an image AND both are valid (not padding).
+  defnp block_diagonal_attention_mask(image_id_per_patch, patch_valid) do
+    a = Nx.new_axis(image_id_per_patch, -1)
+    b = Nx.new_axis(image_id_per_patch, 0)
+    same_image = Nx.equal(a, b)
+    valid_pair = Nx.multiply(Nx.new_axis(patch_valid, -1), Nx.new_axis(patch_valid, 0))
+    Nx.logical_and(same_image, valid_pair)
+  end
+
+  defp vision_transformer_blocks(
+         embeddings,
+         rotary_2d,
+         attention_mask,
+         spec,
+         deepstack_indexes,
+         name
+       ) do
+    head_dim = div(spec.hidden_size, spec.num_attention_heads)
+
+    {hidden_state, hidden_states, attentions} =
+      Enum.reduce(0..(spec.num_blocks - 1), {embeddings, [], []}, fn idx,
+                                                                     {hidden_state, hidden_states,
+                                                                      attentions} ->
+        block_name = join(name, idx)
+
+        normed =
+          Axon.layer_norm(hidden_state,
+            epsilon: spec.layer_norm_epsilon,
+            name: join(block_name, "norm1")
+          )
+
+        {attn_output, attn_weights} =
+          vision_attention_with_2d_rotary(
+            normed,
+            rotary_2d,
+            attention_mask,
+            spec,
+            head_dim,
+            join(block_name, "attn")
+          )
+
+        hidden_state = Axon.add(hidden_state, attn_output)
+
+        normed =
+          Axon.layer_norm(hidden_state,
+            epsilon: spec.layer_norm_epsilon,
+            name: join(block_name, "norm2")
+          )
+
+        ffn_output =
+          normed
+          |> Axon.dense(spec.intermediate_size,
+            kernel_initializer: kernel_initializer(spec),
+            name: join(block_name, "mlp.fc1")
+          )
+          |> Layers.activation(spec.activation)
+          |> Axon.dense(spec.hidden_size,
+            kernel_initializer: kernel_initializer(spec),
+            name: join(block_name, "mlp.fc2")
+          )
+
+        hidden_state = Axon.add(hidden_state, ffn_output)
+
+        {hidden_state, hidden_states ++ [hidden_state], attentions ++ [attn_weights]}
+      end)
+
+    deepstack_merged_features =
+      deepstack_indexes
+      |> Enum.sort()
+      |> Enum.with_index()
+      |> Enum.map(fn {layer_idx, merger_idx} ->
+        hidden_state_at_layer =
+          if layer_idx < length(hidden_states) do
+            Enum.at(hidden_states, layer_idx)
+          else
+            List.last(hidden_states)
+          end
+
+        deepstack_merger(hidden_state_at_layer, spec, merger_idx, "deepstack_merger_list")
+      end)
+
+    %{
+      hidden_state: hidden_state,
+      hidden_states: Axon.container(List.to_tuple(hidden_states)),
+      attentions: Axon.container(List.to_tuple(attentions)),
+      deepstack_hidden_states: Axon.container(List.to_tuple(deepstack_merged_features))
+    }
+  end
+
+  defp deepstack_merger(hidden_state, spec, index, name) do
+    merger_name = join(name, index)
+    merge_sq = spec.spatial_merge_size * spec.spatial_merge_size
+    mlp_input_size = spec.hidden_size * merge_sq
+
+    hidden_state
+    |> Axon.nx(fn x ->
+      {batch, total_patches, hidden} = Nx.shape(x)
+      Nx.reshape(x, {batch, div(total_patches, merge_sq), merge_sq * hidden})
+    end)
+    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(merger_name, "norm"))
+    |> Axon.dense(mlp_input_size,
+      kernel_initializer: kernel_initializer(spec),
+      name: join(merger_name, "linear_fc1")
+    )
+    |> Layers.activation(spec.activation)
+    |> Axon.dense(spec.out_hidden_size,
+      kernel_initializer: kernel_initializer(spec),
+      name: join(merger_name, "linear_fc2")
+    )
+  end
+
+  defp vision_attention_with_2d_rotary(
+         hidden_state,
+         rotary_2d,
+         attention_mask,
+         spec,
+         head_dim,
+         name
+       ) do
+    qkv =
+      Axon.dense(hidden_state, spec.hidden_size * 3,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "qkv")
+      )
+
+    {query, key, value} =
+      Axon.layer(
+        fn qkv, _opts ->
+          {batch, seq_len, _} = Nx.shape(qkv)
+          qkv_reshaped = Nx.reshape(qkv, {batch, seq_len, 3, spec.num_attention_heads, head_dim})
+          qkv_transposed = Nx.transpose(qkv_reshaped, axes: [2, 0, 3, 1, 4])
+          {qkv_transposed[0], qkv_transposed[1], qkv_transposed[2]}
+        end,
+        [qkv],
+        name: join(name, "split_qkv")
+      )
+      |> then(fn layer ->
+        q = Axon.nx(layer, fn {q, _k, _v} -> q end)
+        k = Axon.nx(layer, fn {_q, k, _v} -> k end)
+        v = Axon.nx(layer, fn {_q, _k, v} -> v end)
+        {q, k, v}
+      end)
+
+    {rotated_query, rotated_key} =
+      Axon.layer(
+        fn query, key, rotary_2d, _opts ->
+          {cos, sin} = rotary_2d
+          apply_2d_rotary_embedding(query, key, cos, sin)
+        end,
+        [query, key, rotary_2d],
+        name: join(name, "rotary_2d")
+      )
+      |> then(fn layer ->
+        q = Axon.nx(layer, fn {q, _k} -> q end)
+        k = Axon.nx(layer, fn {_q, k} -> k end)
+        {q, k}
+      end)
+
+    scale = :math.sqrt(head_dim)
+
+    attn_output =
+      Axon.layer(
+        fn query, key, value, attention_mask, _opts ->
+          # query, key, value: {batch, heads, seq, head_dim}
+          # attention_mask: {seq, seq} boolean (True = attend)
+          scores = Nx.dot(query, [3], [0, 1], key, [3], [0, 1])
+          scores = Nx.divide(scores, scale)
+
+          mask_value =
+            attention_mask
+            |> Nx.select(Nx.tensor(0.0, type: :f32), Nx.tensor(-1.0e9, type: :f32))
+            |> Nx.new_axis(0)
+            |> Nx.new_axis(0)
+
+          scores = Nx.add(scores, mask_value)
+          weights = Axon.Activations.softmax(scores, axis: -1)
+          output = Nx.dot(weights, [3], [0, 1], value, [2], [0, 1])
+
+          {output, weights}
+        end,
+        [rotated_query, rotated_key, value, attention_mask],
+        name: join(name, "attention")
+      )
+
+    output = Axon.nx(attn_output, fn {out, _weights} -> out end)
+    weights = Axon.nx(attn_output, fn {_out, weights} -> weights end)
+
+    output =
+      Axon.layer(
+        fn x, _opts ->
+          {batch, heads, seq_len, head_dim} = Nx.shape(x)
+          hidden_size = heads * head_dim
+
+          x
+          |> Nx.transpose(axes: [0, 2, 1, 3])
+          |> Nx.reshape({batch, seq_len, hidden_size})
+        end,
+        [output],
+        name: join(name, "reshape_output")
+      )
+
+    output =
+      Axon.dense(output, spec.hidden_size,
+        kernel_initializer: kernel_initializer(spec),
+        name: join(name, "proj")
+      )
+
+    {output, weights}
+  end
+
+  defnp apply_2d_rotary_embedding(query, key, cos, sin) do
+    {_batch, _heads, _seq, head_dim} = Nx.shape(query)
+    rotary_dim = div(head_dim, 2)
+
+    {q_rot, q_pass} = split_rotary(query, rotary_dim)
+    {k_rot, k_pass} = split_rotary(key, rotary_dim)
+
+    cos = cos |> Nx.new_axis(0) |> Nx.new_axis(0)
+    sin = sin |> Nx.new_axis(0) |> Nx.new_axis(0)
+
+    q_embed = q_rot * cos + rotate_half(q_rot) * sin
+    k_embed = k_rot * cos + rotate_half(k_rot) * sin
+
+    {Nx.concatenate([q_embed, q_pass], axis: -1), Nx.concatenate([k_embed, k_pass], axis: -1)}
+  end
+
+  defnp split_rotary(tensor, rotary_dim) do
+    {batch, heads, seq, head_dim} = Nx.shape(tensor)
+    pass_dim = head_dim - rotary_dim
+    rotary_part = Nx.slice(tensor, [0, 0, 0, 0], [batch, heads, seq, rotary_dim])
+    pass_part = Nx.slice(tensor, [0, 0, 0, rotary_dim], [batch, heads, seq, pass_dim])
+    {rotary_part, pass_part}
+  end
+
+  defnp rotate_half(x) do
+    {batch, heads, seq, dim} = Nx.shape(x)
+    half_dim = div(dim, 2)
+    x1 = Nx.slice(x, [0, 0, 0, 0], [batch, heads, seq, half_dim])
+    x2 = Nx.slice(x, [0, 0, 0, half_dim], [batch, heads, seq, half_dim])
+    Nx.concatenate([Nx.negate(x2), x1], axis: -1)
+  end
+
+  defp patch_merger(hidden_state, spec, opts) do
+    name = opts[:name]
+    merge_sq = spec.spatial_merge_size * spec.spatial_merge_size
+    mlp_input_size = spec.hidden_size * merge_sq
+
+    hidden_state
+    |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "ln_q"))
+    |> Axon.nx(fn x ->
+      {batch, total_patches, hidden} = Nx.shape(x)
+      Nx.reshape(x, {batch, div(total_patches, merge_sq), merge_sq * hidden})
+    end)
+    |> Axon.dense(mlp_input_size,
+      kernel_initializer: kernel_initializer(spec),
+      name: join(name, "mlp.0")
+    )
+    |> Layers.activation(spec.activation)
+    |> Axon.dense(spec.out_hidden_size,
+      kernel_initializer: kernel_initializer(spec),
+      name: join(name, "mlp.2")
+    )
+  end
+
+  defp kernel_initializer(spec) do
+    Axon.Initializers.normal(scale: spec.initializer_scale)
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Config do
+    def load(spec, %{"model_type" => "qwen3_vl", "vision_config" => data}) do
+      load(spec, data)
+    end
+
+    def load(spec, data) do
+      import Shared.Converters
+
+      opts =
+        convert!(data,
+          num_blocks: {"depth", number()},
+          num_attention_heads: {"num_heads", number()},
+          num_channels: {"in_channels", number()},
+          patch_size: {"patch_size", number()},
+          temporal_patch_size: {"temporal_patch_size", number()},
+          spatial_merge_size: {"spatial_merge_size", number()},
+          activation: {"hidden_act", activation()},
+          initializer_scale: {"initializer_range", number()}
+        ) ++ Shared.common_options_from_transformers(data, spec)
+
+      hidden_size = data["hidden_size"] || data["embed_dim"] || spec.hidden_size
+      opts = Keyword.put(opts, :hidden_size, hidden_size)
+
+      mlp_ratio = Map.get(data, "mlp_ratio", 4)
+      intermediate_size = data["intermediate_size"] || hidden_size * mlp_ratio
+      out_hidden_size = Map.get(data, "out_hidden_size", spec.out_hidden_size)
+
+      opts =
+        opts
+        |> Keyword.put(:intermediate_size, intermediate_size)
+        |> Keyword.put(:out_hidden_size, out_hidden_size)
+
+      @for.config(spec, opts)
+    end
+  end
+
+  defimpl Bumblebee.HuggingFace.Transformers.Model do
+    def params_mapping(_spec) do
+      %{
+        "patch_embed.proj" => %{
+          "kernel" => {
+            [{"visual.patch_embed.proj", "weight"}],
+            fn [kernel] -> kernel end
+          },
+          "bias" => {
+            [{"visual.patch_embed.proj", "bias"}],
+            fn [bias] -> bias end
+          }
+        },
+        "pos_embed" => "visual.pos_embed",
+        "blocks.{n}.norm1" => "visual.blocks.{n}.norm1",
+        "blocks.{n}.attn.qkv" => "visual.blocks.{n}.attn.qkv",
+        "blocks.{n}.attn.proj" => "visual.blocks.{n}.attn.proj",
+        "blocks.{n}.norm2" => "visual.blocks.{n}.norm2",
+        "blocks.{n}.mlp.fc1" => "visual.blocks.{n}.mlp.linear_fc1",
+        "blocks.{n}.mlp.fc2" => "visual.blocks.{n}.mlp.linear_fc2",
+        "merger.ln_q" => "visual.merger.norm",
+        "merger.mlp.0" => "visual.merger.linear_fc1",
+        "merger.mlp.2" => "visual.merger.linear_fc2",
+        "deepstack_merger_list.{n}.norm" => "visual.deepstack_merger_list.{n}.norm",
+        "deepstack_merger_list.{n}.linear_fc1" => "visual.deepstack_merger_list.{n}.linear_fc1",
+        "deepstack_merger_list.{n}.linear_fc2" => "visual.deepstack_merger_list.{n}.linear_fc2"
+      }
+    end
+  end
+end
diff --git a/notebooks/qwen3_vl.livemd b/notebooks/qwen3_vl.livemd
new file mode 100644
index 00000000..7b388bff
--- /dev/null
+++ b/notebooks/qwen3_vl.livemd
@@ -0,0 +1,248 @@
+# Qwen3-VL Vision-Language Model
+
+```elixir
+Mix.install([
+  {:bumblebee, path: "."},
+  {:nx, "~> 0.9"},
+  {:exla, "~> 0.9"},
+  {:kino, "~> 0.14"},
+  {:stb_image, "~> 0.6"}
+])
+
+Nx.global_default_backend(EXLA.Backend)
+```
+
+## Introduction
+
+Qwen3-VL is a multimodal vision-language model from Alibaba that can understand images and generate text descriptions. This notebook demonstrates how to use Qwen3-VL with Bumblebee.
+
+## Model Architecture
+
+Qwen3-VL combines:
+- **Vision Encoder**: Processes images using 2D spatial rotary position embeddings
+- **Text Decoder**: Qwen3-based transformer with MRoPE (Multi-axis Rotary Position Embedding)
+
+Key features:
+- 3D convolution patch embedding (supports video temporal dimension)
+- 2D spatial rotary embeddings for accurate spatial understanding
+- Patch merger for spatial reduction
+- Per-image `image_grid_thw` threaded through the encoder so it handles
+  multiple images of varying sizes in a single prompt
+- Smart-resize with `:low`/`:medium`/`:high` quality presets to trade
+  off image detail against visual-token count
+
+## Load the Model
+
+```elixir
+# Load the model, tokenizer, and featurizer
+repo = "Qwen/Qwen3-VL-2B-Instruct"
+
+{:ok, model_info} = Bumblebee.load_model({:hf, repo})
+{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, repo})
+
+# The featurizer accepts a `:quality` preset (`:low`, `:medium`, `:high`) or
+# explicit `:min_pixels` / `:max_pixels` caps. Smart-resize preserves aspect
+# ratio and rounds each side to a multiple of `patch_size * merge_size`.
+{:ok, featurizer} =
+  Bumblebee.load_featurizer({:hf, repo},
+    module: Bumblebee.Vision.Qwen3VLFeaturizer,
+    quality: :medium
+  )
+
+:ok
+```
+
+## Process an Image
+
+```elixir
+# Upload an image
+image_input = Kino.Input.image("Upload an image", format: :rgb)
+```
+
+```elixir
+# Get the uploaded image
+image_data = Kino.Input.read(image_input)
+
+image =
+  if image_data do
+    # Convert Kino image to tensor
+    image_data.file_ref
+    |> Kino.Input.file_path()
+    |> StbImage.read_file!()
+  else
+    # Use a sample image if none uploaded
+    {:ok, %{body: body}} =
+      Req.get("https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png")
+    StbImage.read_binary!(body)
+  end
+
+Kino.Image.new(image)
+```
+
+## Generate Image Description
+
+```elixir
+# Build the prompt for image description
+prompt = "<|im_start|>user
+<|vision_start|><|image_pad|><|vision_end|>Describe this image in detail.<|im_end|>
+<|im_start|>assistant
+"
+
+# Tokenize the prompt
+inputs = Bumblebee.apply_tokenizer(tokenizer, prompt)
+
+# Process the image. The featurizer returns `pixel_values` (concatenated,
+# pre-extracted patches) and `image_grid_thw` (per-image grid dims). Both
+# are required by the model — `image_grid_thw` tells the vision encoder
+# the correct per-patch positions.
+image_inputs = Bumblebee.apply_featurizer(featurizer, image)
+
+# Combine inputs
+combined_inputs = Map.merge(inputs, image_inputs)
+
+# Run inference
+outputs = Axon.predict(model_info.model, model_info.params, combined_inputs)
+
+# Decode the output (greedy decoding for simplicity)
+# For better results, use Bumblebee.Text.generation/4 serving
+logits = outputs.logits
+predicted_ids = Nx.argmax(logits, axis: -1)
+
+Bumblebee.Tokenizer.decode(tokenizer, predicted_ids)
+```
+
+## Generating in One Call
+
+`Bumblebee.Multimodal.ImageTextToText.generate/6` is a single-call
+helper that featurizes the image, expands the `<|image_pad|>` marker
+in your prompt to the right number of visual tokens, and runs
+generation:
+
+```elixir
+{:ok, generation_config} = Bumblebee.load_generation_config({:hf, repo})
+generation_config = Bumblebee.configure(generation_config, max_new_tokens: 64)
+
+prompt = "<|im_start|>user
+<|vision_start|><|image_pad|><|vision_end|>What is in this image?<|im_end|>
+<|im_start|>assistant
+"
+
+Bumblebee.Multimodal.ImageTextToText.generate(
+  model_info,
+  featurizer,
+  tokenizer,
+  generation_config,
+  prompt,
+  image
+)
+#=> %{text: "A group of cats lying on a pink blanket with remote controls.", token_ids: ...}
+```
+
+> Note: each `generate/6` call recompiles the generation graph when
+> the image size or sequence length changes. For repeated calls, use
+> `compile/5` + `run/3` (see below).
+
+## Compile Once, Run Many
+
+For serving-style use where many images of varying sizes share one
+compiled graph, configure upper bounds with `compile/5`, then call
+`run/3` repeatedly. The featurizer pads `pixel_values` and
+`image_grid_thw` to the maxima you set, and the vision encoder
+excludes the padded patches from attention.
+
+```elixir
+compiled =
+  Bumblebee.Multimodal.ImageTextToText.compile(
+    model_info,
+    featurizer,
+    tokenizer,
+    generation_config,
+    max_patches: 1024,
+    max_num_images: 1,
+    sequence_length: 384
+  )
+
+# First call: JIT-compiles for these upper-bound shapes
+Bumblebee.Multimodal.ImageTextToText.run(compiled, prompt, image)
+
+# Subsequent calls reuse the same compiled graph, even if the new
+# image produces fewer real patches — padding makes the shapes match.
+Bumblebee.Multimodal.ImageTextToText.run(compiled, prompt, another_image)
+```
+
+On `Qwen3-VL-2B-Instruct` + CPU + a 640×480 COCO image, the warm
+call runs in ~10s while the cold (JIT-compiling) call takes ~27s — a
+2.7x speedup that scales with the number of repeated calls.
+
+## Multiple Images in One Prompt
+
+`apply_featurizer/2` accepts a list of images of differing sizes. They
+are concatenated into a single flat patch sequence and the per-image
+grid dimensions are returned via `image_grid_thw`.
+
+```elixir
+images = [image, image]
+
+multi_image_inputs = Bumblebee.apply_featurizer(featurizer, images)
+# multi_image_inputs["image_grid_thw"] has shape {2, 3}
+
+prompt = "<|im_start|>user
+<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|>Compare these two images.<|im_end|>
+<|im_start|>assistant
+"
+
+inputs = Bumblebee.apply_tokenizer(tokenizer, prompt)
+combined_inputs = Map.merge(inputs, multi_image_inputs)
+
+outputs = Axon.predict(model_info.model, model_info.params, combined_inputs)
+```
+
+## Validation Against Standalone Qwen3
+
+Qwen3-VL's text decoder is the standalone Qwen3 model. A useful sanity
+check after touching the vision/multimodal code is to confirm the
+standalone Qwen3 text path still runs cleanly:
+
+```elixir
+# Loads only the small config.json, not weights
+{:ok, qwen3_spec} = Bumblebee.load_spec({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+IO.inspect(qwen3_spec.__struct__)
+# => Bumblebee.Text.Qwen3
+```
+
+For a full end-to-end check (downloads ~8GB of weights):
+
+```elixir
+{:ok, qwen3} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"}, type: :bf16)
+{:ok, qwen3_tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"})
+
+serving =
+  Bumblebee.Text.generation(qwen3, qwen3_tokenizer,
+    max_new_tokens: 64,
+    compile: [batch_size: 1, sequence_length: 512]
+  )
+
+Nx.Serving.run(serving, "Explain in one sentence what a vector database is.")
+```
+
+## Quality Profiles
+
+Use the `:quality` preset to bound how many visual tokens each image
+produces. Lower quality = faster inference, less spatial detail.
+
+```elixir
+# Token-budget knobs
+{:ok, fast_featurizer} =
+  Bumblebee.load_featurizer({:hf, repo},
+    module: Bumblebee.Vision.Qwen3VLFeaturizer,
+    quality: :low
+  )
+
+# Or explicit pixel caps (overrides :quality)
+{:ok, custom_featurizer} =
+  Bumblebee.load_featurizer({:hf, repo},
+    module: Bumblebee.Vision.Qwen3VLFeaturizer,
+    min_pixels: 256 * 32 * 32,
+    max_pixels: 1280 * 32 * 32
+  )
+```
diff --git a/test/bumblebee/multimodal/qwen3_vl_test.exs b/test/bumblebee/multimodal/qwen3_vl_test.exs
new file mode 100644
index 00000000..d4928350
--- /dev/null
+++ b/test/bumblebee/multimodal/qwen3_vl_test.exs
@@ -0,0 +1,137 @@
+defmodule Bumblebee.Multimodal.Qwen3VLTest do
+  use ExUnit.Case, async: true
+
+  import Bumblebee.TestHelpers
+
+  @moduletag model_test_tags()
+
+  test ":for_conditional_generation" do
+    # Tiny model created with /tmp/create_tiny_qwen3vl_v4.py (transformers 4.57.3):
+    # - text_config: vocab_size=1024, hidden_size=64, num_hidden_layers=2,
+    #                num_attention_heads=4, num_key_value_heads=2, head_dim=16,
+    #                intermediate_size=128
+    # - vision_config: depth=2, hidden_size=32, num_heads=4, intermediate_size=64,
+    #                  out_hidden_size=64, patch_size=14, spatial_merge_size=2,
+    #                  temporal_patch_size=2
+    #
+    # Reference values from /tmp/generate_reference_v2.py (seed=0):
+    # model = Qwen3VLForConditionalGeneration.from_pretrained(model_path)
+    # outputs = model(input_ids=torch.tensor([[10, 20, 30, 40, 50, 60, 0, 0]]),
+    #                 attention_mask=torch.tensor([[1, 1, 1, 1, 1, 1, 0, 0]]))
+    # outputs.logits[0, 0:3, 0:5].numpy()
+
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"})
+
+    assert %Bumblebee.Multimodal.Qwen3VL{architecture: :for_conditional_generation} = spec
+
+    inputs = %{
+      "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 0, 0]]),
+      "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    assert Nx.shape(outputs.logits) == {1, 8, 1024}
+
+    # Reference values from Python (transformers 4.57.3)
+    assert_all_close(
+      outputs.logits[[.., 0..2, 0..4]],
+      Nx.tensor([
+        [
+          [0.0410, 0.0745, -0.0977, 0.0099, 0.2705],
+          [-0.0504, 0.1776, -0.0481, -0.0269, 0.1630],
+          [-0.1887, 0.0889, -0.1113, -0.1756, 0.0805]
+        ]
+      ]),
+      atol: 1.0e-4
+    )
+  end
+
+  test "vision pathway runs end-to-end with image_grid_thw" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"})
+
+    factor = spec.vision_spec.patch_size * spec.vision_spec.spatial_merge_size
+
+    featurizer =
+      Bumblebee.configure(Bumblebee.Vision.Qwen3VLFeaturizer,
+        patch_size: spec.vision_spec.patch_size,
+        merge_size: spec.vision_spec.spatial_merge_size,
+        temporal_patch_size: spec.vision_spec.temporal_patch_size,
+        min_pixels: 4 * factor * factor,
+        max_pixels: 64 * factor * factor
+      )
+
+    image = Nx.iota({64, 64, 3}, type: :u8)
+    image_inputs = Bumblebee.apply_featurizer(featurizer, image)
+
+    [grid_t, grid_h, grid_w] = Nx.to_flat_list(image_inputs["image_grid_thw"])
+    merge_size = spec.vision_spec.spatial_merge_size
+    visual_tokens = grid_t * div(grid_h, merge_size) * div(grid_w, merge_size)
+
+    image_token_id = spec.image_token_id
+    input_ids = List.duplicate(image_token_id, visual_tokens) ++ [1, 2, 3]
+    attention_mask = List.duplicate(1, length(input_ids))
+
+    inputs = %{
+      "input_ids" => Nx.tensor([input_ids]),
+      "attention_mask" => Nx.tensor([attention_mask]),
+      "pixel_values" => image_inputs["pixel_values"],
+      "image_grid_thw" => image_inputs["image_grid_thw"]
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    expected_seq = visual_tokens + 3
+    assert {1, ^expected_seq, 1024} = Nx.shape(outputs.logits)
+  end
+
+  test "vision pathway accepts multiple images of different sizes" do
+    assert {:ok, %{model: model, params: params, spec: spec}} =
+             Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"})
+
+    factor = spec.vision_spec.patch_size * spec.vision_spec.spatial_merge_size
+
+    featurizer =
+      Bumblebee.configure(Bumblebee.Vision.Qwen3VLFeaturizer,
+        patch_size: spec.vision_spec.patch_size,
+        merge_size: spec.vision_spec.spatial_merge_size,
+        temporal_patch_size: spec.vision_spec.temporal_patch_size,
+        min_pixels: 4 * factor * factor,
+        max_pixels: 64 * factor * factor
+      )
+
+    images = [Nx.iota({56, 56, 3}, type: :u8), Nx.iota({84, 56, 3}, type: :u8)]
+    image_inputs = Bumblebee.apply_featurizer(featurizer, images)
+
+    assert {2, 3} = Nx.shape(image_inputs["image_grid_thw"])
+
+    merge_size = spec.vision_spec.spatial_merge_size
+
+    visual_tokens =
+      image_inputs["image_grid_thw"]
+      |> Nx.to_batched(1)
+      |> Enum.map(fn row ->
+        [t, h, w] = Nx.to_flat_list(row)
+        t * div(h, merge_size) * div(w, merge_size)
+      end)
+      |> Enum.sum()
+
+    image_token_id = spec.image_token_id
+    input_ids = List.duplicate(image_token_id, visual_tokens) ++ [1, 2]
+    attention_mask = List.duplicate(1, length(input_ids))
+
+    inputs = %{
+      "input_ids" => Nx.tensor([input_ids]),
+      "attention_mask" => Nx.tensor([attention_mask]),
+      "pixel_values" => image_inputs["pixel_values"],
+      "image_grid_thw" => image_inputs["image_grid_thw"]
+    }
+
+    outputs = Axon.predict(model, params, inputs)
+
+    expected_seq = visual_tokens + 2
+    assert {1, ^expected_seq, 1024} = Nx.shape(outputs.logits)
+  end
+end
diff --git a/test/bumblebee/vision/qwen3_vl_featurizer_test.exs b/test/bumblebee/vision/qwen3_vl_featurizer_test.exs
new file mode 100644
index 00000000..059d4547
--- /dev/null
+++ b/test/bumblebee/vision/qwen3_vl_featurizer_test.exs
@@ -0,0 +1,168 @@
+defmodule Bumblebee.Vision.Qwen3VLFeaturizerTest do
+  use ExUnit.Case, async: true
+
+  alias Bumblebee.Vision.Qwen3VLFeaturizer
+
+  defp synthetic_image(height, width, channels \\ 3) do
+    Nx.iota({height, width, channels}, type: :u8)
+    |> Nx.remainder(255)
+  end
+
+  defp featurizer(opts \\ []) do
+    defaults = [
+      patch_size: 16,
+      temporal_patch_size: 2,
+      merge_size: 2
+    ]
+
+    Bumblebee.configure(Qwen3VLFeaturizer, Keyword.merge(defaults, opts))
+  end
+
+  test "produces pixel_values and image_grid_thw for a single image" do
+    image = synthetic_image(64, 64)
+    inputs = Bumblebee.apply_featurizer(featurizer(), image)
+
+    # 4x4 = 16 patches; flat = channels * temporal_patch * patch * patch = 3*2*16*16 = 1536
+    assert {16, 1536} = Nx.shape(inputs["pixel_values"])
+    assert {1, 3} = Nx.shape(inputs["image_grid_thw"])
+
+    # 64x64 image, patch=16 -> 4x4 patches, temporal duplicated 1->2 -> patches_t=1
+    assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4]
+  end
+
+  test "smart_resize preserves aspect ratio and rounds to factor multiples" do
+    # 96x64 input. factor = 16 * 2 = 32. 96 = 3*32, 64 = 2*32 — already aligned.
+    image = synthetic_image(96, 64)
+    inputs = Bumblebee.apply_featurizer(featurizer(), image)
+
+    [_t, grid_h, grid_w] = Nx.to_flat_list(inputs["image_grid_thw"])
+    # patch_size=16: 96/16=6, 64/16=4
+    assert grid_h == 6
+    assert grid_w == 4
+
+    expected_patches = grid_h * grid_w
+    assert {^expected_patches, _} = Nx.shape(inputs["pixel_values"])
+  end
+
+  test "max_pixels caps the resized image" do
+    # 1024x1024 with max_pixels=256 visual tokens forces a strong downscale.
+    image = synthetic_image(1024, 1024)
+    factor = 32
+    max_pixels = 256 * factor * factor
+
+    inputs =
+      Bumblebee.apply_featurizer(
+        featurizer(min_pixels: 4 * factor * factor, max_pixels: max_pixels),
+        image
+      )
+
+    [_t, grid_h, grid_w] = Nx.to_flat_list(inputs["image_grid_thw"])
+    merge_size = 2
+    visual_tokens = div(grid_h, merge_size) * div(grid_w, merge_size)
+
+    assert visual_tokens <= 256
+  end
+
+  test ":low quality produces fewer visual tokens than :high" do
+    image = synthetic_image(2048, 1536)
+
+    [_t, low_h, low_w] =
+      Bumblebee.apply_featurizer(featurizer(quality: :low), image)["image_grid_thw"]
+      |> Nx.to_flat_list()
+
+    [_t, high_h, high_w] =
+      Bumblebee.apply_featurizer(featurizer(quality: :high), image)["image_grid_thw"]
+      |> Nx.to_flat_list()
+
+    assert low_h * low_w < high_h * high_w
+  end
+
+  test "supports multiple images of different sizes in one call" do
+    images = [synthetic_image(64, 64), synthetic_image(96, 64)]
+    inputs = Bumblebee.apply_featurizer(featurizer(), images)
+
+    assert {2, 3} = Nx.shape(inputs["image_grid_thw"])
+    assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4, 1, 6, 4]
+
+    # Total patches = 4*4 + 6*4 = 40; flat = 3*2*16*16 = 1536
+    assert {40, 1536} = Nx.shape(inputs["pixel_values"])
+  end
+
+  test "windowed layout: every 4 consecutive patches form one 2x2 merge block" do
+    # A 64x64 image gives a 4x4 patch grid. With merge_size=2 there are
+    # 2x2 = 4 merge blocks of 4 patches each. Patches inside one block
+    # come from one spatial region of the resized image, so their flat
+    # patch features must be pairwise close. We verify the layout by
+    # checking that within each block-of-4 the variance is much smaller
+    # than the variance across blocks.
+    image =
+      Nx.iota({64, 64, 3}, type: :f32)
+      |> Nx.divide(64 * 64 * 3)
+
+    inputs = Bumblebee.apply_featurizer(featurizer(normalize: false), image)
+
+    grouped = Nx.reshape(inputs["pixel_values"], {4, 4, 1536})
+    within_block_var = grouped |> Nx.variance(axes: [1]) |> Nx.mean() |> Nx.to_number()
+
+    across_block_var =
+      grouped
+      |> Nx.mean(axes: [1])
+      |> Nx.variance(axes: [0])
+      |> Nx.mean()
+      |> Nx.to_number()
+
+    assert within_block_var < across_block_var
+  end
+
+  test "raises on extreme aspect ratios" do
+    image = synthetic_image(1, 400)
+
+    assert_raise ArgumentError, ~r/aspect ratio/, fn ->
+      Bumblebee.apply_featurizer(featurizer(), image)
+    end
+  end
+
+  test "raises when min_pixels exceeds max_pixels" do
+    image = synthetic_image(64, 64)
+
+    assert_raise ArgumentError, ~r/min_pixels/, fn ->
+      Bumblebee.apply_featurizer(featurizer(min_pixels: 10_000, max_pixels: 1_000), image)
+    end
+  end
+
+  test "pads pixel_values to :max_patches with zeros" do
+    image = synthetic_image(64, 64)
+    inputs = Bumblebee.apply_featurizer(featurizer(max_patches: 64), image)
+
+    assert {64, 1536} = Nx.shape(inputs["pixel_values"])
+    # First 16 patches are real, rest are zero-padded
+    real_block = inputs["pixel_values"][[0..15, ..]]
+    pad_block = inputs["pixel_values"][[16..63, ..]]
+    assert Nx.to_number(Nx.sum(Nx.abs(pad_block))) == 0.0
+    refute Nx.to_number(Nx.sum(Nx.abs(real_block))) == 0.0
+  end
+
+  test "pads image_grid_thw with [0, 0, 0] rows" do
+    image = synthetic_image(64, 64)
+    inputs = Bumblebee.apply_featurizer(featurizer(max_num_images: 3), image)
+
+    assert {3, 3} = Nx.shape(inputs["image_grid_thw"])
+    assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4, 0, 0, 0, 0, 0, 0]
+  end
+
+  test "raises when :max_patches is not a multiple of merge_size**2" do
+    image = synthetic_image(64, 64)
+
+    assert_raise ArgumentError, ~r/multiple of merge_size/, fn ->
+      Bumblebee.apply_featurizer(featurizer(max_patches: 17), image)
+    end
+  end
+
+  test "raises when image needs more patches than :max_patches" do
+    image = synthetic_image(96, 96)
+
+    assert_raise ArgumentError, ~r/raise :max_patches/, fn ->
+      Bumblebee.apply_featurizer(featurizer(max_patches: 16), image)
+    end
+  end
+end