diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex index a191f5bf..7806a2f8 100644 --- a/lib/bumblebee.ex +++ b/lib/bumblebee.ex @@ -192,6 +192,8 @@ defmodule Bumblebee do "Qwen3Model" => {Bumblebee.Text.Qwen3, :base}, "Qwen3ForCausalLM" => {Bumblebee.Text.Qwen3, :for_causal_language_modeling}, "Qwen3ForSequenceClassification" => {Bumblebee.Text.Qwen3, :for_sequence_classification}, + "Qwen3VLForConditionalGeneration" => + {Bumblebee.Multimodal.Qwen3VL, :for_conditional_generation}, "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification}, "ResNetModel" => {Bumblebee.Vision.ResNet, :base}, "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling}, @@ -242,12 +244,14 @@ defmodule Bumblebee do @transformers_image_processor_type_to_featurizer %{ "BlipImageProcessor" => Bumblebee.Vision.BlipFeaturizer, - "BitImageProcessor" => Bumblebee.Vision.BitFeaturizer + "BitImageProcessor" => Bumblebee.Vision.BitFeaturizer, + "Qwen3VLImageProcessor" => Bumblebee.Vision.Qwen3VLFeaturizer } @model_type_to_featurizer %{ "convnext" => Bumblebee.Vision.ConvNextFeaturizer, "deit" => Bumblebee.Vision.DeitFeaturizer, + "qwen3_vl" => Bumblebee.Vision.Qwen3VLFeaturizer, "resnet" => Bumblebee.Vision.ConvNextFeaturizer, "vit" => Bumblebee.Vision.VitFeaturizer, "whisper" => Bumblebee.Audio.WhisperFeaturizer @@ -274,7 +278,9 @@ defmodule Bumblebee do "mpnet" => :mpnet, "phi" => :code_gen, "phi3" => :llama, + "qwen2_vl" => :qwen2, "qwen3" => :qwen2, + "qwen3_vl" => :qwen2, "roberta" => :roberta, "smollm3" => :smollm3, "t5" => :t5, diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex index 188b0ffe..8f009251 100644 --- a/lib/bumblebee/layers/transformer.ex +++ b/lib/bumblebee/layers/transformer.ex @@ -75,6 +75,7 @@ defmodule Bumblebee.Layers.Transformer do :num_blocks, :rotary_embedding, :attention_window_size, + :post_block_hook, attention_mask: Layers.none(), attention_head_mask: Layers.none(), attention_relative_bias: nil, @@ -97,6 +98,7 @@ defmodule Bumblebee.Layers.Transformer do cache = opts[:cache] rotary_embedding = opts[:rotary_embedding] attention_window_size = opts[:attention_window_size] + post_block_hook = opts[:post_block_hook] block_opts = Keyword.take(opts, block_opts_keys) @@ -160,6 +162,14 @@ defmodule Bumblebee.Layers.Transformer do ] ++ block_opts ) + # Apply post-block hook if provided (e.g., for DeepStack feature injection) + hidden_state = + if post_block_hook do + post_block_hook.(idx, hidden_state) + else + hidden_state + end + cache = Layers.Decoder.put_block_cache(state.cache, idx, block_cache) %{ diff --git a/lib/bumblebee/multimodal/image_text_to_text.ex b/lib/bumblebee/multimodal/image_text_to_text.ex new file mode 100644 index 00000000..a789c37a --- /dev/null +++ b/lib/bumblebee/multimodal/image_text_to_text.ex @@ -0,0 +1,247 @@ +defmodule Bumblebee.Multimodal.ImageTextToText do + @moduledoc """ + Generation helpers for vision-language models like Qwen3-VL. + + Two entry points: + + * `generate/6` — one-shot call. Featurizes, expands the prompt + placeholder, and runs generation. Each call recompiles the graph + when the image or sequence length changes, so it suits + interactive use. + + * `compile/5` + `run/3` — compile the generation graph **once** for + upper-bound shapes, then run repeatedly with images of varying + sizes. The featurizer pads `pixel_values` and `image_grid_thw` to + the configured maxima, and the vision encoder excludes padded + patches from attention via `patch_valid`. + """ + + alias Bumblebee.Text + + @placeholder "<|image_pad|>" + + @doc """ + Generates text from a prompt that includes a `<|image_pad|>` marker + and an image. + + ## Required arguments + + * `model_info` - a loaded `Bumblebee.Multimodal.Qwen3VL` (or compatible) + model + * `featurizer` - a configured `Bumblebee.Vision.Qwen3VLFeaturizer` + * `tokenizer` - a loaded tokenizer for the same model + * `generation_config` - a `Bumblebee.Text.GenerationConfig` + * `text` - the user prompt containing exactly one `<|image_pad|>` marker + * `image` - an image tensor or `t:StbImage.t/0` + + ## Returns + + %{text: "", token_ids: [...]} + + ## Example + + {:ok, model_info} = Bumblebee.load_model({:hf, "Qwen/Qwen3-VL-2B-Instruct"}) + {:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-VL-2B-Instruct"}) + + {:ok, featurizer} = + Bumblebee.load_featurizer({:hf, "Qwen/Qwen3-VL-2B-Instruct"}, + module: Bumblebee.Vision.Qwen3VLFeaturizer + ) + + featurizer = Bumblebee.configure(featurizer, quality: :low) + {:ok, gen_config} = Bumblebee.load_generation_config({:hf, "Qwen/Qwen3-VL-2B-Instruct"}) + gen_config = Bumblebee.configure(gen_config, max_new_tokens: 64) + + Bumblebee.Multimodal.ImageTextToText.generate( + model_info, featurizer, tokenizer, gen_config, + "<|im_start|>user\\n<|vision_start|><|image_pad|><|vision_end|>What is in this image?<|im_end|>\\n<|im_start|>assistant\\n", + image + ) + """ + def generate( + model_info, + featurizer, + tokenizer, + %Text.GenerationConfig{} = generation_config, + text, + image + ) do + %{model: model, params: params, spec: spec} = model_info + + unless Map.has_key?(spec, :image_token_id) do + raise ArgumentError, + "expected a multimodal model with :image_token_id, got #{inspect(spec.__struct__)}" + end + + merge_size = + case spec do + %{vision_spec: %{spatial_merge_size: ms}} -> ms + _ -> 1 + end + + image_inputs = Bumblebee.apply_featurizer(featurizer, image) + visual_tokens = visual_tokens_for(image_inputs["image_grid_thw"], merge_size) + expanded_text = expand_marker(text, visual_tokens) + + tokenizer = Bumblebee.configure(tokenizer, return_token_type_ids: false) + text_inputs = Bumblebee.apply_tokenizer(tokenizer, expanded_text) + + inputs = + text_inputs + |> Map.merge(image_inputs) + |> Map.put("seed", Nx.tensor([:erlang.system_time()], type: :s64)) + + generate_fun = Text.Generation.build_generate(model, spec, generation_config) + %{token_ids: token_ids} = generate_fun.(params, inputs) + + decoded = + token_ids + |> Nx.to_batched(1) + |> Enum.map(&Bumblebee.Tokenizer.decode(tokenizer, Nx.to_flat_list(&1))) + |> hd() + + %{text: decoded, token_ids: token_ids} + end + + @doc """ + Compiles the generation graph once for the given upper-bound shapes. + + The returned struct can be passed to `run/3` repeatedly. Calls with + images that produce fewer than `:max_patches` real patches or + shorter than `:sequence_length` prompts are padded; the vision + encoder masks the padded positions out of attention. + + ## Options + + * `:max_patches` (required) — upper bound on total patches across + all images in one call. Must be a multiple of `merge_size ** 2`. + * `:max_num_images` (required) — upper bound on number of images + per call. + * `:sequence_length` (required) — upper bound on token count + (prompt + generated). + """ + def compile( + model_info, + featurizer, + tokenizer, + %Text.GenerationConfig{} = generation_config, + opts + ) do + opts = Keyword.validate!(opts, [:max_patches, :max_num_images, :sequence_length]) + max_patches = Keyword.fetch!(opts, :max_patches) + max_num_images = Keyword.fetch!(opts, :max_num_images) + sequence_length = Keyword.fetch!(opts, :sequence_length) + + %{model: model, params: params, spec: spec} = model_info + + unless Map.has_key?(spec, :image_token_id) do + raise ArgumentError, + "expected a multimodal model with :image_token_id, got #{inspect(spec.__struct__)}" + end + + merge_size = spec.vision_spec.spatial_merge_size + + featurizer = + Bumblebee.configure(featurizer, + max_patches: max_patches, + max_num_images: max_num_images + ) + + tokenizer = + Bumblebee.configure(tokenizer, + length: sequence_length, + pad_direction: :left, + return_token_type_ids: false + ) + + generate_fun = Text.Generation.build_generate(model, spec, generation_config) + + %{ + generate_fun: generate_fun, + params: params, + spec: spec, + featurizer: featurizer, + tokenizer: tokenizer, + merge_size: merge_size, + max_patches: max_patches, + max_num_images: max_num_images, + sequence_length: sequence_length + } + end + + @doc """ + Runs a prompt + image through a pre-compiled generator from `compile/5`. + + EXLA caches the compiled graph by input shape; since the featurizer + pads to the upper bounds configured in `compile/5`, every call hits + the same cached graph. + """ + def run(compiled, text, image) do + %{ + generate_fun: generate_fun, + params: params, + featurizer: featurizer, + tokenizer: tokenizer, + merge_size: merge_size + } = compiled + + image_inputs = Bumblebee.apply_featurizer(featurizer, image) + grid_thw_real = unpad_grid_thw(image_inputs["image_grid_thw"]) + visual_tokens = visual_tokens_for(grid_thw_real, merge_size) + expanded_text = expand_marker(text, visual_tokens) + + text_inputs = Bumblebee.apply_tokenizer(tokenizer, expanded_text) + + inputs = + text_inputs + |> Map.merge(image_inputs) + |> Map.put("seed", Nx.tensor([:erlang.system_time()], type: :s64)) + + %{token_ids: token_ids} = generate_fun.(params, inputs) + + decoded = + token_ids + |> Nx.to_batched(1) + |> Enum.map(&Bumblebee.Tokenizer.decode(tokenizer, Nx.to_flat_list(&1))) + |> hd() + + %{text: decoded, token_ids: token_ids} + end + + # Drops padding rows ([0, 0, 0]) so visual_tokens_for matches the + # actual prompt expansion length. + defp unpad_grid_thw(grid_thw) do + grid_thw + |> Nx.to_list() + |> Enum.reject(fn [t, h, w] -> t == 0 and h == 0 and w == 0 end) + |> case do + [] -> Nx.tensor([[0, 0, 0]], type: :s64) + rows -> Nx.tensor(rows, type: :s64) + end + end + + defp expand_marker(text, visual_tokens) do + case String.split(text, @placeholder) do + [_only] -> + raise ArgumentError, + "the prompt must contain a #{@placeholder} marker where the image " <> + "embedding should be spliced in, got: #{inspect(text)}" + + [prefix, suffix] -> + prefix <> String.duplicate(@placeholder, visual_tokens) <> suffix + + _multiple -> + raise ArgumentError, + "expected exactly one #{@placeholder} marker in the prompt" + end + end + + defp visual_tokens_for(grid_thw, merge_size) do + grid_thw + |> Nx.to_list() + |> Enum.map(fn [t, h, w] -> + t * div(h, merge_size) * div(w, merge_size) + end) + |> Enum.sum() + end +end diff --git a/lib/bumblebee/multimodal/qwen3_vl.ex b/lib/bumblebee/multimodal/qwen3_vl.ex new file mode 100644 index 00000000..c847ef37 --- /dev/null +++ b/lib/bumblebee/multimodal/qwen3_vl.ex @@ -0,0 +1,565 @@ +defmodule Bumblebee.Multimodal.Qwen3VL do + alias Bumblebee.Shared + + options = + [ + image_token_id: [ + default: 151_655, + doc: "the token ID used to represent images in the input sequence" + ], + video_token_id: [ + default: 151_656, + doc: "the token ID used to represent videos in the input sequence" + ], + vision_start_token_id: [ + default: 151_652, + doc: "the token ID marking the start of visual content" + ], + vision_end_token_id: [ + default: 151_653, + doc: "the token ID marking the end of visual content" + ] + ] ++ Shared.common_options([:output_hidden_states, :output_attentions]) + + @moduledoc """ + Qwen3-VL model for vision-language tasks. + + ## Architectures + + * `:for_conditional_generation` - Qwen3-VL with a language modeling + head for image/video-to-text generation + + ## Inputs + + * `"pixel_values"` - `{num_patches, flattened_patch_size}` + + Concatenated, pre-extracted image/video patches from the featurizer. + Shape is `{num_patches, channels * temporal_patch_size * patch_size * patch_size}`. + + * `"image_grid_thw"` - `{num_images, 3}` + + Per-image grid dimensions `[temporal, height, width]` in patch + units. Threaded into the vision encoder so it can compute correct + per-patch positions for variable image sizes and multiple images + per prompt. + + * `"input_ids"` - `{batch_size, sequence_length}` + + Indices of input sequence tokens in the vocabulary. Should contain + special image/video tokens at positions where visual content appears. + + * `"attention_mask"` - `{batch_size, sequence_length}` + + Mask indicating which tokens to attend to. + + ## Global layer options + + #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])} + + ## Configuration + + #{Shared.options_doc(options)} + + ## References + + * [Qwen3-VL](https://huggingface.co/Qwen/Qwen3-VL-2B-Instruct) + + """ + + defstruct [architecture: :for_conditional_generation, vision_spec: nil, text_spec: nil] ++ + Shared.option_defaults(options) + + @behaviour Bumblebee.ModelSpec + @behaviour Bumblebee.Configurable + @behaviour Bumblebee.Text.Generation + + alias Bumblebee.Layers + + @impl true + def architectures(), do: [:for_conditional_generation] + + @impl true + def config(spec, opts) do + Shared.put_config_attrs(spec, opts) + end + + @impl true + def input_template(%{vision_spec: vision_spec}) do + # Vision input is pre-extracted patches: {num_patches, flattened_patch_size} + # flattened_patch_size = channels * temporal_patch_size * patch_size * patch_size + patch_size = vision_spec.patch_size + temporal_patch_size = vision_spec.temporal_patch_size + + flattened_patch_size = + vision_spec.num_channels * temporal_patch_size * patch_size * patch_size + + # Use 196 patches as template (14x14 grid from 224x224 image) + num_patches = 196 + + %{ + "pixel_values" => Nx.template({num_patches, flattened_patch_size}, :f32), + "image_grid_thw" => Nx.template({1, 3}, :s64), + "input_ids" => Nx.template({1, 1}, :u32) + } + end + + @impl true + def init_cache(%{text_spec: text_spec}, batch_size, max_length, inputs) do + text_spec.__struct__.init_cache(text_spec, batch_size, max_length, inputs) + end + + @impl true + def traverse_cache(_spec, cache, fun) do + Layers.Decoder.traverse_cache(cache, fun) + end + + @impl true + def model(%__MODULE__{architecture: :for_conditional_generation} = spec) do + inputs = inputs(spec) + + vision_model = + Bumblebee.build_model(spec.vision_spec) + |> Bumblebee.Utils.Axon.prefix_names("vision_model.") + |> Bumblebee.Utils.Axon.plug_inputs(%{ + "pixel_values" => inputs["pixel_values"], + "image_grid_thw" => inputs["image_grid_thw"] + }) + + # Get vision embeddings using correct Axon.nx pattern + vision_hidden_state = + Layers.if_present inputs["pixel_values"] do + Axon.nx(vision_model, & &1.hidden_state) + else + Layers.none() + end + + # Extract DeepStack features from vision encoder + # These are hidden states from intermediate layers passed through mergers + deepstack_features = + Layers.if_present inputs["pixel_values"] do + Axon.nx(vision_model, & &1.deepstack_hidden_states) + else + Layers.none() + end + + # Substitute visual embeddings into text input + input_embeddings = + substitute_visual_embeddings( + inputs["input_ids"], + vision_hidden_state, + spec, + name: "embed_substitute" + ) + + # Create visual position mask for DeepStack injection + visual_mask = + Layers.if_present inputs["pixel_values"] do + Axon.nx(inputs["input_ids"], fn ids -> + image_mask = Nx.equal(ids, spec.image_token_id) + video_mask = Nx.equal(ids, spec.video_token_id) + Nx.logical_or(image_mask, video_mask) + end) + else + Layers.none() + end + + # Build text decoder with DeepStack injection hook + text_outputs = + text_decoder_with_deepstack( + input_embeddings, + inputs["attention_mask"], + inputs["position_ids"], + inputs["cache"], + deepstack_features, + visual_mask, + spec, + name: "text_model" + ) + + Layers.output(%{ + logits: text_outputs.logits, + cache: text_outputs.cache, + hidden_states: text_outputs.hidden_states, + attentions: text_outputs.attentions + }) + end + + defp inputs(spec) do + # Vision inputs - pre-extracted patches from featurizer + # Shape: {num_patches, flattened_patch_size} where + # flattened_patch_size = channels * temporal_patch_size * patch_size * patch_size + patch_size = spec.vision_spec.patch_size + temporal_patch_size = spec.vision_spec.temporal_patch_size + + flattened_patch_size = + spec.vision_spec.num_channels * temporal_patch_size * patch_size * patch_size + + vision_shape = {nil, flattened_patch_size} + + # Text inputs + text_shape = {nil, nil} + hidden_shape = {nil, nil, spec.text_spec.hidden_size} + + Bumblebee.Utils.Model.inputs_to_map([ + Axon.input("pixel_values", optional: true, shape: vision_shape), + Axon.input("image_grid_thw", optional: true, shape: {nil, 3}), + Axon.input("input_ids", shape: text_shape), + Axon.input("attention_mask", optional: true, shape: text_shape), + Axon.input("position_ids", optional: true, shape: text_shape), + Axon.input("input_embeddings", optional: true, shape: hidden_shape), + Axon.input("cache", optional: true) + ]) + end + + defp substitute_visual_embeddings(input_ids, vision_hidden_state, spec, _opts) do + # Get the token embeddings for the input_ids + token_embeddings = + Axon.embedding(input_ids, spec.text_spec.vocab_size, spec.text_spec.hidden_size, + name: "text_model.embedder.token_embedding" + ) + + # If no vision input, just return token embeddings + # Otherwise, substitute visual embeddings at image/video token positions + Layers.if_present vision_hidden_state do + Axon.layer( + fn token_embeds, visual_embeds, input_ids, _opts -> + # Create mask for visual tokens + image_mask = Nx.equal(input_ids, spec.image_token_id) + video_mask = Nx.equal(input_ids, spec.video_token_id) + visual_mask = Nx.logical_or(image_mask, video_mask) + + # visual_embeds shape: {batch, num_visual_tokens, hidden_size} + # visual_mask shape: {batch, seq_len} + # This is a simplified substitution - a full implementation would need + # to handle variable numbers of visual tokens per sequence + substitute_at_mask(token_embeds, visual_embeds, visual_mask) + end, + [token_embeddings, vision_hidden_state, input_ids] + ) + else + # No visual input - just use token embeddings + token_embeddings + end + end + + # Substitute visual embeddings at positions where mask is true + defp substitute_at_mask(token_embeds, visual_embeds, mask) do + # token_embeds: {batch, seq_len, hidden} + # visual_embeds: {batch, num_visual, hidden} + # mask: {batch, seq_len} - boolean mask where image tokens are + {batch_size, seq_len, hidden_size} = Nx.shape(token_embeds) + {_, num_visual, _} = Nx.shape(visual_embeds) + + # We need to scatter visual_embeds into positions where mask is true + # Create indices for where to place visual embeddings + # mask_indices gives us which positions in seq_len are image tokens + + # Convert mask to indices - find positions where mask is true + # For each position in the sequence, if it's an image token, + # we need to know which visual embedding to use + + # Create a cumulative sum of the mask to get visual embedding indices + # mask: [0, 0, 1, 1, 1, 0, 0] -> cumsum: [0, 0, 1, 2, 3, 3, 3] + # Then subtract 1 where mask is true to get 0-indexed: [-, -, 0, 1, 2, -, -] + mask_int = Nx.as_type(mask, :s32) + cumsum = Nx.cumulative_sum(mask_int, axis: 1) + # visual_indices gives the index into visual_embeds for each position + # For non-image positions, this will be garbage but we'll mask it out + visual_indices = Nx.subtract(cumsum, 1) + # Clamp to valid range + visual_indices = Nx.clip(visual_indices, 0, num_visual - 1) + + # Gather visual embeddings according to indices + # visual_indices shape: {batch, seq_len} + # We need to gather from visual_embeds {batch, num_visual, hidden} + # Result should be {batch, seq_len, hidden} + + # Expand indices to match hidden dimension for gathering + # {batch, seq_len} -> {batch, seq_len, hidden} + visual_indices_expanded = Nx.new_axis(visual_indices, -1) + + visual_indices_expanded = + Nx.broadcast(visual_indices_expanded, {batch_size, seq_len, hidden_size}) + + visual_gathered = Nx.take_along_axis(visual_embeds, visual_indices_expanded, axis: 1) + + # Expand mask for broadcasting with hidden dimension + mask_expanded = Nx.new_axis(mask, -1) + mask_expanded = Nx.broadcast(mask_expanded, {batch_size, seq_len, hidden_size}) + + # Select: where mask is true, use visual; else use token + Nx.select(mask_expanded, visual_gathered, token_embeds) + end + + # Build text decoder with DeepStack feature injection + # This builds the decoder directly so we can use post_block_hook for injection + defp text_decoder_with_deepstack( + embeddings, + attention_mask, + position_ids, + cache, + deepstack_features, + visual_mask, + spec, + opts + ) do + name = opts[:name] + text_spec = spec.text_spec + + import Bumblebee.Utils.Model, only: [join: 2] + + # Default position_ids if not provided + position_ids = + Layers.default position_ids do + Layers.default_position_ids(embeddings) + end + + # Build query and key normalization functions for Qwen3 + query_norm = + if text_spec.use_qk_norm do + &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, channel_index: -1, name: &2) + end + + key_norm = + if text_spec.use_qk_norm do + &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, channel_index: -1, name: &2) + end + + # DeepStack injection layers (0, 1, 2 in Python) + # The vision encoder extracts features from layers [5, 11, 17] (1-indexed) + # These are injected into decoder layers [0, 1, 2] + deepstack_injection_layers = MapSet.new([0, 1, 2]) + + # Build post_block_hook for DeepStack injection + # The hook is always defined, but only applies injection at layers 0, 1, 2 + # when deepstack_features and visual_mask are present + post_block_hook = fn layer_idx, hidden_state -> + if MapSet.member?(deepstack_injection_layers, layer_idx) do + # Conditionally inject deepstack features at visual token positions + Layers.if_present deepstack_features do + Axon.layer( + fn hidden, ds_features, mask, _opts -> + inject_deepstack_features(hidden, ds_features, mask, layer_idx) + end, + [hidden_state, deepstack_features, visual_mask], + name: join(name, "deepstack_inject.#{layer_idx}") + ) + else + hidden_state + end + else + hidden_state + end + end + + # Run decoder blocks with hook + decoder_outputs = + Layers.Transformer.blocks(embeddings, + num_blocks: text_spec.num_blocks, + num_attention_heads: text_spec.num_attention_heads, + num_key_value_heads: text_spec.num_key_value_heads, + hidden_size: text_spec.hidden_size, + attention_head_size: text_spec.attention_head_size, + kernel_initializer: Axon.Initializers.normal(scale: text_spec.initializer_scale), + query_use_bias: false, + key_use_bias: false, + value_use_bias: false, + output_use_bias: false, + block_type: :norm_first, + attention_mask: attention_mask, + cache: cache, + causal: true, + layer_norm: &Layers.rms_norm(&1, epsilon: text_spec.layer_norm_epsilon, name: &2), + ffn: + &gated_ffn(&1, text_spec.intermediate_size, text_spec.hidden_size, + name: &2, + activation: text_spec.activation, + initializer_scale: text_spec.initializer_scale + ), + rotary_embedding: [ + position_ids: position_ids, + max_positions: text_spec.max_positions, + base: text_spec.rotary_embedding_base, + scaling_strategy: text_spec.rotary_embedding_scaling_strategy + ], + query_norm: query_norm, + key_norm: key_norm, + post_block_hook: post_block_hook, + name: join(name, "decoder.blocks") + ) + + # Final layer norm + hidden_state = + Layers.rms_norm(decoder_outputs.hidden_state, + name: join(name, "output_norm"), + epsilon: text_spec.layer_norm_epsilon + ) + + # Language modeling head + logits = + Layers.dense_transposed(hidden_state, text_spec.vocab_size, + kernel_initializer: Axon.Initializers.normal(scale: text_spec.initializer_scale), + name: join(name, "language_modeling_head.output") + ) + + %{ + logits: logits, + hidden_states: Layers.append(decoder_outputs.hidden_states, hidden_state), + attentions: decoder_outputs.attentions, + cache: decoder_outputs.cache + } + end + + # Inject DeepStack features at visual token positions + # Formula: hidden_states[visual_mask] += deepstack_features[layer_idx] + defp inject_deepstack_features(hidden_state, deepstack_features_tuple, visual_mask, layer_idx) do + # deepstack_features_tuple is a tuple of {feature_0, feature_1, feature_2} + # Each feature has shape {batch, num_visual_tokens, hidden_size} + deepstack_feature = elem(deepstack_features_tuple, layer_idx) + + # hidden_state: {batch, seq_len, hidden} + # visual_mask: {batch, seq_len} + # deepstack_feature: {batch, num_visual, hidden} + {batch_size, seq_len, hidden_size} = Nx.shape(hidden_state) + {_, num_visual, _} = Nx.shape(deepstack_feature) + + # Create indices to gather deepstack features for each position + mask_int = Nx.as_type(visual_mask, :s32) + cumsum = Nx.cumulative_sum(mask_int, axis: 1) + visual_indices = Nx.subtract(cumsum, 1) + visual_indices = Nx.clip(visual_indices, 0, num_visual - 1) + + # Expand indices for gathering + visual_indices_expanded = Nx.new_axis(visual_indices, -1) + + visual_indices_expanded = + Nx.broadcast(visual_indices_expanded, {batch_size, seq_len, hidden_size}) + + # Gather features according to position + gathered_features = Nx.take_along_axis(deepstack_feature, visual_indices_expanded, axis: 1) + + # Create additive mask - only add at visual positions + mask_expanded = Nx.new_axis(visual_mask, -1) + mask_expanded = Nx.broadcast(mask_expanded, {batch_size, seq_len, hidden_size}) + + # Add features at visual positions (zero elsewhere) + addition = Nx.select(mask_expanded, gathered_features, Nx.tensor(0.0)) + Nx.add(hidden_state, addition) + end + + # Gated FFN for Qwen3 text decoder + defp gated_ffn(hidden_state, intermediate_size, output_size, opts) do + import Bumblebee.Utils.Model, only: [join: 2] + name = opts[:name] + activation = opts[:activation] + initializer_scale = opts[:initializer_scale] + kernel_initializer = Axon.Initializers.normal(scale: initializer_scale) + + intermediate = + Axon.dense(hidden_state, intermediate_size, + kernel_initializer: kernel_initializer, + name: join(name, "intermediate"), + use_bias: false + ) + + gate = + Axon.dense(hidden_state, intermediate_size, + kernel_initializer: kernel_initializer, + name: join(name, "gate"), + use_bias: false + ) + + hidden_state = Axon.multiply(intermediate, Axon.activation(gate, activation)) + + Axon.dense(hidden_state, output_size, + kernel_initializer: kernel_initializer, + name: join(name, "output"), + use_bias: false + ) + end + + defimpl Bumblebee.HuggingFace.Transformers.Config do + def load(spec, data) do + import Shared.Converters + + opts = + convert!(data, + image_token_id: {"image_token_id", number()}, + video_token_id: {"video_token_id", number()}, + vision_start_token_id: {"vision_start_token_id", number()}, + vision_end_token_id: {"vision_end_token_id", number()} + ) + + # Load text spec from text_config first to get hidden_size + text_data = Map.get(data, "text_config", data) + + # Qwen3-VL uses QK-norm in the text model (same as standalone Qwen3) + text_spec = + Bumblebee.configure(Bumblebee.Text.Qwen3, + architecture: :for_causal_language_modeling + ) + |> Bumblebee.HuggingFace.Transformers.Config.load(text_data) + + # Load vision spec with out_hidden_size from text config + vision_data = + data + |> Map.put_new("vision_config", %{}) + |> update_in(["vision_config"], fn vc -> + Map.put_new(vc, "out_hidden_size", text_spec.hidden_size) + end) + + vision_spec = + Bumblebee.configure(Bumblebee.Vision.Qwen3VLVision) + |> Bumblebee.HuggingFace.Transformers.Config.load(vision_data) + + @for.config( + %{spec | vision_spec: vision_spec, text_spec: text_spec}, + opts + ) + end + end + + defimpl Bumblebee.HuggingFace.Transformers.Model do + def params_mapping(spec) do + vision_mapping = + Bumblebee.HuggingFace.Transformers.Model.params_mapping(spec.vision_spec) + |> Enum.map(fn {bumblebee, hf} -> {"vision_model.#{bumblebee}", hf} end) + |> Map.new() + + # Qwen3-VL text model uses `model.language_model.*` paths instead of Qwen3's `model.*` + # The loader infers a "model." prefix from PyTorch state, so we use "language_model.*" + # paths (the loader will prepend "model." automatically) + text_mapping = %{ + "text_model.embedder.token_embedding" => "language_model.embed_tokens", + "text_model.decoder.blocks.{n}.self_attention.query" => + "language_model.layers.{n}.self_attn.q_proj", + "text_model.decoder.blocks.{n}.self_attention.key" => + "language_model.layers.{n}.self_attn.k_proj", + "text_model.decoder.blocks.{n}.self_attention.value" => + "language_model.layers.{n}.self_attn.v_proj", + "text_model.decoder.blocks.{n}.self_attention.output" => + "language_model.layers.{n}.self_attn.o_proj", + "text_model.decoder.blocks.{n}.self_attention.query_norm" => + "language_model.layers.{n}.self_attn.q_norm", + "text_model.decoder.blocks.{n}.self_attention.key_norm" => + "language_model.layers.{n}.self_attn.k_norm", + "text_model.decoder.blocks.{n}.self_attention_norm" => + "language_model.layers.{n}.input_layernorm", + "text_model.decoder.blocks.{n}.ffn.gate" => "language_model.layers.{n}.mlp.gate_proj", + "text_model.decoder.blocks.{n}.ffn.intermediate" => + "language_model.layers.{n}.mlp.up_proj", + "text_model.decoder.blocks.{n}.ffn.output" => "language_model.layers.{n}.mlp.down_proj", + "text_model.decoder.blocks.{n}.output_norm" => + "language_model.layers.{n}.post_attention_layernorm", + "text_model.output_norm" => "language_model.norm", + "text_model.language_modeling_head.output" => + if(spec.text_spec.tie_word_embeddings, + do: "language_model.embed_tokens", + else: "language_model.lm_head" + ) + } + + Map.merge(vision_mapping, text_mapping) + end + end +end diff --git a/lib/bumblebee/vision/qwen3_vl_featurizer.ex b/lib/bumblebee/vision/qwen3_vl_featurizer.ex new file mode 100644 index 00000000..77446eed --- /dev/null +++ b/lib/bumblebee/vision/qwen3_vl_featurizer.ex @@ -0,0 +1,420 @@ +defmodule Bumblebee.Vision.Qwen3VLFeaturizer do + alias Bumblebee.Shared + + options = [ + resize: [ + default: true, + doc: "whether to resize images via the smart-resize algorithm" + ], + resize_method: [ + default: :bicubic, + doc: + "the resizing method, either of `:nearest`, `:bilinear`, `:bicubic`, `:lanczos3`, `:lanczos5`" + ], + normalize: [ + default: true, + doc: "whether or not to normalize the input with mean and standard deviation" + ], + image_mean: [ + default: [0.5, 0.5, 0.5], + doc: "the sequence of mean values for each channel, to be used when normalizing images" + ], + image_std: [ + default: [0.5, 0.5, 0.5], + doc: + "the sequence of standard deviations for each channel, to be used when normalizing images" + ], + patch_size: [ + default: 16, + doc: "the spatial patch size" + ], + temporal_patch_size: [ + default: 2, + doc: "the temporal patch size for video frames" + ], + merge_size: [ + default: 2, + doc: "the merge factor for spatial patches" + ], + quality: [ + default: :medium, + doc: """ + preset controlling the `:min_pixels` / `:max_pixels` caps used by smart-resize. + One of `:low` (~256 visual tokens), `:medium` (~1280), or `:high` (16384). + Ignored if `:min_pixels` and `:max_pixels` are both set explicitly. + """ + ], + min_pixels: [ + default: nil, + doc: """ + explicit minimum total pixels after smart-resize. Overrides the `:quality` + preset when set. + """ + ], + max_pixels: [ + default: nil, + doc: """ + explicit maximum total pixels after smart-resize. Overrides the `:quality` + preset when set. + """ + ], + max_patches: [ + default: nil, + doc: """ + when set, pads `pixel_values` along the patches axis to this size with + zeros. Required for compile-once-and-pad serving of variable-size + images. Must be a multiple of `merge_size ** 2`. + """ + ], + max_num_images: [ + default: nil, + doc: """ + when set, pads `image_grid_thw` to this many rows with `[0, 0, 0]`. + Required alongside `:max_patches` for compile-once-and-pad serving. + """ + ] + ] + + @moduledoc """ + Qwen3-VL featurizer for image and video data. + + Accepts a single image, a list of images, or a `%{video: [frame, ...]}` + map. When given multiple images they are concatenated into a single + flat sequence of patches; per-image grid dimensions are returned as + `image_grid_thw`. + + ## Quality profiles + + Smart-resize caps the total number of pixels passed through the + patchifier. The `:quality` preset is a convenience over the explicit + `:min_pixels` / `:max_pixels` keys: + + * `:low` — ~256 visual tokens per image (fastest, lowest detail) + * `:medium` — ~1280 visual tokens per image (default) + * `:high` — up to 16384 visual tokens per image (full Qwen ceiling) + + Set `:min_pixels` and/or `:max_pixels` to override the preset. + + ## Configuration + + #{Shared.options_doc(options)} + """ + + defstruct Shared.option_defaults(options) + + @behaviour Bumblebee.Featurizer + @behaviour Bumblebee.Configurable + + alias Bumblebee.Utils.Image + + @impl true + def config(featurizer, opts) do + Shared.put_config_attrs(featurizer, opts) + end + + @impl true + def process_input(featurizer, input) do + factor = featurizer.patch_size * featurizer.merge_size + {min_pixels, max_pixels} = resolve_pixel_bounds(featurizer, factor) + + per_image = + for image_or_video <- normalize_input(input) do + process_one(featurizer, image_or_video, min_pixels, max_pixels, factor) + end + + pixel_values = + per_image + |> Enum.map(& &1.pixel_values) + |> Nx.concatenate(axis: 0) + + image_grid_thw = + per_image + |> Enum.map(& &1.grid_thw) + |> Nx.stack() + + {pixel_values, image_grid_thw} = + maybe_pad_to_max(pixel_values, image_grid_thw, featurizer) + + %{ + "pixel_values" => pixel_values, + "image_grid_thw" => image_grid_thw + } + end + + defp maybe_pad_to_max(pixel_values, image_grid_thw, featurizer) do + pixel_values = maybe_pad_patches(pixel_values, featurizer) + image_grid_thw = maybe_pad_grid_thw(image_grid_thw, featurizer) + {pixel_values, image_grid_thw} + end + + defp maybe_pad_patches(pixel_values, %{max_patches: nil}), do: pixel_values + + defp maybe_pad_patches(pixel_values, featurizer) do + {num_patches, flat} = Nx.shape(pixel_values) + max_patches = featurizer.max_patches + merge_sq = featurizer.merge_size * featurizer.merge_size + + unless rem(max_patches, merge_sq) == 0 do + raise ArgumentError, + ":max_patches (#{max_patches}) must be a multiple of merge_size**2 " <> + "(= #{merge_sq})" + end + + if num_patches > max_patches do + raise ArgumentError, + "featurizer produced #{num_patches} patches but :max_patches is " <> + "#{max_patches}; raise :max_patches or lower :quality / :max_pixels" + end + + pad_rows = max_patches - num_patches + + if pad_rows == 0 do + pixel_values + else + padding = Nx.broadcast(Nx.tensor(0.0, type: Nx.type(pixel_values)), {pad_rows, flat}) + Nx.concatenate([pixel_values, padding], axis: 0) + end + end + + defp maybe_pad_grid_thw(image_grid_thw, %{max_num_images: nil}), do: image_grid_thw + + defp maybe_pad_grid_thw(image_grid_thw, featurizer) do + {num_images, 3} = Nx.shape(image_grid_thw) + max_num_images = featurizer.max_num_images + + if num_images > max_num_images do + raise ArgumentError, + "got #{num_images} images but :max_num_images is #{max_num_images}" + end + + pad_rows = max_num_images - num_images + + if pad_rows == 0 do + image_grid_thw + else + padding = Nx.broadcast(Nx.tensor(0, type: Nx.type(image_grid_thw)), {pad_rows, 3}) + Nx.concatenate([image_grid_thw, padding], axis: 0) + end + end + + defp normalize_input(input) when is_list(input), do: input + defp normalize_input(%{image: _} = input), do: [input] + defp normalize_input(%{video: _} = input), do: [input] + defp normalize_input(input), do: [%{image: input}] + + defp process_one(featurizer, %{video: frames}, min_pixels, max_pixels, factor) + when is_list(frames) do + process_frames(featurizer, frames, min_pixels, max_pixels, factor) + end + + defp process_one(featurizer, %{image: image}, min_pixels, max_pixels, factor) do + process_frames(featurizer, [image], min_pixels, max_pixels, factor) + end + + defp process_one(featurizer, image, min_pixels, max_pixels, factor) do + process_frames(featurizer, [image], min_pixels, max_pixels, factor) + end + + defp process_frames(featurizer, frames, min_pixels, max_pixels, factor) do + num_channels = length(featurizer.image_mean) + + batched_frames = + Enum.map(frames, fn frame -> + frame + |> Image.to_batched_tensor() + |> Nx.as_type(:f32) + |> Image.normalize_channels(num_channels) + end) + + [first | _] = batched_frames + {1, height, width, _} = Nx.shape(first) + + {target_h, target_w} = + if featurizer.resize do + smart_resize(height, width, min_pixels, max_pixels, factor) + else + h = max(factor, round_to_multiple(height, factor)) + w = max(factor, round_to_multiple(width, factor)) + {h, w} + end + + mean = Nx.tensor(featurizer.image_mean) + std = Nx.tensor(featurizer.image_std) + + processed_frames = + Enum.map(batched_frames, fn frame -> + frame + |> NxImage.resize({target_h, target_w}, method: featurizer.resize_method) + |> NxImage.to_continuous(0, 1) + |> maybe_normalize(featurizer, mean, std) + |> Nx.squeeze(axes: [0]) + end) + + stacked = Nx.stack(processed_frames) + {stacked, temporal} = ensure_temporal(stacked, featurizer.temporal_patch_size) + + patches_t = div(temporal, featurizer.temporal_patch_size) + patches_h = div(target_h, featurizer.patch_size) + patches_w = div(target_w, featurizer.patch_size) + + pixel_values = window_patchify(stacked, featurizer, patches_t, patches_h, patches_w) + + %{ + pixel_values: pixel_values, + grid_thw: Nx.tensor([patches_t, patches_h, patches_w], type: :s64) + } + end + + defp maybe_normalize(images, %{normalize: false}, _mean, _std), do: images + defp maybe_normalize(images, _, mean, std), do: NxImage.normalize(images, mean, std) + + defp ensure_temporal(stacked, temporal_patch_size) do + {temporal, _, _, _} = Nx.shape(stacked) + + target = + if temporal < temporal_patch_size do + temporal_patch_size + else + div(temporal, temporal_patch_size) * temporal_patch_size + end + + cond do + target == temporal -> + {stacked, temporal} + + target > temporal -> + last = stacked[(temporal - 1)..(temporal - 1)//1] + pad = Nx.tile(last, [target - temporal, 1, 1, 1]) + {Nx.concatenate([stacked, pad], axis: 0), target} + + target < temporal -> + {Nx.slice_along_axis(stacked, 0, target, axis: 0), target} + end + end + + # Arranges patches in "windowed" order so that every group of + # merge_size * merge_size consecutive patches forms a contiguous + # spatial merge block. This lets the vision encoder's patch merger + # reshape {N, hidden} -> {N/merge^2, merge^2 * hidden} without + # needing to know per-image grid dimensions. + defp window_patchify(stacked, featurizer, patches_t, patches_h, patches_w) do + {_temporal, _height, _width, channels} = Nx.shape(stacked) + patch_size = featurizer.patch_size + temporal_patch_size = featurizer.temporal_patch_size + merge_size = featurizer.merge_size + merged_h = div(patches_h, merge_size) + merged_w = div(patches_w, merge_size) + + stacked + |> Nx.reshape({ + patches_t, + temporal_patch_size, + merged_h, + merge_size, + patch_size, + merged_w, + merge_size, + patch_size, + channels + }) + |> Nx.transpose(axes: [0, 2, 5, 3, 6, 8, 1, 4, 7]) + |> Nx.reshape({ + patches_t * merged_h * merged_w * merge_size * merge_size, + channels * temporal_patch_size * patch_size * patch_size + }) + end + + defp smart_resize(height, width, min_pixels, max_pixels, factor) do + ratio = max(height, width) / min(height, width) + + if ratio > 200 do + raise ArgumentError, + "image aspect ratio is #{Float.round(ratio, 2)}, " <> + "which exceeds the supported limit of 200" + end + + h_bar = max(factor, round_to_multiple(height, factor)) + w_bar = max(factor, round_to_multiple(width, factor)) + + cond do + h_bar * w_bar > max_pixels -> + beta = :math.sqrt(height * width / max_pixels) + h2 = floor_to_multiple(height / beta, factor) + w2 = floor_to_multiple(width / beta, factor) + {max(factor, h2), max(factor, w2)} + + h_bar * w_bar < min_pixels -> + beta = :math.sqrt(min_pixels / (height * width)) + h2 = ceil_to_multiple(height * beta, factor) + w2 = ceil_to_multiple(width * beta, factor) + {h2, w2} + + true -> + {h_bar, w_bar} + end + end + + defp round_to_multiple(value, factor) do + round(value / factor) * factor + end + + defp floor_to_multiple(value, factor) do + trunc(value / factor) * factor + end + + defp ceil_to_multiple(value, factor) do + trunc(Float.ceil(value / factor)) * factor + end + + defp resolve_pixel_bounds(featurizer, factor) do + f2 = factor * factor + + {default_min, default_max} = + case featurizer.quality do + :low -> + {4 * f2, 256 * f2} + + :medium -> + {4 * f2, 1280 * f2} + + :high -> + {4 * f2, 16384 * f2} + + other -> + raise ArgumentError, + "invalid :quality #{inspect(other)}, expected :low, :medium, or :high" + end + + min_pixels = featurizer.min_pixels || default_min + max_pixels = featurizer.max_pixels || default_max + + if min_pixels > max_pixels do + raise ArgumentError, + "min_pixels (#{min_pixels}) must not exceed max_pixels (#{max_pixels})" + end + + {min_pixels, max_pixels} + end + + defimpl Bumblebee.HuggingFace.Transformers.Config do + def load(featurizer, data) do + import Shared.Converters + + opts = + convert!(data, + resize: {"do_resize", boolean()}, + resize_method: {"resample", resize_method()}, + normalize: {"do_normalize", boolean()}, + image_mean: {"image_mean", list(number())}, + image_std: {"image_std", list(number())}, + patch_size: {"patch_size", number()}, + temporal_patch_size: {"temporal_patch_size", number()}, + merge_size: {"merge_size", number()}, + min_pixels: {"min_pixels", number()}, + max_pixels: {"max_pixels", number()} + ) + + @for.config(featurizer, opts) + end + end +end diff --git a/lib/bumblebee/vision/qwen3_vl_vision.ex b/lib/bumblebee/vision/qwen3_vl_vision.ex new file mode 100644 index 00000000..41c7fd8a --- /dev/null +++ b/lib/bumblebee/vision/qwen3_vl_vision.ex @@ -0,0 +1,770 @@ +defmodule Bumblebee.Vision.Qwen3VLVision do + import Nx.Defn + + alias Bumblebee.Shared + + options = + [ + hidden_size: [ + default: 1024, + doc: "the dimensionality of hidden layers" + ], + num_blocks: [ + default: 24, + doc: "the number of Transformer blocks in the encoder" + ], + num_attention_heads: [ + default: 16, + doc: "the number of attention heads for each attention layer in the encoder" + ], + intermediate_size: [ + default: 4096, + doc: + "the dimensionality of the intermediate layer in the transformer feed-forward network (FFN) in the encoder" + ], + num_channels: [ + default: 3, + doc: "the number of channels in the input" + ], + patch_size: [ + default: 16, + doc: "the size of the patch spatial dimensions" + ], + temporal_patch_size: [ + default: 2, + doc: "the size of the patch temporal dimension (for video)" + ], + spatial_merge_size: [ + default: 2, + doc: "the factor by which to merge spatial patches" + ], + out_hidden_size: [ + default: 2048, + doc: "the output dimensionality after patch merger" + ], + num_position_embeddings: [ + default: 2304, + doc: "the number of learned absolute position embeddings (a square grid)" + ], + deepstack_visual_indexes: [ + default: [5, 11, 17], + doc: + "the encoder layer indices from which to extract DeepStack features (0-indexed, matching HuggingFace's `enumerate(self.blocks)`)" + ], + activation: [ + default: :gelu_approx_tanh, + doc: "the activation function" + ], + layer_norm_epsilon: [ + default: 1.0e-6, + doc: "the epsilon used by the layer normalization layers" + ], + rotary_embedding_base: [ + default: 10_000, + doc: "base for computing rotary embedding frequency" + ], + initializer_scale: [ + default: 0.02, + doc: + "the standard deviation of the normal initializer used for initializing kernel parameters" + ] + ] + + @moduledoc """ + The Qwen3-VL vision encoder for processing images and video frames. + + Patches arrive from the featurizer in windowed order: every group of + `spatial_merge_size ** 2` consecutive patches forms a contiguous spatial + merge block. Combined with the per-image `image_grid_thw` tensor, this + encoder supports a variable number of images of varying sizes in a + single forward pass. + + ## Architectures + + * `:base` - the base vision encoder model + + ## Inputs + + * `"pixel_values"` - `{num_patches, num_channels * temporal_patch_size * patch_size * patch_size}` + + Concatenated, pre-extracted image/video patches from the featurizer. + + * `"image_grid_thw"` - `{num_images, 3}` + + Per-image grid dimensions `[temporal, height, width]` in patch + units, used to derive per-patch row/column positions for the + learned bilinear position embedding and the 2D rotary embedding. + + ## Global layer options + + #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])} + + ## Configuration + + #{Shared.options_doc(options)} + """ + + defstruct [architecture: :base] ++ Shared.option_defaults(options) + + @behaviour Bumblebee.ModelSpec + @behaviour Bumblebee.Configurable + + import Bumblebee.Utils.Model, only: [join: 2] + + alias Bumblebee.Layers + + @impl true + def architectures(), do: [:base] + + @impl true + def config(spec, opts) do + Shared.put_config_attrs(spec, opts) + end + + @impl true + def input_template(spec) do + patch_size = spec.patch_size + temporal_patch_size = spec.temporal_patch_size + flattened_patch_size = spec.num_channels * temporal_patch_size * patch_size * patch_size + # 14x14 grid from a 224x224 image with patch_size=16 + num_patches = 196 + + %{ + "pixel_values" => Nx.template({num_patches, flattened_patch_size}, :f32), + "image_grid_thw" => Nx.template({1, 3}, :s64) + } + end + + @impl true + def model(%__MODULE__{architecture: :base} = spec) do + inputs = inputs(spec) + + inputs + |> core(spec) + |> Layers.output() + end + + defp inputs(spec) do + patch_size = spec.patch_size + temporal_patch_size = spec.temporal_patch_size + flattened_patch_size = spec.num_channels * temporal_patch_size * patch_size * patch_size + + Bumblebee.Utils.Model.inputs_to_map([ + Axon.input("pixel_values", shape: {nil, flattened_patch_size}), + Axon.input("image_grid_thw", shape: {nil, 3}) + ]) + end + + defp core(inputs, spec) do + pixel_values = inputs["pixel_values"] + grid_thw = inputs["image_grid_thw"] + + embeddings = + pixel_values + |> patch_embedding(spec, name: "patch_embed") + |> position_embedding(grid_thw, spec, name: "pos_embed") + + encoder_outputs = encoder(embeddings, grid_thw, spec, name: "blocks") + + hidden_state = patch_merger(encoder_outputs.hidden_state, spec, name: "merger") + + %{ + hidden_state: hidden_state, + hidden_states: encoder_outputs.hidden_states, + attentions: encoder_outputs.attentions, + deepstack_hidden_states: encoder_outputs.deepstack_hidden_states + } + end + + defp patch_embedding(pixel_values, spec, opts) do + name = opts[:name] + + # Input: {num_patches, channels * temporal_patch_size * patch_size * patch_size} + # PyTorch's Conv3d with kernel=stride=full_patch is equivalent to a dense projection + # over the flattened patch features. The kernel param keeps PyTorch's + # {out_channels, in_channels, t, h, w} layout for clean weight loading. + reshaped = + Axon.nx(pixel_values, fn x -> + {num_patches, _flat} = Nx.shape(x) + + Nx.reshape( + x, + {num_patches, spec.num_channels, spec.temporal_patch_size, spec.patch_size, + spec.patch_size} + ) + end) + + kernel_param = + Axon.param( + "kernel", + fn _ -> + {spec.hidden_size, spec.num_channels, spec.temporal_patch_size, spec.patch_size, + spec.patch_size} + end, + initializer: kernel_initializer(spec) + ) + + bias_param = + Axon.param("bias", fn _ -> {spec.hidden_size} end, initializer: Axon.Initializers.zeros()) + + Axon.layer( + fn x, kernel, bias, _opts -> + {num_patches, c, t, h, w} = Nx.shape(x) + {hidden_size, _, _, _, _} = Nx.shape(kernel) + + x_flat = Nx.reshape(x, {num_patches, c * t * h * w}) + k_flat = kernel |> Nx.reshape({hidden_size, c * t * h * w}) |> Nx.transpose() + + x_flat + |> Nx.dot(k_flat) + |> Nx.add(bias) + end, + [reshaped, kernel_param, bias_param], + name: join(name, "proj"), + op_name: :conv3d + ) + |> Axon.nx(fn x -> Nx.new_axis(x, 0) end) + end + + defp position_embedding(embeddings, grid_thw, spec, opts) do + name = opts[:name] + + pos_embed_param = + Axon.param( + "weight", + fn _, _ -> {spec.num_position_embeddings, spec.hidden_size} end, + initializer: kernel_initializer(spec) + ) + + Axon.layer( + fn embed, grid_thw_t, pos_embed, _opts -> + bilinear_interpolated_position(embed, grid_thw_t, pos_embed, spec) + end, + [embeddings, grid_thw, pos_embed_param], + name: name, + op_name: :position_embedding + ) + end + + defp bilinear_interpolated_position(embed, grid_thw, pos_embed, spec) do + {_batch, total_patches, _hidden} = Nx.shape(embed) + src_grid_size = trunc(:math.sqrt(spec.num_position_embeddings)) + merge_size = spec.spatial_merge_size + + {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, _image_id, _patch_valid} = + patch_metadata(grid_thw, total_patches, merge_size) + + src_max_f = Nx.tensor(src_grid_size - 1, type: :f32) + + grid_h_minus_one = grid_h_per_patch |> Nx.subtract(1) |> Nx.max(1) |> Nx.as_type(:f32) + grid_w_minus_one = grid_w_per_patch |> Nx.subtract(1) |> Nx.max(1) |> Nx.as_type(:f32) + + row_src_f = + row_in_image + |> Nx.as_type(:f32) + |> Nx.multiply(src_max_f) + |> Nx.divide(grid_h_minus_one) + + col_src_f = + col_in_image + |> Nx.as_type(:f32) + |> Nx.multiply(src_max_f) + |> Nx.divide(grid_w_minus_one) + + row_src_f = Nx.select(Nx.equal(grid_h_per_patch, 1), Nx.tensor(0.0), row_src_f) + col_src_f = Nx.select(Nx.equal(grid_w_per_patch, 1), Nx.tensor(0.0), col_src_f) + + row_floor = row_src_f |> Nx.floor() |> Nx.as_type(:s32) + col_floor = col_src_f |> Nx.floor() |> Nx.as_type(:s32) + row_ceil = row_floor |> Nx.add(1) |> Nx.min(src_grid_size - 1) + col_ceil = col_floor |> Nx.add(1) |> Nx.min(src_grid_size - 1) + + dh = Nx.subtract(row_src_f, Nx.as_type(row_floor, :f32)) + dw = Nx.subtract(col_src_f, Nx.as_type(col_floor, :f32)) + + idx_ff = row_floor |> Nx.multiply(src_grid_size) |> Nx.add(col_floor) + idx_fc = row_floor |> Nx.multiply(src_grid_size) |> Nx.add(col_ceil) + idx_cf = row_ceil |> Nx.multiply(src_grid_size) |> Nx.add(col_floor) + idx_cc = row_ceil |> Nx.multiply(src_grid_size) |> Nx.add(col_ceil) + + emb_ff = Nx.take(pos_embed, idx_ff, axis: 0) + emb_fc = Nx.take(pos_embed, idx_fc, axis: 0) + emb_cf = Nx.take(pos_embed, idx_cf, axis: 0) + emb_cc = Nx.take(pos_embed, idx_cc, axis: 0) + + w_ff = dh |> Nx.subtract(1.0) |> Nx.negate() |> Nx.multiply(Nx.subtract(1.0, dw)) + w_fc = dh |> Nx.subtract(1.0) |> Nx.negate() |> Nx.multiply(dw) + w_cf = Nx.multiply(dh, Nx.subtract(1.0, dw)) + w_cc = Nx.multiply(dh, dw) + + interpolated = + Nx.multiply(emb_ff, Nx.new_axis(w_ff, -1)) + |> Nx.add(Nx.multiply(emb_fc, Nx.new_axis(w_fc, -1))) + |> Nx.add(Nx.multiply(emb_cf, Nx.new_axis(w_cf, -1))) + |> Nx.add(Nx.multiply(emb_cc, Nx.new_axis(w_cc, -1))) + + Nx.add(embed, interpolated) + end + + # Per-patch metadata derived from image_grid_thw. + # Returns {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, image_id_per_patch}. + # All tensors have shape {total_patches}. + defp patch_metadata(grid_thw, total_patches, merge_size) do + grid_t = grid_thw[[.., 0]] + grid_h = grid_thw[[.., 1]] + grid_w = grid_thw[[.., 2]] + + patches_per_image = grid_t |> Nx.multiply(grid_h) |> Nx.multiply(grid_w) + + cumulative = Nx.cumulative_sum(patches_per_image) + exclusive_cumulative = Nx.subtract(cumulative, patches_per_image) + total_real_patches = Nx.sum(patches_per_image) + + patch_indices = Nx.iota({total_patches}, type: :s64) + + # Patches beyond total_real_patches are padding slots (when the + # featurizer was configured with :max_patches). Mark them invalid so + # downstream attention masking can exclude them entirely. + patch_valid = Nx.less(patch_indices, total_real_patches) + + image_id_raw = + patch_indices + |> Nx.new_axis(-1) + |> Nx.greater_equal(Nx.new_axis(cumulative, 0)) + |> Nx.sum(axes: [-1]) + |> Nx.as_type(:s64) + + n_images = Nx.axis_size(grid_thw, 0) + # Padded patches map to image_id == n_images (out of bounds). Clip so + # gather operations succeed. Their derived row/col/grid values are + # garbage but get masked out via `patch_valid` in the attention step. + image_id_per_patch = Nx.clip(image_id_raw, 0, n_images - 1) + + offset_per_patch = Nx.take(exclusive_cumulative, image_id_per_patch) + local_index = Nx.subtract(patch_indices, offset_per_patch) + + grid_h_per_patch = Nx.take(grid_h, image_id_per_patch) + grid_w_per_patch = Nx.take(grid_w, image_id_per_patch) + + # Padded images have grid_w == 0; guard the divisions so we don't + # divide by zero. The resulting coordinates for padded patches are + # arbitrary and are masked out downstream. + safe_grid_w = Nx.max(grid_w_per_patch, merge_size) + + merge_sq = merge_size * merge_size + merged_w_per_patch = Nx.quotient(safe_grid_w, merge_size) + + block_idx = Nx.quotient(local_index, merge_sq) + within = Nx.remainder(local_index, merge_sq) + block_row = Nx.quotient(block_idx, merged_w_per_patch) + block_col = Nx.remainder(block_idx, merged_w_per_patch) + within_h = Nx.quotient(within, merge_size) + within_w = Nx.remainder(within, merge_size) + + row_in_image = block_row |> Nx.multiply(merge_size) |> Nx.add(within_h) + col_in_image = block_col |> Nx.multiply(merge_size) |> Nx.add(within_w) + + {row_in_image, col_in_image, grid_h_per_patch, grid_w_per_patch, image_id_per_patch, + patch_valid} + end + + defp encoder(embeddings, grid_thw, spec, opts) do + name = opts[:name] + + deepstack_indexes = MapSet.new(spec.deepstack_visual_indexes) + + head_dim = div(spec.hidden_size, spec.num_attention_heads) + rotary_dim = div(head_dim, 2) + + rotary_2d = + Axon.layer( + fn embed, grid_thw_t, _opts -> + {_batch, total_patches, _hidden} = Nx.shape(embed) + + {row_in_image, col_in_image, _, _, _, _} = + patch_metadata(grid_thw_t, total_patches, spec.spatial_merge_size) + + compute_2d_rotary_from_positions( + row_in_image, + col_in_image, + rotary_dim, + spec.rotary_embedding_base + ) + end, + [embeddings, grid_thw], + op_name: :rotary_2d + ) + + attention_mask = + Axon.layer( + fn embed, grid_thw_t, _opts -> + {_batch, total_patches, _hidden} = Nx.shape(embed) + + {_, _, _, _, image_id_per_patch, patch_valid} = + patch_metadata(grid_thw_t, total_patches, spec.spatial_merge_size) + + block_diagonal_attention_mask(image_id_per_patch, patch_valid) + end, + [embeddings, grid_thw], + op_name: :attention_mask + ) + + vision_transformer_blocks( + embeddings, + rotary_2d, + attention_mask, + spec, + deepstack_indexes, + name + ) + end + + # 2D rotary cos/sin from per-patch (row, col) positions. + # Returns {cos, sin}, each of shape {total_patches, rotary_dim}. + defnp compute_2d_rotary_from_positions(row_positions, col_positions, rotary_dim, base) do + half_rotary_dim = div(rotary_dim, 2) + range = Nx.iota({half_rotary_dim}) |> Nx.multiply(2) |> Nx.divide(rotary_dim) + inv_freq = 1.0 / Nx.pow(base, range) + + row_angles = Nx.outer(Nx.as_type(row_positions, :f32), inv_freq) + col_angles = Nx.outer(Nx.as_type(col_positions, :f32), inv_freq) + + angles = Nx.concatenate([row_angles, col_angles], axis: -1) + {Nx.cos(angles), Nx.sin(angles)} + end + + # Returns {total_patches, total_patches} boolean tensor where True means + # the two patches share an image AND both are valid (not padding). + defnp block_diagonal_attention_mask(image_id_per_patch, patch_valid) do + a = Nx.new_axis(image_id_per_patch, -1) + b = Nx.new_axis(image_id_per_patch, 0) + same_image = Nx.equal(a, b) + valid_pair = Nx.multiply(Nx.new_axis(patch_valid, -1), Nx.new_axis(patch_valid, 0)) + Nx.logical_and(same_image, valid_pair) + end + + defp vision_transformer_blocks( + embeddings, + rotary_2d, + attention_mask, + spec, + deepstack_indexes, + name + ) do + head_dim = div(spec.hidden_size, spec.num_attention_heads) + + {hidden_state, hidden_states, attentions} = + Enum.reduce(0..(spec.num_blocks - 1), {embeddings, [], []}, fn idx, + {hidden_state, hidden_states, + attentions} -> + block_name = join(name, idx) + + normed = + Axon.layer_norm(hidden_state, + epsilon: spec.layer_norm_epsilon, + name: join(block_name, "norm1") + ) + + {attn_output, attn_weights} = + vision_attention_with_2d_rotary( + normed, + rotary_2d, + attention_mask, + spec, + head_dim, + join(block_name, "attn") + ) + + hidden_state = Axon.add(hidden_state, attn_output) + + normed = + Axon.layer_norm(hidden_state, + epsilon: spec.layer_norm_epsilon, + name: join(block_name, "norm2") + ) + + ffn_output = + normed + |> Axon.dense(spec.intermediate_size, + kernel_initializer: kernel_initializer(spec), + name: join(block_name, "mlp.fc1") + ) + |> Layers.activation(spec.activation) + |> Axon.dense(spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(block_name, "mlp.fc2") + ) + + hidden_state = Axon.add(hidden_state, ffn_output) + + {hidden_state, hidden_states ++ [hidden_state], attentions ++ [attn_weights]} + end) + + deepstack_merged_features = + deepstack_indexes + |> Enum.sort() + |> Enum.with_index() + |> Enum.map(fn {layer_idx, merger_idx} -> + hidden_state_at_layer = + if layer_idx < length(hidden_states) do + Enum.at(hidden_states, layer_idx) + else + List.last(hidden_states) + end + + deepstack_merger(hidden_state_at_layer, spec, merger_idx, "deepstack_merger_list") + end) + + %{ + hidden_state: hidden_state, + hidden_states: Axon.container(List.to_tuple(hidden_states)), + attentions: Axon.container(List.to_tuple(attentions)), + deepstack_hidden_states: Axon.container(List.to_tuple(deepstack_merged_features)) + } + end + + defp deepstack_merger(hidden_state, spec, index, name) do + merger_name = join(name, index) + merge_sq = spec.spatial_merge_size * spec.spatial_merge_size + mlp_input_size = spec.hidden_size * merge_sq + + hidden_state + |> Axon.nx(fn x -> + {batch, total_patches, hidden} = Nx.shape(x) + Nx.reshape(x, {batch, div(total_patches, merge_sq), merge_sq * hidden}) + end) + |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(merger_name, "norm")) + |> Axon.dense(mlp_input_size, + kernel_initializer: kernel_initializer(spec), + name: join(merger_name, "linear_fc1") + ) + |> Layers.activation(spec.activation) + |> Axon.dense(spec.out_hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(merger_name, "linear_fc2") + ) + end + + defp vision_attention_with_2d_rotary( + hidden_state, + rotary_2d, + attention_mask, + spec, + head_dim, + name + ) do + qkv = + Axon.dense(hidden_state, spec.hidden_size * 3, + kernel_initializer: kernel_initializer(spec), + name: join(name, "qkv") + ) + + {query, key, value} = + Axon.layer( + fn qkv, _opts -> + {batch, seq_len, _} = Nx.shape(qkv) + qkv_reshaped = Nx.reshape(qkv, {batch, seq_len, 3, spec.num_attention_heads, head_dim}) + qkv_transposed = Nx.transpose(qkv_reshaped, axes: [2, 0, 3, 1, 4]) + {qkv_transposed[0], qkv_transposed[1], qkv_transposed[2]} + end, + [qkv], + name: join(name, "split_qkv") + ) + |> then(fn layer -> + q = Axon.nx(layer, fn {q, _k, _v} -> q end) + k = Axon.nx(layer, fn {_q, k, _v} -> k end) + v = Axon.nx(layer, fn {_q, _k, v} -> v end) + {q, k, v} + end) + + {rotated_query, rotated_key} = + Axon.layer( + fn query, key, rotary_2d, _opts -> + {cos, sin} = rotary_2d + apply_2d_rotary_embedding(query, key, cos, sin) + end, + [query, key, rotary_2d], + name: join(name, "rotary_2d") + ) + |> then(fn layer -> + q = Axon.nx(layer, fn {q, _k} -> q end) + k = Axon.nx(layer, fn {_q, k} -> k end) + {q, k} + end) + + scale = :math.sqrt(head_dim) + + attn_output = + Axon.layer( + fn query, key, value, attention_mask, _opts -> + # query, key, value: {batch, heads, seq, head_dim} + # attention_mask: {seq, seq} boolean (True = attend) + scores = Nx.dot(query, [3], [0, 1], key, [3], [0, 1]) + scores = Nx.divide(scores, scale) + + mask_value = + attention_mask + |> Nx.select(Nx.tensor(0.0, type: :f32), Nx.tensor(-1.0e9, type: :f32)) + |> Nx.new_axis(0) + |> Nx.new_axis(0) + + scores = Nx.add(scores, mask_value) + weights = Axon.Activations.softmax(scores, axis: -1) + output = Nx.dot(weights, [3], [0, 1], value, [2], [0, 1]) + + {output, weights} + end, + [rotated_query, rotated_key, value, attention_mask], + name: join(name, "attention") + ) + + output = Axon.nx(attn_output, fn {out, _weights} -> out end) + weights = Axon.nx(attn_output, fn {_out, weights} -> weights end) + + output = + Axon.layer( + fn x, _opts -> + {batch, heads, seq_len, head_dim} = Nx.shape(x) + hidden_size = heads * head_dim + + x + |> Nx.transpose(axes: [0, 2, 1, 3]) + |> Nx.reshape({batch, seq_len, hidden_size}) + end, + [output], + name: join(name, "reshape_output") + ) + + output = + Axon.dense(output, spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "proj") + ) + + {output, weights} + end + + defnp apply_2d_rotary_embedding(query, key, cos, sin) do + {_batch, _heads, _seq, head_dim} = Nx.shape(query) + rotary_dim = div(head_dim, 2) + + {q_rot, q_pass} = split_rotary(query, rotary_dim) + {k_rot, k_pass} = split_rotary(key, rotary_dim) + + cos = cos |> Nx.new_axis(0) |> Nx.new_axis(0) + sin = sin |> Nx.new_axis(0) |> Nx.new_axis(0) + + q_embed = q_rot * cos + rotate_half(q_rot) * sin + k_embed = k_rot * cos + rotate_half(k_rot) * sin + + {Nx.concatenate([q_embed, q_pass], axis: -1), Nx.concatenate([k_embed, k_pass], axis: -1)} + end + + defnp split_rotary(tensor, rotary_dim) do + {batch, heads, seq, head_dim} = Nx.shape(tensor) + pass_dim = head_dim - rotary_dim + rotary_part = Nx.slice(tensor, [0, 0, 0, 0], [batch, heads, seq, rotary_dim]) + pass_part = Nx.slice(tensor, [0, 0, 0, rotary_dim], [batch, heads, seq, pass_dim]) + {rotary_part, pass_part} + end + + defnp rotate_half(x) do + {batch, heads, seq, dim} = Nx.shape(x) + half_dim = div(dim, 2) + x1 = Nx.slice(x, [0, 0, 0, 0], [batch, heads, seq, half_dim]) + x2 = Nx.slice(x, [0, 0, 0, half_dim], [batch, heads, seq, half_dim]) + Nx.concatenate([Nx.negate(x2), x1], axis: -1) + end + + defp patch_merger(hidden_state, spec, opts) do + name = opts[:name] + merge_sq = spec.spatial_merge_size * spec.spatial_merge_size + mlp_input_size = spec.hidden_size * merge_sq + + hidden_state + |> Axon.layer_norm(epsilon: spec.layer_norm_epsilon, name: join(name, "ln_q")) + |> Axon.nx(fn x -> + {batch, total_patches, hidden} = Nx.shape(x) + Nx.reshape(x, {batch, div(total_patches, merge_sq), merge_sq * hidden}) + end) + |> Axon.dense(mlp_input_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "mlp.0") + ) + |> Layers.activation(spec.activation) + |> Axon.dense(spec.out_hidden_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "mlp.2") + ) + end + + defp kernel_initializer(spec) do + Axon.Initializers.normal(scale: spec.initializer_scale) + end + + defimpl Bumblebee.HuggingFace.Transformers.Config do + def load(spec, %{"model_type" => "qwen3_vl", "vision_config" => data}) do + load(spec, data) + end + + def load(spec, data) do + import Shared.Converters + + opts = + convert!(data, + num_blocks: {"depth", number()}, + num_attention_heads: {"num_heads", number()}, + num_channels: {"in_channels", number()}, + patch_size: {"patch_size", number()}, + temporal_patch_size: {"temporal_patch_size", number()}, + spatial_merge_size: {"spatial_merge_size", number()}, + activation: {"hidden_act", activation()}, + initializer_scale: {"initializer_range", number()} + ) ++ Shared.common_options_from_transformers(data, spec) + + hidden_size = data["hidden_size"] || data["embed_dim"] || spec.hidden_size + opts = Keyword.put(opts, :hidden_size, hidden_size) + + mlp_ratio = Map.get(data, "mlp_ratio", 4) + intermediate_size = data["intermediate_size"] || hidden_size * mlp_ratio + out_hidden_size = Map.get(data, "out_hidden_size", spec.out_hidden_size) + + opts = + opts + |> Keyword.put(:intermediate_size, intermediate_size) + |> Keyword.put(:out_hidden_size, out_hidden_size) + + @for.config(spec, opts) + end + end + + defimpl Bumblebee.HuggingFace.Transformers.Model do + def params_mapping(_spec) do + %{ + "patch_embed.proj" => %{ + "kernel" => { + [{"visual.patch_embed.proj", "weight"}], + fn [kernel] -> kernel end + }, + "bias" => { + [{"visual.patch_embed.proj", "bias"}], + fn [bias] -> bias end + } + }, + "pos_embed" => "visual.pos_embed", + "blocks.{n}.norm1" => "visual.blocks.{n}.norm1", + "blocks.{n}.attn.qkv" => "visual.blocks.{n}.attn.qkv", + "blocks.{n}.attn.proj" => "visual.blocks.{n}.attn.proj", + "blocks.{n}.norm2" => "visual.blocks.{n}.norm2", + "blocks.{n}.mlp.fc1" => "visual.blocks.{n}.mlp.linear_fc1", + "blocks.{n}.mlp.fc2" => "visual.blocks.{n}.mlp.linear_fc2", + "merger.ln_q" => "visual.merger.norm", + "merger.mlp.0" => "visual.merger.linear_fc1", + "merger.mlp.2" => "visual.merger.linear_fc2", + "deepstack_merger_list.{n}.norm" => "visual.deepstack_merger_list.{n}.norm", + "deepstack_merger_list.{n}.linear_fc1" => "visual.deepstack_merger_list.{n}.linear_fc1", + "deepstack_merger_list.{n}.linear_fc2" => "visual.deepstack_merger_list.{n}.linear_fc2" + } + end + end +end diff --git a/notebooks/qwen3_vl.livemd b/notebooks/qwen3_vl.livemd new file mode 100644 index 00000000..7b388bff --- /dev/null +++ b/notebooks/qwen3_vl.livemd @@ -0,0 +1,248 @@ +# Qwen3-VL Vision-Language Model + +```elixir +Mix.install([ + {:bumblebee, path: "."}, + {:nx, "~> 0.9"}, + {:exla, "~> 0.9"}, + {:kino, "~> 0.14"}, + {:stb_image, "~> 0.6"} +]) + +Nx.global_default_backend(EXLA.Backend) +``` + +## Introduction + +Qwen3-VL is a multimodal vision-language model from Alibaba that can understand images and generate text descriptions. This notebook demonstrates how to use Qwen3-VL with Bumblebee. + +## Model Architecture + +Qwen3-VL combines: +- **Vision Encoder**: Processes images using 2D spatial rotary position embeddings +- **Text Decoder**: Qwen3-based transformer with MRoPE (Multi-axis Rotary Position Embedding) + +Key features: +- 3D convolution patch embedding (supports video temporal dimension) +- 2D spatial rotary embeddings for accurate spatial understanding +- Patch merger for spatial reduction +- Per-image `image_grid_thw` threaded through the encoder so it handles + multiple images of varying sizes in a single prompt +- Smart-resize with `:low`/`:medium`/`:high` quality presets to trade + off image detail against visual-token count + +## Load the Model + +```elixir +# Load the model, tokenizer, and featurizer +repo = "Qwen/Qwen3-VL-2B-Instruct" + +{:ok, model_info} = Bumblebee.load_model({:hf, repo}) +{:ok, tokenizer} = Bumblebee.load_tokenizer({:hf, repo}) + +# The featurizer accepts a `:quality` preset (`:low`, `:medium`, `:high`) or +# explicit `:min_pixels` / `:max_pixels` caps. Smart-resize preserves aspect +# ratio and rounds each side to a multiple of `patch_size * merge_size`. +{:ok, featurizer} = + Bumblebee.load_featurizer({:hf, repo}, + module: Bumblebee.Vision.Qwen3VLFeaturizer, + quality: :medium + ) + +:ok +``` + +## Process an Image + +```elixir +# Upload an image +image_input = Kino.Input.image("Upload an image", format: :rgb) +``` + +```elixir +# Get the uploaded image +image_data = Kino.Input.read(image_input) + +image = + if image_data do + # Convert Kino image to tensor + image_data.file_ref + |> Kino.Input.file_path() + |> StbImage.read_file!() + else + # Use a sample image if none uploaded + {:ok, %{body: body}} = + Req.get("https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/280px-PNG_transparency_demonstration_1.png") + StbImage.read_binary!(body) + end + +Kino.Image.new(image) +``` + +## Generate Image Description + +```elixir +# Build the prompt for image description +prompt = "<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|>Describe this image in detail.<|im_end|> +<|im_start|>assistant +" + +# Tokenize the prompt +inputs = Bumblebee.apply_tokenizer(tokenizer, prompt) + +# Process the image. The featurizer returns `pixel_values` (concatenated, +# pre-extracted patches) and `image_grid_thw` (per-image grid dims). Both +# are required by the model — `image_grid_thw` tells the vision encoder +# the correct per-patch positions. +image_inputs = Bumblebee.apply_featurizer(featurizer, image) + +# Combine inputs +combined_inputs = Map.merge(inputs, image_inputs) + +# Run inference +outputs = Axon.predict(model_info.model, model_info.params, combined_inputs) + +# Decode the output (greedy decoding for simplicity) +# For better results, use Bumblebee.Text.generation/4 serving +logits = outputs.logits +predicted_ids = Nx.argmax(logits, axis: -1) + +Bumblebee.Tokenizer.decode(tokenizer, predicted_ids) +``` + +## Generating in One Call + +`Bumblebee.Multimodal.ImageTextToText.generate/6` is a single-call +helper that featurizes the image, expands the `<|image_pad|>` marker +in your prompt to the right number of visual tokens, and runs +generation: + +```elixir +{:ok, generation_config} = Bumblebee.load_generation_config({:hf, repo}) +generation_config = Bumblebee.configure(generation_config, max_new_tokens: 64) + +prompt = "<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|>What is in this image?<|im_end|> +<|im_start|>assistant +" + +Bumblebee.Multimodal.ImageTextToText.generate( + model_info, + featurizer, + tokenizer, + generation_config, + prompt, + image +) +#=> %{text: "A group of cats lying on a pink blanket with remote controls.", token_ids: ...} +``` + +> Note: each `generate/6` call recompiles the generation graph when +> the image size or sequence length changes. For repeated calls, use +> `compile/5` + `run/3` (see below). + +## Compile Once, Run Many + +For serving-style use where many images of varying sizes share one +compiled graph, configure upper bounds with `compile/5`, then call +`run/3` repeatedly. The featurizer pads `pixel_values` and +`image_grid_thw` to the maxima you set, and the vision encoder +excludes the padded patches from attention. + +```elixir +compiled = + Bumblebee.Multimodal.ImageTextToText.compile( + model_info, + featurizer, + tokenizer, + generation_config, + max_patches: 1024, + max_num_images: 1, + sequence_length: 384 + ) + +# First call: JIT-compiles for these upper-bound shapes +Bumblebee.Multimodal.ImageTextToText.run(compiled, prompt, image) + +# Subsequent calls reuse the same compiled graph, even if the new +# image produces fewer real patches — padding makes the shapes match. +Bumblebee.Multimodal.ImageTextToText.run(compiled, prompt, another_image) +``` + +On `Qwen3-VL-2B-Instruct` + CPU + a 640×480 COCO image, the warm +call runs in ~10s while the cold (JIT-compiling) call takes ~27s — a +2.7x speedup that scales with the number of repeated calls. + +## Multiple Images in One Prompt + +`apply_featurizer/2` accepts a list of images of differing sizes. They +are concatenated into a single flat patch sequence and the per-image +grid dimensions are returned via `image_grid_thw`. + +```elixir +images = [image, image] + +multi_image_inputs = Bumblebee.apply_featurizer(featurizer, images) +# multi_image_inputs["image_grid_thw"] has shape {2, 3} + +prompt = "<|im_start|>user +<|vision_start|><|image_pad|><|vision_end|><|vision_start|><|image_pad|><|vision_end|>Compare these two images.<|im_end|> +<|im_start|>assistant +" + +inputs = Bumblebee.apply_tokenizer(tokenizer, prompt) +combined_inputs = Map.merge(inputs, multi_image_inputs) + +outputs = Axon.predict(model_info.model, model_info.params, combined_inputs) +``` + +## Validation Against Standalone Qwen3 + +Qwen3-VL's text decoder is the standalone Qwen3 model. A useful sanity +check after touching the vision/multimodal code is to confirm the +standalone Qwen3 text path still runs cleanly: + +```elixir +# Loads only the small config.json, not weights +{:ok, qwen3_spec} = Bumblebee.load_spec({:hf, "Qwen/Qwen3-4B-Instruct-2507"}) +IO.inspect(qwen3_spec.__struct__) +# => Bumblebee.Text.Qwen3 +``` + +For a full end-to-end check (downloads ~8GB of weights): + +```elixir +{:ok, qwen3} = Bumblebee.load_model({:hf, "Qwen/Qwen3-4B-Instruct-2507"}, type: :bf16) +{:ok, qwen3_tokenizer} = Bumblebee.load_tokenizer({:hf, "Qwen/Qwen3-4B-Instruct-2507"}) + +serving = + Bumblebee.Text.generation(qwen3, qwen3_tokenizer, + max_new_tokens: 64, + compile: [batch_size: 1, sequence_length: 512] + ) + +Nx.Serving.run(serving, "Explain in one sentence what a vector database is.") +``` + +## Quality Profiles + +Use the `:quality` preset to bound how many visual tokens each image +produces. Lower quality = faster inference, less spatial detail. + +```elixir +# Token-budget knobs +{:ok, fast_featurizer} = + Bumblebee.load_featurizer({:hf, repo}, + module: Bumblebee.Vision.Qwen3VLFeaturizer, + quality: :low + ) + +# Or explicit pixel caps (overrides :quality) +{:ok, custom_featurizer} = + Bumblebee.load_featurizer({:hf, repo}, + module: Bumblebee.Vision.Qwen3VLFeaturizer, + min_pixels: 256 * 32 * 32, + max_pixels: 1280 * 32 * 32 + ) +``` diff --git a/test/bumblebee/multimodal/qwen3_vl_test.exs b/test/bumblebee/multimodal/qwen3_vl_test.exs new file mode 100644 index 00000000..d4928350 --- /dev/null +++ b/test/bumblebee/multimodal/qwen3_vl_test.exs @@ -0,0 +1,137 @@ +defmodule Bumblebee.Multimodal.Qwen3VLTest do + use ExUnit.Case, async: true + + import Bumblebee.TestHelpers + + @moduletag model_test_tags() + + test ":for_conditional_generation" do + # Tiny model created with /tmp/create_tiny_qwen3vl_v4.py (transformers 4.57.3): + # - text_config: vocab_size=1024, hidden_size=64, num_hidden_layers=2, + # num_attention_heads=4, num_key_value_heads=2, head_dim=16, + # intermediate_size=128 + # - vision_config: depth=2, hidden_size=32, num_heads=4, intermediate_size=64, + # out_hidden_size=64, patch_size=14, spatial_merge_size=2, + # temporal_patch_size=2 + # + # Reference values from /tmp/generate_reference_v2.py (seed=0): + # model = Qwen3VLForConditionalGeneration.from_pretrained(model_path) + # outputs = model(input_ids=torch.tensor([[10, 20, 30, 40, 50, 60, 0, 0]]), + # attention_mask=torch.tensor([[1, 1, 1, 1, 1, 1, 0, 0]])) + # outputs.logits[0, 0:3, 0:5].numpy() + + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"}) + + assert %Bumblebee.Multimodal.Qwen3VL{architecture: :for_conditional_generation} = spec + + inputs = %{ + "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 0, 0]]), + "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 0, 0]]) + } + + outputs = Axon.predict(model, params, inputs) + + assert Nx.shape(outputs.logits) == {1, 8, 1024} + + # Reference values from Python (transformers 4.57.3) + assert_all_close( + outputs.logits[[.., 0..2, 0..4]], + Nx.tensor([ + [ + [0.0410, 0.0745, -0.0977, 0.0099, 0.2705], + [-0.0504, 0.1776, -0.0481, -0.0269, 0.1630], + [-0.1887, 0.0889, -0.1113, -0.1756, 0.0805] + ] + ]), + atol: 1.0e-4 + ) + end + + test "vision pathway runs end-to-end with image_grid_thw" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"}) + + factor = spec.vision_spec.patch_size * spec.vision_spec.spatial_merge_size + + featurizer = + Bumblebee.configure(Bumblebee.Vision.Qwen3VLFeaturizer, + patch_size: spec.vision_spec.patch_size, + merge_size: spec.vision_spec.spatial_merge_size, + temporal_patch_size: spec.vision_spec.temporal_patch_size, + min_pixels: 4 * factor * factor, + max_pixels: 64 * factor * factor + ) + + image = Nx.iota({64, 64, 3}, type: :u8) + image_inputs = Bumblebee.apply_featurizer(featurizer, image) + + [grid_t, grid_h, grid_w] = Nx.to_flat_list(image_inputs["image_grid_thw"]) + merge_size = spec.vision_spec.spatial_merge_size + visual_tokens = grid_t * div(grid_h, merge_size) * div(grid_w, merge_size) + + image_token_id = spec.image_token_id + input_ids = List.duplicate(image_token_id, visual_tokens) ++ [1, 2, 3] + attention_mask = List.duplicate(1, length(input_ids)) + + inputs = %{ + "input_ids" => Nx.tensor([input_ids]), + "attention_mask" => Nx.tensor([attention_mask]), + "pixel_values" => image_inputs["pixel_values"], + "image_grid_thw" => image_inputs["image_grid_thw"] + } + + outputs = Axon.predict(model, params, inputs) + + expected_seq = visual_tokens + 3 + assert {1, ^expected_seq, 1024} = Nx.shape(outputs.logits) + end + + test "vision pathway accepts multiple images of different sizes" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "roulis/tiny-random-Qwen3VLForConditionalGeneration"}) + + factor = spec.vision_spec.patch_size * spec.vision_spec.spatial_merge_size + + featurizer = + Bumblebee.configure(Bumblebee.Vision.Qwen3VLFeaturizer, + patch_size: spec.vision_spec.patch_size, + merge_size: spec.vision_spec.spatial_merge_size, + temporal_patch_size: spec.vision_spec.temporal_patch_size, + min_pixels: 4 * factor * factor, + max_pixels: 64 * factor * factor + ) + + images = [Nx.iota({56, 56, 3}, type: :u8), Nx.iota({84, 56, 3}, type: :u8)] + image_inputs = Bumblebee.apply_featurizer(featurizer, images) + + assert {2, 3} = Nx.shape(image_inputs["image_grid_thw"]) + + merge_size = spec.vision_spec.spatial_merge_size + + visual_tokens = + image_inputs["image_grid_thw"] + |> Nx.to_batched(1) + |> Enum.map(fn row -> + [t, h, w] = Nx.to_flat_list(row) + t * div(h, merge_size) * div(w, merge_size) + end) + |> Enum.sum() + + image_token_id = spec.image_token_id + input_ids = List.duplicate(image_token_id, visual_tokens) ++ [1, 2] + attention_mask = List.duplicate(1, length(input_ids)) + + inputs = %{ + "input_ids" => Nx.tensor([input_ids]), + "attention_mask" => Nx.tensor([attention_mask]), + "pixel_values" => image_inputs["pixel_values"], + "image_grid_thw" => image_inputs["image_grid_thw"] + } + + outputs = Axon.predict(model, params, inputs) + + expected_seq = visual_tokens + 2 + assert {1, ^expected_seq, 1024} = Nx.shape(outputs.logits) + end +end diff --git a/test/bumblebee/vision/qwen3_vl_featurizer_test.exs b/test/bumblebee/vision/qwen3_vl_featurizer_test.exs new file mode 100644 index 00000000..059d4547 --- /dev/null +++ b/test/bumblebee/vision/qwen3_vl_featurizer_test.exs @@ -0,0 +1,168 @@ +defmodule Bumblebee.Vision.Qwen3VLFeaturizerTest do + use ExUnit.Case, async: true + + alias Bumblebee.Vision.Qwen3VLFeaturizer + + defp synthetic_image(height, width, channels \\ 3) do + Nx.iota({height, width, channels}, type: :u8) + |> Nx.remainder(255) + end + + defp featurizer(opts \\ []) do + defaults = [ + patch_size: 16, + temporal_patch_size: 2, + merge_size: 2 + ] + + Bumblebee.configure(Qwen3VLFeaturizer, Keyword.merge(defaults, opts)) + end + + test "produces pixel_values and image_grid_thw for a single image" do + image = synthetic_image(64, 64) + inputs = Bumblebee.apply_featurizer(featurizer(), image) + + # 4x4 = 16 patches; flat = channels * temporal_patch * patch * patch = 3*2*16*16 = 1536 + assert {16, 1536} = Nx.shape(inputs["pixel_values"]) + assert {1, 3} = Nx.shape(inputs["image_grid_thw"]) + + # 64x64 image, patch=16 -> 4x4 patches, temporal duplicated 1->2 -> patches_t=1 + assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4] + end + + test "smart_resize preserves aspect ratio and rounds to factor multiples" do + # 96x64 input. factor = 16 * 2 = 32. 96 = 3*32, 64 = 2*32 — already aligned. + image = synthetic_image(96, 64) + inputs = Bumblebee.apply_featurizer(featurizer(), image) + + [_t, grid_h, grid_w] = Nx.to_flat_list(inputs["image_grid_thw"]) + # patch_size=16: 96/16=6, 64/16=4 + assert grid_h == 6 + assert grid_w == 4 + + expected_patches = grid_h * grid_w + assert {^expected_patches, _} = Nx.shape(inputs["pixel_values"]) + end + + test "max_pixels caps the resized image" do + # 1024x1024 with max_pixels=256 visual tokens forces a strong downscale. + image = synthetic_image(1024, 1024) + factor = 32 + max_pixels = 256 * factor * factor + + inputs = + Bumblebee.apply_featurizer( + featurizer(min_pixels: 4 * factor * factor, max_pixels: max_pixels), + image + ) + + [_t, grid_h, grid_w] = Nx.to_flat_list(inputs["image_grid_thw"]) + merge_size = 2 + visual_tokens = div(grid_h, merge_size) * div(grid_w, merge_size) + + assert visual_tokens <= 256 + end + + test ":low quality produces fewer visual tokens than :high" do + image = synthetic_image(2048, 1536) + + [_t, low_h, low_w] = + Bumblebee.apply_featurizer(featurizer(quality: :low), image)["image_grid_thw"] + |> Nx.to_flat_list() + + [_t, high_h, high_w] = + Bumblebee.apply_featurizer(featurizer(quality: :high), image)["image_grid_thw"] + |> Nx.to_flat_list() + + assert low_h * low_w < high_h * high_w + end + + test "supports multiple images of different sizes in one call" do + images = [synthetic_image(64, 64), synthetic_image(96, 64)] + inputs = Bumblebee.apply_featurizer(featurizer(), images) + + assert {2, 3} = Nx.shape(inputs["image_grid_thw"]) + assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4, 1, 6, 4] + + # Total patches = 4*4 + 6*4 = 40; flat = 3*2*16*16 = 1536 + assert {40, 1536} = Nx.shape(inputs["pixel_values"]) + end + + test "windowed layout: every 4 consecutive patches form one 2x2 merge block" do + # A 64x64 image gives a 4x4 patch grid. With merge_size=2 there are + # 2x2 = 4 merge blocks of 4 patches each. Patches inside one block + # come from one spatial region of the resized image, so their flat + # patch features must be pairwise close. We verify the layout by + # checking that within each block-of-4 the variance is much smaller + # than the variance across blocks. + image = + Nx.iota({64, 64, 3}, type: :f32) + |> Nx.divide(64 * 64 * 3) + + inputs = Bumblebee.apply_featurizer(featurizer(normalize: false), image) + + grouped = Nx.reshape(inputs["pixel_values"], {4, 4, 1536}) + within_block_var = grouped |> Nx.variance(axes: [1]) |> Nx.mean() |> Nx.to_number() + + across_block_var = + grouped + |> Nx.mean(axes: [1]) + |> Nx.variance(axes: [0]) + |> Nx.mean() + |> Nx.to_number() + + assert within_block_var < across_block_var + end + + test "raises on extreme aspect ratios" do + image = synthetic_image(1, 400) + + assert_raise ArgumentError, ~r/aspect ratio/, fn -> + Bumblebee.apply_featurizer(featurizer(), image) + end + end + + test "raises when min_pixels exceeds max_pixels" do + image = synthetic_image(64, 64) + + assert_raise ArgumentError, ~r/min_pixels/, fn -> + Bumblebee.apply_featurizer(featurizer(min_pixels: 10_000, max_pixels: 1_000), image) + end + end + + test "pads pixel_values to :max_patches with zeros" do + image = synthetic_image(64, 64) + inputs = Bumblebee.apply_featurizer(featurizer(max_patches: 64), image) + + assert {64, 1536} = Nx.shape(inputs["pixel_values"]) + # First 16 patches are real, rest are zero-padded + real_block = inputs["pixel_values"][[0..15, ..]] + pad_block = inputs["pixel_values"][[16..63, ..]] + assert Nx.to_number(Nx.sum(Nx.abs(pad_block))) == 0.0 + refute Nx.to_number(Nx.sum(Nx.abs(real_block))) == 0.0 + end + + test "pads image_grid_thw with [0, 0, 0] rows" do + image = synthetic_image(64, 64) + inputs = Bumblebee.apply_featurizer(featurizer(max_num_images: 3), image) + + assert {3, 3} = Nx.shape(inputs["image_grid_thw"]) + assert Nx.to_flat_list(inputs["image_grid_thw"]) == [1, 4, 4, 0, 0, 0, 0, 0, 0] + end + + test "raises when :max_patches is not a multiple of merge_size**2" do + image = synthetic_image(64, 64) + + assert_raise ArgumentError, ~r/multiple of merge_size/, fn -> + Bumblebee.apply_featurizer(featurizer(max_patches: 17), image) + end + end + + test "raises when image needs more patches than :max_patches" do + image = synthetic_image(96, 96) + + assert_raise ArgumentError, ~r/raise :max_patches/, fn -> + Bumblebee.apply_featurizer(featurizer(max_patches: 16), image) + end + end +end