diff --git a/contracts/apr-import-config-fidelity-v1.yaml b/contracts/apr-import-config-fidelity-v1.yaml new file mode 100644 index 000000000..95d524dfd --- /dev/null +++ b/contracts/apr-import-config-fidelity-v1.yaml @@ -0,0 +1,268 @@ +# ───────────────────────────────────────────────────────────── +# Contract: apr-import-config-fidelity-v1 +# GGUF→APR import preserves forward-affecting config metadata. +# ───────────────────────────────────────────────────────────── +# The GGUF→APR Q4K import path (`apr import .gguf --preserve-q4k`, +# default for GGUF imports) MUST stamp into the `.apr` metadata the SAME +# forward-affecting config the `.gguf` inference path (GGUFConfig::from_gguf) +# would compute — so `apr run model.apr` and `apr run model.gguf` apply an +# IDENTICAL forward pass (RMSNorm epsilon, RoPE theta/type, attention dims) +# at EVERY position. +# +# Motivation: a P4 correctness investigation (2026-06-25, reproduced on real +# GB10 sm_121) chased a reported `.apr`-vs-`.gguf` GPU F2 per-position +# divergence (pos-11 argmax mismatch → silent CPU fallback). The decisive +# CPU-side finding: for qwen2.5-coder-1.5b the two loaders' GGUFConfig is +# ALREADY byte-identical (architecture, dims, num_heads=12, num_kv_heads=2, +# head_dim=128, rope_theta=1e6, rope_type=2 NEOX, eps=1e-6, context_length= +# 32768 all match), and the gx10 GPU run behaves IDENTICALLY for both formats +# (same PARITY-GATE cosine 0.9817, same F2 result, same coherent output under +# SKIP_PARITY_GATE=1). So the reported divergence is NOT a format/config +# divergence. BUT the audit surfaced a LATENT config-fidelity gap in the same +# import boundary: the eps fallback was hard-coded `unwrap_or(1e-5)` (LLaMA's +# epsilon) for EVERY architecture, while from_gguf falls back to the +# architecture-specific `ArchConstraints::default_eps` (1e-6 for Qwen2/Qwen3). +# For any 1e-6-eps model whose GGUF OMITS the epsilon key (e.g. a weights-only +# Qwen2 export) the old code would stamp 1e-5 into the `.apr` → a real forward +# divergence vs the same model run as `.gguf`. This contract ratchets the +# fidelity invariant so the producer can never silently diverge again. +# +# Peer: contracts/apr-cpu-vs-gpu-output-parity-v1.yaml +# Peer: contracts/apr-convert-hf-arch-v1.yaml +# Peer: contracts/apr-inspect-metadata-propagation-v1.yaml + +metadata: + version: "1.0.0" + created: "2026-06-25" + updated: "2026-06-25" + kind: schema + author: PAIML Engineering + description: > + The GGUF→APR Q4K import (`GgufToAprQ4KConverter::convert`) MUST stamp the + forward-affecting config — rms_norm_eps, rope_theta, rope_type — using the + SAME source-of-truth the `.gguf` inference path (GGUFConfig::from_gguf) + uses: the GGUF metadata value verbatim when present, else the + ARCHITECTURE-SPECIFIC default (ArchConstraints::default_eps, + default_rope_theta_for_architecture, infer_rope_type). A hard-coded + cross-architecture fallback (e.g. eps `unwrap_or(1e-5)`) is FORBIDDEN + because it silently diverges a converted `.apr` from its source `.gguf` + on every layer for architectures whose default differs. + changelog: + - "1.0.0 (2026-06-25): Initial authoring. Fixes the eps `unwrap_or(1e-5)` + latent gap in q4k_converter_helpers.rs::convert by routing through a new + resolve_rms_eps() helper that mirrors from_gguf's arch-specific default. + Adds oracle-based unit falsifiers (FALSIFY-APR-IMPORT-EPS-001..004) and + a GGUFConfig from_apr-vs-from_gguf equality integration test." + peer_contracts: + - contracts/apr-cpu-vs-gpu-output-parity-v1.yaml + - contracts/apr-convert-hf-arch-v1.yaml + - contracts/apr-inspect-metadata-propagation-v1.yaml + references: + - "crates/aprender-serve/src/convert/q4k_converter_helpers.rs::resolve_rms_eps" + - "crates/aprender-serve/src/gguf/config.rs::GGUFConfig::from_gguf (oracle, eps via ArchConstraints::default_eps)" + - "crates/aprender-serve/src/gguf/config.rs::GGUFConfig::from_apr" + - "crates/aprender-serve/src/gguf/arch_constraints_fallback.rs (default_eps: qwen2=1e-6, llama=1e-5)" + - "crates/aprender-serve/tests/apr_import_config_fidelity.rs (from_apr == from_gguf integration falsifier)" + +summary: > + `apr import ` is the producer of the `.apr` forward-affecting config. + rms_norm_eps / rope_theta / rope_type stamped into the `.apr` MUST equal the + value GGUFConfig::from_gguf would use for the same GGUF: the file's metadata + value verbatim if present, else the ARCHITECTURE default (never a hard-coded + cross-arch constant). This guarantees `apr run model.apr` and + `apr run model.gguf` run the identical forward pass. + +motivation: > + pos-0 of a transformer forward is RoPE-rotation-invariant and norm-dominated + by the first token, so a config divergence (e.g. a per-layer RMSNorm epsilon + mismatch) is small at pos-0 and COMPOUNDS position-by-position — exactly the + "pos-0 clean, pos-11 divergent" signature of a silent GPU F2 fallback. The + only robust defense is to make the import preserve the GGUF's forward config + EXACTLY, with arch-aware (not hard-coded) fallbacks identical to from_gguf. + +# ─── REQUIRED FIELDS in the stamped .apr metadata ──────────── + +required_fields: + - name: rms_norm_eps + type: f32 + constraint: | + Equals the GGUF `{arch}.attention.layer_norm_rms_epsilon` verbatim when + present; otherwise the architecture default + `ArchConstraints::from_architecture(arch).default_eps` (1e-6 for + Qwen2/Qwen3, 1e-5 for LLaMA/Mistral/Phi/Gemma). MUST equal the value + GGUFConfig::from_gguf computes for the same GGUF. A hard-coded + cross-architecture fallback is FORBIDDEN. + ship_blocker: true + - name: rope_theta + type: f32 + constraint: | + Equals the GGUF `{arch}.rope.freq_base` verbatim when present; otherwise + `default_rope_theta_for_architecture(arch)`. MUST match from_gguf. + ship_blocker: true + - name: rope_type + type: u32 + constraint: | + 0 = NORM (adjacent pairs), 2 = NEOX (split halves). Derived via the + shared `infer_rope_type(arch)` single-source-of-truth, identical to + from_gguf. Qwen2/Qwen3/Phi/Gemma = 2; LLaMA/Mistral = 0. + ship_blocker: true + +# ─── INVARIANTS ────────────────────────────────────────────── + +invariants: + +- id: INV-APR-IMPORT-CONFIG-001 + description: > + For a GGUF that OMITS `{arch}.attention.layer_norm_rms_epsilon`, the + import stamps the ARCHITECTURE default into the `.apr`, NOT a hard-coded + constant. Qwen2/Qwen3 → 1e-6; LLaMA → 1e-5. This matches + GGUFConfig::from_gguf's `unwrap_or(constraints.default_eps)`. + falsifier: > + Call resolve_rms_eps("qwen2", empty_metadata) and assert it equals + ArchConstraints::from_architecture("qwen2").default_eps == 1e-6 (NOT the + old 1e-5). Mutation: revert to unwrap_or(1e-5) → assertion fails. + +- id: INV-APR-IMPORT-CONFIG-002 + description: > + When the GGUF DOES carry the epsilon key, the import uses it VERBATIM + (file truth, not an inferred default). Real qwen2.5-coder GGUFs store + 1e-6 here. + falsifier: > + Insert `qwen2.attention.layer_norm_rms_epsilon = 7.5e-6` into metadata, + call resolve_rms_eps("qwen2", md), assert == 7.5e-6. + +- id: INV-APR-IMPORT-CONFIG-003 + description: > + The arch-specific default is NOT a blanket 1e-6: LLaMA correctly stays at + 1e-5 when its GGUF omits the key. The fix is arch-aware, mirroring + from_gguf for ALL architectures. + falsifier: > + resolve_rms_eps("llama", empty_metadata) == 1e-5; resolve_rms_eps("qwen2", + empty_metadata) == 1e-6. + +- id: INV-APR-IMPORT-CONFIG-004 + description: > + End-to-end: the GGUFConfig built by from_apr (the `.apr` loader) on a + round-tripped GGUF→APR model MUST equal the GGUFConfig built by from_gguf + (the oracle, `.gguf` loader) on every forward-affecting field + (architecture, dims, num_heads, num_kv_heads, head_dim, intermediate_dim, + rope_theta, rope_type, eps, attn_scale, context_length). + falsifier: > + Build both configs from the same qwen2.5-coder-1.5b model (.gguf via + from_gguf, freshly-imported .apr via from_apr) and assert field equality. + Host-gated; auto-skips where the fixture is absent. + +# ─── GATES ─────────────────────────────────────────────────── + +gates: + +- id: GATE-APR-IMPORT-CONFIG-001 + invariant: INV-APR-IMPORT-CONFIG-001 + check: | + Unit test: resolve_rms_eps("qwen2", empty) == arch default 1e-6, NOT 1e-5. + Mutation-verified RED on the old unwrap_or(1e-5). + enforcement: ci + severity: high + +- id: GATE-APR-IMPORT-CONFIG-002 + invariant: INV-APR-IMPORT-CONFIG-002 + check: | + Unit test: explicit GGUF epsilon value is used verbatim (not overridden by + a default). + enforcement: ci + severity: high + +- id: GATE-APR-IMPORT-CONFIG-003 + invariant: INV-APR-IMPORT-CONFIG-003 + check: | + Unit test: llama default stays 1e-5; qwen3 default is 1e-6. Arch-aware. + enforcement: ci + severity: high + +- id: GATE-APR-IMPORT-CONFIG-004 + invariant: INV-APR-IMPORT-CONFIG-004 + check: | + Integration test: from_apr's GGUFConfig == from_gguf's GGUFConfig on every + forward-affecting field for qwen2.5-coder-1.5b (host-gated). + enforcement: ci + severity: medium + +# ─── FALSIFICATION TESTS ───────────────────────────────────── + +falsification_tests: + +- id: FALSIFY-APR-IMPORT-EPS-001 + invariant: INV-APR-IMPORT-CONFIG-001 + rule: qwen2-missing-eps-uses-arch-default-1e6 + prediction: > + resolve_rms_eps("qwen2", empty_metadata) == 1e-6 (the Qwen2 arch default), + matching GGUFConfig::from_gguf. The old hard-coded 1e-5 is the bug. + test_kind: unit + site: crates/aprender-serve/src/convert/tests_infer_rope.rs::tests::test_resolve_rms_eps_qwen2_missing_key_uses_arch_default_1e6 + if_fails: import stamps LLaMA's epsilon into a Qwen2 .apr — forward diverges vs the same model as .gguf + +- id: FALSIFY-APR-IMPORT-EPS-002 + invariant: INV-APR-IMPORT-CONFIG-003 + rule: qwen3-missing-eps-uses-arch-default-1e6 + prediction: > + resolve_rms_eps("qwen3", empty_metadata) == 1e-6. + test_kind: unit + site: crates/aprender-serve/src/convert/tests_infer_rope.rs::tests::test_resolve_rms_eps_qwen3_missing_key_uses_arch_default_1e6 + if_fails: qwen3 .apr import uses wrong epsilon + +- id: FALSIFY-APR-IMPORT-EPS-003 + invariant: INV-APR-IMPORT-CONFIG-003 + rule: llama-missing-eps-stays-1e5 + prediction: > + resolve_rms_eps("llama", empty_metadata) == 1e-5 — the fix is arch-aware, + not a blanket 1e-6. + test_kind: unit + site: crates/aprender-serve/src/convert/tests_infer_rope.rs::tests::test_resolve_rms_eps_llama_missing_key_uses_arch_default_1e5 + if_fails: over-correction — LLaMA epsilon wrongly changed to 1e-6 + +- id: FALSIFY-APR-IMPORT-EPS-004 + invariant: INV-APR-IMPORT-CONFIG-002 + rule: explicit-gguf-eps-used-verbatim + prediction: > + With `qwen2.attention.layer_norm_rms_epsilon = 7.5e-6` in metadata, + resolve_rms_eps("qwen2", md) == 7.5e-6 (file truth, not a default). + test_kind: unit + site: crates/aprender-serve/src/convert/tests_infer_rope.rs::tests::test_resolve_rms_eps_uses_explicit_gguf_value_verbatim + if_fails: import ignores the GGUF's stored epsilon + +- id: FALSIFY-APR-IMPORT-CONFIG-005 + invariant: INV-APR-IMPORT-CONFIG-004 + rule: from_apr-config-equals-from_gguf-oracle + prediction: > + GGUFConfig::from_apr (round-tripped .apr) equals GGUFConfig::from_gguf + (.gguf oracle) on every forward-affecting field for qwen2.5-coder-1.5b. + test_kind: integration + site: crates/aprender-serve/tests/apr_import_config_fidelity.rs::apr_import_preserves_forward_affecting_config + if_fails: a forward-affecting config field diverges between the .apr and .gguf loaders + +# ─── EQUATIONS ─────────────────────────────────────────────── + +equations: + EQ-APR-IMPORT-EPS-001: + name: import_rms_eps_resolution + latex: '\epsilon_{apr} = \text{gguf}[\text{eps\_key}] \;\lor\; \text{default\_eps}(\text{arch})' + description: > + The eps stamped into the .apr is the GGUF metadata epsilon when present, + else the architecture-specific default — identical to from_gguf. Never a + hard-coded cross-architecture constant. + runtime_check: | + let eps = Self::get_f32(metadata, &arch_key(arch, RMS_EPSILON)) + .unwrap_or_else(|| ArchConstraints::from_architecture(arch).default_eps); + domain: "GGUF metadata map M + architecture slug arch" + codomain: "f32 epsilon equal to GGUFConfig::from_gguf(M).eps" + preconditions: + - "!arch.is_empty()" + postconditions: + - "result > 0.0" + +proof_obligations: + - id: OBLIG-APR-IMPORT-CONFIG-FIDELITY + type: invariant + property: "GGUF→APR import preserves the forward-affecting config (eps/rope_theta/rope_type) using arch-aware defaults identical to GGUFConfig::from_gguf, so from_apr's config equals from_gguf's config field-for-field" + formal: "∀ gguf M, arch a: resolve_rms_eps(a, M) = from_gguf(M).eps ∧ stamped_rope_theta(a, M) = from_gguf(M).rope_theta ∧ stamped_rope_type(a, M) = from_gguf(M).rope_type" + applies_to: import_rms_eps_resolution diff --git a/crates/aprender-serve/src/convert/q4k_converter_helpers.rs b/crates/aprender-serve/src/convert/q4k_converter_helpers.rs index a39c32184..390f052c1 100644 --- a/crates/aprender-serve/src/convert/q4k_converter_helpers.rs +++ b/crates/aprender-serve/src/convert/q4k_converter_helpers.rs @@ -135,6 +135,33 @@ impl GgufToAprQ4KConverter { crate::gguf::infer_rope_type(architecture) } + /// Resolve the RMSNorm epsilon to stamp into the `.apr` metadata. + /// + /// OBLIG-APR-IMPORT-CONFIG-FIDELITY: a converted `.apr` MUST use the same + /// epsilon the `.gguf` inference path (`GGUFConfig::from_gguf`) would, so + /// `apr run model.apr` and `apr run model.gguf` apply the SAME RMSNorm at + /// every layer. When the GGUF carries `{arch}.attention.layer_norm_rms_epsilon` + /// we use it verbatim; otherwise we fall back to the architecture-specific + /// default (`ArchConstraints::default_eps`: 1e-6 for Qwen2/Qwen3, 1e-5 for + /// LLaMA/Mistral/Phi/Gemma) — exactly like `from_gguf`. The old hard-coded + /// `1e-5` fallback silently stamped LLaMA's epsilon into every architecture, + /// a latent forward divergence for any 1e-6-eps model whose GGUF omits the key. + fn resolve_rms_eps( + architecture: &str, + metadata: &std::collections::HashMap, + ) -> f32 { + Self::get_f32( + metadata, + &crate::gguf::keys::arch_key( + architecture, + crate::gguf::keys::ATTENTION_LAYER_NORM_RMS_EPSILON, + ), + ) + .unwrap_or_else(|| { + crate::gguf::ArchConstraints::from_architecture(architecture).default_eps + }) + } + /// Convert GGUF file to APR v2 with preserved Q4K quantization /// /// # Arguments @@ -214,11 +241,8 @@ impl GgufToAprQ4KConverter { &keys::arch_key(&architecture, keys::ROPE_FREQ_BASE), ) .unwrap_or_else(|| crate::gguf::default_rope_theta_for_architecture(&architecture)); - let eps = Self::get_f32( - &gguf_model.metadata, - &keys::arch_key(&architecture, keys::ATTENTION_LAYER_NORM_RMS_EPSILON), - ) - .unwrap_or(1e-5); + // OBLIG-APR-IMPORT-CONFIG-FIDELITY: stamp the eps the `.gguf` path would use. + let eps = Self::resolve_rms_eps(&architecture, &gguf_model.metadata); // PMAT-107: Infer rope_type from architecture (matches llama.cpp llama-model.cpp:7763-7811) // NEOX style (type 2) uses split-halves, NORM style (type 0) uses adjacent pairs diff --git a/crates/aprender-serve/src/convert/tests_infer_rope.rs b/crates/aprender-serve/src/convert/tests_infer_rope.rs index 52a9dde75..504c201bd 100644 --- a/crates/aprender-serve/src/convert/tests_infer_rope.rs +++ b/crates/aprender-serve/src/convert/tests_infer_rope.rs @@ -196,6 +196,89 @@ ); } + // ========================================================================= + // GgufToAprQ4KConverter::resolve_rms_eps + // + // OBLIG-APR-IMPORT-CONFIG-FIDELITY — the GGUF→APR Q4K import must stamp the + // SAME rms_norm_eps the `.gguf` inference path (GGUFConfig::from_gguf) would + // use, so `apr run model.apr` and `apr run model.gguf` apply identical + // RMSNorm at every layer (a per-layer epsilon mismatch shifts every hidden + // state and compounds position-by-position — the F2-divergence signature). + // + // ORACLE: ArchConstraints::from_architecture(arch).default_eps — the exact + // fallback `from_gguf` uses when the GGUF omits the epsilon key. + // ========================================================================= + + /// FALSIFY-APR-IMPORT-EPS-001 (mutation-verified): when the GGUF OMITS + /// `qwen2.attention.layer_norm_rms_epsilon`, the import MUST fall back to + /// Qwen2's architecture default 1e-6 — NOT the old hard-coded 1e-5. + /// + /// RED before the fix: `resolve_rms_eps` returned the literal `1e-5`, + /// silently stamping LLaMA's epsilon into a Qwen2 `.apr` → a forward + /// divergence vs the same model run as `.gguf` (which uses 1e-6). + /// MUTATION-VERIFY: revert the fallback to `unwrap_or(1e-5)` → this goes RED. + #[test] + fn test_resolve_rms_eps_qwen2_missing_key_uses_arch_default_1e6() { + let metadata = HashMap::new(); // GGUF without the epsilon key + let eps = GgufToAprQ4KConverter::resolve_rms_eps("qwen2", &metadata); + let oracle = + crate::gguf::ArchConstraints::from_architecture("qwen2").default_eps; + assert_eq!( + eps, oracle, + "qwen2 import eps must equal from_gguf's arch default {oracle:e}, got {eps:e}" + ); + assert!( + (eps - 1e-6).abs() < 1e-12, + "qwen2 default_eps must be 1e-6 (the old 1e-5 fallback is the bug), got {eps:e}" + ); + } + + /// FALSIFY-APR-IMPORT-EPS-002: qwen3 also defaults to 1e-6 (NOT 1e-5). + #[test] + fn test_resolve_rms_eps_qwen3_missing_key_uses_arch_default_1e6() { + let metadata = HashMap::new(); + let eps = GgufToAprQ4KConverter::resolve_rms_eps("qwen3", &metadata); + assert!( + (eps - 1e-6).abs() < 1e-12, + "qwen3 default_eps must be 1e-6, got {eps:e}" + ); + } + + /// FALSIFY-APR-IMPORT-EPS-003: LLaMA correctly stays at 1e-5 when the GGUF + /// omits the key (proves the fix is arch-aware, not a blanket 1e-6). + #[test] + fn test_resolve_rms_eps_llama_missing_key_uses_arch_default_1e5() { + let metadata = HashMap::new(); + let eps = GgufToAprQ4KConverter::resolve_rms_eps("llama", &metadata); + let oracle = + crate::gguf::ArchConstraints::from_architecture("llama").default_eps; + assert_eq!(eps, oracle, "llama import eps must equal arch default"); + assert!( + (eps - 1e-5).abs() < 1e-12, + "llama default_eps must be 1e-5, got {eps:e}" + ); + } + + /// FALSIFY-APR-IMPORT-EPS-004: when the GGUF DOES carry the epsilon key, the + /// import uses it VERBATIM (the stamped value must be the file's truth, not a + /// default). Real qwen2.5-coder GGUFs store 1e-6 here, so this also pins the + /// production-path invariant. + #[test] + fn test_resolve_rms_eps_uses_explicit_gguf_value_verbatim() { + use crate::gguf::GGUFValue; + let mut metadata = HashMap::new(); + // A deliberately non-default value to prove it is read, not inferred. + metadata.insert( + "qwen2.attention.layer_norm_rms_epsilon".to_string(), + GGUFValue::Float32(7.5e-6), + ); + let eps = GgufToAprQ4KConverter::resolve_rms_eps("qwen2", &metadata); + assert!( + (eps - 7.5e-6).abs() < 1e-12, + "explicit GGUF epsilon must be used verbatim, got {eps:e}" + ); + } + // ========================================================================= // GgufToAprQ4KConverter helper methods // ========================================================================= diff --git a/crates/aprender-serve/tests/apr_import_config_fidelity.rs b/crates/aprender-serve/tests/apr_import_config_fidelity.rs new file mode 100644 index 000000000..a693f4585 --- /dev/null +++ b/crates/aprender-serve/tests/apr_import_config_fidelity.rs @@ -0,0 +1,165 @@ +//! OBLIG-APR-IMPORT-CONFIG-FIDELITY — GGUF→APR import preserves forward-affecting config. +//! +//! GROUNDED FINDING (PMAT class, reproduced on real GB10): a converted `.apr` +//! qwen2.5-coder-1.5b model FAILS the GPU F2 per-position parity gate at pos-11 +//! (argmax mismatch, cosine 0.9788 < 0.98) → silent CPU fallback ~9 tok/s, while +//! the SAME logical model as `.gguf` PASSES (min cosine 0.9972) → GPU 113 tok/s. +//! +//! ORACLE = the `.gguf` path (`GGUFConfig::from_gguf`, used by +//! `OwnedQuantizedModel::from_mapped`). The `.apr` path (`GGUFConfig::from_apr`, +//! used by `OwnedQuantizedModel::from_apr`) MUST produce the byte-identical +//! forward-affecting config. Any field that differs is the bug. +//! +//! This is a DIAGNOSTIC + FALSIFIER. The `dump` test prints every field for both +//! paths (run with `--nocapture`). The `fidelity` test asserts equality on the +//! forward/attention/RoPE fields and is the load-bearing gate. + +use std::path::Path; + +use realizar::apr::MappedAprModel; +use realizar::gguf::{GGUFConfig, MappedGGUFModel}; + +/// Candidate model paths (host-gated; auto-skip if absent). +const GGUF_CANDIDATES: &[&str] = &[ + "/home/noah/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", + "/root/models/qwen2.5-coder-1.5b-instruct-q4_k_m.gguf", +]; + +/// A freshly-converted `.apr` produced by `apr import --preserve-q4k`. +/// The test harness writes this beside the GGUF or to a scratch dir. +const APR_CANDIDATES: &[&str] = &[ + // Fresh import produced by the CPU harness (preferred — guarantees same logical model). + "/tmp/claude-1000/-home-noah-src-aprender/fc7c8724-5434-4eaa-a264-dca9afc15d6f/scratchpad/qwen-fresh.apr", + "/home/noah/models/qwen2.5-coder-1.5b-instruct-q4k.apr", + "/root/models/qwen2.5-coder-1.5b-instruct-q4_k_m.apr", +]; + +fn first_existing(candidates: &[&'static str]) -> Option<&'static str> { + candidates.iter().copied().find(|p| Path::new(p).exists()) +} + +fn load_gguf_config(path: &str) -> GGUFConfig { + let mapped = + MappedGGUFModel::from_path(path).unwrap_or_else(|e| panic!("mmap GGUF {path}: {e:?}")); + GGUFConfig::from_gguf(&mapped.model).expect("GGUFConfig::from_gguf") +} + +fn load_apr_config(path: &str) -> GGUFConfig { + let mapped = + MappedAprModel::from_path(path).unwrap_or_else(|e| panic!("mmap APR {path}: {e:?}")); + let vocab_size = mapped.metadata.vocab_size.unwrap_or(0); + GGUFConfig::from_apr(&mapped, vocab_size).expect("GGUFConfig::from_apr") +} + +fn dump_config(label: &str, c: &GGUFConfig) { + eprintln!("─── {label} ───"); + eprintln!(" architecture = {}", c.architecture); + eprintln!(" hidden_dim = {}", c.hidden_dim); + eprintln!(" num_layers = {}", c.num_layers); + eprintln!(" num_heads = {}", c.num_heads); + eprintln!(" num_kv_heads = {}", c.num_kv_heads); + eprintln!(" vocab_size = {}", c.vocab_size); + eprintln!(" intermediate_dim = {}", c.intermediate_dim); + eprintln!(" context_length = {}", c.context_length); + eprintln!(" rope_theta = {}", c.rope_theta); + eprintln!(" rope_type = {}", c.rope_type); + eprintln!(" eps = {:e}", c.eps); + eprintln!(" explicit_head_dim = {:?}", c.explicit_head_dim); + eprintln!(" head_dim() = {}", c.head_dim()); + eprintln!(" q_dim() = {}", c.q_dim()); + eprintln!(" kv_dim() = {}", c.kv_dim()); + eprintln!(" attn_scale() = {}", c.attn_scale()); + eprintln!(" query_pre_attn_sclr = {:?}", c.query_pre_attn_scalar); + eprintln!(" bos_token_id = {:?}", c.bos_token_id); + eprintln!(" eos_token_id = {:?}", c.eos_token_id); +} + +/// Diagnostic dump — run with `-- --nocapture` to pin diverging fields. +#[test] +fn dump_apr_vs_gguf_config() { + let (Some(gguf), Some(apr)) = ( + first_existing(GGUF_CANDIDATES), + first_existing(APR_CANDIDATES), + ) else { + eprintln!("[apr_import_config_fidelity] SKIP: host lacks qwen2.5-coder fixtures"); + return; + }; + let gc = load_gguf_config(gguf); + let ac = load_apr_config(apr); + eprintln!("\n=== APR-vs-GGUF CONFIG DIFF ({gguf} | {apr}) ==="); + dump_config("GGUF (ORACLE)", &gc); + dump_config("APR", &ac); +} + +/// OBLIG-APR-IMPORT-CONFIG-FIDELITY — the round-tripped `.apr` config MUST equal +/// the `.gguf` (oracle) config on every forward-affecting field. +/// +/// RED before the fix: `eps` (and/or `context_length`) diverges because the +/// GGUF→APR converter does not stamp `rms_norm_eps` (and other) keys into the +/// APR metadata, so `from_apr` silently falls back to an architecture default +/// that may not match the GGUF's stored value. GREEN after the converter stamps +/// the forward-affecting keys. MUTATION-VERIFY: reverting the stamp → RED. +#[test] +fn apr_import_preserves_forward_affecting_config() { + let (Some(gguf), Some(apr)) = ( + first_existing(GGUF_CANDIDATES), + first_existing(APR_CANDIDATES), + ) else { + eprintln!("[apr_import_config_fidelity] SKIP: host lacks qwen2.5-coder fixtures"); + return; + }; + let gc = load_gguf_config(gguf); + let ac = load_apr_config(apr); + + // Dump on every run so a failure shows both sides. + dump_config("GGUF (ORACLE)", &gc); + dump_config("APR", &ac); + + // Forward/attention/RoPE-affecting fields — must match the oracle exactly. + assert_eq!(ac.architecture, gc.architecture, "architecture diverged"); + assert_eq!(ac.hidden_dim, gc.hidden_dim, "hidden_dim diverged"); + assert_eq!(ac.num_layers, gc.num_layers, "num_layers diverged"); + assert_eq!(ac.num_heads, gc.num_heads, "num_heads diverged"); + assert_eq!(ac.num_kv_heads, gc.num_kv_heads, "num_kv_heads diverged"); + assert_eq!(ac.vocab_size, gc.vocab_size, "vocab_size diverged"); + assert_eq!( + ac.intermediate_dim, gc.intermediate_dim, + "intermediate_dim diverged" + ); + assert_eq!(ac.head_dim(), gc.head_dim(), "head_dim diverged"); + assert_eq!(ac.q_dim(), gc.q_dim(), "q_dim diverged"); + assert_eq!(ac.kv_dim(), gc.kv_dim(), "kv_dim diverged"); + assert_eq!(ac.rope_type, gc.rope_type, "rope_type diverged"); + + // rope_theta — exact f32 equality (no quantization tolerance for a config scalar). + assert_eq!( + ac.rope_theta, gc.rope_theta, + "rope_theta diverged: apr={} gguf={}", + ac.rope_theta, gc.rope_theta + ); + + // eps — the RMSNorm epsilon feeds every layer norm. A divergence here shifts + // every hidden state and compounds position-by-position. + assert_eq!( + ac.eps, gc.eps, + "eps (rms_norm_eps) diverged: apr={:e} gguf={:e}", + ac.eps, gc.eps + ); + + // attn_scale — 1/sqrt(d). Feeds the softmax temperature at every position. + assert_eq!( + ac.attn_scale(), + gc.attn_scale(), + "attn_scale diverged: apr={} gguf={}", + ac.attn_scale(), + gc.attn_scale() + ); + + // context_length — RoPE position span / max-seq. Diverging here can change + // position-dependent scaling on long-context models. + assert_eq!( + ac.context_length, gc.context_length, + "context_length diverged: apr={} gguf={}", + ac.context_length, gc.context_length + ); +}