From 02411d1188aad4637bff54f8732cc9d416ce66bd Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sat, 6 Jun 2026 15:50:26 -0400 Subject: [PATCH 1/2] nar: switch `NarRestorer` to fully async I/O Callers no longer pay the `spawn_blocking` + `SyncIoBridge` penalty per NAR entry. The `Sink`-based API is replaced with a simpler `restore` method that consumes a stream of `NarEvent`s directly. Co-authored-by: Amaan Qureshi --- Cargo.lock | 1 - harmonia-file-nar/src/archive/restorer.rs | 193 ++++++++-------------- harmonia-protocol/src/lib.rs | 2 +- harmonia-store-derivation/Cargo.toml | 1 - harmonia-store-derivation/src/lib.rs | 2 +- harmonia-store-path/Cargo.toml | 2 +- 6 files changed, 74 insertions(+), 127 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2dab091..db1116d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1250,7 +1250,6 @@ dependencies = [ "rstest", "serde", "serde_json", - "tempfile", "thiserror", "zerocopy", ] diff --git a/harmonia-file-nar/src/archive/restorer.rs b/harmonia-file-nar/src/archive/restorer.rs index 7e8df637..62b526fd 100644 --- a/harmonia-file-nar/src/archive/restorer.rs +++ b/harmonia-file-nar/src/archive/restorer.rs @@ -1,24 +1,14 @@ use std::collections::HashMap; use std::fmt; -use std::fs::{OpenOptions, create_dir}; -use std::future::Future; -use std::io::{self, BufRead as _, Write as _}; -#[cfg(unix)] -use std::os::unix::fs::{OpenOptionsExt as _, symlink}; +use std::io; use std::path::{Path, PathBuf}; -use std::pin::pin; -use std::task::{Poll, ready}; use bstr::ByteSlice as _; use bytes::Bytes; use derive_more::Display; use futures_core::Stream; -use futures_sink::Sink; -use pin_project_lite::pin_project; use thiserror::Error; -use tokio::io::AsyncBufRead; -use tokio::task::{JoinHandle, spawn_blocking}; -use tokio_util::io::SyncIoBridge; +use tokio::io::{AsyncBufRead, AsyncBufReadExt as _, AsyncWriteExt as _}; use tracing::{debug, trace}; use super::{CASE_HACK_SUFFIX, NarEvent}; @@ -33,8 +23,6 @@ pub enum NarWriteOperation { CreateFile, #[display("path contains invalid UTF-8")] PathUTF8, - #[display("Could not join state")] - JoinError, } #[derive(Error, Debug)] @@ -72,16 +60,11 @@ impl NarWriteError { } } -pin_project! { - pub struct NarRestorer { - root: PathBuf, - path: PathBuf, - #[pin] - state: Option>>, - use_case_hack: bool, - entries: Entries, - dir_stack: Vec, - } +pub struct NarRestorer { + path: PathBuf, + use_case_hack: bool, + entries: Entries, + dir_stack: Vec, } impl NarRestorer { @@ -99,66 +82,24 @@ impl NarRestorer { { let path = path.into(); Self { - root: path.clone(), path, - state: None, use_case_hack, entries: Default::default(), dir_stack: Default::default(), } } -} - -fn join_name(path: &Path, name: &[u8]) -> Result { - if name.is_empty() { - Ok(path.to_owned()) - } else { - let name_os = name.to_os_str().map_err(|err| { - let lossy = name.to_os_str_lossy(); - let path = path.join(lossy); - NarWriteError::path_utf8_error(path, err) - })?; - Ok(path.join(name_os)) - } -} - -impl Sink> for NarRestorer -where - R: AsyncBufRead + Send + 'static, -{ - type Error = NarWriteError; - - fn poll_ready( - self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - let mut this = self.project(); - if let Some(state) = this.state.as_mut().as_pin_mut() { - ready!(state.poll(cx)).map_err(|_| { - NarWriteError::new( - NarWriteOperation::JoinError, - this.root.clone(), - io::Error::other("background task failed"), - ) - })??; - } - this.state.set(None); - Poll::Ready(Ok(())) - } - fn start_send( - mut self: std::pin::Pin<&mut Self>, - item: NarEvent, - ) -> Result<(), Self::Error> { - if self.state.is_some() { - panic!("Sending when not ready!"); - } - match item { + /// Process a single NAR event, writing to the filesystem. + async fn process_event(&mut self, event: NarEvent) -> Result<(), NarWriteError> + where + R: AsyncBufRead + Unpin, + { + match event { NarEvent::File { name, executable, size: _, - reader, + mut reader, } => { let name = if self.use_case_hack { self.entries.hack_name(name) @@ -167,7 +108,7 @@ where }; let path = join_name(&self.path, &name)?; - let mut options = OpenOptions::new(); + let mut options = tokio::fs::OpenOptions::new(); options.write(true); options.create_new(true); #[cfg(unix)] @@ -178,32 +119,28 @@ where options.mode(0o666); } } - let handle = spawn_blocking(move || { - let reader = pin!(reader); - let mut reader = SyncIoBridge::new(reader); - let mut writer = options - .open(&path) + let mut file = options + .open(&path) + .await + .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; + loop { + trace!("Writing to file {:?}", path); + let buf = reader + .fill_buf() + .await .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; - loop { - trace!("Writing to file {:?}", path); - let buf = reader - .fill_buf() - .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; - if buf.is_empty() { - break; - } - let amt = buf.len(); - writer - .write_all(buf) - .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; - reader.consume(amt); + if buf.is_empty() { + break; } - writer - .flush() + let amt = buf.len(); + file.write_all(buf) + .await .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; - Ok(()) - }); - self.state = Some(handle); + reader.consume(amt); + } + file.flush() + .await + .map_err(|err| NarWriteError::create_file_error(path.clone(), err))?; } NarEvent::Symlink { name, target } => { let name = if self.use_case_hack { @@ -221,13 +158,12 @@ where NarWriteError::path_utf8_error(path, err) })? .to_owned(); - self.state = Some(spawn_blocking(move || { - #[cfg(unix)] - { - symlink(target_os, &path) - .map_err(|err| NarWriteError::create_symlink_error(path, err)) - } - })); + #[cfg(unix)] + { + tokio::fs::symlink(target_os, &path) + .await + .map_err(|err| NarWriteError::create_symlink_error(path, err))?; + } } NarEvent::StartDirectory { name } => { let name = if self.use_case_hack { @@ -244,10 +180,9 @@ where let path = join_name(&self.path, &name)?; self.path = path; let path = self.path.clone(); - self.state = Some(spawn_blocking(|| { - let path = path; - create_dir(&path).map_err(|err| NarWriteError::create_dir_error(path, err)) - })); + tokio::fs::create_dir(&path) + .await + .map_err(|err| NarWriteError::create_dir_error(path, err))?; } NarEvent::EndDirectory => { if self.use_case_hack { @@ -259,18 +194,33 @@ where Ok(()) } - fn poll_flush( - self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - >>::poll_ready(self, cx) + /// Consume a stream of NAR events and restore them to the filesystem. + pub async fn restore(mut self, stream: S) -> Result<(), NarWriteError> + where + S: Stream, + U: Into, NarWriteError>>, + R: AsyncBufRead + Send + Unpin, + { + use futures_util::StreamExt as _; + futures_util::pin_mut!(stream); + while let Some(item) = stream.next().await { + let event = item.into()?; + self.process_event(event).await?; + } + Ok(()) } +} - fn poll_close( - self: std::pin::Pin<&mut Self>, - cx: &mut std::task::Context<'_>, - ) -> Poll> { - >>::poll_ready(self, cx) +fn join_name(path: &Path, name: &[u8]) -> Result { + if name.is_empty() { + Ok(path.to_owned()) + } else { + let name_os = name.to_os_str().map_err(|err| { + let lossy = name.to_os_str_lossy(); + let path = path.join(lossy); + NarWriteError::path_utf8_error(path, err) + })?; + Ok(path.join(name_os)) } } @@ -348,11 +298,10 @@ impl RestoreOptions { S: Stream, U: Into, NarWriteError>>, P: Into, - R: AsyncBufRead + Send + 'static, + R: AsyncBufRead + Send + Unpin, { - use futures_util::stream::StreamExt as _; let restorer = NarRestorer::new_restorer(path, self.use_case_hack); - stream.map(|item| item.into()).forward(restorer).await + restorer.restore(stream).await } } @@ -367,7 +316,7 @@ where S: Stream, U: Into, NarWriteError>>, P: Into, - R: AsyncBufRead + Send + 'static, + R: AsyncBufRead + Send + Unpin, { RestoreOptions::new().restore(stream, path).await } diff --git a/harmonia-protocol/src/lib.rs b/harmonia-protocol/src/lib.rs index 3cb3d966..02385964 100644 --- a/harmonia-protocol/src/lib.rs +++ b/harmonia-protocol/src/lib.rs @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: 2024 griff -// SPDX-FileCopyrightText: 2025 Jörg Thalheim +// SPDX-FileCopyrightText: 2026 Jörg Thalheim // SPDX-License-Identifier: EUPL-1.2 OR MIT // // This crate is derived from Nix.rs (https://github.com/griff/Nix.rs) diff --git a/harmonia-store-derivation/Cargo.toml b/harmonia-store-derivation/Cargo.toml index e409f413..3e17b1a5 100644 --- a/harmonia-store-derivation/Cargo.toml +++ b/harmonia-store-derivation/Cargo.toml @@ -33,7 +33,6 @@ zerocopy = { version = "0.8", features = [ "derive" ] } harmonia-store-derivation = { path = ".", features = [ "test" ] } hex-literal = { workspace = true } rstest = { workspace = true } -tempfile = { workspace = true } [features] test = [ diff --git a/harmonia-store-derivation/src/lib.rs b/harmonia-store-derivation/src/lib.rs index 6f2bb786..783103dc 100644 --- a/harmonia-store-derivation/src/lib.rs +++ b/harmonia-store-derivation/src/lib.rs @@ -1,5 +1,5 @@ // SPDX-FileCopyrightText: 2024 griff -// SPDX-FileCopyrightText: 2025 Jörg Thalheim +// SPDX-FileCopyrightText: 2026 Jörg Thalheim // SPDX-License-Identifier: EUPL-1.2 OR MIT // // This crate is derived from Nix.rs (https://github.com/griff/Nix.rs) diff --git a/harmonia-store-path/Cargo.toml b/harmonia-store-path/Cargo.toml index f6f67b40..8618faeb 100644 --- a/harmonia-store-path/Cargo.toml +++ b/harmonia-store-path/Cargo.toml @@ -1,5 +1,5 @@ # SPDX-FileCopyrightText: 2024 griff (original Nix.rs) -# SPDX-FileCopyrightText: 2025 Jörg Thalheim (Harmonia adaptation) +# SPDX-FileCopyrightText: 2026 Jörg Thalheim (Harmonia adaptation) # SPDX-License-Identifier: MIT [package] From e4d58f106db5b8415117dc195b456b359b58023f Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sat, 6 Jun 2026 15:50:43 -0400 Subject: [PATCH 2/2] store: add `harmonia-store-ref-scan` Add a new `harmonia-store-ref-scan` crate with a streaming `RefScanSink` (Boyer-Moore-style window over the nix-base32 alphabet) for post-build reference discovery without a second disk walk. Co-authored-by: Amaan Qureshi --- Cargo.lock | 12 + Cargo.toml | 2 + docs/architecture/harmonia-store-structure.md | 2 + harmonia-store-ref-scan/Cargo.toml | 26 ++ harmonia-store-ref-scan/src/lib.rs | 336 ++++++++++++++++++ scripts/dependency-diagram.py | 1 + 6 files changed, 379 insertions(+) create mode 100644 harmonia-store-ref-scan/Cargo.toml create mode 100644 harmonia-store-ref-scan/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index db1116d4..3382aea0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1304,6 +1304,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "harmonia-store-ref-scan" +version = "0.0.0-alpha.0" +dependencies = [ + "futures-util", + "harmonia-file-nar", + "harmonia-store-path", + "harmonia-utils-base-encoding", + "tempfile", + "tokio", +] + [[package]] name = "harmonia-store-remote" version = "3.1.0" diff --git a/Cargo.toml b/Cargo.toml index 509fb528..a990a29b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ members = [ "harmonia-utils-hash", "harmonia-store-content-address", "harmonia-store-derivation", + "harmonia-store-ref-scan", "harmonia-store-db", "harmonia-store-path", "harmonia-utils-signature", @@ -108,6 +109,7 @@ harmonia-store-derivation = { path = "harmonia-store-derivation" } harmonia-store-nar-info = { path = "harmonia-store-nar-info" } harmonia-store-path = { path = "harmonia-store-path" } harmonia-store-path-info = { path = "harmonia-store-path-info" } +harmonia-store-ref-scan = { path = "harmonia-store-ref-scan" } harmonia-store-remote = { path = "harmonia-store-remote" } harmonia-utils-base-encoding = { path = "harmonia-utils-base-encoding" } harmonia-utils-hash = { path = "harmonia-utils-hash" } diff --git a/docs/architecture/harmonia-store-structure.md b/docs/architecture/harmonia-store-structure.md index 5e53ae93..76b5b18c 100644 --- a/docs/architecture/harmonia-store-structure.md +++ b/docs/architecture/harmonia-store-structure.md @@ -96,6 +96,7 @@ graph BT store-nar-info store-path store-path-info + store-ref-scan end subgraph "File" file-core @@ -110,6 +111,7 @@ graph BT utils-signature --> utils-base-encoding store-path --> utils-hash store-content-address --> store-path + store-ref-scan --> store-path store-derivation --> store-content-address store-derivation --> utils-signature store-path-info --> store-content-address diff --git a/harmonia-store-ref-scan/Cargo.toml b/harmonia-store-ref-scan/Cargo.toml new file mode 100644 index 00000000..1555338d --- /dev/null +++ b/harmonia-store-ref-scan/Cargo.toml @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: 2026 Jörg Thalheim +# SPDX-License-Identifier: MIT + +[package] +name = "harmonia-store-ref-scan" +version = "0.0.0-alpha.0" +authors.workspace = true +edition.workspace = true +license.workspace = true +homepage.workspace = true +repository.workspace = true +description = "Streaming reference scanner for Nix store path outputs" +readme = "README.md" + +[dependencies] +harmonia-store-path = { workspace = true } +harmonia-utils-base-encoding = { workspace = true } + +[dev-dependencies] +futures-util = { workspace = true } +harmonia-file-nar = { workspace = true } +tempfile = { workspace = true } +tokio = { workspace = true, features = [ "macros", "rt-multi-thread" ] } + +[lints] +workspace = true diff --git a/harmonia-store-ref-scan/src/lib.rs b/harmonia-store-ref-scan/src/lib.rs new file mode 100644 index 00000000..83b571a4 --- /dev/null +++ b/harmonia-store-ref-scan/src/lib.rs @@ -0,0 +1,336 @@ +// SPDX-FileCopyrightText: 2026 Jörg Thalheim +// SPDX-License-Identifier: MIT + +//! Reference scanning for store path outputs. +//! +//! After a build completes, we need to discover which store paths are +//! referenced by the output. This module provides [`RefScanSink`], a +//! streaming scanner that can be fed arbitrary byte chunks (typically +//! from a NAR stream) and efficiently finds store path hash references. +//! +//! # Algorithm +//! +//! Rather than searching for each candidate pattern separately (O(n×k)), +//! we use the same approach as Nix's `search()` in `references.cc`: +//! +//! 1. Slide a window of [`HASH_LEN`] bytes across the input. +//! 2. For each window position, validate characters right-to-left against +//! the nix-base32 alphabet. If an invalid character is found at offset j, +//! skip ahead by j+1 positions (Boyer-Moore style). +//! 3. When a valid 32-byte window is found, look it up in a `HashSet`. +//! +//! This gives O(n/32) amortized performance on binary data (most bytes are +//! not in the nix-base32 alphabet), independent of the number of candidates. +//! +//! # Integration with NAR streaming +//! +//! `RefScanSink` implements a push-based interface: call [`RefScanSink::feed`] +//! with each chunk of NAR bytes. This allows scanning for references during +//! NAR serialization (for hash computation) without a separate disk walk, +//! matching Nix's `TeeSink{refsSink, hashSink}` pattern. + +use std::collections::{BTreeSet, HashSet}; + +use harmonia_store_path::{StorePath, StorePathHash}; + +/// Encoded length of a store path hash in nix-base32 (32 bytes). +const HASH_LEN: usize = StorePathHash::encoded_len(); + +/// 256-byte lookup table: `true` for bytes that are valid nix-base32 characters. +/// +/// Built from the canonical alphabet in [`harmonia_utils_base_encoding::base32`]. +const NIX_BASE32_VALID: [bool; 256] = { + let mut table = [false; 256]; + let chars = harmonia_utils_base_encoding::base32::ALPHABET_BYTES; + let mut i = 0; + while i < chars.len() { + table[chars[i] as usize] = true; + i += 1; + } + table +}; + +/// A streaming reference scanner that finds store path hashes in byte data. +/// +/// Feed it chunks of bytes (e.g., from a NAR stream) via [`feed`](Self::feed), +/// then retrieve results with [`found_paths`](Self::found_paths). +/// +/// # Example +/// +/// ```ignore +/// let mut sink = RefScanSink::new(&candidates, Some(&self_path)); +/// for chunk in nar_chunks { +/// sink.feed(&chunk); +/// } +/// let references = sink.found_paths(); +/// ``` +pub struct RefScanSink { + /// Hash strings we're still looking for (removed on match, like Nix). + pending: HashSet<[u8; HASH_LEN]>, + /// Hash strings we've found so far. + seen: HashSet<[u8; HASH_LEN]>, + /// Map from hash bytes back to StorePath for result construction. + back_map: Vec<([u8; HASH_LEN], StorePath)>, + /// Tail bytes from the previous chunk for boundary matching. + tail: Vec, +} + +impl RefScanSink { + /// Create a new scanner for the given candidate store paths. + /// + /// `candidates` is the set of store paths to search for (typically all + /// build inputs). `self_path` is the output path itself, for detecting + /// self-references. + pub fn new(candidates: &BTreeSet, self_path: Option<&StorePath>) -> Self { + let mut pending = HashSet::with_capacity(candidates.len() + 1); + let mut back_map = Vec::with_capacity(candidates.len() + 1); + + for sp in candidates { + let hash_bytes = hash_to_bytes(sp); + pending.insert(hash_bytes); + back_map.push((hash_bytes, sp.clone())); + } + + if let Some(sp) = self_path { + let hash_bytes = hash_to_bytes(sp); + if pending.insert(hash_bytes) { + back_map.push((hash_bytes, sp.clone())); + } + } + + Self { + pending, + seen: HashSet::new(), + back_map, + tail: Vec::with_capacity(HASH_LEN), + } + } + + /// Feed a chunk of bytes to the scanner. + /// + /// Handles boundary matches by keeping a tail buffer from the previous + /// chunk. This mirrors Nix's `RefScanSink::operator()`. + pub fn feed(&mut self, data: &[u8]) { + if self.pending.is_empty() { + return; + } + + // Mirrors Nix's RefScanSink::operator() exactly. + let tail_len = data.len().min(HASH_LEN); + + // Search the overlap region: copy of old tail + start of new data. + // Uses a separate buffer so self.tail can be rebuilt independently. + if !self.tail.is_empty() { + let mut overlap = self.tail.clone(); + overlap.extend_from_slice(&data[..tail_len]); + search(&overlap, &mut self.pending, &mut self.seen); + } + + // Search the current chunk itself. + search(data, &mut self.pending, &mut self.seen); + + // Rebuild tail: keep up to HASH_LEN bytes total + // (suffix of old tail + suffix of new data). + let rest = HASH_LEN - tail_len; + if rest < self.tail.len() { + self.tail.drain(..self.tail.len() - rest); + } + self.tail.extend_from_slice(&data[data.len() - tail_len..]); + } + + /// Returns the set of store paths whose hashes were found. + pub fn found_paths(&self) -> BTreeSet { + let mut result = BTreeSet::new(); + for (hash_bytes, store_path) in &self.back_map { + if self.seen.contains(hash_bytes) { + result.insert(store_path.clone()); + } + } + result + } +} + +/// Convert a store path's hash to a fixed-size byte array for zero-alloc lookups. +fn hash_to_bytes(sp: &StorePath) -> [u8; HASH_LEN] { + let s = sp.hash().to_string(); + let mut buf = [0u8; HASH_LEN]; + buf.copy_from_slice(s.as_bytes()); + buf +} + +/// Core search algorithm matching Nix's `search()` in `references.cc`. +/// +/// Scans `data` for valid nix-base32 windows of length [`HASH_LEN`]. +/// Uses right-to-left character validation with Boyer-Moore-style skipping: +/// when an invalid character is found at offset j within the window, +/// advance by j+1 positions. On random binary data this skips ~32 bytes +/// per invalid character, giving O(n/32) amortized performance. +/// +/// Matched hashes are moved from `pending` to `seen`. +#[inline] +fn search(data: &[u8], pending: &mut HashSet<[u8; HASH_LEN]>, seen: &mut HashSet<[u8; HASH_LEN]>) { + if data.len() < HASH_LEN { + return; + } + + let mut i = 0; + while i + HASH_LEN <= data.len() { + // Scan the window right-to-left for valid nix-base32 characters. + let mut j = HASH_LEN; + loop { + if j == 0 { + break; + } + j -= 1; + if !NIX_BASE32_VALID[data[i + j] as usize] { + i += j + 1; + break; + } + } + if j > 0 { + // Broke out early due to invalid character, already advanced i. + continue; + } + + // All HASH_LEN characters are valid nix-base32. Check the HashSet. + let window: [u8; HASH_LEN] = data[i..i + HASH_LEN] + .try_into() + .expect("slice length matches HASH_LEN"); + + if pending.remove(&window) { + seen.insert(window); + } + + i += 1; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + + use futures_util::StreamExt as _; + + /// Helper: dump a path as NAR, feeding each byte chunk through the scanner. + async fn scan_nar_for_references( + path: &std::path::Path, + candidates: &BTreeSet, + self_path: Option<&StorePath>, + ) -> BTreeSet { + let mut sink = RefScanSink::new(candidates, self_path); + + // Stream the NAR and feed chunks to the scanner, just like + // production code would do while computing the NAR hash. + let mut stream = harmonia_file_nar::NarByteStream::new(path.to_path_buf()); + while let Some(chunk) = stream.next().await { + sink.feed(&chunk.unwrap()); + } + + sink.found_paths() + } + + /// Output file containing an input's hash part → input is discovered as a reference. + #[test] + fn test_scan_finds_input_reference() { + let input = StorePath::from_base_path("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-input").unwrap(); + let hash_str = input.hash().to_string(); + let data = format!("some content /nix/store/{hash_str}-input more stuff"); + + let mut candidates = BTreeSet::new(); + candidates.insert(input.clone()); + + let mut sink = RefScanSink::new(&candidates, None); + sink.feed(data.as_bytes()); + + let refs = sink.found_paths(); + assert!(refs.contains(&input), "Should discover input as reference"); + } + + /// Output containing its own hash part → self-reference detected. + #[test] + fn test_scan_finds_self_reference() { + let self_path = StorePath::from_base_path("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-self").unwrap(); + let hash_str = self_path.hash().to_string(); + let data = format!("#!/bin/sh\nexec /nix/store/{hash_str}-self/bin/real \"$@\""); + + let candidates = BTreeSet::new(); + let mut sink = RefScanSink::new(&candidates, Some(&self_path)); + sink.feed(data.as_bytes()); + + let refs = sink.found_paths(); + assert!(refs.contains(&self_path), "Should detect self-reference"); + } + + /// Feed data in every possible chunk size to verify the tail logic + /// handles hashes spanning any number of chunks (2, 3, ... up to N). + #[test] + fn test_scan_across_chunk_boundary() { + let input = StorePath::from_base_path("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-test").unwrap(); + let hash_str = input.hash().to_string(); + let content = format!("prefix{hash_str}suffix"); + let bytes = content.as_bytes(); + + // chunk_size=1 means single-byte feeds (hash spans 32 chunks), + // chunk_size=bytes.len() means one big feed. + for chunk_size in 1..=bytes.len() { + let mut candidates = BTreeSet::new(); + candidates.insert(input.clone()); + let mut sink = RefScanSink::new(&candidates, None); + + for chunk in bytes.chunks(chunk_size) { + sink.feed(chunk); + } + + let refs = sink.found_paths(); + assert!( + refs.contains(&input), + "Should find reference with chunk_size={chunk_size}" + ); + } + } + + /// Input reference discovered when scanning over a NAR stream. + #[tokio::test] + async fn test_nar_scan_finds_input_reference() { + let dir = tempfile::TempDir::new().unwrap(); + let output_dir = dir.path().join("output"); + fs::create_dir(&output_dir).unwrap(); + + let input = StorePath::from_base_path("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-input").unwrap(); + let hash_str = input.hash().to_string(); + + fs::write( + output_dir.join("file.txt"), + format!("some content /nix/store/{hash_str}-input more stuff"), + ) + .unwrap(); + + let mut candidates = BTreeSet::new(); + candidates.insert(input.clone()); + + let refs = scan_nar_for_references(&output_dir, &candidates, None).await; + assert!(refs.contains(&input), "Should discover input as reference"); + } + + /// Self-reference detected when scanning over a NAR stream. + #[tokio::test] + async fn test_nar_scan_finds_self_reference() { + let dir = tempfile::TempDir::new().unwrap(); + let output_dir = dir.path().join("output"); + fs::create_dir(&output_dir).unwrap(); + + let self_path = StorePath::from_base_path("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-self").unwrap(); + let hash_str = self_path.hash().to_string(); + + fs::write( + output_dir.join("wrapper.sh"), + format!("#!/bin/sh\nexec /nix/store/{hash_str}-self/bin/real \"$@\""), + ) + .unwrap(); + + let candidates = BTreeSet::new(); + let refs = scan_nar_for_references(&output_dir, &candidates, Some(&self_path)).await; + assert!(refs.contains(&self_path), "Should detect self-reference"); + } +} diff --git a/scripts/dependency-diagram.py b/scripts/dependency-diagram.py index a092ec06..2b61ca60 100644 --- a/scripts/dependency-diagram.py +++ b/scripts/dependency-diagram.py @@ -151,6 +151,7 @@ def generate_mermaid( "store-nar-info", "store-path", "store-path-info", + "store-ref-scan", } # Sanity check: every store-* crate must be in exactly one list.