From 3fe6e17ad89634cf58c296494210a9c06326e51b Mon Sep 17 00:00:00 2001 From: Oz Date: Sun, 28 Jun 2026 13:01:33 +0000 Subject: [PATCH] fix: bound retained AI file-outline memory per repo (APP-4794) `ai::index::file_outline::build_outline` -> `parse_file_outline` allocates owned `Symbol` strings for every parsable file and retains them per-repo in `RepoOutlines`. Per-file size (3MB) and file count (5000) are capped, but the cumulative outline size is unbounded, so large or numerous repositories grow the in-memory index to multiple GB. A symbolized heap profile from Sentry 7259255054 showed ~6 GB (84%) of live heap retained here (`name.to_owned()` 53%, `collect_vec()` 30%). Track the cumulative heap size of the retained outlines while parsing and stop building new outlines once a generous 256 MiB per-repo budget is exceeded, so the index degrades to a bounded partial outline instead of growing without limit. Typical repos are well under the limit and unaffected. Co-Authored-By: Oz --- crates/ai/src/index/file_outline/mod.rs | 27 ++++++++++ crates/ai/src/index/file_outline/mod_test.rs | 52 ++++++++++++++++++++ crates/ai/src/index/file_outline/native.rs | 32 ++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 crates/ai/src/index/file_outline/mod_test.rs diff --git a/crates/ai/src/index/file_outline/mod.rs b/crates/ai/src/index/file_outline/mod.rs index 39a6380eff..911ed99562 100644 --- a/crates/ai/src/index/file_outline/mod.rs +++ b/crates/ai/src/index/file_outline/mod.rs @@ -146,6 +146,20 @@ pub struct Symbol { pub line_number: usize, } +impl Symbol { + /// Approximate number of heap bytes owned by this symbol (its owned + /// `String`s). Excludes the inline `Symbol` struct itself, which is + /// accounted for by the containing `Vec`'s capacity in + /// [`FileOutline::approx_heap_size`]. + fn approx_heap_size(&self) -> usize { + let comment_bytes = self.comment.as_ref().map_or(0, |lines| { + lines.capacity() * std::mem::size_of::() + + lines.iter().map(String::len).sum::() + }); + self.name.len() + self.type_prefix.as_ref().map_or(0, String::len) + comment_bytes + } +} + /// Represents the "outline" of a file with all the identifier symbols of interest. #[derive(Debug, Clone, Default)] pub struct FileOutline { @@ -158,6 +172,15 @@ impl FileOutline { self.symbols.as_ref() } + /// Approximate number of heap bytes retained by this outline's symbols. Used + /// to bound the cumulative size of a repository's in-memory outline. + pub(crate) fn approx_heap_size(&self) -> usize { + self.symbols.as_ref().map_or(0, |symbols| { + symbols.capacity() * std::mem::size_of::() + + symbols.iter().map(Symbol::approx_heap_size).sum::() + }) + } + /// Format the outline into a string. pub fn to_string(&self) -> Option { Some( @@ -183,3 +206,7 @@ impl FileOutline { ) } } + +#[cfg(test)] +#[path = "mod_test.rs"] +mod tests; diff --git a/crates/ai/src/index/file_outline/mod_test.rs b/crates/ai/src/index/file_outline/mod_test.rs new file mode 100644 index 0000000000..491c28c15e --- /dev/null +++ b/crates/ai/src/index/file_outline/mod_test.rs @@ -0,0 +1,52 @@ +use super::{FileOutline, Symbol}; + +fn symbol(name: &str, comment: Option>) -> Symbol { + Symbol { + name: name.to_string(), + type_prefix: None, + comment: comment.map(|lines| lines.into_iter().map(str::to_string).collect()), + line_number: 1, + } +} + +#[test] +fn approx_heap_size_is_zero_without_symbols() { + let outline = FileOutline { symbols: None }; + assert_eq!(outline.approx_heap_size(), 0); + + let outline = FileOutline { + symbols: Some(Vec::new()), + }; + assert_eq!(outline.approx_heap_size(), 0); +} + +#[test] +fn approx_heap_size_counts_owned_string_bytes() { + let outline = FileOutline { + symbols: Some(vec![ + symbol("foo", None), + symbol("bar", Some(vec!["a", "bc"])), + ]), + }; + + let size = outline.approx_heap_size(); + let symbols = outline.symbols().expect("symbols"); + + // The owned name bytes ("foo" + "bar") and comment bytes ("a" + "bc") are + // all counted. + let owned_bytes = 3 + 3 + 1 + 2; + // Plus the inline `Symbol` array backing the `Vec`. + let vec_bytes = symbols.capacity() * std::mem::size_of::(); + assert!(size >= owned_bytes + vec_bytes, "size = {size}"); +} + +#[test] +fn approx_heap_size_grows_with_more_symbols() { + let small = FileOutline { + symbols: Some(vec![symbol("a", None)]), + }; + let large = FileOutline { + symbols: Some(vec![symbol(&"a".repeat(10_000), None)]), + }; + assert!(large.approx_heap_size() > small.approx_heap_size()); +} diff --git a/crates/ai/src/index/file_outline/native.rs b/crates/ai/src/index/file_outline/native.rs index 3eefa3bcba..4130fc34a0 100644 --- a/crates/ai/src/index/file_outline/native.rs +++ b/crates/ai/src/index/file_outline/native.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::fs; use std::path::Path; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use anyhow::anyhow; use arborium::tree_sitter::{Parser, Query, QueryCursor, Tree}; @@ -23,6 +24,17 @@ cfg_if::cfg_if! { } } +/// Upper bound on the cumulative heap size (in bytes) of the retained symbol +/// outline for a single repository. Once the outlines built so far exceed this +/// budget, the remaining files are skipped (their outline is left empty) so that +/// a very large repository — or many indexed repositories at once — cannot grow +/// the in-memory index to multiple gigabytes. Typical repositories produce +/// outlines well under this limit and are unaffected. +/// +/// See APP-4794 / Sentry 7259255054: ~6 GB of live heap was retained here as +/// owned `Symbol` strings produced by `parse_file_outline`. +const MAX_OUTLINE_TOTAL_BYTES: usize = 256 * 1024 * 1024; + /// Given a repo path, try to build its outline. An outline is a list of all its files and the symbols /// of interest from each file. pub async fn build_outline( @@ -67,12 +79,32 @@ pub async fn build_outline( pool.spawn(move || { // Parse each file in parallel. Note that we have to fold and then reduce given the parallelization. let result = pool.install(|| { + // Track the cumulative heap size of the outlines retained so far and + // stop parsing once we exceed the budget. This bounds the in-memory + // index for pathologically large (or numerous) repos, which could + // otherwise retain multiple GB of `Symbol` strings (APP-4794). + let retained_bytes = AtomicUsize::new(0); + let budget_warned = AtomicBool::new(false); files .par_iter() .map(|metadata| { + if retained_bytes.load(Ordering::Relaxed) >= MAX_OUTLINE_TOTAL_BYTES { + return (metadata.file_id, FileOutline::default()); + } + let outline = parse_file_outline(&metadata.path.to_local_path_lossy()) .ok() .unwrap_or_default(); + retained_bytes.fetch_add(outline.approx_heap_size(), Ordering::Relaxed); + + if retained_bytes.load(Ordering::Relaxed) >= MAX_OUTLINE_TOTAL_BYTES + && !budget_warned.swap(true, Ordering::Relaxed) + { + log::warn!( + "Repo outline reached the {MAX_OUTLINE_TOTAL_BYTES}-byte memory \ + budget; remaining files will be skipped to bound memory usage." + ); + } (metadata.file_id, outline) })