Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions crates/ai/src/index/file_outline/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,20 @@ pub struct Symbol {
pub line_number: usize,
}

impl Symbol {
/// Approximate number of heap bytes owned by this symbol (its owned
/// `String`s). Excludes the inline `Symbol` struct itself, which is
/// accounted for by the containing `Vec`'s capacity in
/// [`FileOutline::approx_heap_size`].
fn approx_heap_size(&self) -> usize {
let comment_bytes = self.comment.as_ref().map_or(0, |lines| {
lines.capacity() * std::mem::size_of::<String>()
+ lines.iter().map(String::len).sum::<usize>()
});
self.name.len() + self.type_prefix.as_ref().map_or(0, String::len) + comment_bytes
}
}

/// Represents the "outline" of a file with all the identifier symbols of interest.
#[derive(Debug, Clone, Default)]
pub struct FileOutline {
Expand All @@ -158,6 +172,15 @@ impl FileOutline {
self.symbols.as_ref()
}

/// Approximate number of heap bytes retained by this outline's symbols. Used
/// to bound the cumulative size of a repository's in-memory outline.
pub(crate) fn approx_heap_size(&self) -> usize {
self.symbols.as_ref().map_or(0, |symbols| {
symbols.capacity() * std::mem::size_of::<Symbol>()
+ symbols.iter().map(Symbol::approx_heap_size).sum::<usize>()
})
}

/// Format the outline into a string.
pub fn to_string(&self) -> Option<String> {
Some(
Expand All @@ -183,3 +206,7 @@ impl FileOutline {
)
}
}

#[cfg(test)]
#[path = "mod_test.rs"]
mod tests;
52 changes: 52 additions & 0 deletions crates/ai/src/index/file_outline/mod_test.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use super::{FileOutline, Symbol};

fn symbol(name: &str, comment: Option<Vec<&str>>) -> Symbol {
Symbol {
name: name.to_string(),
type_prefix: None,
comment: comment.map(|lines| lines.into_iter().map(str::to_string).collect()),
line_number: 1,
}
}

#[test]
fn approx_heap_size_is_zero_without_symbols() {
let outline = FileOutline { symbols: None };
assert_eq!(outline.approx_heap_size(), 0);

let outline = FileOutline {
symbols: Some(Vec::new()),
};
assert_eq!(outline.approx_heap_size(), 0);
}

#[test]
fn approx_heap_size_counts_owned_string_bytes() {
let outline = FileOutline {
symbols: Some(vec![
symbol("foo", None),
symbol("bar", Some(vec!["a", "bc"])),
]),
};

let size = outline.approx_heap_size();
let symbols = outline.symbols().expect("symbols");

// The owned name bytes ("foo" + "bar") and comment bytes ("a" + "bc") are
// all counted.
let owned_bytes = 3 + 3 + 1 + 2;
// Plus the inline `Symbol` array backing the `Vec`.
let vec_bytes = symbols.capacity() * std::mem::size_of::<Symbol>();
assert!(size >= owned_bytes + vec_bytes, "size = {size}");
}

#[test]
fn approx_heap_size_grows_with_more_symbols() {
let small = FileOutline {
symbols: Some(vec![symbol("a", None)]),
};
let large = FileOutline {
symbols: Some(vec![symbol(&"a".repeat(10_000), None)]),
};
assert!(large.approx_heap_size() > small.approx_heap_size());
}
32 changes: 32 additions & 0 deletions crates/ai/src/index/file_outline/native.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};

use anyhow::anyhow;
use arborium::tree_sitter::{Parser, Query, QueryCursor, Tree};
Expand All @@ -23,6 +24,17 @@ cfg_if::cfg_if! {
}
}

/// Upper bound on the cumulative heap size (in bytes) of the retained symbol
/// outline for a single repository. Once the outlines built so far exceed this
/// budget, the remaining files are skipped (their outline is left empty) so that
/// a very large repository — or many indexed repositories at once — cannot grow
/// the in-memory index to multiple gigabytes. Typical repositories produce
/// outlines well under this limit and are unaffected.
///
/// See APP-4794 / Sentry 7259255054: ~6 GB of live heap was retained here as
/// owned `Symbol` strings produced by `parse_file_outline`.
const MAX_OUTLINE_TOTAL_BYTES: usize = 256 * 1024 * 1024;

/// Given a repo path, try to build its outline. An outline is a list of all its files and the symbols
/// of interest from each file.
pub async fn build_outline(
Expand Down Expand Up @@ -67,12 +79,32 @@ pub async fn build_outline(
pool.spawn(move || {
// Parse each file in parallel. Note that we have to fold and then reduce given the parallelization.
let result = pool.install(|| {
// Track the cumulative heap size of the outlines retained so far and
// stop parsing once we exceed the budget. This bounds the in-memory
// index for pathologically large (or numerous) repos, which could
// otherwise retain multiple GB of `Symbol` strings (APP-4794).
let retained_bytes = AtomicUsize::new(0);
let budget_warned = AtomicBool::new(false);
files
.par_iter()
.map(|metadata| {
if retained_bytes.load(Ordering::Relaxed) >= MAX_OUTLINE_TOTAL_BYTES {
return (metadata.file_id, FileOutline::default());
}

let outline = parse_file_outline(&metadata.path.to_local_path_lossy())
.ok()
.unwrap_or_default();
retained_bytes.fetch_add(outline.approx_heap_size(), Ordering::Relaxed);

if retained_bytes.load(Ordering::Relaxed) >= MAX_OUTLINE_TOTAL_BYTES
&& !budget_warned.swap(true, Ordering::Relaxed)
{
log::warn!(
"Repo outline reached the {MAX_OUTLINE_TOTAL_BYTES}-byte memory \
budget; remaining files will be skipped to bound memory usage."
);
}

(metadata.file_id, outline)
})
Expand Down