From eb2c844122abc681159e16fdb867f460b28bc168 Mon Sep 17 00:00:00 2001 From: Lionel Henry Date: Thu, 28 May 2026 13:42:21 +0200 Subject: [PATCH] Don't canonicalise file keys Only canonicalise when comparing against paths from external sources (e.g. normalised by R). Canonicalisation may corrupt the identity of paths sent by frontends, and may prevent matching files at all when they are deleted (at which point a non-canonicalised path sent by the frontend can't be matched against canonicalised paths we've stored as keys) --- crates/aether_url/src/lib.rs | 157 ++++++++------------- crates/ark/src/dap/dap_state.rs | 97 ++++++++++++- crates/ark/src/lsp/tests/state_handlers.rs | 20 --- crates/ark_test/src/dummy_frontend.rs | 13 +- crates/oak_db/src/file_exports.rs | 2 +- crates/oak_db/src/imports.rs | 2 +- crates/oak_db/src/tests/test_db.rs | 2 +- crates/oak_scan/src/tests/scheduler.rs | 10 +- crates/oak_scan/src/tests/stale.rs | 2 +- crates/oak_scan/tests/library.rs | 4 +- 10 files changed, 171 insertions(+), 138 deletions(-) diff --git a/crates/aether_url/src/lib.rs b/crates/aether_url/src/lib.rs index 52458b390b..fcec8f0343 100644 --- a/crates/aether_url/src/lib.rs +++ b/crates/aether_url/src/lib.rs @@ -10,117 +10,75 @@ use std::fmt; use stdext::result::ResultExt; use url::Url; -/// Canonicalised file URL identity. +/// Lexically normalised file URL identity. /// -/// # The multi-source URI reconciliation problem +/// Internal identity key for files received from any source (LSP, DAP, +/// scanner, R runtime). Constructed via the same lexical normalisation +/// at every entry point so that two paths the editor considers "the +/// same file" produce the same [`UrlId`]. /// -/// File URIs for the same file can arrive from independent sources, each -/// with its own representation: +/// What we normalise: drive-letter casing on Windows; percent-encoding +/// of `:` (decoded via `Url -> PathBuf -> Url` round-trip). No I/O: +/// `std::fs::canonicalize()`, no symlink resolution. The same input URI +/// produces the same [`UrlId`] whether or not the file exists on disk. /// -/// - **DAP (SetBreakpoints)**: Receives raw file paths from the frontend, -/// converted to URIs via `UrlId::from_file_path`. These are stored as -/// HashMap keys for breakpoint lookup. +/// # Bridging across symlinks /// -/// - **LSP (didChange, etc.)**: Receives URIs directly from the editor -/// client, which may use non-canonical forms (e.g. percent-encoded -/// colons on Windows, or symlinked paths on macOS). +/// R's `normalizePath()` resolves symlinks on its own. A srcref URI +/// from the R runtime may name `/private/tmp/foo.R` while the editor +/// sent us `/tmp/foo.R`. The two don't compare equal, so a `HashMap` +/// keyed on `UrlId` treats them as separate files. Code that needs to +/// match a srcref URI back to an open document or a breakpoint should +/// maintain a secondary index of `fs::canonicalize`d paths and fall +/// back to it on a primary miss. +/// [`crate::dap::dap_state::BreakpointMap`] in `ark` does this for +/// breakpoints. /// -/// - **Execute requests**: A frontend may attach a `code_location` URI -/// that comes straight from the editor's document model, again -/// potentially non-canonical. +/// # Important: don't leak normalised URIs back out /// -/// - **R runtime**: When R evaluates `source()` or annotates code, it -/// passes URIs that went through R's `normalizePath()`, which resolves -/// symlinks to their canonical target (e.g. `/tmp` resolves to `/private/tmp` -/// on macOS), producing a path the editor never sent. More generally, -/// arbitrary R code can create source references that we may end up -/// consuming for breakpoint or debug purposes, and the paths in those -/// references may or may not be canonical. -/// -/// All four sources must agree on file identity. For instance breakpoints set -/// via DAP are looked up in a HashMap keyed by URI when code is executed or -/// sourced, and invalidated when documents change via LSP. -/// -/// # Design decision -/// -/// We solve this by canonicalizing URIs into [`UrlId`] at every entry -/// point, rather than interning paths into opaque IDs (as rust-analyzer -/// does with its VFS `FileId` approach). Interning would be a larger -/// architectural change and is not warranted here since we only need -/// canonical keys at a handful of call sites. -/// -/// Canonicalization uses `std::fs::canonicalize()` to resolve symlinks -/// (e.g. `/tmp` to `/private/tmp` on macOS), round-trips through the -/// filesystem path to normalize encoding variants (e.g. `%3A` to `:` on -/// Windows), and uppercases drive letters on Windows. When the file does -/// not exist on disk, we fall back to the original URI. -/// -/// # Important: canonical URIs must not leak -/// -/// [`UrlId`] is strictly for internal identity. When a URI flows back -/// to R (e.g. in `#line` directives or injected breakpoint calls) or to -/// the frontend (e.g. in DAP stack frames), always use the original raw -/// URI. The frontend (and possibly R code) expects their own URI -/// representation, and a canonical URI (e.g. `/private/tmp/...` instead of -/// `/tmp/...`) could be treated as a different file (e.g. open a new editor in -/// the frontend instead of an existing one). -/// -/// A canonicalized file URI for use as a stable identity key. -/// -/// Wraps a [`Url`] that has been canonicalized to resolve symlinks, -/// normalize encoding variants, and uppercase drive letters on Windows. -/// Use this type in HashMaps and anywhere file identity matters. -/// -/// Construct via [`UrlId::from_url`], [`UrlId::from_file_path`], or -/// [`UrlId::parse`]. -/// -/// On Windows, `std::fs::canonicalize()` returns extended-length paths -/// prefixed with `\\?\` (e.g. `\\?\C:\Users\...`). Projects like Ruff -/// use the `dunce` crate to strip this prefix, but we don't need it -/// because `Url::from_file_path` already handles -/// `Prefix::VerbatimDisk` and produces a clean `file:///C:/...` URI. +/// Even though [`UrlId`] no longer fs-canonicalises, it still +/// uppercases the Windows drive letter and decodes the percent-encoded +/// colon. When sending URIs back to the editor or to R, prefer the +/// original bytes the frontend sent. The frontend treats a URI as the +/// editor's identity for the file; a normalised form may look like a +/// different file to it (e.g. open a new editor pane). #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct UrlId(Url); impl UrlId { - /// Canonicalize a [`Url`] into a [`UrlId`]. + /// Lexically normalise a [`Url`] into a [`UrlId`]. /// - /// Resolves symlinks via `std::fs::canonicalize()` and normalizes - /// encoding variants (e.g. `%3A` to `:` on Windows). On Windows, also - /// uppercases the drive letter. Falls back to the original URI for - /// non-file schemes or when the path can't be resolved. + /// Decodes encoding variants (e.g. `%3A` to `:` on Windows) and + /// uppercases the Windows drive letter. Does no filesystem I/O. + /// Non-`file:` URLs (`ark://`, `untitled:`, ...) pass through + /// untouched. pub fn from_url(uri: Url) -> Self { if uri.scheme() != "file" { return Self(uri); } - let Some(path) = uri.to_file_path().warn_on_err() else { - return Self(uri); + // Round-trip through `PathBuf` so the URI form matches what + // `Url::from_file_path` produces (decoded `%3A`, etc.). Skip + // on error, we let pathological URIs flow through unchanged. + let uri = match uri.to_file_path().warn_on_err() { + Some(path) => Url::from_file_path(&path) + .map_err(|()| anyhow::anyhow!("Failed to convert path to URI: {path:?}")) + .warn_on_err() + .unwrap_or(uri), + None => uri, }; - let path = std::fs::canonicalize(&path).trace_on_err().unwrap_or(path); - let uri = Url::from_file_path(&path) - .map_err(|()| anyhow::anyhow!("Failed to convert path to URI: {path:?}")) - .warn_on_err() - .unwrap_or(uri); - #[cfg(windows)] let uri = uppercase_windows_drive_in_uri(uri); Self(uri) } - /// Wrap a [`Url`] that the caller asserts is already canonical. - pub fn from_canonical(uri: Url) -> Self { - Self(uri) - } - - /// Convert a file path to a canonical [`UrlId`]. + /// Build a [`UrlId`] from a filesystem path. /// - /// Canonicalizes the path to resolve symlinks (e.g. `/var/folders` to - /// `/private/var/folders` on macOS) so the URI matches what R's - /// `normalizePath()` produces. Falls back to the original path if - /// canonicalization fails. + /// Same lexical normalisation as [`Self::from_url`], no filesystem + /// I/O. Errors only if `path` can't be expressed as a URL (e.g. + /// not absolute on platforms that require it). pub fn from_file_path(path: impl AsRef) -> anyhow::Result { let path = path.as_ref(); let url = Url::from_file_path(path) @@ -128,7 +86,7 @@ impl UrlId { Ok(Self::from_url(url)) } - /// Parse a URI string into a canonical [`UrlId`]. + /// Parse a URI string into a [`UrlId`]. pub fn parse(s: &str) -> Result { let url = Url::parse(s)?; Ok(Self::from_url(url)) @@ -223,9 +181,9 @@ mod tests { #[test] #[cfg(not(windows))] - fn test_fallback_for_nonexistent_path() { - // For paths that don't exist, canonicalization falls back to the - // original path so the URI is unchanged. + fn test_nonexistent_path_unchanged() { + // Construction is lexical-only, so nonexistent paths flow + // through unchanged. let uri = Url::parse("file:///nonexistent/path/test.R").unwrap(); let id = UrlId::from_url(uri.clone()); assert_eq!(*id.as_url(), uri); @@ -233,18 +191,23 @@ mod tests { #[test] #[cfg(target_os = "macos")] - fn test_resolves_tmp_symlink() { - // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` should - // resolve it so that URIs from different sources match. + fn test_does_not_resolve_symlinks() { + // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` does + // *not* resolve it; same input bytes produce the same output + // regardless of the symlink graph on disk. Bridging across + // symlinked names is the job of secondary canonical indexes at + // specific seams (e.g. the DAP breakpoint store), not of + // construction. let dir = tempfile::tempdir_in("/tmp").unwrap(); let file = dir.path().join("test.R"); std::fs::write(&file, "").unwrap(); - let non_canonical = Url::from_file_path(&file).unwrap(); - assert!(non_canonical.path().starts_with("/tmp/")); + let original = Url::from_file_path(&file).unwrap(); + assert!(original.path().starts_with("/tmp/")); - let id = UrlId::from_url(non_canonical); - assert!(id.as_url().path().starts_with("/private/tmp/")); + let id = UrlId::from_url(original.clone()); + assert_eq!(*id.as_url(), original); + assert!(id.as_url().path().starts_with("/tmp/")); } #[test] diff --git a/crates/ark/src/dap/dap_state.rs b/crates/ark/src/dap/dap_state.rs index 4488a8d034..4e9bca617d 100644 --- a/crates/ark/src/dap/dap_state.rs +++ b/crates/ark/src/dap/dap_state.rs @@ -6,6 +6,7 @@ // use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use std::sync::Mutex; @@ -220,6 +221,92 @@ impl DapBackendEvent { } } +/// Breakpoint storage keyed on the verbatim `UrlId` the frontend sent in +/// `setBreakpoints`. A secondary index maps `fs::canonicalize`d paths +/// back to the primary key, bridging R-runtime srcref URIs (which go +/// through `normalizePath()` and therefore arrive symlink-resolved) +/// against the editor's pre-resolution form. +/// +/// `fs::canonicalize` runs at insert time and at lookup-fallback time +/// only. Lookups try the primary first (cheap, no fs touch), then +/// canonicalise the incoming URI and consult the secondary. DAP wire +/// output (`debugInfo` source paths, breakpoint events) round-trips the +/// primary key unchanged, so the frontend always sees the URI it sent. +#[derive(Debug, Default)] +pub struct BreakpointMap { + /// Primary index, keyed on the URI the frontend gave us. + by_url: HashMap)>, + /// `fs::canonicalize`d path -> primary key. Populated at insert + /// only when the URL is a `file:` and `canonicalize` succeeds (i.e. + /// the file exists, which it does at `setBreakpoints` time). + by_canonical: HashMap, +} + +impl BreakpointMap { + pub fn insert(&mut self, url: UrlId, value: (blake3::Hash, Vec)) { + if let Some(canonical) = canonical_path(&url) { + self.by_canonical.insert(canonical, url.clone()); + } + self.by_url.insert(url, value); + } + + pub fn remove(&mut self, url: &UrlId) -> Option<(blake3::Hash, Vec)> { + let primary = self.resolve_primary(url)?.clone(); + self.by_canonical.retain(|_, p| p != &primary); + self.by_url.remove(&primary) + } + + pub fn get(&self, url: &UrlId) -> Option<&(blake3::Hash, Vec)> { + let primary = self.resolve_primary(url)?; + self.by_url.get(primary) + } + + pub fn get_mut(&mut self, url: &UrlId) -> Option<&mut (blake3::Hash, Vec)> { + // Cloning because the mut accessors can't hold a `&self.by_canonical` + // borrow across `&mut self.by_url`. + let primary = self.resolve_primary(url)?.clone(); + self.by_url.get_mut(&primary) + } + + pub fn contains_key(&self, url: &UrlId) -> bool { + self.resolve_primary(url).is_some() + } + + pub fn iter(&self) -> impl Iterator))> { + self.by_url.iter() + } + + pub fn iter_mut( + &mut self, + ) -> impl Iterator))> { + self.by_url.iter_mut() + } + + /// Resolve to the primary key. Tries bare `url` first. On miss, + /// canonicalises `url` and consults the secondary index. The + /// secondary catches cases like an R srcref `/private/tmp/foo.R` + /// (resolved by `normalizePath()`) looking up an editor URI + /// `/tmp/foo.R` (not resolved). + fn resolve_primary<'a>(&'a self, url: &'a UrlId) -> Option<&'a UrlId> { + if self.by_url.contains_key(url) { + return Some(url); + } + let canonical = canonical_path(url)?; + self.by_canonical.get(&canonical) + } +} + +/// `fs::canonicalize` of the path component of a `file:` URL, if any. +/// Returns `None` for non-`file:` URLs (`ark://`, `untitled:`, ...) and +/// when the file isn't on disk. +fn canonical_path(url: &UrlId) -> Option { + if !url.is_file() { + return None; + } + let path = url.to_file_path().ok()?; + std::fs::canonicalize(path).ok() +} + pub struct Dap { /// Whether the REPL is stopped with a browser prompt. pub is_debugging: bool, @@ -241,8 +328,8 @@ pub struct Dap { /// Current call stack pub stack: Option>, - /// Known breakpoints keyed by canonical URI, with document hash - pub breakpoints: HashMap)>, + /// Known breakpoints, with document hash. + pub breakpoints: BreakpointMap, /// Filters for enabled condition breakpoints pub exception_breakpoint_filters: Vec, @@ -306,7 +393,7 @@ impl Dap { is_connected: false, backend_events_tx: None, stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), @@ -864,7 +951,7 @@ mod tests { is_connected: true, backend_events_tx: Some(backend_events_tx), stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), @@ -978,7 +1065,7 @@ mod tests { is_connected: false, backend_events_tx: None, stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), diff --git a/crates/ark/src/lsp/tests/state_handlers.rs b/crates/ark/src/lsp/tests/state_handlers.rs index 79e6cc74b2..727796f67c 100644 --- a/crates/ark/src/lsp/tests/state_handlers.rs +++ b/crates/ark/src/lsp/tests/state_handlers.rs @@ -307,7 +307,6 @@ fn test_r_file_changed_for_unopened_file_updates_contents() { } #[test] -#[ignore = "blocked by UrlId canonicalization gap for missing paths"] fn test_r_file_deleted_for_editor_open_file_is_skipped() { // Mirror of `test_r_file_changed_for_editor_open_file_is_skipped` // for the Deleted kind. The skip check in `apply_watcher_events` @@ -315,17 +314,6 @@ fn test_r_file_deleted_for_editor_open_file_is_skipped() { // protected from deletion too: the buffer stays visible to the // user and queries keep resolving against the editor's // last-pushed content. - // - // The assertion passes today, but only incidentally: the same - // canonicalization gap that blocks the DESCRIPTION test below also - // breaks the skip lookup here. The event URL canonicalizes to the - // raw `/var/...` path (the file is gone, canonicalize fails), - // so the skip set (built from open documents while the file - // existed, i.e. `/private/var/...`) does not contain it. The event - // therefore reaches `remove_watched_file`, which also fails to find - // the file by URL and bails out. Net: file survives, but the skip - // mechanism is never actually exercised. Marked ignore until the - // canonicalization gap is fixed. let tmp = tempfile::tempdir().unwrap(); let path = tmp.path().join("a.R"); fs::write(&path, "disk_v1\n").unwrap(); @@ -351,18 +339,10 @@ fn test_r_file_deleted_for_editor_open_file_is_skipped() { } #[test] -#[ignore = "blocked by UrlId canonicalization gap for missing paths; see dev-notes/notes/oak/2026-05-22-0846-watcher-test-gaps.md"] fn test_description_deleted_demotes_package_to_scripts() { // Inverse of `test_description_created_triggers_root_rescan`: a // DESCRIPTION removed mid-session triggers a root rescan and the // former package's R/ files surface as workspace scripts. - // - // Currently fails because `UrlId::from_url` canonicalizes via - // `std::fs::canonicalize`, which fails for paths that no longer - // exist and falls back to the raw path. On macOS this means the - // Deleted event's URL keeps the `/var/...` prefix while the - // workspace root URL uses `/private/var/...`, and the routing in - // `apply_watcher_events` misses. let tmp = tempfile::tempdir().unwrap(); write_package(&tmp.path().join("pkg"), "pkg", &[("a.R", "x <- 1\n")]); let mut state = workspace_state(tmp.path()); diff --git a/crates/ark_test/src/dummy_frontend.rs b/crates/ark_test/src/dummy_frontend.rs index 80485beca6..1591ec88af 100644 --- a/crates/ark_test/src/dummy_frontend.rs +++ b/crates/ark_test/src/dummy_frontend.rs @@ -1431,7 +1431,12 @@ impl DummyArkFrontend { // Use forward slashes for R compatibility on Windows (backslashes would be // interpreted as escape sequences in R strings) let path = file.path().to_string_lossy().replace('\\', "/"); - let uri_id = UrlId::from_file_path(file.path()).unwrap().to_string(); + // `uri_id` is what R-side code reports back. R's `path_to_file_uri()` + // applies `normalizePath()` which resolves symlinks (e.g. macOS + // `/var/...` -> `/private/var/...`), so canonicalize here so tests + // can compare against R-reported URIs byte for byte. + let canonical = file.path().canonicalize().unwrap(); + let uri_id = UrlId::from_file_path(&canonical).unwrap().to_string(); let filename = file .path() .file_name() @@ -1822,7 +1827,11 @@ impl SourceFile { // Use forward slashes for R compatibility on Windows (backslashes would be // interpreted as escape sequences in R strings) let path = file.path().to_string_lossy().replace('\\', "/"); - let url = UrlId::from_file_path(file.path()).unwrap(); + // `uri_id` is what R reports via `path_to_file_uri()` (which goes + // through `normalizePath()`), so canonicalize here so tests can + // compare against R-reported URIs byte for byte. + let canonical = file.path().canonicalize().unwrap(); + let url = UrlId::from_file_path(&canonical).unwrap(); let uri_id = url.to_string(); // Extract file name diff --git a/crates/oak_db/src/file_exports.rs b/crates/oak_db/src/file_exports.rs index 45de5ebab5..f39b2613b1 100644 --- a/crates/oak_db/src/file_exports.rs +++ b/crates/oak_db/src/file_exports.rs @@ -68,7 +68,7 @@ impl File { name: target_name, .. } => { - let target_url_id = UrlId::from_canonical(target_url.clone()); + let target_url_id = UrlId::from_url(target_url.clone()); let Some(target_file) = db.file_by_url(&target_url_id) else { continue; }; diff --git a/crates/oak_db/src/imports.rs b/crates/oak_db/src/imports.rs index 9ce149b742..f55bc31fc6 100644 --- a/crates/oak_db/src/imports.rs +++ b/crates/oak_db/src/imports.rs @@ -107,7 +107,7 @@ fn resolve_relative_to(anchor_dir: &Path, path: &str) -> Option { let raw: PathBuf = anchor_dir.join(path); let target_path = normalise_path(&raw); let url = Url::from_file_path(&target_path).ok()?; - Some(UrlId::from_canonical(url)) + Some(UrlId::from_url(url)) } /// Resolve `..` and `.` components in `path` lexically, without diff --git a/crates/oak_db/src/tests/test_db.rs b/crates/oak_db/src/tests/test_db.rs index df940712ec..50ec770558 100644 --- a/crates/oak_db/src/tests/test_db.rs +++ b/crates/oak_db/src/tests/test_db.rs @@ -129,7 +129,7 @@ pub(super) fn file_url(name: &str) -> UrlId { } else { Url::parse(&format!("file:///{name}")).unwrap() }; - UrlId::from_canonical(url) + UrlId::from_url(url) } /// Build a fresh empty `RootKind::Workspace` `Root` at `path`. Each diff --git a/crates/oak_scan/src/tests/scheduler.rs b/crates/oak_scan/src/tests/scheduler.rs index 20e7f4907e..845ad5abb0 100644 --- a/crates/oak_scan/src/tests/scheduler.rs +++ b/crates/oak_scan/src/tests/scheduler.rs @@ -262,11 +262,7 @@ fn test_set_workspace_paths_inserts_empty_root_immediately() { let roots = db.workspace_roots().roots(&db).clone(); assert_eq!(roots.len(), 1); assert!(roots[0].packages(&db).is_empty()); - // Path matches. Compare via the URL conversion the scheduler uses - // internally, so this stays cross-platform: macOS canonicalization - // resolves symlinks (`/var` -> `/private/var`), Windows adds a `\\?\` - // UNC prefix, and going through `UrlId::from_file_path` on both sides - // is the only round-trip that matches. - let expected = UrlId::from_file_path(tmp.path()).unwrap(); - assert_eq!(roots[0].path(&db), &expected); + // `UrlId` construction is lexical, so the stored path is the one + // we handed in, byte for byte. + assert_eq!(roots[0].path(&db).to_file_path().unwrap(), tmp.path()); } diff --git a/crates/oak_scan/src/tests/stale.rs b/crates/oak_scan/src/tests/stale.rs index 1f85773ca4..aca0bfaf07 100644 --- a/crates/oak_scan/src/tests/stale.rs +++ b/crates/oak_scan/src/tests/stale.rs @@ -20,7 +20,7 @@ use crate::inputs::DbExt; use crate::inputs::RootExt; fn file_url(s: &str) -> UrlId { - UrlId::from_canonical(Url::parse(&format!("file://{s}")).unwrap()) + UrlId::from_url(Url::parse(&format!("file://{s}")).unwrap()) } #[test] diff --git a/crates/oak_scan/tests/library.rs b/crates/oak_scan/tests/library.rs index dea95332ce..2f8404d8f3 100644 --- a/crates/oak_scan/tests/library.rs +++ b/crates/oak_scan/tests/library.rs @@ -364,9 +364,7 @@ fn test_set_library_paths_resurrected_file_picks_up_disk_contents() { // and is the realistic trigger -- depends on. fn file_url(s: &str) -> UrlId { - // Bypass `UrlId::from_file_path`'s canonicalization (these paths - // don't exist on disk). - UrlId::from_canonical(Url::parse(&format!("file://{s}")).unwrap()) + UrlId::from_url(Url::parse(&format!("file://{s}")).unwrap()) } fn empty_library_root(db: &OakDatabase, path: &str) -> Root {