diff --git a/crates/aether_url/src/lib.rs b/crates/aether_url/src/lib.rs index 52458b390b..fcec8f0343 100644 --- a/crates/aether_url/src/lib.rs +++ b/crates/aether_url/src/lib.rs @@ -10,117 +10,75 @@ use std::fmt; use stdext::result::ResultExt; use url::Url; -/// Canonicalised file URL identity. +/// Lexically normalised file URL identity. /// -/// # The multi-source URI reconciliation problem +/// Internal identity key for files received from any source (LSP, DAP, +/// scanner, R runtime). Constructed via the same lexical normalisation +/// at every entry point so that two paths the editor considers "the +/// same file" produce the same [`UrlId`]. /// -/// File URIs for the same file can arrive from independent sources, each -/// with its own representation: +/// What we normalise: drive-letter casing on Windows; percent-encoding +/// of `:` (decoded via `Url -> PathBuf -> Url` round-trip). No I/O: +/// `std::fs::canonicalize()`, no symlink resolution. The same input URI +/// produces the same [`UrlId`] whether or not the file exists on disk. /// -/// - **DAP (SetBreakpoints)**: Receives raw file paths from the frontend, -/// converted to URIs via `UrlId::from_file_path`. These are stored as -/// HashMap keys for breakpoint lookup. +/// # Bridging across symlinks /// -/// - **LSP (didChange, etc.)**: Receives URIs directly from the editor -/// client, which may use non-canonical forms (e.g. percent-encoded -/// colons on Windows, or symlinked paths on macOS). +/// R's `normalizePath()` resolves symlinks on its own. A srcref URI +/// from the R runtime may name `/private/tmp/foo.R` while the editor +/// sent us `/tmp/foo.R`. The two don't compare equal, so a `HashMap` +/// keyed on `UrlId` treats them as separate files. Code that needs to +/// match a srcref URI back to an open document or a breakpoint should +/// maintain a secondary index of `fs::canonicalize`d paths and fall +/// back to it on a primary miss. +/// [`crate::dap::dap_state::BreakpointMap`] in `ark` does this for +/// breakpoints. /// -/// - **Execute requests**: A frontend may attach a `code_location` URI -/// that comes straight from the editor's document model, again -/// potentially non-canonical. +/// # Important: don't leak normalised URIs back out /// -/// - **R runtime**: When R evaluates `source()` or annotates code, it -/// passes URIs that went through R's `normalizePath()`, which resolves -/// symlinks to their canonical target (e.g. `/tmp` resolves to `/private/tmp` -/// on macOS), producing a path the editor never sent. More generally, -/// arbitrary R code can create source references that we may end up -/// consuming for breakpoint or debug purposes, and the paths in those -/// references may or may not be canonical. -/// -/// All four sources must agree on file identity. For instance breakpoints set -/// via DAP are looked up in a HashMap keyed by URI when code is executed or -/// sourced, and invalidated when documents change via LSP. -/// -/// # Design decision -/// -/// We solve this by canonicalizing URIs into [`UrlId`] at every entry -/// point, rather than interning paths into opaque IDs (as rust-analyzer -/// does with its VFS `FileId` approach). Interning would be a larger -/// architectural change and is not warranted here since we only need -/// canonical keys at a handful of call sites. -/// -/// Canonicalization uses `std::fs::canonicalize()` to resolve symlinks -/// (e.g. `/tmp` to `/private/tmp` on macOS), round-trips through the -/// filesystem path to normalize encoding variants (e.g. `%3A` to `:` on -/// Windows), and uppercases drive letters on Windows. When the file does -/// not exist on disk, we fall back to the original URI. -/// -/// # Important: canonical URIs must not leak -/// -/// [`UrlId`] is strictly for internal identity. When a URI flows back -/// to R (e.g. in `#line` directives or injected breakpoint calls) or to -/// the frontend (e.g. in DAP stack frames), always use the original raw -/// URI. The frontend (and possibly R code) expects their own URI -/// representation, and a canonical URI (e.g. `/private/tmp/...` instead of -/// `/tmp/...`) could be treated as a different file (e.g. open a new editor in -/// the frontend instead of an existing one). -/// -/// A canonicalized file URI for use as a stable identity key. -/// -/// Wraps a [`Url`] that has been canonicalized to resolve symlinks, -/// normalize encoding variants, and uppercase drive letters on Windows. -/// Use this type in HashMaps and anywhere file identity matters. -/// -/// Construct via [`UrlId::from_url`], [`UrlId::from_file_path`], or -/// [`UrlId::parse`]. -/// -/// On Windows, `std::fs::canonicalize()` returns extended-length paths -/// prefixed with `\\?\` (e.g. `\\?\C:\Users\...`). Projects like Ruff -/// use the `dunce` crate to strip this prefix, but we don't need it -/// because `Url::from_file_path` already handles -/// `Prefix::VerbatimDisk` and produces a clean `file:///C:/...` URI. +/// Even though [`UrlId`] no longer fs-canonicalises, it still +/// uppercases the Windows drive letter and decodes the percent-encoded +/// colon. When sending URIs back to the editor or to R, prefer the +/// original bytes the frontend sent. The frontend treats a URI as the +/// editor's identity for the file; a normalised form may look like a +/// different file to it (e.g. open a new editor pane). #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct UrlId(Url); impl UrlId { - /// Canonicalize a [`Url`] into a [`UrlId`]. + /// Lexically normalise a [`Url`] into a [`UrlId`]. /// - /// Resolves symlinks via `std::fs::canonicalize()` and normalizes - /// encoding variants (e.g. `%3A` to `:` on Windows). On Windows, also - /// uppercases the drive letter. Falls back to the original URI for - /// non-file schemes or when the path can't be resolved. + /// Decodes encoding variants (e.g. `%3A` to `:` on Windows) and + /// uppercases the Windows drive letter. Does no filesystem I/O. + /// Non-`file:` URLs (`ark://`, `untitled:`, ...) pass through + /// untouched. pub fn from_url(uri: Url) -> Self { if uri.scheme() != "file" { return Self(uri); } - let Some(path) = uri.to_file_path().warn_on_err() else { - return Self(uri); + // Round-trip through `PathBuf` so the URI form matches what + // `Url::from_file_path` produces (decoded `%3A`, etc.). Skip + // on error, we let pathological URIs flow through unchanged. + let uri = match uri.to_file_path().warn_on_err() { + Some(path) => Url::from_file_path(&path) + .map_err(|()| anyhow::anyhow!("Failed to convert path to URI: {path:?}")) + .warn_on_err() + .unwrap_or(uri), + None => uri, }; - let path = std::fs::canonicalize(&path).trace_on_err().unwrap_or(path); - let uri = Url::from_file_path(&path) - .map_err(|()| anyhow::anyhow!("Failed to convert path to URI: {path:?}")) - .warn_on_err() - .unwrap_or(uri); - #[cfg(windows)] let uri = uppercase_windows_drive_in_uri(uri); Self(uri) } - /// Wrap a [`Url`] that the caller asserts is already canonical. - pub fn from_canonical(uri: Url) -> Self { - Self(uri) - } - - /// Convert a file path to a canonical [`UrlId`]. + /// Build a [`UrlId`] from a filesystem path. /// - /// Canonicalizes the path to resolve symlinks (e.g. `/var/folders` to - /// `/private/var/folders` on macOS) so the URI matches what R's - /// `normalizePath()` produces. Falls back to the original path if - /// canonicalization fails. + /// Same lexical normalisation as [`Self::from_url`], no filesystem + /// I/O. Errors only if `path` can't be expressed as a URL (e.g. + /// not absolute on platforms that require it). pub fn from_file_path(path: impl AsRef) -> anyhow::Result { let path = path.as_ref(); let url = Url::from_file_path(path) @@ -128,7 +86,7 @@ impl UrlId { Ok(Self::from_url(url)) } - /// Parse a URI string into a canonical [`UrlId`]. + /// Parse a URI string into a [`UrlId`]. pub fn parse(s: &str) -> Result { let url = Url::parse(s)?; Ok(Self::from_url(url)) @@ -223,9 +181,9 @@ mod tests { #[test] #[cfg(not(windows))] - fn test_fallback_for_nonexistent_path() { - // For paths that don't exist, canonicalization falls back to the - // original path so the URI is unchanged. + fn test_nonexistent_path_unchanged() { + // Construction is lexical-only, so nonexistent paths flow + // through unchanged. let uri = Url::parse("file:///nonexistent/path/test.R").unwrap(); let id = UrlId::from_url(uri.clone()); assert_eq!(*id.as_url(), uri); @@ -233,18 +191,23 @@ mod tests { #[test] #[cfg(target_os = "macos")] - fn test_resolves_tmp_symlink() { - // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` should - // resolve it so that URIs from different sources match. + fn test_does_not_resolve_symlinks() { + // On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` does + // *not* resolve it; same input bytes produce the same output + // regardless of the symlink graph on disk. Bridging across + // symlinked names is the job of secondary canonical indexes at + // specific seams (e.g. the DAP breakpoint store), not of + // construction. let dir = tempfile::tempdir_in("/tmp").unwrap(); let file = dir.path().join("test.R"); std::fs::write(&file, "").unwrap(); - let non_canonical = Url::from_file_path(&file).unwrap(); - assert!(non_canonical.path().starts_with("/tmp/")); + let original = Url::from_file_path(&file).unwrap(); + assert!(original.path().starts_with("/tmp/")); - let id = UrlId::from_url(non_canonical); - assert!(id.as_url().path().starts_with("/private/tmp/")); + let id = UrlId::from_url(original.clone()); + assert_eq!(*id.as_url(), original); + assert!(id.as_url().path().starts_with("/tmp/")); } #[test] diff --git a/crates/ark/src/dap/dap_state.rs b/crates/ark/src/dap/dap_state.rs index 4488a8d034..4e9bca617d 100644 --- a/crates/ark/src/dap/dap_state.rs +++ b/crates/ark/src/dap/dap_state.rs @@ -6,6 +6,7 @@ // use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use std::sync::Mutex; @@ -220,6 +221,92 @@ impl DapBackendEvent { } } +/// Breakpoint storage keyed on the verbatim `UrlId` the frontend sent in +/// `setBreakpoints`. A secondary index maps `fs::canonicalize`d paths +/// back to the primary key, bridging R-runtime srcref URIs (which go +/// through `normalizePath()` and therefore arrive symlink-resolved) +/// against the editor's pre-resolution form. +/// +/// `fs::canonicalize` runs at insert time and at lookup-fallback time +/// only. Lookups try the primary first (cheap, no fs touch), then +/// canonicalise the incoming URI and consult the secondary. DAP wire +/// output (`debugInfo` source paths, breakpoint events) round-trips the +/// primary key unchanged, so the frontend always sees the URI it sent. +#[derive(Debug, Default)] +pub struct BreakpointMap { + /// Primary index, keyed on the URI the frontend gave us. + by_url: HashMap)>, + /// `fs::canonicalize`d path -> primary key. Populated at insert + /// only when the URL is a `file:` and `canonicalize` succeeds (i.e. + /// the file exists, which it does at `setBreakpoints` time). + by_canonical: HashMap, +} + +impl BreakpointMap { + pub fn insert(&mut self, url: UrlId, value: (blake3::Hash, Vec)) { + if let Some(canonical) = canonical_path(&url) { + self.by_canonical.insert(canonical, url.clone()); + } + self.by_url.insert(url, value); + } + + pub fn remove(&mut self, url: &UrlId) -> Option<(blake3::Hash, Vec)> { + let primary = self.resolve_primary(url)?.clone(); + self.by_canonical.retain(|_, p| p != &primary); + self.by_url.remove(&primary) + } + + pub fn get(&self, url: &UrlId) -> Option<&(blake3::Hash, Vec)> { + let primary = self.resolve_primary(url)?; + self.by_url.get(primary) + } + + pub fn get_mut(&mut self, url: &UrlId) -> Option<&mut (blake3::Hash, Vec)> { + // Cloning because the mut accessors can't hold a `&self.by_canonical` + // borrow across `&mut self.by_url`. + let primary = self.resolve_primary(url)?.clone(); + self.by_url.get_mut(&primary) + } + + pub fn contains_key(&self, url: &UrlId) -> bool { + self.resolve_primary(url).is_some() + } + + pub fn iter(&self) -> impl Iterator))> { + self.by_url.iter() + } + + pub fn iter_mut( + &mut self, + ) -> impl Iterator))> { + self.by_url.iter_mut() + } + + /// Resolve to the primary key. Tries bare `url` first. On miss, + /// canonicalises `url` and consults the secondary index. The + /// secondary catches cases like an R srcref `/private/tmp/foo.R` + /// (resolved by `normalizePath()`) looking up an editor URI + /// `/tmp/foo.R` (not resolved). + fn resolve_primary<'a>(&'a self, url: &'a UrlId) -> Option<&'a UrlId> { + if self.by_url.contains_key(url) { + return Some(url); + } + let canonical = canonical_path(url)?; + self.by_canonical.get(&canonical) + } +} + +/// `fs::canonicalize` of the path component of a `file:` URL, if any. +/// Returns `None` for non-`file:` URLs (`ark://`, `untitled:`, ...) and +/// when the file isn't on disk. +fn canonical_path(url: &UrlId) -> Option { + if !url.is_file() { + return None; + } + let path = url.to_file_path().ok()?; + std::fs::canonicalize(path).ok() +} + pub struct Dap { /// Whether the REPL is stopped with a browser prompt. pub is_debugging: bool, @@ -241,8 +328,8 @@ pub struct Dap { /// Current call stack pub stack: Option>, - /// Known breakpoints keyed by canonical URI, with document hash - pub breakpoints: HashMap)>, + /// Known breakpoints, with document hash. + pub breakpoints: BreakpointMap, /// Filters for enabled condition breakpoints pub exception_breakpoint_filters: Vec, @@ -306,7 +393,7 @@ impl Dap { is_connected: false, backend_events_tx: None, stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), @@ -864,7 +951,7 @@ mod tests { is_connected: true, backend_events_tx: Some(backend_events_tx), stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), @@ -978,7 +1065,7 @@ mod tests { is_connected: false, backend_events_tx: None, stack: None, - breakpoints: HashMap::new(), + breakpoints: BreakpointMap::default(), exception_breakpoint_filters: Vec::new(), fallback_sources: HashMap::new(), frame_id_to_variables_reference: HashMap::new(), diff --git a/crates/ark/src/lsp/tests/state_handlers.rs b/crates/ark/src/lsp/tests/state_handlers.rs index 79e6cc74b2..727796f67c 100644 --- a/crates/ark/src/lsp/tests/state_handlers.rs +++ b/crates/ark/src/lsp/tests/state_handlers.rs @@ -307,7 +307,6 @@ fn test_r_file_changed_for_unopened_file_updates_contents() { } #[test] -#[ignore = "blocked by UrlId canonicalization gap for missing paths"] fn test_r_file_deleted_for_editor_open_file_is_skipped() { // Mirror of `test_r_file_changed_for_editor_open_file_is_skipped` // for the Deleted kind. The skip check in `apply_watcher_events` @@ -315,17 +314,6 @@ fn test_r_file_deleted_for_editor_open_file_is_skipped() { // protected from deletion too: the buffer stays visible to the // user and queries keep resolving against the editor's // last-pushed content. - // - // The assertion passes today, but only incidentally: the same - // canonicalization gap that blocks the DESCRIPTION test below also - // breaks the skip lookup here. The event URL canonicalizes to the - // raw `/var/...` path (the file is gone, canonicalize fails), - // so the skip set (built from open documents while the file - // existed, i.e. `/private/var/...`) does not contain it. The event - // therefore reaches `remove_watched_file`, which also fails to find - // the file by URL and bails out. Net: file survives, but the skip - // mechanism is never actually exercised. Marked ignore until the - // canonicalization gap is fixed. let tmp = tempfile::tempdir().unwrap(); let path = tmp.path().join("a.R"); fs::write(&path, "disk_v1\n").unwrap(); @@ -351,18 +339,10 @@ fn test_r_file_deleted_for_editor_open_file_is_skipped() { } #[test] -#[ignore = "blocked by UrlId canonicalization gap for missing paths; see dev-notes/notes/oak/2026-05-22-0846-watcher-test-gaps.md"] fn test_description_deleted_demotes_package_to_scripts() { // Inverse of `test_description_created_triggers_root_rescan`: a // DESCRIPTION removed mid-session triggers a root rescan and the // former package's R/ files surface as workspace scripts. - // - // Currently fails because `UrlId::from_url` canonicalizes via - // `std::fs::canonicalize`, which fails for paths that no longer - // exist and falls back to the raw path. On macOS this means the - // Deleted event's URL keeps the `/var/...` prefix while the - // workspace root URL uses `/private/var/...`, and the routing in - // `apply_watcher_events` misses. let tmp = tempfile::tempdir().unwrap(); write_package(&tmp.path().join("pkg"), "pkg", &[("a.R", "x <- 1\n")]); let mut state = workspace_state(tmp.path()); diff --git a/crates/ark_test/src/dummy_frontend.rs b/crates/ark_test/src/dummy_frontend.rs index 80485beca6..1591ec88af 100644 --- a/crates/ark_test/src/dummy_frontend.rs +++ b/crates/ark_test/src/dummy_frontend.rs @@ -1431,7 +1431,12 @@ impl DummyArkFrontend { // Use forward slashes for R compatibility on Windows (backslashes would be // interpreted as escape sequences in R strings) let path = file.path().to_string_lossy().replace('\\', "/"); - let uri_id = UrlId::from_file_path(file.path()).unwrap().to_string(); + // `uri_id` is what R-side code reports back. R's `path_to_file_uri()` + // applies `normalizePath()` which resolves symlinks (e.g. macOS + // `/var/...` -> `/private/var/...`), so canonicalize here so tests + // can compare against R-reported URIs byte for byte. + let canonical = file.path().canonicalize().unwrap(); + let uri_id = UrlId::from_file_path(&canonical).unwrap().to_string(); let filename = file .path() .file_name() @@ -1822,7 +1827,11 @@ impl SourceFile { // Use forward slashes for R compatibility on Windows (backslashes would be // interpreted as escape sequences in R strings) let path = file.path().to_string_lossy().replace('\\', "/"); - let url = UrlId::from_file_path(file.path()).unwrap(); + // `uri_id` is what R reports via `path_to_file_uri()` (which goes + // through `normalizePath()`), so canonicalize here so tests can + // compare against R-reported URIs byte for byte. + let canonical = file.path().canonicalize().unwrap(); + let url = UrlId::from_file_path(&canonical).unwrap(); let uri_id = url.to_string(); // Extract file name diff --git a/crates/oak_db/src/file_exports.rs b/crates/oak_db/src/file_exports.rs index 45de5ebab5..f39b2613b1 100644 --- a/crates/oak_db/src/file_exports.rs +++ b/crates/oak_db/src/file_exports.rs @@ -68,7 +68,7 @@ impl File { name: target_name, .. } => { - let target_url_id = UrlId::from_canonical(target_url.clone()); + let target_url_id = UrlId::from_url(target_url.clone()); let Some(target_file) = db.file_by_url(&target_url_id) else { continue; }; diff --git a/crates/oak_db/src/imports.rs b/crates/oak_db/src/imports.rs index 9ce149b742..f55bc31fc6 100644 --- a/crates/oak_db/src/imports.rs +++ b/crates/oak_db/src/imports.rs @@ -107,7 +107,7 @@ fn resolve_relative_to(anchor_dir: &Path, path: &str) -> Option { let raw: PathBuf = anchor_dir.join(path); let target_path = normalise_path(&raw); let url = Url::from_file_path(&target_path).ok()?; - Some(UrlId::from_canonical(url)) + Some(UrlId::from_url(url)) } /// Resolve `..` and `.` components in `path` lexically, without diff --git a/crates/oak_db/src/tests/test_db.rs b/crates/oak_db/src/tests/test_db.rs index df940712ec..50ec770558 100644 --- a/crates/oak_db/src/tests/test_db.rs +++ b/crates/oak_db/src/tests/test_db.rs @@ -129,7 +129,7 @@ pub(super) fn file_url(name: &str) -> UrlId { } else { Url::parse(&format!("file:///{name}")).unwrap() }; - UrlId::from_canonical(url) + UrlId::from_url(url) } /// Build a fresh empty `RootKind::Workspace` `Root` at `path`. Each diff --git a/crates/oak_scan/src/tests/scheduler.rs b/crates/oak_scan/src/tests/scheduler.rs index 20e7f4907e..845ad5abb0 100644 --- a/crates/oak_scan/src/tests/scheduler.rs +++ b/crates/oak_scan/src/tests/scheduler.rs @@ -262,11 +262,7 @@ fn test_set_workspace_paths_inserts_empty_root_immediately() { let roots = db.workspace_roots().roots(&db).clone(); assert_eq!(roots.len(), 1); assert!(roots[0].packages(&db).is_empty()); - // Path matches. Compare via the URL conversion the scheduler uses - // internally, so this stays cross-platform: macOS canonicalization - // resolves symlinks (`/var` -> `/private/var`), Windows adds a `\\?\` - // UNC prefix, and going through `UrlId::from_file_path` on both sides - // is the only round-trip that matches. - let expected = UrlId::from_file_path(tmp.path()).unwrap(); - assert_eq!(roots[0].path(&db), &expected); + // `UrlId` construction is lexical, so the stored path is the one + // we handed in, byte for byte. + assert_eq!(roots[0].path(&db).to_file_path().unwrap(), tmp.path()); } diff --git a/crates/oak_scan/src/tests/stale.rs b/crates/oak_scan/src/tests/stale.rs index 1f85773ca4..aca0bfaf07 100644 --- a/crates/oak_scan/src/tests/stale.rs +++ b/crates/oak_scan/src/tests/stale.rs @@ -20,7 +20,7 @@ use crate::inputs::DbExt; use crate::inputs::RootExt; fn file_url(s: &str) -> UrlId { - UrlId::from_canonical(Url::parse(&format!("file://{s}")).unwrap()) + UrlId::from_url(Url::parse(&format!("file://{s}")).unwrap()) } #[test] diff --git a/crates/oak_scan/tests/library.rs b/crates/oak_scan/tests/library.rs index dea95332ce..2f8404d8f3 100644 --- a/crates/oak_scan/tests/library.rs +++ b/crates/oak_scan/tests/library.rs @@ -364,9 +364,7 @@ fn test_set_library_paths_resurrected_file_picks_up_disk_contents() { // and is the realistic trigger -- depends on. fn file_url(s: &str) -> UrlId { - // Bypass `UrlId::from_file_path`'s canonicalization (these paths - // don't exist on disk). - UrlId::from_canonical(Url::parse(&format!("file://{s}")).unwrap()) + UrlId::from_url(Url::parse(&format!("file://{s}")).unwrap()) } fn empty_library_root(db: &OakDatabase, path: &str) -> Root {