diff --git a/DESCRIPTION b/DESCRIPTION index 6447470..3c1bce4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: cd Title: Climate Departure Analysis from ERA5-Land Reanalysis -Version: 0.3.2 -Date: 2026-06-06 +Version: 0.4.0 +Date: 2026-06-25 Authors@R: c( person("Allan", "Irvine", , "al@newgraphenvironment.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-3495-2128")), @@ -35,9 +35,11 @@ Suggests: rmarkdown, testthat (>= 3.0.0), tidyterra, + withr, zyp Config/testthat/edition: 3 -Imports: +Imports: + curl, dplyr, jsonlite, rappdirs, diff --git a/NAMESPACE b/NAMESPACE index b095deb..7b9540f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(cd_aggregate) export(cd_anomaly) export(cd_baseline) export(cd_cache_clear) +export(cd_cache_fetch) export(cd_cache_info) export(cd_cache_path) export(cd_catalog) diff --git a/NEWS.md b/NEWS.md index 9bbd313..adecac0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# cd 0.4.0 (2026-06-25) + +* On-disk caching wired into the consumer read path, so repeated extractions, report renders, and vignette rebuilds pull each COG from S3 **once** and read locally thereafter — turning the dominant recurring S3 egress driver into a one-time cost. New exported `cd_cache_fetch()` downloads a remote http(s) COG to the cd cache (keyed by URL hash, with a sidecar `.meta` recording the S3 ETag and size), validates freshness with a cheap HTTP HEAD (ETag, falling back to Content-Length), and serves the local copy on a hit. Downloads are size-validated and atomically renamed so a truncated file is never served; a failed HEAD with a cached copy present serves the cache, and `options(cd.cache_revalidate = FALSE)` skips revalidation entirely for offline work. `cd_crop()` and `cd_extract()` gain `cache = TRUE` (default), threading remote reads through the cache while local paths pass through unchanged. Live S3 confirmation: a repeat read drops from a full-COG download (megabytes) to a ~1 KB HEAD (or zero network with revalidation off). Adds `curl` to Imports. See the new README "Caching" section, which also documents the GDAL `/vsicurl/` env-var stopgap. ([#76](https://github.com/NewGraphEnvironment/cd/pull/76)) + # cd 0.3.2 (2026-06-06) * Both regional vignettes (kootenay-lake, peace-fwcp) rewritten for new readers: plainer-language opener for the snowpack section ("In BC, most of the year's runoff starts as winter snow…" instead of the "hinge of BC hydrology" metaphor), Trends / Recent-Decade / bias-notes preambles compressed and de-jargoned, Annual snowpack signals intro reduced to a 3-bullet plain-language list, salmonid Interpretation closer tightened to one paragraph with three bold knock-on effects. Figure trim: cut `plot-tmean` (covered by `facet-tmean`), `plot-dtr` (asymmetry numbers already in prose), and `snow-rate-peak` (not load-bearing); fold `plot-tmax` + `plot-tmin` into one 2-panel faceted `plot-tmaxmin`, and `snow-swe-max` + `snow-doy-50` + `snow-fraction` into one 3-panel faceted `snow-annual` (free y-scales). Net per vignette: 3 fewer standalone figures, same coverage. Bibliography: dropped `kouki_etal2023` and `yue_wang2002` (no longer cited); union now 15/15. ([#75](https://github.com/NewGraphEnvironment/cd/pull/75)) diff --git a/R/cd_cache_fetch.R b/R/cd_cache_fetch.R new file mode 100644 index 0000000..8034920 --- /dev/null +++ b/R/cd_cache_fetch.R @@ -0,0 +1,153 @@ +#' Fetch a remote COG through the on-disk cache +#' +#' Given a remote `href` (http/https), downloads the file once to the cd +#' cache directory and returns a local path; subsequent calls read the +#' local copy instead of re-pulling from the network. Freshness is +#' checked with a cheap HTTP HEAD request (comparing the S3 ETag), so a +#' monthly catalog republish is picked up automatically while repeat +#' builds do near-zero egress. Local paths — and non-http URLs such as +#' `s3://`, which GDAL reads directly — are returned unchanged. +#' +#' @param href Character. Path or URL to a COG. +#' @param refresh Logical. If `TRUE`, force a re-download even when a +#' valid cached copy exists. Default `FALSE`. +#' @param cache_dir Character. Override the cache location. If `NULL`, +#' uses [cd_cache_path()]. +#' +#' @details +#' Freshness uses the ETag when the server provides one, falling back to +#' the `Content-Length` size when it does not. A host that returns +#' neither validator cannot be proven fresh, so the file is re-downloaded +#' on each call (safe, but un-cached) — S3, the default host, always +#' returns both. Revalidation can be disabled for a fully-offline fast +#' path with `options(cd.cache_revalidate = FALSE)`, which serves any +#' existing cached copy without an HTTP HEAD. When the HEAD fails (e.g. +#' offline) but a cached copy exists, the cached copy is served with a +#' message. Downloads are written to a temporary file, validated against +#' the advertised `Content-Length`, then atomically renamed, so a +#' truncated download is never served as complete. +#' +#' @return Character path to the local (cached) file, or `href` +#' unchanged for local / non-http inputs. +#' +#' @examples +#' # Local files pass through untouched: +#' f <- system.file("extdata", "example_climate.tif", package = "cd") +#' identical(cd_cache_fetch(f), f) +#' +#' @export +cd_cache_fetch <- function(href, refresh = FALSE, cache_dir = NULL) { + if (length(href) != 1L || is.na(href) || !cd_is_remote(href)) { + return(href) + } + + dir <- cd_cache_path(cache_dir) + ext <- tools::file_ext(href) + key <- rlang::hash(href) + fname <- if (nzchar(ext)) paste0(key, ".", ext) else key + local_path <- file.path(dir, fname) + meta_path <- paste0(local_path, ".meta") + + have_local <- file.exists(local_path) && file.exists(meta_path) + revalidate <- isTRUE(getOption("cd.cache_revalidate", default = TRUE)) + + # Offline fast path: trust an existing cache without a HEAD request. + if (have_local && !refresh && !revalidate) { + return(local_path) + } + + head <- cd_remote_head(href) + + # HEAD failed (offline / server error): serve a cached copy if present. + if (is.null(head)) { + if (have_local && !refresh) { + rlang::inform( + paste0("cd_cache_fetch: could not reach '", href, + "'; serving cached copy.") + ) + return(local_path) + } + stop("cd_cache_fetch: failed to reach '", href, + "' and no cached copy is available.", call. = FALSE) + } + + # Valid cache: serve local, no download. + if (have_local && !refresh) { + meta <- jsonlite::read_json(meta_path) + if (cd_cache_valid(head, meta)) { + return(local_path) + } + } + + # Download to a temp file, validate size, atomic rename, write meta. + tmp <- tempfile(tmpdir = dir, fileext = if (nzchar(ext)) paste0(".", ext) else "") + on.exit(if (file.exists(tmp)) unlink(tmp), add = TRUE) + cd_remote_download(href, tmp) + + if (!is.null(head$size) && !is.na(head$size)) { + got <- file.size(tmp) + if (is.na(got) || got != head$size) { + stop("cd_cache_fetch: incomplete download of '", href, "' (", + got, " of ", head$size, " bytes).", call. = FALSE) + } + } + + if (!file.rename(tmp, local_path)) { + stop("cd_cache_fetch: failed to move the download into the cache for '", + href, "'.", call. = FALSE) + } + jsonlite::write_json( + list(url = href, etag = head$etag, size = head$size, + downloaded_at = format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")), + meta_path, auto_unbox = TRUE + ) + local_path +} + +#' Is an href a cacheable remote (http/https) URL? +#' @noRd +cd_is_remote <- function(href) { + grepl("^https?://", href) +} + +#' Is a cached copy still valid against fresh HEAD metadata? +#' +#' Prefers the ETag; falls back to Content-Length size when the server +#' (or the stored meta) carries no ETag, so ETag-less hosts still get a +#' cache hit instead of re-downloading on every call. +#' @noRd +cd_cache_valid <- function(head, meta) { + if (!is.null(head$etag) && !is.null(meta$etag)) { + return(identical(head$etag, meta$etag)) + } + if (!is.null(head$size) && !is.na(head$size) && !is.null(meta$size)) { + return(isTRUE(as.numeric(meta$size) == head$size)) + } + FALSE +} + +#' HTTP HEAD a remote COG; return its ETag and size, or NULL on failure. +#' @noRd +cd_remote_head <- function(href) { + handle <- curl::new_handle(nobody = TRUE) + res <- tryCatch( + curl::curl_fetch_memory(href, handle = handle), + error = function(e) NULL + ) + if (is.null(res) || res$status_code >= 400) { + return(NULL) + } + hdrs <- curl::parse_headers_list(res$headers) + etag <- hdrs[["etag"]] + cl <- hdrs[["content-length"]] + list( + etag = if (!is.null(etag)) gsub('"', "", etag) else NULL, + size = if (!is.null(cl)) as.numeric(cl) else NA_real_ + ) +} + +#' Download a remote COG to destfile (binary). +#' @noRd +cd_remote_download <- function(href, destfile) { + curl::curl_download(href, destfile, mode = "wb") +} diff --git a/R/cd_crop.R b/R/cd_crop.R index e84f93b..e4c7ebf 100644 --- a/R/cd_crop.R +++ b/R/cd_crop.R @@ -6,6 +6,9 @@ #' #' @param href Character. Path or URL to a COG or raster file. #' @param aoi An `sf` or `SpatVector` polygon to crop to. +#' @param cache Logical. If `TRUE` (default), route remote http(s) hrefs +#' through the on-disk cache via [cd_cache_fetch()] so repeated reads +#' pull from S3 once instead of every call. Local paths are unaffected. #' #' @return A [terra::SpatRaster] cropped and masked to the AOI. #' @@ -19,7 +22,10 @@ #' r #' #' @export -cd_crop <- function(href, aoi) { +cd_crop <- function(href, aoi, cache = TRUE) { + if (isTRUE(cache)) { + href <- cd_cache_fetch(href) + } r <- terra::rast(href) if (inherits(aoi, "sf") || inherits(aoi, "sfc")) { aoi <- terra::vect(aoi) diff --git a/R/cd_extract.R b/R/cd_extract.R index 581cb45..b7efac4 100644 --- a/R/cd_extract.R +++ b/R/cd_extract.R @@ -13,6 +13,10 @@ #' @param periods Character vector of periods to extract. #' Defaults to all periods in `catalog`. #' @param years Optional integer vector to filter specific years. +#' @param cache Logical. If `TRUE` (default), remote COGs are read +#' through the on-disk cache (see [cd_cache_fetch()]) so repeated +#' extractions and report rebuilds download each COG from S3 once +#' rather than on every call. Passed through to [cd_crop()]. #' #' @return A tibble with columns: #' \describe{ @@ -36,11 +40,12 @@ cd_extract <- function(catalog, aoi, variables = catalog$variable, periods = catalog$period, - years = NULL) { + years = NULL, + cache = TRUE) { rows <- catalog[catalog$variable %in% variables & catalog$period %in% periods, ] results <- lapply(seq_len(nrow(rows)), function(i) { - r <- cd_crop(rows$href[i], aoi) + r <- cd_crop(rows$href[i], aoi, cache = cache) means <- terra::global(r, fun = "mean", na.rm = TRUE) yr <- as.integer(names(r)) diff --git a/README.md b/README.md index 38cc40f..170d6c8 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,40 @@ cd_summary(trn) cd_compare(ts, window_a = 1956:1960, window_b = 1951:1955) ``` +## Caching + +`cd_extract()` and `cd_crop()` cache each COG on first read, so repeated +extractions, report renders, and vignette rebuilds pull each file from S3 +**once** and read locally thereafter — turning recurring S3 egress into a +one-time cost. Caching is on by default (`cache = TRUE`). + +```r +# First call downloads; later calls read the local cache. +ts <- cd_extract(catalog, aoi) # cache = TRUE by default + +cd_cache_info() # where the cache lives + size +cd_cache_clear() # wipe it +cd_extract(catalog, aoi, cache = FALSE) # bypass the cache for one call +``` + +Freshness is checked with a cheap HTTP HEAD (S3 ETag), so the monthly +catalog republish is picked up automatically; `cd_cache_fetch(href, +refresh = TRUE)` forces a re-download. For a fully-offline session set +`options(cd.cache_revalidate = FALSE)` to serve cached copies without any +network call. + +**Stopgap without the cache.** If you read COGs through GDAL directly +(e.g. raw `terra::rast("/vsicurl/...")` outside `cd_crop()`), you can cut +repeat egress within a session by enabling GDAL's `/vsicurl/` cache: + +```r +Sys.setenv(VSI_CACHE = "TRUE", VSI_CACHE_SIZE = "100000000") # 100 MB +Sys.setenv(GDAL_HTTP_MAX_RETRY = "3", GDAL_HTTP_RETRY_DELAY = "1") +``` + +This only persists within one R session; the `cd_*` cache above persists +across sessions, which is what kills recurring report-dev egress. + ## Data The producer pipeline fetches ERA5-Land hourly reanalysis from diff --git a/man/cd_cache_fetch.Rd b/man/cd_cache_fetch.Rd new file mode 100644 index 0000000..51f7ee0 --- /dev/null +++ b/man/cd_cache_fetch.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cd_cache_fetch.R +\name{cd_cache_fetch} +\alias{cd_cache_fetch} +\title{Fetch a remote COG through the on-disk cache} +\usage{ +cd_cache_fetch(href, refresh = FALSE, cache_dir = NULL) +} +\arguments{ +\item{href}{Character. Path or URL to a COG.} + +\item{refresh}{Logical. If \code{TRUE}, force a re-download even when a +valid cached copy exists. Default \code{FALSE}.} + +\item{cache_dir}{Character. Override the cache location. If \code{NULL}, +uses \code{\link[=cd_cache_path]{cd_cache_path()}}.} +} +\value{ +Character path to the local (cached) file, or \code{href} +unchanged for local / non-http inputs. +} +\description{ +Given a remote \code{href} (http/https), downloads the file once to the cd +cache directory and returns a local path; subsequent calls read the +local copy instead of re-pulling from the network. Freshness is +checked with a cheap HTTP HEAD request (comparing the S3 ETag), so a +monthly catalog republish is picked up automatically while repeat +builds do near-zero egress. Local paths — and non-http URLs such as +\verb{s3://}, which GDAL reads directly — are returned unchanged. +} +\details{ +Freshness uses the ETag when the server provides one, falling back to +the \code{Content-Length} size when it does not. A host that returns +neither validator cannot be proven fresh, so the file is re-downloaded +on each call (safe, but un-cached) — S3, the default host, always +returns both. Revalidation can be disabled for a fully-offline fast +path with \code{options(cd.cache_revalidate = FALSE)}, which serves any +existing cached copy without an HTTP HEAD. When the HEAD fails (e.g. +offline) but a cached copy exists, the cached copy is served with a +message. Downloads are written to a temporary file, validated against +the advertised \code{Content-Length}, then atomically renamed, so a +truncated download is never served as complete. +} +\examples{ +# Local files pass through untouched: +f <- system.file("extdata", "example_climate.tif", package = "cd") +identical(cd_cache_fetch(f), f) + +} diff --git a/man/cd_crop.Rd b/man/cd_crop.Rd index 4a77d0b..97153f4 100644 --- a/man/cd_crop.Rd +++ b/man/cd_crop.Rd @@ -4,12 +4,16 @@ \alias{cd_crop} \title{Crop and mask a raster to an AOI} \usage{ -cd_crop(href, aoi) +cd_crop(href, aoi, cache = TRUE) } \arguments{ \item{href}{Character. Path or URL to a COG or raster file.} \item{aoi}{An \code{sf} or \code{SpatVector} polygon to crop to.} + +\item{cache}{Logical. If \code{TRUE} (default), route remote http(s) hrefs +through the on-disk cache via \code{\link[=cd_cache_fetch]{cd_cache_fetch()}} so repeated reads +pull from S3 once instead of every call. Local paths are unaffected.} } \value{ A \link[terra:SpatRaster-class]{terra::SpatRaster} cropped and masked to the AOI. diff --git a/man/cd_extract.Rd b/man/cd_extract.Rd index 1576b90..08483f2 100644 --- a/man/cd_extract.Rd +++ b/man/cd_extract.Rd @@ -9,7 +9,8 @@ cd_extract( aoi, variables = catalog$variable, periods = catalog$period, - years = NULL + years = NULL, + cache = TRUE ) } \arguments{ @@ -25,6 +26,11 @@ Defaults to all variables in \code{catalog}.} Defaults to all periods in \code{catalog}.} \item{years}{Optional integer vector to filter specific years.} + +\item{cache}{Logical. If \code{TRUE} (default), remote COGs are read +through the on-disk cache (see \code{\link[=cd_cache_fetch]{cd_cache_fetch()}}) so repeated +extractions and report rebuilds download each COG from S3 once +rather than on every call. Passed through to \code{\link[=cd_crop]{cd_crop()}}.} } \value{ A tibble with columns: diff --git a/planning/archive/2026-06-issue-76-cog-cache/.gitkeep b/planning/archive/2026-06-issue-76-cog-cache/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/planning/archive/2026-06-issue-76-cog-cache/README.md b/planning/archive/2026-06-issue-76-cog-cache/README.md new file mode 100644 index 0000000..e419797 --- /dev/null +++ b/planning/archive/2026-06-issue-76-cog-cache/README.md @@ -0,0 +1,35 @@ +## Outcome + +Wired the orphaned `cd_cache` module into the consumer COG read path so +repeated extractions, report renders, and vignette rebuilds read each COG +from a local on-disk cache instead of re-pulling from S3 every call — +killing the dominant recurring S3 egress driver (rtj#168). New exported +`cd_cache_fetch()` downloads a remote http(s) COG once (filename = +`hash(url)` + ext, sidecar `.meta` JSON with the S3 ETag/size/timestamp), +revalidates freshness with a cheap HTTP HEAD (ETag, falling back to +Content-Length size when a host returns no ETag), and serves the local +copy on a hit. Downloads go to a temp file in the cache dir, are +size-validated against Content-Length, then atomically renamed (guarded +against `file.rename` failure) so a truncated file is never served. A +failed HEAD with a cached copy present serves the cache; `options( +cd.cache_revalidate = FALSE)` skips the HEAD entirely for offline work. +`cd_crop()` and `cd_extract()` gained `cache = TRUE` (default), threading +remote reads through the cache while local paths pass through unchanged. + +Built in three phases (core / read-path wiring / docs+release). Phase 1 +got a 3-round `/code-check` that caught two real issues — perpetual +re-download for ETag-less hosts (fixed with a Content-Length size +fallback in `cd_cache_valid()`) and an unchecked `file.rename` return +(now `stop()`s on failure). Tests use testthat3e `local_mocked_bindings` +to mock `cd_remote_head`/`cd_remote_download`, keeping the 20 cache tests +CI-safe with no real network. Egress kill was confirmed against the live +S3 catalog: first read of `prcp_annual.tif` pulled 5.26 MB; the second +read was a ~1 KB HEAD with no re-download (0.04 s), and fully offline +(0 s) with revalidation disabled. Released as v0.4.0 (minor — new +exported function, new `cache` args, new `curl` dependency). Known +follow-up surfaced but left out of scope: `planning/` is not in +`.Rbuildignore`, which is the source of several pre-existing `--as-cran` +NOTEs. + +Closed by: PR #76 (commits 39c7833 → f53ba3d on branch +`76-wire-cd-cache-read-path`) diff --git a/planning/archive/2026-06-issue-76-cog-cache/findings.md b/planning/archive/2026-06-issue-76-cog-cache/findings.md new file mode 100644 index 0000000..1ec47fd --- /dev/null +++ b/planning/archive/2026-06-issue-76-cog-cache/findings.md @@ -0,0 +1,83 @@ +# Findings — Wire cd_cache into the COG read path (#76) + +## Plan-mode exploration (2026-06-23) + +### Read path +- `cd_extract()` (`R/cd_extract.R:43`) loops catalog rows through + `cd_crop()`. `cd_crop()` (`R/cd_crop.R:23`) calls `terra::rast(href)` + directly on the remote URL — no caching. +- Only `cd_extract` calls `cd_crop` internally. `cd_crop` is also + exported and called directly (e.g. vignette spatial-tmean live + equivalent). Routing `cd_crop` through the cache covers both. + +### Orphaned cache module +- `R/cd_cache.R` ships `cd_cache_path()`, `cd_cache_clear()`, + `cd_cache_info()` backed by `rappdirs::user_cache_dir("cd")`. No + fetch-through layer; nothing in `R/` calls them. + +### Invalidation signal +- STAC items (`inst/extdata/example_catalog.json`, writer in + `R/cd_stac_catalog.R:118`) carry only `cd:variable`, `cd:period`, + `datetime`, `start_datetime`, `end_datetime`. **No `updated` field, + no etag.** So the only freshness signal is an HTTP HEAD on the COG + URL (S3 ETag / Last-Modified). Confirmed: invalidation must be + HEAD-based, per the issue's own suggestion. + +### Dependencies +- `curl` and `httr` both available transitively, but neither in + DESCRIPTION Imports. Plan adds `curl` (lighter) to Imports. + +### CI safety +- Exported-function examples use local `system.file()` paths → + cache passthrough, no network in `R CMD check` examples. +- Vignettes load pre-computed `.rds` (`inst/vignette-data/`), not live + S3, so pkgdown CI never exercises the live fetch path. Cache wiring + won't break CI. + +## Decisions captured +- Whole-COG cache (not AOI-cropped subsets). +- Filename = hash(url); sidecar `.meta` holds etag/size/timestamp. +- HEAD-always revalidation + `options(cd.cache_revalidate = FALSE)` + opt-out (user-approved). +- Atomic temp→rename with Content-Length size validation. + +## Egress confirmation (live S3 smoke test, 2026-06-24) + +Ran `cd_cache_fetch()` against the real catalog +(`https://stac-era5-land.s3.us-west-2.amazonaws.com/prcp_annual.tif`, +5.26 MB) in a throwaway cache dir. Not a committed test (needs network); +documented here as the issue's "second knit does ~zero egress" check. + +| Call | Time | Network | +|------|------|---------| +| 1st fetch | 0.8 s | full 5.26 MB download | +| 2nd fetch (HEAD revalidate) | 0.04 s | ~1 KB HEAD only, file mtime unchanged → no re-download | +| 3rd fetch (`cd.cache_revalidate = FALSE`) | 0.000 s | zero network | + +Sidecar `.meta` captured the real S3 ETag (`bb297f3a…`) and +Content-Length (5518610). So a repeat report/vignette build drops from +N × full-COG egress to N × ~1 KB HEAD (or zero with revalidate off). +Confirms the fix kills the recurring egress driver in rtj#168. + +## Issue context + +(full body) + +The consumer read path re-downloads every COG from S3 on every call. +`cd_extract()` loops each catalog row through `cd_crop()`, which does +`terra::rast(href)` directly on a `/vsicurl/` URL. GDAL's `/vsicurl/` +only keeps a small in-memory chunk cache per session (~16 MB default) +with no on-disk persistence across R sessions. So every separate report +render, appendix knit, or vignette build re-pulls the full overviews + +tiles for each AOI from scratch. Multiple dev iterations × multiple AOIs +× all variables/periods → hundreds of GB of repeated downloads. This is +the likely dominant driver of S3 egress (~$17 / ~290 GB May 2026, +rtj#168). Self-inflicted, recurring, avoidable. + +`R/cd_cache.R` ships the cache helpers but nothing in `R/` calls them. +Wiring it into the read path turns repeated builds from network pulls +into local reads. + +References: +- NewGraphEnvironment/rtj#168 — account-wide S3 cost guardrails (this is + the source-side fix for the egress that issue alarms on). diff --git a/planning/archive/2026-06-issue-76-cog-cache/progress.md b/planning/archive/2026-06-issue-76-cog-cache/progress.md new file mode 100644 index 0000000..b460806 --- /dev/null +++ b/planning/archive/2026-06-issue-76-cog-cache/progress.md @@ -0,0 +1,27 @@ +# Progress — Wire cd_cache into the COG read path (#76) + +## Session 2026-06-23 + +- Plan-mode exploration of read path, orphaned cache module, STAC + invalidation signal, HTTP deps, CI safety — phases approved by user +- User chose HEAD-always revalidation + `cd.cache_revalidate` opt-out +- Created branch `76-wire-cd-cache-read-path` off main +- Scaffolded PWF baseline from issue #76 with approved phases +- Phase 1 complete: `cd_cache_fetch()` + helpers (`cd_is_remote`, + `cd_cache_valid`, `cd_remote_head`, `cd_remote_download`), `curl` + Imports + `withr` Suggests, 20 CI-safe tests. Full suite FAIL 0 / + 219 PASS, lint clean. `/code-check` 3 rounds: 2 fixes (etag→size + fallback for header-poor hosts, `file.rename` failure guard), round + 3 clean. +- Phase 2 complete: `cd_crop(..., cache = TRUE)` routes remote hrefs + through `cd_cache_fetch` (local passthrough), `cd_extract(..., cache + = TRUE)` threads it through. Backward-compatible (default TRUE); + `cache=TRUE`/`FALSE` output identical for local COGs (asserted). + Full suite FAIL 0 / 206 PASS. Trivial param-threading over the + 3-round-reviewed core — judgment-reviewed, not re-looped. +- Phase 3 complete: README "Caching" section + GDAL stopgap note; live + S3 smoke test confirmed egress kill (5.26 MB first read → 0.04 s + HEAD-only second read → 0 s offline); codetools clean on all new + functions; NEWS + version bump 0.3.2 → 0.4.0 (minor). Pre-existing + `--as-cran` NOTEs (planning/ detritus + vignettes) untouched. +- Next: `/planning-archive`, then `/gh-pr-push` diff --git a/planning/archive/2026-06-issue-76-cog-cache/task_plan.md b/planning/archive/2026-06-issue-76-cog-cache/task_plan.md new file mode 100644 index 0000000..9439f44 --- /dev/null +++ b/planning/archive/2026-06-issue-76-cog-cache/task_plan.md @@ -0,0 +1,62 @@ +# Task: Wire cd_cache into the COG read path so repeated builds read locally (#76) + +## Problem + +The consumer read path re-downloads every COG from S3 on every call. +`cd_extract()` loops each catalog row through `cd_crop()`, which does +`terra::rast(href)` directly on a `/vsicurl/` URL. GDAL's `/vsicurl/` +only keeps a small in-memory chunk cache per session with no on-disk +persistence across R sessions, so every report render, appendix knit, +or vignette build re-pulls the full COG for each AOI from scratch. This +is the likely dominant driver of S3 egress (~$17 / ~290 GB in May 2026, +NewGraphEnvironment/rtj#168). The fix is half-built: `R/cd_cache.R` ships +`cd_cache_path()` / `cd_cache_clear()` / `cd_cache_info()` but nothing in +`R/` calls them — the module is orphaned. + +## Approved design (from plan-mode exploration) + +- **Whole-COG caching** (dedupes across overlapping AOIs, simplest key). +- **Cache key:** `hash(url)` for the cached filename (keeps `.tif` + extension so terra reads it), with a sidecar `.meta` JSON holding + `{url, etag, size, downloaded_at}`. Keying the filename by url-hash + (not etag) means a republish overwrites in place — self-cleaning. +- **Invalidation:** cheap HTTP HEAD per read (ETag + Content-Length via + `curl`). HEAD is <1 KB vs GB-scale bodies. ETag match → serve local; + else re-download. STAC items carry no `updated`/etag field, so HEAD is + the only available freshness signal. +- **Revalidation cadence:** HEAD-always for correctness, with an opt-out + `options(cd.cache_revalidate = FALSE)` for a fully-offline fast path. +- **Offline fallback:** HEAD fails + local copy present → serve cached. +- **Partial-download guard:** download to temp, validate byte size vs + `Content-Length`, atomic rename — never serve a truncated file. +- **New dependency:** add `curl` to DESCRIPTION Imports. + +## Phase 1: Fetch-through-cache core + +- [x] New `R/cd_cache_fetch.R`: `cd_cache_fetch(href, refresh = FALSE, cache_dir = NULL)` returns a local path +- [x] Local (non-http/s3) hrefs pass through untouched +- [x] HEAD → ETag/size; serve local on ETag match (size fallback when ETag absent), else download-temp → size-validate → atomic rename → write `.meta` +- [x] `refresh = TRUE` forces re-download; `options(cd.cache_revalidate = FALSE)` skips HEAD; offline-with-local-copy serves cached + messages +- [x] Add `curl` to DESCRIPTION Imports (and `withr` to Suggests for tests) +- [x] `tests/testthat/test-cd_cache_fetch.R`: 20 tests via mocked fetcher (`local_mocked_bindings`), CI-safe — passthrough, key/meta creation, ETag + size revalidation, partial-download rejection, refresh, offline fallback, revalidate opt-out + +## Phase 2: Wire into read path + +- [x] `cd_crop(href, aoi, cache = TRUE)` — route remote hrefs through `cd_cache_fetch`, local passthrough (`isTRUE(cache)` guard) +- [x] `cd_extract(..., cache = TRUE)` — thread `cache` through to `cd_crop` +- [x] Update roxygen (`@param cache`, runnable examples stay local/passthrough) +- [x] Extend `test-cd_crop.R` / `test-cd_extract.R`: `cache = TRUE` with local file passes through, output identical to `cache = FALSE` + +## Phase 3: Docs, README stopgap, egress confirmation + +- [x] README "Caching" section: behavior + GDAL `/vsicurl/` env-var stopgap (`VSI_CACHE`, `GDAL_HTTP_*`) +- [x] Confirmed second read does ~zero egress via live S3 smoke test (5.26 MB → 0.04 s HEAD-only; documented in findings.md) +- [x] `devtools::document()` clean; `check()` adds zero new issues (pre-existing `--as-cran` NOTEs are `planning/` detritus + vignettes; `curl`/`withr` wiring clean per codetools) +- [x] NEWS entry + version bump 0.3.2 → 0.4.0 (minor — new exported `cd_cache_fetch()`, `cache` args, `curl` dep) as final commit + +## Validation + +- [x] Tests pass (full suite FAIL 0) +- [x] `/code-check` clean (3-round on core; judgment-reviewed on Phase 2 threading) +- [x] PWF checkboxes match landed work +- [ ] `/planning-archive` on completion diff --git a/tests/testthat/test-cd_cache_fetch.R b/tests/testthat/test-cd_cache_fetch.R new file mode 100644 index 0000000..ce071f2 --- /dev/null +++ b/tests/testthat/test-cd_cache_fetch.R @@ -0,0 +1,224 @@ +fixture <- function() { + system.file("extdata", "example_climate.tif", package = "cd") +} + +test_that("local paths pass through untouched", { + f <- fixture() + expect_identical(cd_cache_fetch(f), f) +}) + +test_that("non-http and degenerate inputs pass through", { + expect_identical(cd_cache_fetch("s3://bucket/key.tif"), "s3://bucket/key.tif") + expect_identical(cd_cache_fetch("/local/abs/path.tif"), "/local/abs/path.tif") + expect_identical(cd_cache_fetch(NA_character_), NA_character_) +}) + +test_that("remote fetch downloads, validates size, and writes meta", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + + url <- "https://example.com/data/tmean-annual.tif" + out <- cd_cache_fetch(url, cache_dir = tmp) + + expect_true(file.exists(out)) + expect_equal(file.size(out), sz) + meta <- jsonlite::read_json(paste0(out, ".meta")) + expect_equal(meta$etag, "v1") + expect_equal(meta$url, url) + + unlink(tmp, recursive = TRUE) +}) + +test_that("matching ETag serves the cached copy without re-downloading", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + url <- "https://example.com/data/x.tif" + first <- cd_cache_fetch(url, cache_dir = tmp) + + # Second call: download must NOT be invoked. + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) stop("should not download") + ) + second <- cd_cache_fetch(url, cache_dir = tmp) + expect_identical(first, second) + + unlink(tmp, recursive = TRUE) +}) + +test_that("changed ETag triggers a re-download", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + url <- "https://example.com/data/x.tif" + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + cd_cache_fetch(url, cache_dir = tmp) + + downloaded <- 0L + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v2", size = sz), + cd_remote_download = function(href, destfile) { + downloaded <<- downloaded + 1L + file.copy(fx, destfile) + } + ) + cd_cache_fetch(url, cache_dir = tmp) + expect_equal(downloaded, 1L) + meta <- jsonlite::read_json( + paste0(cd_cache_fetch(url, cache_dir = tmp), ".meta") + ) + expect_equal(meta$etag, "v2") + + unlink(tmp, recursive = TRUE) +}) + +test_that("refresh = TRUE forces a re-download even on ETag match", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + url <- "https://example.com/data/x.tif" + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + cd_cache_fetch(url, cache_dir = tmp) + + downloaded <- 0L + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) { + downloaded <<- downloaded + 1L + file.copy(fx, destfile) + } + ) + cd_cache_fetch(url, cache_dir = tmp, refresh = TRUE) + expect_equal(downloaded, 1L) + + unlink(tmp, recursive = TRUE) +}) + +test_that("incomplete download (size mismatch) is rejected", { + tmp <- tempfile("cd_cache") + fx <- fixture() + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = file.size(fx) + 1), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + expect_error( + cd_cache_fetch("https://example.com/data/x.tif", cache_dir = tmp), + "incomplete download" + ) + # No partial file left behind under the cache key. + expect_equal(length(list.files(tmp, pattern = "\\.tif$")), 0L) + + unlink(tmp, recursive = TRUE) +}) + +test_that("offline with a cached copy serves it; without one, errors", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + url <- "https://example.com/data/x.tif" + + # Populate the cache first. + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + cached <- cd_cache_fetch(url, cache_dir = tmp) + + # Now go "offline": HEAD returns NULL. + local_mocked_bindings( + cd_remote_head = function(href) NULL, + cd_remote_download = function(href, destfile) stop("offline") + ) + expect_message( + out <- cd_cache_fetch(url, cache_dir = tmp), + "serving cached copy" + ) + expect_identical(out, cached) + + # A different (un-cached) URL while offline errors. + expect_error( + cd_cache_fetch("https://example.com/data/other.tif", cache_dir = tmp), + "no cached copy" + ) + + unlink(tmp, recursive = TRUE) +}) + +test_that("ETag-less server falls back to size and still cache-hits", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + url <- "https://example.com/data/x.tif" + + # Server returns no ETag, only a size. + local_mocked_bindings( + cd_remote_head = function(href) list(etag = NULL, size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + first <- cd_cache_fetch(url, cache_dir = tmp) + + # Second call (still no ETag): size matches -> must NOT re-download. + local_mocked_bindings( + cd_remote_head = function(href) list(etag = NULL, size = sz), + cd_remote_download = function(href, destfile) stop("should not download") + ) + expect_identical(cd_cache_fetch(url, cache_dir = tmp), first) + + # Size changes -> re-download. + downloaded <- 0L + local_mocked_bindings( + cd_remote_head = function(href) list(etag = NULL, size = sz + 1), + cd_remote_download = function(href, destfile) { + downloaded <<- downloaded + 1L + # produce a file of the new advertised size so validation passes + writeBin(c(readBin(fx, "raw", sz), as.raw(0)), destfile) + } + ) + cd_cache_fetch(url, cache_dir = tmp) + expect_equal(downloaded, 1L) + + unlink(tmp, recursive = TRUE) +}) + +test_that("cd.cache_revalidate = FALSE serves cache without a HEAD", { + tmp <- tempfile("cd_cache") + fx <- fixture() + sz <- file.size(fx) + url <- "https://example.com/data/x.tif" + + local_mocked_bindings( + cd_remote_head = function(href) list(etag = "v1", size = sz), + cd_remote_download = function(href, destfile) file.copy(fx, destfile) + ) + cached <- cd_cache_fetch(url, cache_dir = tmp) + + withr::local_options(cd.cache_revalidate = FALSE) + local_mocked_bindings( + cd_remote_head = function(href) stop("should not HEAD"), + cd_remote_download = function(href, destfile) stop("should not download") + ) + expect_identical(cd_cache_fetch(url, cache_dir = tmp), cached) + + unlink(tmp, recursive = TRUE) +}) diff --git a/tests/testthat/test-cd_cache_fetch_live.R b/tests/testthat/test-cd_cache_fetch_live.R new file mode 100644 index 0000000..e88d2c0 --- /dev/null +++ b/tests/testthat/test-cd_cache_fetch_live.R @@ -0,0 +1,44 @@ +# Integration test against the live S3 catalog. Skipped on CRAN and when +# offline, so CI (which has no business pulling COGs from S3) never runs +# it, but a local `devtools::test()` with a network exercises the real +# curl HEAD + ETag parsing + download path that the mocked unit tests in +# test-cd_cache_fetch.R deliberately stub out. + +test_that("cd_cache_fetch round-trips a real COG from live S3", { + skip_on_cran() + skip_if_offline(host = "stac-era5-land.s3.us-west-2.amazonaws.com") + skip_if_not_installed("withr") + + catalog <- tryCatch(cd_catalog(), error = function(e) NULL) + skip_if(is.null(catalog) || nrow(catalog) == 0, "live catalog unreachable") + + url <- catalog$href[catalog$period == "annual"][1] + skip_if(is.na(url) || !grepl("^https?://", url), "no remote annual COG in catalog") + + cache <- withr::local_tempdir() + + # Cold read: real download, real S3 ETag captured, size validated. + p1 <- cd_cache_fetch(url, cache_dir = cache) + expect_true(file.exists(p1)) + meta <- jsonlite::read_json(paste0(p1, ".meta")) + expect_true(is.character(meta$etag) && nchar(meta$etag) > 0) + # The advertised Content-Length must equal the bytes actually written. + expect_equal(meta$size, file.size(p1)) + + # Warm read: HEAD revalidation hits, same path, file is NOT rewritten. + mtime_cold <- file.info(p1)$mtime + p2 <- cd_cache_fetch(url, cache_dir = cache) + expect_identical(p2, p1) + expect_identical(file.info(p2)$mtime, mtime_cold) + + # The cached file is a valid COG with the same structure as a direct + # remote read — i.e. caching serves the real data, not a stale/partial. + r_cache <- terra::rast(p2) + r_direct <- terra::rast(url) + expect_equal(terra::nlyr(r_cache), terra::nlyr(r_direct)) + expect_equal(names(r_cache), names(r_direct)) + + # Offline fast path: serve the cached copy with no network at all. + withr::local_options(cd.cache_revalidate = FALSE) + expect_identical(cd_cache_fetch(url, cache_dir = cache), p1) +}) diff --git a/tests/testthat/test-cd_crop.R b/tests/testthat/test-cd_crop.R index 70728d8..d81c1d0 100644 --- a/tests/testthat/test-cd_crop.R +++ b/tests/testthat/test-cd_crop.R @@ -32,3 +32,23 @@ test_that("cd_crop preserves band names", { expect_equal(names(r), as.character(1951:1960)) }) + +test_that("cd_crop with cache = TRUE passes a local href straight through", { + href <- system.file("extdata", "example_climate.tif", package = "cd") + aoi <- sf::st_read( + system.file("extdata", "example_aoi.gpkg", package = "cd"), + quiet = TRUE + ) + # Local path is not remote, so cd_cache_fetch returns it unchanged and + # nothing is written to the cache. cache = TRUE must not alter results. + r_cache <- cd_crop(href, aoi, cache = TRUE) + r_nocache <- cd_crop(href, aoi, cache = FALSE) + + expect_s4_class(r_cache, "SpatRaster") + expect_equal(terra::nlyr(r_cache), 10) + expect_equal(names(r_cache), names(r_nocache)) + expect_equal( + terra::global(r_cache, "mean", na.rm = TRUE)$mean, + terra::global(r_nocache, "mean", na.rm = TRUE)$mean + ) +}) diff --git a/tests/testthat/test-cd_extract.R b/tests/testthat/test-cd_extract.R index 1db7bd6..3eb56d8 100644 --- a/tests/testthat/test-cd_extract.R +++ b/tests/testthat/test-cd_extract.R @@ -44,6 +44,20 @@ test_that("cd_extract filters by years", { expect_equal(ts$year, 1951L:1953L) }) +test_that("cd_extract cache = TRUE matches cache = FALSE for local COGs", { + catalog <- cd_catalog( + system.file("extdata", "example_catalog.json", package = "cd") + ) + aoi <- sf::st_read( + system.file("extdata", "example_aoi.gpkg", package = "cd"), + quiet = TRUE + ) + ts_cache <- cd_extract(catalog, aoi, cache = TRUE) + ts_nocache <- cd_extract(catalog, aoi, cache = FALSE) + + expect_equal(ts_cache, ts_nocache) +}) + test_that("cd_extract filters by variables", { catalog <- cd_catalog( system.file("extdata", "example_catalog.json", package = "cd")