diff --git a/CLAUDE.md b/CLAUDE.md index 7911d30..6f17ad9 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -11,9 +11,11 @@ Experimental package — breaking all the time and loving the learning curve. St **Prefix:** `lnk_` **Branch:** `main` (v0.40.2 as of 2026-05-19) -## Status (2026-05-23) — ACTIVE HANDOFF +## Status (2026-05-25) — ACTIVE HANDOFF (#175 study-area mapping_code parity) -**Picking up this repo? Read [`planning/active/HANDOFF.md`](planning/active/HANDOFF.md) first, then [`RUNBOOK.md`](RUNBOOK.md).** Work on branch `196-streams-access-source-flags` is mid-stream and handing off to M1. The mapping_code/access mechanism is solved and the next fix (Phase 4d) is scoped — do not start over. v0.40.3 (persist per-source flags) is ready to ship; the dam/access divergence is characterized with a drafted fix + issue. +**Picking up? Read [`planning/active/task_plan.md`](planning/active/task_plan.md) + [`progress.md`](planning/active/progress.md), then [`research/study_area_run.md`](research/study_area_run.md) and [`RUNBOOK.md`](RUNBOOK.md).** Branch `175-promote-with-mapping-code-flag-to-stand` (pushed, `34b0cd3`). Built a lean tunnel-free, M1-dispatch study-area parity runner (`data-raw/study_area_run.sh` + `study_area_wsgs.R` / `wsg_run_one.R` / `study_area_compare.R`); ran all 3 study areas (50 WSGs) — **authoritative parity median 99.66%** ([`research/provincial_parity_2026_05_25.md`](research/provincial_parity_2026_05_25.md)). + +**THE key finding:** per-segment mapping_code parity needs a **post-consolidate recompute** — drainage-closed + DS-first per-host is NOT sufficient (downstream barriers can be cross-bucket / late-in-order; FINA 75%→99% only after re-modelling on the full consolidated barrier set). The recompute is the correctness guarantee; bucketing is just a speed knob. **Next: build #205** (cheap access-only recompute reusing persisted streams/habitat — the current full-pipeline recompute is ~2× on diverged WSGs), then one clean driver-automated run, then annotate the genuine divergences (UNRS reservoir, SETN salmon) + ship. Filed #204 (persist shape-drift) + #205 (cheap recompute). Do NOT start over — the methodology is solved. ## Status (2026-05-19) diff --git a/NAMESPACE b/NAMESPACE index a9036be..1d40ce6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(format,lnk_stamp) S3method(print,lnk_config) S3method(print,lnk_stamp) +export(lnk_access) export(lnk_aggregate) export(lnk_barrier_overrides) export(lnk_barriers_emit) @@ -13,6 +14,7 @@ export(lnk_baseline_current) export(lnk_baseline_read) export(lnk_bucket_get) export(lnk_bucket_log) +export(lnk_compare_mapping_code) export(lnk_compare_rollup) export(lnk_compare_wsg) export(lnk_config) diff --git a/R/lnk_access.R b/R/lnk_access.R new file mode 100644 index 0000000..0c2ea54 --- /dev/null +++ b/R/lnk_access.R @@ -0,0 +1,213 @@ +#' Build per-segment per-species access from schema tables (portable) +#' +#' Schema-aware portable wrapper around [lnk_pipeline_access()] — the +#' access twin of [lnk_mapping_code()]. Builds the per-species +#' `barriers__access` + per-source views internally (via +#' [lnk_barriers_views()]) over `table_barriers`, then computes the wide +#' `streams_access` shape for `aoi` and writes it to `table_to`. +#' +#' Works against working-schema tables (mid-pipeline) or persist-schema +#' tables (ad-hoc / post-consolidate recompute) without modification — the +#' caller passes explicit `table_` names. The caller passes ONE +#' `table_barriers` (the unified `barriers` table); the per-species access +#' set and the source-typed views are derived from it internally, so no +#' pre-built `barriers_per_sp` list is needed (that stays the lower-level +#' [lnk_pipeline_access()] surface). +#' +#' @section Merge (recompute) mode: +#' `merge = TRUE` is the **post-consolidate recompute** (link#205). A WSG's +#' accessibility depends on barriers *downstream*, possibly in another WSG +#' (the provincial-accumulation property, RUNBOOK.md §5); when WSGs are +#' modelled on separate hosts each sees only its own barriers, so the +#' per-host `streams_access` can be wrong cross-WSG. Once all barriers are +#' consolidated, `merge = TRUE` re-settles ONLY the cross-WSG columns +#' (`has_barriers__dnstr`, `has_barriers_{anthropogenic,pscis,dams}_dnstr`, +#' `dam_dnstr_ind`) against the complete `table_barriers`, reusing the +#' already-persisted `streams` + `streams_habitat` — far cheaper than a full +#' [lnk_pipeline_run()] (which re-derives streams + habitat). It UPDATEs the +#' existing `table_to` rows for `aoi` and **preserves** the within-WSG columns +#' the recompute does not touch: +#' - `remediated_dnstr_ind` (and `has_barriers_remediations_dnstr`) — depend +#' on the working-schema `crossings`/remediations, correct from the prior +#' compute and within-WSG in practice. +#' - the observed-upstream distinction in `access_`: set to `0` when newly +#' blocked, else kept at `2` where the prior compute had an observation, else +#' `1`. +#' +#' `observations`/`crossings` are intentionally skipped (`NULL`): they only +#' drive the access 1-vs-2 code + `remediated_dnstr_ind` (both preserved +#' above); mapping_code's `accessible = !has_barriers__dnstr` is +#' independent of them. +#' +#' `merge = FALSE` (default) overwrites `table_to` via +#' [lnk_pipeline_access()] — first-compute, intended for a working / scratch +#' table (it drops + recreates the target as a flat `id_segment`-keyed table, +#' so do NOT point it at a persist table; use `merge = TRUE` for persist). +#' +#' @param conn A [DBI::DBIConnection-class] to the local pipeline DB. +#' @param cfg An `lnk_config` object. +#' @param aoi Character. Watershed group code (e.g. `"PARS"`). +#' @param table_streams Character. Schema-qualified `streams` table (the +#' segments). +#' @param table_barriers Character. Schema-qualified unified `barriers` +#' table. The per-species `_access` + source `_unified` views are built +#' over it internally via [lnk_barriers_views()]. +#' @param table_to Character. Schema-qualified destination `streams_access` +#' table. With `merge = TRUE` it must already exist (rows for `aoi` are +#' UPDATEd in place). +#' @param merge Logical. `FALSE` (default) overwrites `table_to`. `TRUE` +#' surgically UPDATEs `table_to`'s `aoi` rows (recompute; see Merge mode). +#' @param presence An `lnk_presence` object or `NULL`. Per-species presence +#' for `aoi`; pass-through to [lnk_pipeline_access()]. +#' @param species Character vector of species codes. Default `cfg$species`. +#' +#' @return `conn` invisibly. +#' +#' @family compare +#' @seealso [lnk_mapping_code()], [lnk_pipeline_access()], [lnk_barriers_views()] +#' +#' @examples +#' \dontrun{ +#' conn <- lnk_db_conn() +#' cfg <- lnk_config("bcfishpass") +#' loaded <- lnk_load_overrides(cfg) +#' pres <- lnk_presence(loaded$wsg_species_presence, "PARS") +#' +#' # Post-consolidate recompute against persist (cheap; cross-WSG correct): +#' lnk_access( +#' conn, cfg, aoi = "PARS", +#' table_streams = "fresh.streams", +#' table_barriers = "fresh.barriers", +#' table_to = "fresh.streams_access", +#' merge = TRUE, presence = pres) +#' lnk_mapping_code( +#' conn, +#' table_access = "fresh.streams_access", +#' table_habitat = "fresh.streams_habitat_long_vw", +#' table_streams = "fresh.streams", +#' aoi = "PARS", +#' table_to = "fresh.streams_mapping_code", +#' presence = pres) +#' } +#' +#' @export +lnk_access <- function(conn, cfg, aoi, table_streams, table_barriers, + table_to, merge = FALSE, presence = NULL, + species = NULL) { + stopifnot( + inherits(conn, "DBIConnection"), + inherits(cfg, "lnk_config"), + is.character(aoi), length(aoi) == 1L, nzchar(aoi), + is.character(table_streams), length(table_streams) == 1L, nzchar(table_streams), + is.character(table_barriers), length(table_barriers) == 1L, nzchar(table_barriers), + is.character(table_to), length(table_to) == 1L, nzchar(table_to), + is.logical(merge), length(merge) == 1L, + is.null(species) || is.character(species) + ) + + species <- if (is.null(species)) cfg$species else species + if (is.null(species) || length(species) == 0L) { + stop("species is empty (pass `species` or set cfg$species)", call. = FALSE) + } + sp_set <- tolower(species) + + # The barrier views live in the same schema as table_barriers (so they + # read it + the sibling barrier_overrides). Derive it from the qualified name. + view_schema <- sub("\\.[^.]+$", "", table_barriers) + + # 1. Per-species `_access` + per-source `_unified` views over table_barriers. + lnk_barriers_views(conn, schema = view_schema, cfg = cfg, + species = toupper(sp_set), barriers_table = table_barriers) + + barriers_per_sp <- stats::setNames( + as.list(paste0(view_schema, ".barriers_", sp_set, "_access")), sp_set) + barrier_sources <- list( + anthropogenic = paste0(view_schema, ".barriers_anthropogenic_unified"), + pscis = paste0(view_schema, ".barriers_pscis_unified"), + dams = paste0(view_schema, ".barriers_dams_unified")) + + # AOI-scope the segments — and as a real TABLE (with indexes + ANALYZE), + # NOT a view. `frs_network_features` joins segments to features via + # `whse_basemapping.fwa_downstream(...)`, which inlines into ltree-containment + # predicates the planner can use. But the join DIRECTION matters: if the + # planner picks the ~800k-row barriers as the outer driver instead of the + # ~26k AOI streams, cost explodes by ~1000× (verified via EXPLAIN: 71M + # estimated result rows). A `CREATE VIEW` over persist `streams` doesn't + # carry the small-table row stats, so the planner mis-picks. Materialising + # to a real table with stats fixes the direction. This mirrors the full + # pipeline (which is fast because its `working.streams` is a real, indexed + # table). link#205. + streams_name <- paste0("zz_lnk_streams_", tolower(aoi)) + streams_scoped <- paste0(view_schema, ".", streams_name) + .lnk_db_execute(conn, sprintf("DROP TABLE IF EXISTS %s", streams_scoped)) + .lnk_db_execute(conn, sprintf( + "CREATE TABLE %s AS SELECT * FROM %s WHERE watershed_group_code = %s", + streams_scoped, table_streams, DBI::dbQuoteLiteral(conn, aoi))) + .lnk_db_execute(conn, sprintf( + "CREATE INDEX ON %s (id_segment)", streams_scoped)) + .lnk_db_execute(conn, sprintf( + "CREATE INDEX ON %s USING GIST (wscode_ltree)", streams_scoped)) + .lnk_db_execute(conn, sprintf( + "CREATE INDEX ON %s USING GIST (localcode_ltree)", streams_scoped)) + .lnk_db_execute(conn, sprintf( + "CREATE INDEX ON %s (blue_line_key)", streams_scoped)) + .lnk_db_execute(conn, sprintf("ANALYZE %s", streams_scoped)) + on.exit(try(.lnk_db_execute(conn, sprintf("DROP TABLE IF EXISTS %s", streams_scoped)), + silent = TRUE), add = TRUE) + + # 2a. Overwrite mode: build straight into table_to (working/scratch). + if (!isTRUE(merge)) { + lnk_pipeline_access(conn, + segments = streams_scoped, aoi = aoi, to = table_to, + barriers_per_sp = barriers_per_sp, observations = NULL, + presence = presence, barrier_sources = barrier_sources, + crossings_table = NULL) + return(invisible(conn)) + } + + # 2b. Merge mode: build into a scratch table, surgical UPDATE into table_to. + scratch_name <- paste0("zz_lnk_access_scratch_", tolower(aoi)) + scratch <- paste0(view_schema, ".", scratch_name) + on.exit(try(.lnk_db_execute(conn, sprintf("DROP TABLE IF EXISTS %s", scratch)), + silent = TRUE), add = TRUE) + lnk_pipeline_access(conn, + segments = streams_scoped, aoi = aoi, to = scratch, + barriers_per_sp = barriers_per_sp, observations = NULL, + presence = presence, barrier_sources = barrier_sources, + crossings_table = NULL) + + # Recomputed cross-WSG columns (only those the build actually produced). + scratch_cols <- DBI::dbGetQuery(conn, sprintf( + "SELECT column_name FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s", + DBI::dbQuoteString(conn, view_schema), + DBI::dbQuoteString(conn, scratch_name)))$column_name + + flag_cols <- intersect( + c(paste0("has_barriers_", sp_set, "_dnstr"), + "has_barriers_anthropogenic_dnstr", "has_barriers_pscis_dnstr", + "has_barriers_dams_dnstr", "dam_dnstr_ind"), + scratch_cols) + set_flags <- sprintf("%s = sc.%s", flag_cols, flag_cols) + + # access_: 0 if newly blocked, else keep prior 2 (observed), else 1. + access_cols <- intersect(paste0("access_", sp_set), scratch_cols) + set_access <- sprintf( + "%s = CASE WHEN sc.%s = 0 THEN 0 WHEN t.%s = 2 THEN 2 ELSE 1 END", + access_cols, access_cols, access_cols) + + set_clause <- paste(c(set_flags, set_access), collapse = ",\n ") + if (!nzchar(set_clause)) { + stop("lnk_access(merge=TRUE): nothing to update — scratch produced no ", + "recomputable columns for ", aoi, call. = FALSE) + } + + # id_segment is unique within a WSG; scratch is aoi-scoped and table_to is + # filtered to aoi, so (id_segment, wsg) keys the UPDATE. remediated_dnstr_ind + # + has_barriers_remediations_dnstr are NOT in the SET -> preserved. + .lnk_db_execute(conn, sprintf( + "UPDATE %s t SET\n %s\n FROM %s sc\n WHERE t.id_segment = sc.id_segment\n AND t.watershed_group_code = %s", + table_to, set_clause, scratch, DBI::dbQuoteLiteral(conn, aoi))) + + invisible(conn) +} diff --git a/R/lnk_compare_mapping_code.R b/R/lnk_compare_mapping_code.R new file mode 100644 index 0000000..c20b900 --- /dev/null +++ b/R/lnk_compare_mapping_code.R @@ -0,0 +1,281 @@ +#' Compare one watershed group's persisted mapping_code tokens against a reference +#' +#' Segment-level QA counterpart to [lnk_compare_rollup()]. Reads the +#' per-segment `mapping_code_` tokens that [lnk_pipeline_run()] (with +#' `mapping_code = TRUE`) persisted to `.streams_mapping_code`, +#' diffs them against a reference's tokens for the same segments, and returns +#' a per-species match tibble. +#' +#' Reads only — no writes, no working schema. +#' +#' ## Tunnel-free by default +#' +#' The reference is the **local** snapshot `fresh.streams_vw_bcfp` (loaded by +#' `data-raw/snapshot_bcfp.sh --with-bcfp-views` from bcfp's published S3 +#' output — no SSH, no `:63333`). With `conn_ref = NULL` (default) the compare +#' is a single local join on `conn`: no second connection, no `PG_PASS_SHARE`, +#' no tunnel. Pass `conn_ref` (a DBI connection to the live bcfp tunnel) to +#' diff against `bcfishpass.streams_mapping_code` instead — the legacy path, +#' kept for back-compat. +#' +#' ## Join +#' +#' link's `streams_mapping_code.id_segment` is a local surrogate, distinct from +#' bcfp's `segmented_stream_id`, so the join is on FWA segment-start position: +#' `blue_line_key` + `downstream_route_measure` (rounded to 3 decimals — robust +#' to ULP drift on the PostGIS-computed doubles, deterministic across runs that +#' share the same fwapg segmentation). link's position columns come from +#' `.streams`, joined on the full PK +#' `(id_segment, watershed_group_code)` — `id_segment` alone is not unique +#' across WSGs. The snapshot view carries the position columns inline. +#' +#' ## Species resolution +#' +#' `species = NULL` (default) compares every species present as a +#' `mapping_code_` column on BOTH sides (link's persisted table and the +#' reference), with rows for the WSG. Pass `species` to restrict; caller-passed +#' species absent on either side drop out (no error). +#' +#' @param conn DBI connection to the local pipeline database (where +#' `` and `fresh.streams_vw_bcfp` live). +#' @param aoi Watershed group code (e.g. `"PARS"`). +#' @param cfg An `lnk_config` object (resolves `cfg$pipeline$schema`). +#' @param reference Character scalar identifying the reference. Only +#' `"bcfishpass"` is supported. +#' @param conn_ref Optional DBI connection to the bcfp tunnel +#' (`localhost:63333`). Default `NULL` → tunnel-free local-snapshot compare. +#' @param species Optional character vector of species codes to restrict to. +#' Default `NULL` discovers the set from the mapping_code columns. +#' @param ref_table Reference table name for the tunnel-free path. Default +#' `"fresh.streams_vw_bcfp"` (where `snapshot_bcfp.sh` loads bcfp's output). +#' +#' @return A tibble, one row per species: `wsg`, `species`, `total_segs`, +#' `match_pct`, `n_diffs`, `top_pattern` (most common `link | bcfp` token +#' mismatch), `top_pattern_count`. +#' +#' @examples +#' \dontrun{ +#' conn <- lnk_db_conn() +#' cfg <- lnk_config("bcfishpass") +#' +#' # Tunnel-free: diff persisted tokens vs the local fresh.streams_vw_bcfp snapshot. +#' lnk_compare_mapping_code(conn, aoi = "PARS", cfg = cfg) +#' +#' # Legacy tunnel path (requires the bcfp tunnel up): +#' conn_ref <- DBI::dbConnect(RPostgres::Postgres(), +#' host = "localhost", port = 63333, dbname = "bcfishpass", +#' user = "newgraph", password = Sys.getenv("PG_PASS_SHARE")) +#' lnk_compare_mapping_code(conn, "PARS", cfg, conn_ref = conn_ref) +#' } +#' +#' @family compare +#' @seealso [lnk_compare_rollup()], [lnk_compare_wsg()], [lnk_pipeline_run()] +#' @export +lnk_compare_mapping_code <- function(conn, aoi, cfg, + reference = "bcfishpass", + conn_ref = NULL, + species = NULL, + ref_table = "fresh.streams_vw_bcfp") { + stopifnot( + inherits(conn, "DBIConnection"), + is.character(aoi), length(aoi) == 1L, nzchar(aoi), + grepl("^[A-Z]{3,5}$", aoi), + inherits(cfg, "lnk_config"), + is.character(reference), length(reference) == 1L, nzchar(reference), + is.null(conn_ref) || inherits(conn_ref, "DBIConnection"), + is.null(species) || is.character(species), + is.character(ref_table), length(ref_table) == 1L, nzchar(ref_table) + ) + + supported_references <- c("bcfishpass") + if (!reference %in% supported_references) { + stop("Unsupported reference '", reference, "'. Supported: ", + paste(supported_references, collapse = ", "), ".", call. = FALSE) + } + + tunnel_free <- is.null(conn_ref) + ref_conn <- if (tunnel_free) conn else conn_ref + # Tunnel-free reads the local snapshot view; tunnel path reads bcfp's live + # streams_mapping_code (joined to bcfishpass.streams for position columns). + ref_from <- if (tunnel_free) { + sprintf("%s", ref_table) + } else { + "bcfishpass.streams_mapping_code bmc + JOIN bcfishpass.streams bs ON bs.segmented_stream_id = bmc.segmented_stream_id" + } + + tn <- .lnk_table_names(cfg) + persist_schema <- tn$schema + + # Resolve the species compared: mapping_code_ columns present on the + # reference AND **active** on the link side for this WSG (≥1 non-empty token). + # Restricting to WSG-active link species avoids spurious 0%-match rows for + # species the WSG doesn't model — link emits "" for absent species while the + # reference emits NULL, which would otherwise count as all-mismatch. "salmon" + # is a bcfp-only aggregate with no link counterpart and drops out. + link_cols <- .lnk_mc_species_cols(conn, persist_schema, "streams_mapping_code") + link_sp <- .lnk_mc_active_species(conn, persist_schema, "streams_mapping_code", + aoi, link_cols) + ref_schema_table <- if (tunnel_free) { + strsplit(ref_table, ".", fixed = TRUE)[[1]] + } else { + c("bcfishpass", "streams_mapping_code") + } + ref_sp <- .lnk_mc_species_cols(ref_conn, ref_schema_table[1], ref_schema_table[2]) + cmp_species <- intersect(link_sp, ref_sp) + if (!is.null(species)) { + cmp_species <- intersect(cmp_species, toupper(species)) + } + if (length(cmp_species) == 0L) { + stop("no shared mapping_code_ columns to compare for ", aoi, + " (link: ", paste(link_sp, collapse = ","), + "; ref: ", paste(ref_sp, collapse = ","), ").", call. = FALSE) + } + + aoi_lit_link <- DBI::dbQuoteLiteral(conn, aoi) + aoi_lit_ref <- DBI::dbQuoteLiteral(ref_conn, aoi) + + # link side: persisted tokens + FWA position from .streams. + # JOIN on BOTH (id_segment, watershed_group_code): id_segment is not globally + # unique in the persist tables (PK is the pair), so joining on id_segment + # alone fans a WSG's segments out across every other WSG sharing that id — + # a cartesian blow-up that wrecks the match. (This was latent in the old + # tunnel helper too.) + link_mc <- DBI::dbGetQuery(conn, sprintf(" + SELECT lmc.*, ls.blue_line_key, + round(ls.downstream_route_measure::numeric, 3) AS downstream_route_measure + FROM %1$s.streams_mapping_code lmc + JOIN %1$s.streams ls + ON ls.id_segment = lmc.id_segment + AND ls.watershed_group_code = lmc.watershed_group_code + WHERE ls.watershed_group_code = %2$s", + persist_schema, aoi_lit_link)) + + # reference side: local snapshot view (tunnel-free) carries the position + # columns inline; tunnel path joins bcfishpass.streams for them. + if (tunnel_free) { + bcfp_mc <- DBI::dbGetQuery(ref_conn, sprintf(" + SELECT blue_line_key, + round(downstream_route_measure::numeric, 3) AS downstream_route_measure, + %2$s + FROM %1$s + WHERE watershed_group_code = %3$s", + ref_from, + paste(sprintf("mapping_code_%s", tolower(cmp_species)), collapse = ", "), + aoi_lit_ref)) + } else { + bcfp_mc <- DBI::dbGetQuery(ref_conn, sprintf(" + SELECT bmc.*, bs.blue_line_key, + round(bs.downstream_route_measure::numeric, 3) AS downstream_route_measure + FROM %s + WHERE bs.watershed_group_code = %s", + ref_from, aoi_lit_ref)) + } + + .lnk_mc_diff(link_mc, bcfp_mc, aoi = aoi, species = cmp_species, + ref_empty_is_na = TRUE) +} + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + +#' Species codes (uppercase) that have a `mapping_code_` column on a table. +#' Excludes the bcfp-only `salmon` aggregate (no link counterpart). +#' @noRd +.lnk_mc_species_cols <- function(conn, schema, table) { + cols <- DBI::dbGetQuery(conn, sprintf( + "SELECT column_name FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s + AND column_name LIKE 'mapping_code\\_%%' ESCAPE '\\'", + DBI::dbQuoteLiteral(conn, schema), + DBI::dbQuoteLiteral(conn, table)))$column_name + sp <- sub("^mapping_code_", "", cols) + sp <- sp[grepl("^[a-z]+$", sp) & sp != "salmon"] + toupper(sp) +} + + +#' Subset of candidate species that are ACTIVE in a WSG — i.e. have at least +#' one non-empty `mapping_code_` token in `.` for the AOI. +#' Restricts the compare to species the WSG actually models (link emits "" for +#' absent species; the reference emits NULL — comparing them is meaningless). +#' @noRd +.lnk_mc_active_species <- function(conn, schema, table, aoi, candidates) { + if (length(candidates) == 0L) return(character(0)) + checks <- paste(sprintf( + "bool_or(mapping_code_%1$s IS NOT NULL AND mapping_code_%1$s <> '') AS %1$s", + tolower(candidates)), collapse = ", ") + r <- DBI::dbGetQuery(conn, sprintf( + "SELECT %s FROM %s.%s WHERE watershed_group_code = %s", + checks, schema, table, DBI::dbQuoteLiteral(conn, aoi))) + if (nrow(r) == 0L) return(character(0)) + flags <- as.logical(unlist(r[1, , drop = TRUE])) + candidates[!is.na(flags) & flags] +} + + +#' Per-segment token diff (shared by tunnel-free + tunnel paths). +#' +#' Merges link + reference frames on FWA position and computes per-species +#' match stats. `ref_empty_is_na = TRUE` returns NA-filled stats (with a +#' warning) when the reference has no rows for the WSG (bcfp doesn't model it); +#' a non-empty reference with no key overlap is a hard error (snapshot +#' misalignment). +#' @noRd +.lnk_mc_diff <- function(link_mc, bcfp_mc, aoi, species, + ref_empty_is_na = TRUE) { + joined <- merge( + link_mc, bcfp_mc, + by = c("blue_line_key", "downstream_route_measure"), + suffixes = c("_link", "_bcfp")) + + if (nrow(joined) == 0L) { + if (isTRUE(ref_empty_is_na) && nrow(bcfp_mc) == 0L) { + warning(sprintf( + "reference has 0 rows for %s — not modelled there; returning NA stats.", + aoi), call. = FALSE) + return(do.call(rbind, lapply(species, function(sp) { + tibble::tibble(wsg = aoi, species = sp, total_segs = 0L, + match_pct = NA_real_, n_diffs = NA_integer_, + top_pattern = NA_character_, top_pattern_count = NA_integer_) + }))) + } + stop(sprintf( + "no position overlap between link + reference streams_mapping_code for %s ", + aoi), + "(link rows: ", nrow(link_mc), ", ref rows: ", nrow(bcfp_mc), + "). Check fwapg snapshot alignment.", call. = FALSE) + } + + rows <- lapply(species, function(sp) { + link_col <- paste0("mapping_code_", tolower(sp), "_link") + bcfp_col <- paste0("mapping_code_", tolower(sp), "_bcfp") + if (!(link_col %in% names(joined)) || !(bcfp_col %in% names(joined))) { + return(tibble::tibble(wsg = aoi, species = sp, + total_segs = nrow(joined), match_pct = NA_real_, n_diffs = NA_integer_, + top_pattern = NA_character_, top_pattern_count = NA_integer_)) + } + l <- joined[[link_col]] + b <- joined[[bcfp_col]] + matches <- (is.na(l) & is.na(b)) | (!is.na(l) & !is.na(b) & l == b) + n_match <- sum(matches) + n_total <- nrow(joined) + diff_idx <- which(!matches) + top_pattern <- NA_character_ + top_pattern_count <- NA_integer_ + if (length(diff_idx) > 0L) { + patt <- paste0(ifelse(is.na(l[diff_idx]), "", l[diff_idx]), " | ", + ifelse(is.na(b[diff_idx]), "", b[diff_idx])) + tab <- sort(table(patt), decreasing = TRUE) + top_pattern <- names(tab)[1] + top_pattern_count <- as.integer(tab[1]) + } + tibble::tibble(wsg = aoi, species = sp, total_segs = n_total, + match_pct = round(100 * n_match / n_total, 2), + n_diffs = as.integer(n_total - n_match), + top_pattern = top_pattern, top_pattern_count = top_pattern_count) + }) + do.call(rbind, rows) +} diff --git a/R/lnk_compare_rollup.R b/R/lnk_compare_rollup.R index a291f61..d6f72b4 100644 --- a/R/lnk_compare_rollup.R +++ b/R/lnk_compare_rollup.R @@ -201,7 +201,7 @@ lnk_compare_rollup <- function(conn, aoi, cfg, "SELECT %s AS species_code, s.id_segment, s.length_metre, s.edge_type, h.spawning, h.rearing FROM %s.streams s - JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment + JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment AND s.watershed_group_code = h.watershed_group_code WHERE s.watershed_group_code = %s", sp_lit, tn$schema, tn$schema, tolower(sp), aoi_lit) # nolint: indentation_linter }, character(1)), collapse = "\n UNION ALL\n ") @@ -232,7 +232,7 @@ lnk_compare_rollup <- function(conn, aoi, cfg, sprintf( "SELECT %s AS species_code, s.waterbody_key FROM %s.streams s - JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment + JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment AND s.watershed_group_code = h.watershed_group_code WHERE s.watershed_group_code = %s AND h.lake_rearing = TRUE", sp_lit, tn$schema, tn$schema, tolower(sp), aoi_lit) # nolint: indentation_linter @@ -253,7 +253,7 @@ lnk_compare_rollup <- function(conn, aoi, cfg, sprintf( "SELECT %s AS species_code, s.waterbody_key FROM %s.streams s - JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment + JOIN %s.streams_habitat_%s h ON s.id_segment = h.id_segment AND s.watershed_group_code = h.watershed_group_code WHERE s.watershed_group_code = %s AND h.wetland_rearing = TRUE", sp_lit, tn$schema, tn$schema, tolower(sp), aoi_lit) # nolint: indentation_linter diff --git a/R/lnk_compare_wsg.R b/R/lnk_compare_wsg.R index 6872000..d091835 100644 --- a/R/lnk_compare_wsg.R +++ b/R/lnk_compare_wsg.R @@ -166,10 +166,10 @@ lnk_compare_wsg <- function(conn, aoi, cfg, loaded, call. = FALSE ) } - # `mapping_code = TRUE` requires conn_ref also for the - # streams_mapping_code comparison query. Already validated above - # when reference == 'bcfishpass' (the only supported reference today). - # No additional gate needed. + # `mapping_code = TRUE` is now tunnel-free (link#175) — it diffs against + # the LOCAL bcfp snapshot (fresh.streams_vw_bcfp), not conn_ref. conn_ref + # is still required above for the rollup (bcfp habitat_linear isn't in the + # snapshot). # Run the modelling pipeline. `mapping_code = mapping_code` routes # the streams_access + streams_mapping_code build through pipeline_run's @@ -194,12 +194,11 @@ lnk_compare_wsg <- function(conn, aoi, cfg, loaded, mc_stats <- NULL if (isTRUE(mapping_code)) { # Diff link's `.streams_mapping_code` (built by - # pipeline_run's mapping_code phase) against reference's. Tunnel- - # bound — needs conn_ref. Build path was already executed above. - bcfp_species <- c("bt", "ch", "cm", "co", "pk", "sk", "st", "wct") - mc_stats <- .lnk_compare_wsg_mapping_code_diff( # nolint: object_usage_linter - conn = conn, conn_ref = conn_ref, - aoi = aoi, cfg = cfg, bcfp_species = bcfp_species) + # pipeline_run's mapping_code phase) against the LOCAL bcfp snapshot — + # tunnel-free (link#175). Species auto-resolve to the WSG-active set. + mc_stats <- lnk_compare_mapping_code( # nolint: object_usage_linter + conn = conn, aoi = aoi, cfg = cfg, + reference = reference, species = species) } list(rollup = rollup, mapping_code = mc_stats) @@ -511,126 +510,3 @@ lnk_compare_wsg <- function(conn, aoi, cfg, loaded, out$ref_value, 1)) out } - -#' Diff link's streams_mapping_code vs reference's, return per-species stats -#' -#' Joins on `(blue_line_key, downstream_route_measure, length_metre)` — -#' the canonical segment identity across link and bcfp. NA-aware -#' comparison: `NA == NA` counts as match; `NA` vs concrete value is a -#' mismatch. -#' -#' Returns one row per `bcfp_species`. `top_pattern` is the dominant -#' " | " diff string; useful for class-A/B/C/D -#' taxonomy lookup downstream. -#' -#' @noRd -.lnk_compare_wsg_mapping_code_diff <- function(conn, conn_ref, aoi, cfg, - bcfp_species) { - aoi_lit_link <- DBI::dbQuoteLiteral(conn, aoi) - aoi_lit_ref <- DBI::dbQuoteLiteral(conn_ref, aoi) - tn <- .lnk_table_names(cfg) - persist_schema <- tn$schema - - # Round float join keys to 3 decimal places (mm precision on values - # already in metres). `downstream_route_measure` + `length_metre` are - # PostGIS-computed doubles; deterministic across runs that share the - # same fwapg segmentation, but rounding makes the join robust to any - # future ULP-level drift between link's and bcfp's tunnels. - # - # Reads link's `.streams_mapping_code` (link#187 — built - # by lnk_pipeline_run's mapping_code phase, persisted via lnk_pipeline_persist). - # Pre-#187 this read from the working schema's table; persist path is - # symmetric vs the bcfp source below. - link_mc <- DBI::dbGetQuery(conn, sprintf(" - SELECT lmc.*, ls.blue_line_key, - round(ls.downstream_route_measure::numeric, 3) AS downstream_route_measure, - round(ls.length_metre::numeric, 3) AS length_metre - FROM %1$s.streams_mapping_code lmc - JOIN %1$s.streams ls ON ls.id_segment = lmc.id_segment - WHERE ls.watershed_group_code = %2$s", - persist_schema, aoi_lit_link)) - - bcfp_mc <- DBI::dbGetQuery(conn_ref, sprintf(" - SELECT bmc.*, bs.blue_line_key, - round(bs.downstream_route_measure::numeric, 3) AS downstream_route_measure, - round(bs.length_metre::numeric, 3) AS length_metre - FROM bcfishpass.streams_mapping_code bmc - JOIN bcfishpass.streams bs - ON bs.segmented_stream_id = bmc.segmented_stream_id - WHERE bs.watershed_group_code = %s", aoi_lit_ref)) - - joined <- merge( - link_mc, bcfp_mc, - by = c("blue_line_key", "downstream_route_measure", "length_metre"), - suffixes = c("_link", "_bcfp")) - - # No-overlap handling. Two distinct cases: - # (a) bcfp has 0 rows for this WSG — bcfp's bundle filter doesn't - # model it (link#157-style, but on the bcfp side: ~36 WSGs we - # model that bcfp's 2026-05-12 build does not, spanning - # Mackenzie/Peace drainages, Stikine, and central-BC basins - # like BEAV/COAL/DUNE). Not a defect — emit a warning + - # NA-filled per-species mapping_code stats so the rollup - # tibble still returns and the run continues. - # (b) bcfp has rows but no key overlap — that IS a fwapg snapshot - # misalignment between tunnels, worth surfacing loudly. - if (nrow(joined) == 0L) { - if (nrow(bcfp_mc) == 0L) { - warning(sprintf( - "bcfishpass.streams_mapping_code has 0 rows for %s — bcfp does ", - aoi), - "not model this WSG. Returning NA-filled mapping_code stats.", - call. = FALSE) - return(do.call(rbind, lapply(bcfp_species, function(sp) { - tibble::tibble( - wsg = aoi, species = sp, - total_segs = 0L, match_pct = NA_real_, - n_diffs = NA_integer_, - top_pattern = NA_character_, top_pattern_count = NA_integer_) - }))) - } - stop(sprintf( - "no overlap between link's and bcfishpass's streams_mapping_code for %s ", - aoi), - "(link rows: ", nrow(link_mc), ", bcfp rows: ", nrow(bcfp_mc), - "). Check fwapg snapshot alignment between the two tunnels.", - call. = FALSE) - } - - rows <- lapply(bcfp_species, function(sp) { - link_col <- paste0("mapping_code_", sp, "_link") - bcfp_col <- paste0("mapping_code_", sp, "_bcfp") - if (!(link_col %in% names(joined)) || !(bcfp_col %in% names(joined))) { - return(tibble::tibble( - wsg = aoi, species = sp, - total_segs = nrow(joined), match_pct = NA_real_, - n_diffs = NA_integer_, - top_pattern = NA_character_, top_pattern_count = NA_integer_)) - } - l <- joined[[link_col]] - b <- joined[[bcfp_col]] - matches <- (is.na(l) & is.na(b)) | (!is.na(l) & !is.na(b) & l == b) - n_match <- sum(matches) - n_total <- nrow(joined) - diff_idx <- which(!matches) - - top_pattern <- NA_character_ - top_pattern_count <- NA_integer_ - if (length(diff_idx) > 0L) { - patt <- paste0(ifelse(is.na(l[diff_idx]), "", l[diff_idx]), - " | ", - ifelse(is.na(b[diff_idx]), "", b[diff_idx])) - tab <- sort(table(patt), decreasing = TRUE) - top_pattern <- names(tab)[1] - top_pattern_count <- as.integer(tab[1]) - } - tibble::tibble( - wsg = aoi, species = sp, - total_segs = n_total, - match_pct = round(100 * n_match / n_total, 2), - n_diffs = as.integer(n_total - n_match), - top_pattern = top_pattern, - top_pattern_count = top_pattern_count) - }) - do.call(rbind, rows) -} diff --git a/R/lnk_mapping_code.R b/R/lnk_mapping_code.R index 591f0e8..de96e7f 100644 --- a/R/lnk_mapping_code.R +++ b/R/lnk_mapping_code.R @@ -20,7 +20,7 @@ #' function via `lnk_pipeline_run(..., mapping_code = TRUE)`. Operators #' can also call this directly against persist schema with the tunnel #' down — the build is tunnel-independent (the diff vs reference is -#' separate, see `.lnk_compare_wsg_mapping_code_diff`). +#' separate, see [lnk_compare_mapping_code()]). #' #' Tracks link#187 (tunnel decouple + portable build). #' @@ -120,10 +120,35 @@ lnk_mapping_code <- function( # 1. Access: scalar projection from streams_access. lnk_pipeline_mapping_code # expects a data.frame keyed by id_segment. - access <- DBI::dbGetQuery(conn, sprintf( - "SELECT * FROM %s WHERE id_segment IN ( - SELECT id_segment FROM %s WHERE watershed_group_code = %s)", - table_access, table_streams, aoi_lit)) + # + # `id_segment` is per-WSG unique, NOT globally (link#203). When table_access + # is a PERSIST table (all WSGs), `id_segment IN (FINA's segments)` matches + # access rows from every WSG that happens to share those id_segment values + # -> ~N(WSGs)× duplicates -> propagates through the pivot/joins below and + # blows up `streams_mapping_code_pkey` on the persist write. Caught + # 2026-05-25 in the link#205 cheap recompute. Fix: when table_access carries + # `watershed_group_code` (persist shape, from lnk_persist_init's + # cols_streams_access_base), filter by that directly. Working-schema access + # (written by lnk_pipeline_access via dbWriteTable, no WSG col) falls back + # to the id_segment-in clause (only one WSG present there, so the cartesian + # is impossible). + access_schema_table <- strsplit(table_access, "\\.", fixed = FALSE)[[1]] + access_has_wsg <- DBI::dbGetQuery(conn, sprintf( + "SELECT 1 FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s + AND column_name = 'watershed_group_code' LIMIT 1", + DBI::dbQuoteString(conn, access_schema_table[1]), + DBI::dbQuoteString(conn, access_schema_table[length(access_schema_table)]))) + if (nrow(access_has_wsg) > 0L) { + access <- DBI::dbGetQuery(conn, sprintf( + "SELECT * FROM %s WHERE watershed_group_code = %s", + table_access, aoi_lit)) + } else { + access <- DBI::dbGetQuery(conn, sprintf( + "SELECT * FROM %s WHERE id_segment IN ( + SELECT id_segment FROM %s WHERE watershed_group_code = %s)", + table_access, table_streams, aoi_lit)) + } if (nrow(access) == 0L) { stop(sprintf("%s empty for WSG %s", table_access, aoi), call. = FALSE) } diff --git a/R/lnk_persist_init.R b/R/lnk_persist_init.R index 68931e2..1ca2694 100644 --- a/R/lnk_persist_init.R +++ b/R/lnk_persist_init.R @@ -346,7 +346,16 @@ lnk_persist_init <- function(conn, cfg, species, force_recreate = FALSE) { streams_wsg_idx = "(watershed_group_code)", streams_blk_idx = "(blue_line_key)", streams_geom_idx = "USING GIST (geom)", - streams_wbk_idx = "(waterbody_key)" + streams_wbk_idx = "(waterbody_key)", + # ltree GiST/btree — fresh::frs_network_features's downstream walk needs + # these for ltree-containment traversal (fresh/R/utils.R:416-431, + # frs_network.R:450). Without them an ad-hoc persist-schema access recompute + # (lnk_access, link#205) times out — the full pipeline is fast only because + # its working streams carry these indexes. + streams_wscode_gist_idx = "USING GIST (wscode_ltree)", + streams_wscode_btree_idx = "(wscode_ltree)", + streams_localcode_gist_idx = "USING GIST (localcode_ltree)", + streams_localcode_btree_idx = "(localcode_ltree)" ) for (idx_name in names(idx_specs)) { .lnk_db_execute(conn, sprintf( @@ -387,7 +396,13 @@ lnk_persist_init <- function(conn, cfg, species, force_recreate = FALSE) { barriers_blocks_idx = "USING GIN (blocks_species)", barriers_source_idx = "(barrier_source)", barriers_blk_drm_idx = "(blue_line_key, downstream_route_measure)", - barriers_geom_idx = "USING GIST (geom)" + barriers_geom_idx = "USING GIST (geom)", + # ltree GiST/btree — same rationale as streams: frs_network_features walks + # the barrier (feature) side by ltree containment too (link#205). + barriers_wscode_gist_idx = "USING GIST (wscode_ltree)", + barriers_wscode_btree_idx = "(wscode_ltree)", + barriers_localcode_gist_idx = "USING GIST (localcode_ltree)", + barriers_localcode_btree_idx = "(localcode_ltree)" ) for (idx_name in names(barriers_idx_specs)) { .lnk_db_execute(conn, sprintf( diff --git a/RUNBOOK.md b/RUNBOOK.md index 13a4478..3d76b22 100644 --- a/RUNBOOK.md +++ b/RUNBOOK.md @@ -376,6 +376,35 @@ direction. Not yet scoped; candidate issue. barriers (for cross-WSG views) *and* persists at the end → PARS ~16 min vs ~3.5 min normal. Pre-persisting only barriers (not streams+habitat) is the open optimization (#196 Phase 5). +- **`pkill ` does NOT cancel its Postgres query — the backend + orphans.** Caught 2026-05-25 (link#205): a killed recompute left a + `frs_network_features` SELECT running 1h45m server-side, holding a lock on + `barriers_bt_access`; every later `lnk_barriers_views` `DROP VIEW` blocked + behind it indefinitely (silent hangs). The R client died; the libpq backend + did not. **Always terminate the server-side backend** (`SELECT + pg_terminate_backend(pid) FROM pg_stat_activity WHERE state='active' …`), + not just the client. And **set `statement_timeout` + `lock_timeout` on any + long-running DB op** (`SET statement_timeout = '600000'; SET lock_timeout = + '60000'`) — a runaway cancels server-side instead of orphaning, and a + blocked DROP VIEW fails fast instead of wedging. `data-raw/wsg_recompute_one.R` + sets these on its conn for exactly this reason. +- **AOI-scoping streams to a VIEW (not a real table) makes the planner pick + the wrong join driver.** Caught 2026-05-25 (link#205): scoping + `fresh.streams` to one WSG via `CREATE VIEW … WHERE wsg = 'FINA'` left + Postgres with no small-table stats; it picked the ~800k-row + `barriers_bt_access` as the outer driver of `frs_network_features`'s + nested loop, blowing the cost up by ~1000× (estimated 71M result rows, >10 + min wall). Solution: materialize as a real `CREATE TABLE` with an + `id_segment` btree + ltree GiST + blue_line_key + `ANALYZE`. Then the + planner picks the 26k-row AOI streams as outer and the walk takes ~10s. + Mirrors the full pipeline's working schema (also a real, indexed table). +- **`id_segment IN (…)` is cartesian against the persist schema** (link#203). + `id_segment` is unique per WSG, not globally; a query like + `SELECT * FROM WHERE id_segment IN (SELECT id_segment FROM + WHERE wsg = aoi)` matches access rows from every WSG that + happens to share those id_segment values → ~N(WSGs)× duplicates. Filter + by `watershed_group_code` directly when the table has that column. + `lnk_mapping_code` learned this the hard way. --- diff --git a/data-raw/README.md b/data-raw/README.md index b78da0b..acf1427 100644 --- a/data-raw/README.md +++ b/data-raw/README.md @@ -76,6 +76,7 @@ The dispatch hierarchy: trifecta → run_provincial → compare_wsg. | Script | Calls | Purpose | |--------|-------|---------| +| `study_area_run.sh` | `study_area_wsgs.R`, `wsg_run_one.R`, `schema_consolidate.R`, `study_area_compare.R` | **Lean tunnel-free, M1-as-dispatcher** study-area parity runner (link#175). No M4, no `:63333`. Drainage-closed DS-first buckets (one study area per host), per-WSG soft-fail, consolidate cyphers→M1, burn, tunnel-free `lnk_compare_mapping_code` → CSV. Full procedure + gotchas: `research/study_area_run.md`. | | `wsgs_dispatch.sh` | `wsgs_run_host.R` (×N hosts) | M4 + M1 + N-cypher orchestrator. Inline LPT bucket allocation (reads `_per_wsg_times.csv` from prior runs, computes balanced split using `--host-speeds=`), pre-flight version check across all hosts, parallel dispatch, RDS pull-back, post-pull `lnk_parity_annotate` against the divergence taxonomy. See "Provincial dispatch" section below for full flag reference + gotchas. | | `trifecta_15wsg.sh` | same | 15-WSG smoke variant (legacy 3-host, hardcoded WSG list). | | `trifecta_smoke.sh` | `wsgs_dispatch.sh` | N-host smoke shim: one small WSG per host, ~3 min wall. See `Provincial dispatch` section. | diff --git a/data-raw/cypher_prep.sh b/data-raw/cypher_prep.sh index 115c96a..2471e8e 100755 --- a/data-raw/cypher_prep.sh +++ b/data-raw/cypher_prep.sh @@ -105,7 +105,15 @@ conn <- DBI::dbConnect(RPostgres::Postgres(), user="postgres", password="postgres") cfg <- lnk_config("bcfishpass") loaded <- lnk_load_overrides(cfg) -species <- unique(loaded$parameters_fresh$species_code) +# Persist species set MUST match lnk_pipeline_run (R/lnk_pipeline_run.R: +# `lnk_persist_init(conn, cfg, species = cfg$species)`). The wide per- +# species tables (streams_access, streams_mapping_code) carry one column +# per species, so a cypher seeding from parameters_fresh (11 sp: adds +# CT/DV/RB) while the dispatcher uses cfg$species (8 sp) produces a +# column-set mismatch that breaks the cross-host COPY-consolidate. +# Caught 2026-05-25 in the 3-WSG smoke (link#175). Mirror cfg$species, +# with the same parameters_fresh fallback lnk_pipeline_species uses. +species <- if (!is.null(cfg$species)) cfg$species else unique(loaded$parameters_fresh$species_code) lnk_persist_init(conn, cfg, species, force_recreate = TRUE) cat("=== lnk_persist_init done\n") ' > "$TMP_INIT_LOG" 2>&1; then diff --git a/data-raw/schema_consolidate.R b/data-raw/schema_consolidate.R index 1e33628..5a4030f 100644 --- a/data-raw/schema_consolidate.R +++ b/data-raw/schema_consolidate.R @@ -210,6 +210,60 @@ schema_consolidate <- function(schema, next } + # 3.4. Per-table SHARED-column resolution for shape-tolerant COPY. + # `COPY (SELECT *) TO STDOUT` -> `COPY FROM STDIN` is positional: + # it breaks ("extra data after last expected column") the moment a + # source table's column set differs from the destination's. This + # happens across hosts whenever a wide per-species table + # (streams_access, streams_mapping_code) was created from a different + # species set — e.g. a warm cypher snapshot baked an 11-species + # streams_access (ct/dv/rb included) while the dispatcher's persist + # is 8-species. Caught 2026-05-25 in the 3-WSG smoke (link#175). + # + # Fix: enumerate columns on BOTH sides, COPY only the intersection, + # BY NAME, in destination ordinal order. Source-only columns are + # dropped at SELECT; destination-only columns take their default / + # NULL. Both COPY statements list the same ordered column vector, so + # ordinal drift between hosts no longer matters. + tbl_list_sql <- paste(sprintf("'%s'", wgc_tables), collapse = ", ") + cols_sql <- gsub("\n\\s+", " ", sprintf( + "SELECT table_name, column_name FROM information_schema.columns + WHERE table_schema = '%s' AND table_name IN (%s) + ORDER BY table_name, ordinal_position", schema, tbl_list_sql)) + dest_cols_df <- DBI::dbGetQuery(dest_conn, cols_sql) + dest_cols <- split(dest_cols_df$column_name, dest_cols_df$table_name) + src_cols_inner <- if (via == "docker") { + sprintf("docker exec %s psql -U %s -d %s -t -A -c \"%s\"", + container, pg_user, pg_db, cols_sql) + } else { + sprintf("PGHOST=localhost PGPORT=5432 PGDATABASE=%s PGUSER=%s PGPASSWORD=postgres psql -t -A -c \"%s\"", + pg_db, pg_user, cols_sql) + } + src_cols_inner_esc <- gsub("'", "'\\''", src_cols_inner, fixed = TRUE) + src_cols_raw <- system(sprintf("ssh '%s' '%s'", src$host, src_cols_inner_esc), + intern = TRUE) + src_cols_raw <- src_cols_raw[nzchar(src_cols_raw)] + src_cols_split <- strsplit(src_cols_raw, "|", fixed = TRUE) + src_cols <- split( + vapply(src_cols_split, `[`, character(1), 2L), + vapply(src_cols_split, `[`, character(1), 1L)) + # Destination ordinal order, restricted to columns present on source. + shared_cols <- lapply(wgc_tables, function(t) { + dc <- dest_cols[[t]] + dc[dc %in% src_cols[[t]]] + }) + names(shared_cols) <- wgc_tables + for (t in wgc_tables) { + dc <- dest_cols[[t]]; sc <- src_cols[[t]]; sh <- shared_cols[[t]] + if (length(sh) < length(dc) || length(sh) < length(sc)) { + log(src$host, " -> NOTE: column drift on '", t, "' — COPY ", + length(sh), " shared cols (dest=", length(dc), ", src=", + length(sc), "); src-only: ", + paste(setdiff(sc, dc), collapse = ","), "; dest-only: ", + paste(setdiff(dc, sc), collapse = ",")) + } + } + # 3.5. Bucket-aware destination cleanup. DELETE the bucket's WSGs # from every destination table BEFORE the COPY-INSERTs so PK # constraints don't fire on the inbound rows. @@ -256,9 +310,10 @@ schema_consolidate <- function(schema, log(src$host, " -> COPY (bucket-filtered) for ", length(wgc_tables), " tables") for (t in wgc_tables) { + col_list <- paste(shared_cols[[t]], collapse = ", ") src_sql <- sprintf( - "COPY (SELECT * FROM %s.%s WHERE watershed_group_code IN (%s)) TO STDOUT", - schema, t, wsg_list_sql) + "COPY (SELECT %s FROM %s.%s WHERE watershed_group_code IN (%s)) TO STDOUT", + col_list, schema, t, wsg_list_sql) src_inner <- if (via == "docker") { sprintf("docker exec -i %s psql -U %s -d %s -c \"%s\"", container, pg_user, pg_db, src_sql) @@ -290,8 +345,10 @@ schema_consolidate <- function(schema, unlink(tmpf) next } - # Stage 2: local temp file -> destination COPY FROM STDIN. - dest_sql <- sprintf("COPY %s.%s FROM STDIN", schema, t) + # Stage 2: local temp file -> destination COPY FROM STDIN. Same + # explicit shared-column list as the source SELECT so the transfer + # is by-name (shape-tolerant), not positional. + dest_sql <- sprintf("COPY %s.%s (%s) FROM STDIN", schema, t, col_list) stage2 <- sprintf( "PGHOST=localhost PGPORT=5432 PGDATABASE=fwapg PGUSER=postgres PGPASSWORD=postgres psql -v ON_ERROR_STOP=1 -c \"%s\" < %s", dest_sql, shQuote(tmpf)) diff --git a/data-raw/study_area_compare.R b/data-raw/study_area_compare.R new file mode 100755 index 0000000..c3fa52e --- /dev/null +++ b/data-raw/study_area_compare.R @@ -0,0 +1,47 @@ +#!/usr/bin/env Rscript +# study_area_compare.R — tunnel-free per-WSG mapping_code parity for a set of +# WSGs against the LOCAL bcfp snapshot (fresh.streams_vw_bcfp, loaded by +# snapshot_bcfp.sh --with-bcfp-views). Writes a long CSV +# (wsg, species, total_segs, match_pct, n_diffs, top_pattern, +# top_pattern_count). Run on the dispatcher AFTER consolidate. No tunnel, +# no PG_PASS_SHARE, no :63333 — a single local connection per WSG. +# +# Usage: [LNK_LOAD=loadall] Rscript study_area_compare.R [config] + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 2L) { + stop("usage: study_area_compare.R [config]", call. = FALSE) +} +out_csv <- args[1] +wsgs <- toupper(strsplit(args[2], ",")[[1]]) +wsgs <- wsgs[nzchar(wsgs)] +config <- if (length(args) >= 3L && nzchar(args[3])) args[3] else "bcfishpass" + +if (identical(Sys.getenv("LNK_LOAD"), "loadall")) { + suppressPackageStartupMessages(pkgload::load_all(quiet = TRUE)) +} else { + suppressPackageStartupMessages(library(link)) +} +suppressPackageStartupMessages({ + library(DBI); library(RPostgres) +}) +source("data-raw/wsg_compare.R") + +cfg <- lnk_config(config) + +rows <- list() +for (w in wsgs) { + r <- tryCatch( + wsg_compare_mapping_code(wsg = w, config = cfg), + error = function(e) { + message(sprintf("[study_area_compare] %s ERROR: %s", w, conditionMessage(e))) + NULL + }) + if (!is.null(r)) rows[[w]] <- r +} +if (length(rows) == 0L) stop("no WSG produced a compare result", call. = FALSE) +res <- do.call(rbind, rows) +write.csv(res, out_csv, row.names = FALSE) +cat(sprintf("[study_area_compare] %d rows across %d/%d WSGs -> %s\n", + nrow(res), length(rows), length(wsgs), out_csv)) +print(res) diff --git a/data-raw/study_area_run.sh b/data-raw/study_area_run.sh new file mode 100755 index 0000000..201d785 --- /dev/null +++ b/data-raw/study_area_run.sh @@ -0,0 +1,290 @@ +#!/usr/bin/env bash +# study_area_run.sh — tunnel-free, M1-dispatch study-area mapping_code parity. +# +# Productionizes the proven smoke flow (cypher_up -> cypher_prep -> +# lnk_pipeline_run(mapping_code=TRUE) per WSG -> schema_consolidate -> +# wsg_compare_mapping_code -> cypher_down). NOT a refactor of the old +# M4-centric wsgs_run_pipeline.sh — it reuses the simple local flow the +# 3-WSG smoke validated (link#175). +# +# Host model: the local machine is the dispatcher (M1) and the consolidate +# destination; cyphers are the remote workers. No M4, no `ssh m1`, no bcfp +# tunnel (`:63333`/PG_PASS_SHARE) — the compare reference is the LOCAL bcfp +# snapshot fresh.streams_vw_bcfp (snapshot_bcfp.sh --with-bcfp-views). +# +# Cross-WSG `;DAM` correctness WITHOUT a post-consolidate recompute: each +# host gets a DRAINAGE-CLOSED bucket (focal WSGs + every WSG they drain +# through, via study_area_wsgs.R / public.wsg_outlet) run DOWNSTREAM-FIRST, +# so a WSG's downstream dam barriers are persisted before its access / +# mapping_code is computed. One study area (closed) per host. +# +# Usage: +# bash data-raw/study_area_run.sh \ +# --cy-workspaces=job1,job2 \ +# --focal= \ +# --focal= \ +# --focal= \ +# [--config=bcfishpass] [--keep-cyphers] +# +# The number of --focal flags MUST equal 1 (dispatcher) + N cyphers, in +# order: first --focal -> dispatcher, the rest -> cyphers in --cy-workspaces +# order. Put the LARGEST area on the dispatcher (first --focal): it is the +# fast, free local host, while cyphers are slower + paid — give them the +# smaller areas so they finish + burn sooner. Cyphers burn right after +# consolidate (minimise idle); a trap EXIT is the safety net. + +set -euo pipefail + +# --- args --- +CY_WS="" +CONFIG="bcfishpass" +KEEP_CYPHERS=0 +FOCAL_ARR=() +for arg in "$@"; do + case "$arg" in + --cy-workspaces=*) CY_WS="${arg#--cy-workspaces=}" ;; + --config=*) CONFIG="${arg#--config=}" ;; + --focal=*) FOCAL_ARR+=("${arg#--focal=}") ;; + --keep-cyphers) KEEP_CYPHERS=1 ;; + *) echo "unknown arg: $arg" >&2; exit 1 ;; + esac +done + +IFS=',' read -ra CY_WS_ARR <<< "$CY_WS" +[ -n "$CY_WS" ] || CY_WS_ARR=() +N_CY=${#CY_WS_ARR[@]} +N_FOCAL=${#FOCAL_ARR[@]} +EXPECT=$((N_CY + 1)) +if [ "$N_FOCAL" -ne "$EXPECT" ]; then + echo "FATAL: need exactly $EXPECT --focal flags (1 dispatcher + $N_CY cyphers); got $N_FOCAL" >&2 + exit 1 +fi + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +TS="$(date -u +%Y%m%d_%H%M%S)" +LOG_DIR="$REPO_ROOT/data-raw/logs/study_area_run" +mkdir -p "$LOG_DIR" +CYPHER_DIR="$HOME/Projects/repo/rtj/scripts/cypher" +CYPHER_TF="$HOME/Projects/repo/rtj/env/do/dev/cypher" +# Cyphers must run the SAME git ref as the dispatcher so they carry these +# driver scripts (wsg_run_one.R etc.) + a matching link install. cypher_prep +# reads CYPHER_PREP_BRANCH (default main, which lacks these scripts); pass the +# dispatcher's current branch. The branch MUST be pushed to origin first — +# cypher_prep does `git fetch origin && git reset --hard origin/$BRANCH`. +LINK_BRANCH="$(git -C "$REPO_ROOT" branch --show-current)" + +# Resolve persist schema (don't hardcode "fresh"). +SCHEMA=$(cd "$REPO_ROOT" && Rscript -e \ + 'cat(link::lnk_config(commandArgs(TRUE)[1])$pipeline$schema)' "$CONFIG" 2>/dev/null || true) +[ -n "$SCHEMA" ] || { echo "FATAL: could not resolve persist schema for --config=$CONFIG"; exit 1; } + +echo "=== study_area_run $TS ===" +echo " config: $CONFIG" +echo " branch: $LINK_BRANCH (cyphers run this ref)" +echo " persist: $SCHEMA" +echo " cyphers: ${CY_WS_ARR[*]:-} ($N_CY)" +echo " log dir: $LOG_DIR" + +# --- trap: burn cyphers on exit (safety net; explicit burn after consolidate) --- +CYPHERS_UP=0 +burn_cyphers() { + local rc=$? + if [ "$CYPHERS_UP" = "0" ]; then return $rc; fi + if [ "$KEEP_CYPHERS" = "1" ]; then + echo "=== trap EXIT: --keep-cyphers; NOT burning (${CY_WS_ARR[*]}) ===" + return $rc + fi + echo "=== BURN CYPHERS (trap EXIT) ===" + ( cd "$CYPHER_DIR" + for WS in "${CY_WS_ARR[@]}"; do + ./cypher_down.sh --workspace "$WS" > "$LOG_DIR/${TS}_burn_$WS.log" 2>&1 & + done + wait ) + local clean=1 + for WS in "${CY_WS_ARR[@]}"; do + local n + # `|| n="?"` so a tofu hiccup (pipefail) can't abort the verification + # loop when burn_cyphers runs via the EXIT trap (set -e active there). + n=$(cd "$CYPHER_TF" && TF_WORKSPACE="$WS" tofu state list 2>/dev/null | wc -l | tr -d ' ') || n="?" + echo " cy[$WS]: $n tofu resources (expect 0)"; [ "$n" = "0" ] || clean=0 + done + if doctl compute droplet list --no-header 2>/dev/null | grep -qi cypher; then + echo " ✗ doctl still shows cypher droplets"; clean=0 + else echo " ✓ doctl: no cypher droplets"; fi + [ "$clean" = "1" ] && echo " ✓ burn clean" || echo " ✗ BURN INCOMPLETE — investigate" + CYPHERS_UP=0 + return $rc +} +trap burn_cyphers EXIT + +# --- pre-flight (tunnel-free) --- +echo "=== pre-flight ===" +fail=0 +pg_isready -h localhost -p 5432 >/dev/null 2>&1 || { echo " ✗ local fwapg down (:5432)"; fail=1; } +HAS_VW=$(PGPASSWORD=postgres psql -h localhost -p 5432 -U postgres -d fwapg -t -A -c \ + "SELECT 1 FROM information_schema.tables WHERE table_schema='$SCHEMA' AND table_name='streams_vw_bcfp'" 2>/dev/null || true) +[ "$HAS_VW" = "1" ] || { echo " ✗ $SCHEMA.streams_vw_bcfp missing (run snapshot_bcfp.sh --with-bcfp-views)"; fail=1; } +if [ "$N_CY" -gt 0 ]; then + doctl compute droplet list --no-header >/dev/null 2>&1 || { echo " ✗ doctl not authed"; fail=1; } + (cd "$CYPHER_TF" && tofu workspace list >/dev/null 2>&1) || { echo " ✗ tofu workspace list failed"; fail=1; } +fi +[ "$fail" = "0" ] || { echo "FATAL: pre-flight failed; aborting before spend"; exit 1; } +echo " ✓ pre-flight clean (tunnel-free)" + +# --- resolve drainage-closed DS-first buckets --- +echo "=== resolve drainage-closed DS-first buckets ===" +DISP_BUCKET=$(cd "$REPO_ROOT" && Rscript data-raw/study_area_wsgs.R "${FOCAL_ARR[0]}") +DISP_BUCKET=$(echo "$DISP_BUCKET" | tr -d '[:space:]') +echo " dispatcher (focal=${FOCAL_ARR[0]}): $DISP_BUCKET" +declare -A CY_BUCKET +for i in "${!CY_WS_ARR[@]}"; do + WS="${CY_WS_ARR[$i]}" + B=$(cd "$REPO_ROOT" && Rscript data-raw/study_area_wsgs.R "${FOCAL_ARR[$((i+1))]}") + CY_BUCKET[$WS]=$(echo "$B" | tr -d '[:space:]') + echo " cy[$WS] (focal=${FOCAL_ARR[$((i+1))]}): ${CY_BUCKET[$WS]}" +done + +# Non-fatal: warn if buckets overlap. A WSG in two hosts' closures is +# computed on both and consolidate is last-writer-wins. Harmless when focal +# sets are drainage-independent (Peace/Fraser/Skeena are distinct roots), but +# surface an accidental overlap so it's visible rather than silent. +DUP=$( { echo "$DISP_BUCKET" | tr ',' '\n' + for WS in "${CY_WS_ARR[@]}"; do echo "${CY_BUCKET[$WS]}" | tr ',' '\n'; done +} | grep -v '^$' | sort | uniq -d | paste -sd, - ) +[ -z "$DUP" ] || echo " WARN: buckets overlap on: $DUP (computed on multiple hosts; consolidate last-writer-wins)" + +# --- spin + prep cyphers --- +declare -A CY_IP +if [ "$N_CY" -gt 0 ]; then + echo "=== spin cyphers: ${CY_WS_ARR[*]} ===" + ( cd "$CYPHER_DIR" + for WS in "${CY_WS_ARR[@]}"; do + ./cypher_up.sh --workspace "$WS" > "$LOG_DIR/${TS}_up_$WS.log" 2>&1 & + done + wait ) + for WS in "${CY_WS_ARR[@]}"; do + IP=$(cd "$CYPHER_TF" && TF_WORKSPACE="$WS" tofu output -raw droplet_ip 2>/dev/null) \ + || { echo "FATAL: tofu droplet_ip failed for $WS"; exit 1; } + [ -n "$IP" ] || { echo "FATAL: empty droplet_ip for $WS"; exit 1; } + CY_IP[$WS]="$IP"; echo " cy[$WS] = $IP" + done + CYPHERS_UP=1 + + echo "=== prep cyphers (cypher_prep.sh) ===" + for WS in "${CY_WS_ARR[@]}"; do + IP="${CY_IP[$WS]}" + ( # Wait for the fresh droplet's sshd before scp — cypher_up returns as + # soon as the IP is assigned, often before SSH is up, which races scp + # into "Connection closed". Poll up to ~150s, accept the new host key. + for _ in $(seq 1 30); do + ssh -o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=accept-new \ + "cypher@$IP" 'true' 2>/dev/null && break + sleep 5 + done + scp -q "$REPO_ROOT/data-raw/cypher_prep.sh" "cypher@$IP:/tmp/cypher_prep.sh" \ + && ssh "cypher@$IP" "CYPHER_PREP_BRANCH='$LINK_BRANCH' bash /tmp/cypher_prep.sh" ) > "$LOG_DIR/${TS}_prep_$WS.log" 2>&1 & + done + wait + for WS in "${CY_WS_ARR[@]}"; do + grep -q "snapshot_bcfp.sh: complete" "$LOG_DIR/${TS}_prep_$WS.log" 2>/dev/null \ + || { echo "FATAL: cypher[$WS] prep failed; see $LOG_DIR/${TS}_prep_$WS.log"; exit 1; } + done + echo " ✓ cyphers prepped" +fi + +# --- run buckets DS-first (dispatcher local + cyphers, parallel) --- +# Per-WSG SOFT-FAIL (mirrors wsgs_run_host.R resume-safe behaviour): a single +# WSG error logs a warning and the loop CONTINUES. It must NEVER abort the host +# and trip the trap-burn before consolidate — that lost a whole run + the +# cyphers' data on 2026-05-25 (one species-less WSG -> exit 1 -> FATAL -> burn). +# Missing WSGs surface as gaps in the final compare, not as data loss. +echo "=== run buckets (DS-first) ===" +( cd "$REPO_ROOT" + for w in $(echo "$DISP_BUCKET" | tr ',' ' '); do + LNK_LOAD=loadall Rscript data-raw/wsg_run_one.R "$w" "$CONFIG" \ + || echo "[WARN] dispatcher WSG $w failed (continuing)" + done ) > "$LOG_DIR/${TS}_run_local.log" 2>&1 & +LOCAL_PID=$! +declare -A CY_PID +for WS in "${CY_WS_ARR[@]}"; do + IP="${CY_IP[$WS]}"; B_SPACE=$(echo "${CY_BUCKET[$WS]}" | tr ',' ' ') + ssh "cypher@$IP" "cd ~/Projects/repo/link && for w in $B_SPACE; do Rscript data-raw/wsg_run_one.R \$w '$CONFIG' || echo \"[WARN] cy WSG \$w failed\"; done" \ + > "$LOG_DIR/${TS}_run_$WS.log" 2>&1 & + CY_PID[$WS]=$! +done +# A non-zero host exit (e.g. ssh dropped) is logged, NOT fatal — we still +# consolidate whatever each host persisted so a late failure can't lose the +# other hosts' work. +wait $LOCAL_PID || echo " WARN: dispatcher run returned non-zero; see $LOG_DIR/${TS}_run_local.log" +for WS in "${CY_WS_ARR[@]}"; do + wait "${CY_PID[$WS]}" || echo " WARN: cy[$WS] run returned non-zero; see $LOG_DIR/${TS}_run_$WS.log" +done +echo " ✓ host runs finished (per-WSG soft-fail; gaps surface in compare)" + +# --- consolidate cyphers -> dispatcher --- +if [ "$N_CY" -gt 0 ]; then + echo "=== consolidate cyphers -> dispatcher ($SCHEMA) ===" + SRC_R="list(" + first=1 + for WS in "${CY_WS_ARR[@]}"; do + IP="${CY_IP[$WS]}" + bucket_r=$(echo "${CY_BUCKET[$WS]}" | tr ',' '\n' | grep -v '^$' | sed "s/.*/'&'/" | paste -sd, -) + [ "$first" = "1" ] || SRC_R="$SRC_R, " + SRC_R="$SRC_R list(host = 'cypher@$IP', via = 'docker', bucket = c($bucket_r))" + first=0 + done + SRC_R="$SRC_R)" + ( cd "$REPO_ROOT" && Rscript -e " +suppressPackageStartupMessages(pkgload::load_all(quiet = TRUE)) +source('data-raw/schema_consolidate.R') +res <- schema_consolidate(schema = '$SCHEMA', sources = $SRC_R, backup = TRUE) +print(res) +ok <- all(vapply(res\$sources, function(s) isTRUE(s\$ok), logical(1))) +quit(status = if (ok) 0 else 1) +" ) > "$LOG_DIR/${TS}_consolidate.log" 2>&1 \ + || { echo " ✗ consolidate failed; see $LOG_DIR/${TS}_consolidate.log"; exit 1; } + echo " ✓ consolidated (see $LOG_DIR/${TS}_consolidate.log)" +fi + +# --- burn cyphers now (work is consolidated; minimise idle) --- +burn_cyphers || true + +# WSG set across all hosts. +ALL_WSGS=$( { echo "$DISP_BUCKET" | tr ',' '\n' + for WS in "${CY_WS_ARR[@]}"; do echo "${CY_BUCKET[$WS]}" | tr ',' '\n'; done +} | grep -v '^$' | sort -u | paste -sd, - ) +COMPARE_CSV="$LOG_DIR/${TS}_compare.csv" + +# --- post-consolidate recompute: settle cross-WSG access (link#205) --- +# Drainage-closed + DS-first per-host is NOT sufficient: a WSG's downstream +# barriers can be cross-bucket or arrive late in DS-first order, so its access +# (hence token1/token2) is computed against an incomplete barrier set. +# Caught 2026-05-25: FINA 75% / PARA 69% per-host -> both 99% only after +# re-modelling on the full consolidated barrier set. The recompute is the +# correctness guarantee REGARDLESS of bucketing. We use lnk_access(merge=TRUE) +# — the cheap access-only recompute that reuses the persisted streams / +# habitat / barriers / barrier_overrides (link#205, ~10 s/WSG vs ~1.5 min for +# a full pipeline rebuild). Because it is cheap, we recompute ALL run WSGs +# unconditionally rather than threshold-filtering by parity — bucketing is +# now a speed knob, not a correctness lever. +echo "=== post-consolidate recompute (lnk_access, all WSGs) ===" +( cd "$REPO_ROOT" + for w in $(echo "$ALL_WSGS" | tr ',' ' '); do + LNK_LOAD=loadall Rscript data-raw/wsg_recompute_one.R "$w" "$CONFIG" \ + || echo "[WARN] recompute WSG $w failed (continuing)" + done ) > "$LOG_DIR/${TS}_recompute.log" 2>&1 +echo " ✓ recompute done" + +# --- compare (tunnel-free) -> CSV --- +echo "=== compare (tunnel-free) ===" +( cd "$REPO_ROOT" && LNK_LOAD=loadall Rscript data-raw/study_area_compare.R \ + "$COMPARE_CSV" "$ALL_WSGS" "$CONFIG" ) > "$LOG_DIR/${TS}_compare.log" 2>&1 \ + || { echo " ✗ compare failed; see $LOG_DIR/${TS}_compare.log"; exit 1; } +echo " ✓ compare CSV: $COMPARE_CSV" + +# --- report --- +echo "=== summary ===" +echo " run WSGs: $ALL_WSGS" +echo " compare CSV: $COMPARE_CSV" +tail -40 "$LOG_DIR/${TS}_compare.log" || true +echo "=== study_area_run done ===" diff --git a/data-raw/study_area_wsgs.R b/data-raw/study_area_wsgs.R new file mode 100755 index 0000000..eb1ae9c --- /dev/null +++ b/data-raw/study_area_wsgs.R @@ -0,0 +1,75 @@ +#!/usr/bin/env Rscript +# study_area_wsgs.R — given a set of FOCAL watershed groups, print the +# drainage-CLOSED, MODELABLE set in DOWNSTREAM-FIRST order (one comma line). +# +# Closure: every WSG whose outlet wscode_ltree is an ancestor of (== at or +# downstream of) any focal WSG's outlet — i.e. the WSGs a focal WSG's water +# drains through. DS-first: ordered by outlet ltree depth ascending, so the +# most-downstream WSGs come first. Running a host's bucket in this order +# persists downstream dam barriers before upstream WSGs compute access, which +# is what makes cross-WSG `;DAM` correct from the per-host run (no recompute). +# +# MODELABLE filter (link#157, mirrors data-raw/wsgs_run_host.R): drop closure +# WSGs with no bundle-species presence. lnk_pipeline_run errors hard ("No +# species resolved for AOI") on a species-less WSG (e.g. lower-mainstem groups +# pulled in by closure), which would abort the whole host run. bcfp doesn't +# model those WSGs either, so excluding them matches the proven methodology. +# +# Sources of truth: public.wsg_outlet (closure) + loaded$wsg_species_presence +# (modelable), both in fwapg / the bundle. +# +# Usage: [LNK_LOAD=loadall] Rscript study_area_wsgs.R [config] + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 1L || !nzchar(args[1])) { + stop("usage: study_area_wsgs.R [config]", call. = FALSE) +} +focal <- toupper(strsplit(args[1], ",")[[1]]) +focal <- focal[nzchar(focal)] +config <- if (length(args) >= 2L && nzchar(args[2])) args[2] else "bcfishpass" + +if (identical(Sys.getenv("LNK_LOAD"), "loadall")) { + suppressPackageStartupMessages(pkgload::load_all(quiet = TRUE)) +} else { + suppressPackageStartupMessages(library(link)) +} +suppressPackageStartupMessages({ + library(DBI); library(RPostgres) +}) +conn <- DBI::dbConnect(RPostgres::Postgres(), host = "localhost", port = 5432, + dbname = "fwapg", user = "postgres", password = "postgres") +on.exit(try(DBI::dbDisconnect(conn), silent = TRUE), add = TRUE) + +# 1. Drainage closure, DS-first. +focal_lit <- paste(DBI::dbQuoteLiteral(conn, focal), collapse = ", ") +q <- sprintf(" + SELECT DISTINCT w.wsg, nlevel(w.outlet) AS depth + FROM public.wsg_outlet w + JOIN public.wsg_outlet f ON f.wsg IN (%s) + WHERE f.outlet <@ w.outlet + ORDER BY depth ASC, w.wsg ASC", focal_lit) +res <- DBI::dbGetQuery(conn, q) +if (nrow(res) == 0L) { + stop("no closure found — are the focal WSGs present in public.wsg_outlet?", + call. = FALSE) +} + +# 2. Modelable filter (link#157): keep only WSGs with bundle-species presence. +cfg <- lnk_config(config) +loaded <- lnk_load_overrides(cfg) +spp_cols <- tolower(cfg$species) +wp <- loaded$wsg_species_presence +has_spp <- apply(wp[, spp_cols, drop = FALSE], 1, + function(r) any(r %in% c("t", "TRUE", TRUE))) +modelable <- wp$watershed_group_code[has_spp] + +keep <- res$wsg[res$wsg %in% modelable] # preserves DS-first order +dropped <- setdiff(res$wsg, keep) +if (length(dropped) > 0L) { + message(sprintf("[study_area_wsgs] dropped %d species-less closure WSG(s): %s", + length(dropped), paste(dropped, collapse = ","))) +} +if (length(keep) == 0L) { + stop("no modelable WSGs after species-presence filter", call. = FALSE) +} +cat(paste(keep, collapse = ","), "\n") diff --git a/data-raw/wsg_compare.R b/data-raw/wsg_compare.R index 64de942..9675c47 100644 --- a/data-raw/wsg_compare.R +++ b/data-raw/wsg_compare.R @@ -66,3 +66,33 @@ wsg_compare <- function(wsg, config, species = NULL, } rollup } + + +# Tunnel-free per-segment mapping_code parity (link#175). Diffs link's +# persisted .streams_mapping_code against the LOCAL bcfp +# snapshot fresh.streams_vw_bcfp (loaded by snapshot_bcfp.sh --with-bcfp-views +# from public S3). No bcfp tunnel, no PG_PASS_SHARE, no :63333 — a single +# local connection. This is the compare the orchestrator runs on the +# dispatcher after consolidate (cyphers just run + persist). +# +# Return: tibble wsg, species, total_segs, match_pct, n_diffs, top_pattern, +# top_pattern_count. +wsg_compare_mapping_code <- function(wsg, config, species = NULL, + reference = "bcfishpass") { + stopifnot( + is.character(wsg), length(wsg) == 1L, nzchar(wsg), + grepl("^[A-Z]{3,5}$", wsg), + inherits(config, "lnk_config"), + is.null(species) || is.character(species), + is.character(reference), length(reference) == 1L, nzchar(reference) + ) + + conn <- DBI::dbConnect(RPostgres::Postgres(), + host = "localhost", port = 5432, dbname = "fwapg", + user = "postgres", password = "postgres") + on.exit(try(DBI::dbDisconnect(conn), silent = TRUE), add = TRUE) + + link::lnk_compare_mapping_code( + conn = conn, aoi = wsg, cfg = config, + reference = reference, species = species) +} diff --git a/data-raw/wsg_recompute_one.R b/data-raw/wsg_recompute_one.R new file mode 100644 index 0000000..d8d8f2f --- /dev/null +++ b/data-raw/wsg_recompute_one.R @@ -0,0 +1,114 @@ +#!/usr/bin/env Rscript +# wsg_recompute_one.R — CHEAP post-consolidate recompute of access + +# mapping_code for ONE WSG, against PERSIST (link#205). Reuses the already- +# persisted streams / streams_habitat / barriers / barrier_overrides — does +# NOT re-run the full pipeline (no streams segmentation, no habitat classify). +# Run on the dispatcher AFTER consolidate to settle cross-WSG access/;DAM. +# Sibling of wsg_run_one.R (same LNK_LOAD + species-skip contract). +# +# lnk_access(merge=TRUE) -> surgically updates .streams_access +# (cross-WSG flags; preserves remediated + obs) +# lnk_mapping_code -> rebuilds mapping_code from the updated access, +# written into .streams_mapping_code via +# scratch + DELETE-WHERE-WSG + INSERT (JOIN streams +# for watershed_group_code; mirrors +# lnk_pipeline_persist). +# +# Usage: [LNK_LOAD=loadall] Rscript wsg_recompute_one.R [config] + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 1L) stop("usage: wsg_recompute_one.R [config]", call. = FALSE) +wsg <- toupper(args[1]) +config <- if (length(args) >= 2L && nzchar(args[2])) args[2] else "bcfishpass" + +if (identical(Sys.getenv("LNK_LOAD"), "loadall")) { + suppressPackageStartupMessages(pkgload::load_all(quiet = TRUE)) +} else { + suppressPackageStartupMessages(library(link)) +} +suppressPackageStartupMessages({ + library(DBI); library(RPostgres) +}) + +conn <- lnk_db_conn(dbname = "fwapg", host = "localhost", port = 5432L, + user = "postgres", password = "postgres") +on.exit(try(DBI::dbDisconnect(conn), silent = TRUE), add = TRUE) + +# Fail fast, never hang silently (link#205 / RUNBOOK.md §6): a runaway access +# walk cancels server-side (no orphaned backend to wedge later recomputes), and +# a DROP VIEW (lnk_barriers_views) blocked behind a zombie lock gives up rather +# than blocking forever. A clean error -> a completion/failure signal, not a +# silent hang that needs manual `pg_terminate_backend`. +DBI::dbExecute(conn, "SET statement_timeout = '600000'") # 10 min / statement +DBI::dbExecute(conn, "SET lock_timeout = '60000'") # 1 min on lock waits + +cfg <- lnk_config(config) +loaded <- lnk_load_overrides(cfg) + +active <- lnk_pipeline_species(cfg, loaded, wsg) +if (length(active) == 0L) { + cat(sprintf("[wsg_recompute_one] %s SKIP - no modeled species\n", wsg)) + quit(status = 0) +} +pres <- lnk_presence(loaded$wsg_species_presence, wsg) +sch <- cfg$pipeline$schema +t0 <- Sys.time() + +# 1. Surgically recompute streams_access (cross-WSG cols) in place. +lnk_access(conn, cfg, aoi = wsg, + table_streams = paste0(sch, ".streams"), + table_barriers = paste0(sch, ".barriers"), + table_to = paste0(sch, ".streams_access"), + merge = TRUE, presence = pres, species = active) + +# 2. Rebuild mapping_code from the updated access -> scratch -> persist. +sp_set <- tolower(active) +sp_resident <- union(intersect(sp_set, c("bt", "wct")), + setdiff(sp_set, c("bt", "wct", "ch", "cm", "co", "pk", "sk", "st"))) +sp_anadromous <- intersect(sp_set, c("ch", "cm", "co", "pk", "sk", "st")) +sp_spawn_only <- intersect(sp_set, c("cm", "pk")) + +mc_name <- paste0("zz_lnk_mc_scratch_", tolower(wsg)) +mc_scratch <- paste0(sch, ".", mc_name) +on.exit(try(DBI::dbExecute(conn, sprintf("DROP TABLE IF EXISTS %s", mc_scratch)), + silent = TRUE), add = TRUE) + +lnk_mapping_code(conn, + table_access = paste0(sch, ".streams_access"), + table_habitat = paste0(sch, ".streams_habitat_long_vw"), + table_streams = paste0(sch, ".streams"), + aoi = wsg, + table_to = mc_scratch, + presence = pres, + species_resident = sp_resident, + species_anadromous = sp_anadromous, + species_spawn_only = sp_spawn_only) + +# Persist write: DELETE-WHERE-WSG + INSERT (JOIN streams for WSG), mirroring +# lnk_pipeline_persist's mapping_code branch. Scratch has id_segment + +# mapping_code_ only; watershed_group_code comes from streams. +mc_cols <- DBI::dbGetQuery(conn, sprintf( + "SELECT column_name FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s AND column_name <> 'id_segment' + ORDER BY ordinal_position", + DBI::dbQuoteString(conn, sch), DBI::dbQuoteString(conn, mc_name)))$column_name +ins_cols <- paste(c("id_segment", "watershed_group_code", mc_cols), collapse = ", ") +sel_cols <- paste(c("m.id_segment", "s.watershed_group_code", + paste0("m.", mc_cols)), collapse = ", ") +wsg_lit <- DBI::dbQuoteLiteral(conn, wsg) +# Atomic DELETE+INSERT — without the transaction a failed INSERT (e.g. the #203 +# cartesian PK violation that caused FINA's mc data loss on 2026-05-25) would +# leave the WSG's rows deleted but not re-inserted. Wrap so any failure rolls +# back the DELETE. See soul/conventions/code-check.md Docker/Postgres. +DBI::dbWithTransaction(conn, { + DBI::dbExecute(conn, sprintf( + "DELETE FROM %s.streams_mapping_code WHERE watershed_group_code = %s", sch, wsg_lit)) + DBI::dbExecute(conn, sprintf( + "INSERT INTO %s.streams_mapping_code (%s) + SELECT %s FROM %s m JOIN %s.streams s USING (id_segment) + WHERE s.watershed_group_code = %s", + sch, ins_cols, sel_cols, mc_scratch, sch, wsg_lit)) +}) + +cat(sprintf("[wsg_recompute_one] %s recomputed in %.2f min (persist=%s)\n", + wsg, as.numeric(difftime(Sys.time(), t0, units = "mins")), sch)) diff --git a/data-raw/wsg_run_one.R b/data-raw/wsg_run_one.R new file mode 100755 index 0000000..0adc8a1 --- /dev/null +++ b/data-raw/wsg_run_one.R @@ -0,0 +1,55 @@ +#!/usr/bin/env Rscript +# wsg_run_one.R — run link's modelling + mapping_code pipeline for ONE WSG +# against the LOCAL fwapg (localhost:5432), persisting streams / +# streams_habitat_ / barriers / barrier_overrides / streams_access / +# streams_mapping_code into the bundle's persist schema (cfg$pipeline$schema). +# +# Tunnel-free and host-agnostic: byte-identical invocation on the dispatcher +# and on every cypher. This is the atomic unit of the study-area run +# (data-raw/study_area_run.sh). Run the WSGs of a drainage DS-first (most- +# downstream first) so a WSG's downstream dam barriers are already persisted +# when its access / mapping_code is computed — that is what makes cross-WSG +# `;DAM` appear without any post-consolidate recompute (link#175). +# +# Usage: [LNK_LOAD=loadall] Rscript wsg_run_one.R [config] +# LNK_LOAD=loadall -> pkgload::load_all() (dispatcher dev checkout) +# default -> library(link) (pak-installed, e.g. cyphers) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) < 1L) stop("usage: wsg_run_one.R [config]", call. = FALSE) +wsg <- toupper(args[1]) +config <- if (length(args) >= 2L && nzchar(args[2])) args[2] else "bcfishpass" + +if (identical(Sys.getenv("LNK_LOAD"), "loadall")) { + suppressPackageStartupMessages(pkgload::load_all(quiet = TRUE)) +} else { + suppressPackageStartupMessages(library(link)) +} +suppressPackageStartupMessages({ + library(DBI); library(RPostgres) +}) + +conn <- lnk_db_conn(dbname = "fwapg", host = "localhost", port = 5432L, + user = "postgres", password = "postgres") +on.exit(try(DBI::dbDisconnect(conn), silent = TRUE), add = TRUE) + +cfg <- lnk_config(config) +loaded <- lnk_load_overrides(cfg) + +# Defensive skip (link#157): a WSG with no bundle-species presence can't be +# modelled — lnk_pipeline_run errors "No species resolved for AOI". The +# study-area closure is already species-filtered (study_area_wsgs.R), so this +# is belt-and-suspenders: skip cleanly (exit 0) rather than fail the host run. +active <- lnk_pipeline_species(cfg, loaded, wsg) +if (length(active) == 0L) { + cat(sprintf("[wsg_run_one] %s SKIP — no modeled species in this AOI\n", wsg)) + quit(status = 0) +} + +t0 <- Sys.time() +lnk_pipeline_run(conn, aoi = wsg, cfg = cfg, loaded = loaded, + schema = paste0("working_", tolower(wsg)), + mapping_code = TRUE, cleanup_working = FALSE) +cat(sprintf("[wsg_run_one] %s done in %.1f min (persist=%s)\n", + wsg, as.numeric(difftime(Sys.time(), t0, units = "mins")), + cfg$pipeline$schema)) diff --git a/man/lnk_access.Rd b/man/lnk_access.Rd new file mode 100644 index 0000000..c4f81a0 --- /dev/null +++ b/man/lnk_access.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lnk_access.R +\name{lnk_access} +\alias{lnk_access} +\title{Build per-segment per-species access from schema tables (portable)} +\usage{ +lnk_access( + conn, + cfg, + aoi, + table_streams, + table_barriers, + table_to, + merge = FALSE, + presence = NULL, + species = NULL +) +} +\arguments{ +\item{conn}{A \link[DBI:DBIConnection-class]{DBI::DBIConnection} to the local pipeline DB.} + +\item{cfg}{An \code{lnk_config} object.} + +\item{aoi}{Character. Watershed group code (e.g. \code{"PARS"}).} + +\item{table_streams}{Character. Schema-qualified \code{streams} table (the +segments).} + +\item{table_barriers}{Character. Schema-qualified unified \code{barriers} +table. The per-species \verb{_access} + source \verb{_unified} views are built +over it internally via \code{\link[=lnk_barriers_views]{lnk_barriers_views()}}.} + +\item{table_to}{Character. Schema-qualified destination \code{streams_access} +table. With \code{merge = TRUE} it must already exist (rows for \code{aoi} are +UPDATEd in place).} + +\item{merge}{Logical. \code{FALSE} (default) overwrites \code{table_to}. \code{TRUE} +surgically UPDATEs \code{table_to}'s \code{aoi} rows (recompute; see Merge mode).} + +\item{presence}{An \code{lnk_presence} object or \code{NULL}. Per-species presence +for \code{aoi}; pass-through to \code{\link[=lnk_pipeline_access]{lnk_pipeline_access()}}.} + +\item{species}{Character vector of species codes. Default \code{cfg$species}.} +} +\value{ +\code{conn} invisibly. +} +\description{ +Schema-aware portable wrapper around \code{\link[=lnk_pipeline_access]{lnk_pipeline_access()}} — the +access twin of \code{\link[=lnk_mapping_code]{lnk_mapping_code()}}. Builds the per-species +\verb{barriers__access} + per-source views internally (via +\code{\link[=lnk_barriers_views]{lnk_barriers_views()}}) over \code{table_barriers}, then computes the wide +\code{streams_access} shape for \code{aoi} and writes it to \code{table_to}. +} +\details{ +Works against working-schema tables (mid-pipeline) or persist-schema +tables (ad-hoc / post-consolidate recompute) without modification — the +caller passes explicit \verb{table_} names. The caller passes ONE +\code{table_barriers} (the unified \code{barriers} table); the per-species access +set and the source-typed views are derived from it internally, so no +pre-built \code{barriers_per_sp} list is needed (that stays the lower-level +\code{\link[=lnk_pipeline_access]{lnk_pipeline_access()}} surface). +} +\section{Merge (recompute) mode}{ + +\code{merge = TRUE} is the \strong{post-consolidate recompute} (link#205). A WSG's +accessibility depends on barriers \emph{downstream}, possibly in another WSG +(the provincial-accumulation property, RUNBOOK.md §5); when WSGs are +modelled on separate hosts each sees only its own barriers, so the +per-host \code{streams_access} can be wrong cross-WSG. Once all barriers are +consolidated, \code{merge = TRUE} re-settles ONLY the cross-WSG columns +(\verb{has_barriers__dnstr}, \verb{has_barriers_\{anthropogenic,pscis,dams\}_dnstr}, +\code{dam_dnstr_ind}) against the complete \code{table_barriers}, reusing the +already-persisted \code{streams} + \code{streams_habitat} — far cheaper than a full +\code{\link[=lnk_pipeline_run]{lnk_pipeline_run()}} (which re-derives streams + habitat). It UPDATEs the +existing \code{table_to} rows for \code{aoi} and \strong{preserves} the within-WSG columns +the recompute does not touch: +\itemize{ +\item \code{remediated_dnstr_ind} (and \code{has_barriers_remediations_dnstr}) — depend +on the working-schema \code{crossings}/remediations, correct from the prior +compute and within-WSG in practice. +\item the observed-upstream distinction in \verb{access_}: set to \code{0} when newly +blocked, else kept at \code{2} where the prior compute had an observation, else +\code{1}. +} + +\code{observations}/\code{crossings} are intentionally skipped (\code{NULL}): they only +drive the access 1-vs-2 code + \code{remediated_dnstr_ind} (both preserved +above); mapping_code's \verb{accessible = !has_barriers__dnstr} is +independent of them. + +\code{merge = FALSE} (default) overwrites \code{table_to} via +\code{\link[=lnk_pipeline_access]{lnk_pipeline_access()}} — first-compute, intended for a working / scratch +table (it drops + recreates the target as a flat \code{id_segment}-keyed table, +so do NOT point it at a persist table; use \code{merge = TRUE} for persist). +} + +\examples{ +\dontrun{ +conn <- lnk_db_conn() +cfg <- lnk_config("bcfishpass") +loaded <- lnk_load_overrides(cfg) +pres <- lnk_presence(loaded$wsg_species_presence, "PARS") + +# Post-consolidate recompute against persist (cheap; cross-WSG correct): +lnk_access( + conn, cfg, aoi = "PARS", + table_streams = "fresh.streams", + table_barriers = "fresh.barriers", + table_to = "fresh.streams_access", + merge = TRUE, presence = pres) +lnk_mapping_code( + conn, + table_access = "fresh.streams_access", + table_habitat = "fresh.streams_habitat_long_vw", + table_streams = "fresh.streams", + aoi = "PARS", + table_to = "fresh.streams_mapping_code", + presence = pres) +} + +} +\seealso{ +\code{\link[=lnk_mapping_code]{lnk_mapping_code()}}, \code{\link[=lnk_pipeline_access]{lnk_pipeline_access()}}, \code{\link[=lnk_barriers_views]{lnk_barriers_views()}} + +Other compare: +\code{\link{lnk_compare_mapping_code}()}, +\code{\link{lnk_compare_rollup}()}, +\code{\link{lnk_compare_wsg}()}, +\code{\link{lnk_mapping_code}()}, +\code{\link{lnk_parity_annotate}()} +} +\concept{compare} diff --git a/man/lnk_compare_mapping_code.Rd b/man/lnk_compare_mapping_code.Rd new file mode 100644 index 0000000..7e64f18 --- /dev/null +++ b/man/lnk_compare_mapping_code.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/lnk_compare_mapping_code.R +\name{lnk_compare_mapping_code} +\alias{lnk_compare_mapping_code} +\title{Compare one watershed group's persisted mapping_code tokens against a reference} +\usage{ +lnk_compare_mapping_code( + conn, + aoi, + cfg, + reference = "bcfishpass", + conn_ref = NULL, + species = NULL, + ref_table = "fresh.streams_vw_bcfp" +) +} +\arguments{ +\item{conn}{DBI connection to the local pipeline database (where +\verb{} and \code{fresh.streams_vw_bcfp} live).} + +\item{aoi}{Watershed group code (e.g. \code{"PARS"}).} + +\item{cfg}{An \code{lnk_config} object (resolves \code{cfg$pipeline$schema}).} + +\item{reference}{Character scalar identifying the reference. Only +\code{"bcfishpass"} is supported.} + +\item{conn_ref}{Optional DBI connection to the bcfp tunnel +(\code{localhost:63333}). Default \code{NULL} → tunnel-free local-snapshot compare.} + +\item{species}{Optional character vector of species codes to restrict to. +Default \code{NULL} discovers the set from the mapping_code columns.} + +\item{ref_table}{Reference table name for the tunnel-free path. Default +\code{"fresh.streams_vw_bcfp"} (where \code{snapshot_bcfp.sh} loads bcfp's output).} +} +\value{ +A tibble, one row per species: \code{wsg}, \code{species}, \code{total_segs}, +\code{match_pct}, \code{n_diffs}, \code{top_pattern} (most common \code{link | bcfp} token +mismatch), \code{top_pattern_count}. +} +\description{ +Segment-level QA counterpart to \code{\link[=lnk_compare_rollup]{lnk_compare_rollup()}}. Reads the +per-segment \verb{mapping_code_} tokens that \code{\link[=lnk_pipeline_run]{lnk_pipeline_run()}} (with +\code{mapping_code = TRUE}) persisted to \verb{.streams_mapping_code}, +diffs them against a reference's tokens for the same segments, and returns +a per-species match tibble. +} +\details{ +Reads only — no writes, no working schema. +\subsection{Tunnel-free by default}{ + +The reference is the \strong{local} snapshot \code{fresh.streams_vw_bcfp} (loaded by +\code{data-raw/snapshot_bcfp.sh --with-bcfp-views} from bcfp's published S3 +output — no SSH, no \verb{:63333}). With \code{conn_ref = NULL} (default) the compare +is a single local join on \code{conn}: no second connection, no \code{PG_PASS_SHARE}, +no tunnel. Pass \code{conn_ref} (a DBI connection to the live bcfp tunnel) to +diff against \code{bcfishpass.streams_mapping_code} instead — the legacy path, +kept for back-compat. +} + +\subsection{Join}{ + +link's \code{streams_mapping_code.id_segment} is a local surrogate, distinct from +bcfp's \code{segmented_stream_id}, so the join is on FWA segment-start position: +\code{blue_line_key} + \code{downstream_route_measure} (rounded to 3 decimals — robust +to ULP drift on the PostGIS-computed doubles, deterministic across runs that +share the same fwapg segmentation). link's position columns come from +\verb{.streams}, joined on the full PK +\verb{(id_segment, watershed_group_code)} — \code{id_segment} alone is not unique +across WSGs. The snapshot view carries the position columns inline. +} + +\subsection{Species resolution}{ + +\code{species = NULL} (default) compares every species present as a +\verb{mapping_code_} column on BOTH sides (link's persisted table and the +reference), with rows for the WSG. Pass \code{species} to restrict; caller-passed +species absent on either side drop out (no error). +} +} +\examples{ +\dontrun{ +conn <- lnk_db_conn() +cfg <- lnk_config("bcfishpass") + +# Tunnel-free: diff persisted tokens vs the local fresh.streams_vw_bcfp snapshot. +lnk_compare_mapping_code(conn, aoi = "PARS", cfg = cfg) + +# Legacy tunnel path (requires the bcfp tunnel up): +conn_ref <- DBI::dbConnect(RPostgres::Postgres(), + host = "localhost", port = 63333, dbname = "bcfishpass", + user = "newgraph", password = Sys.getenv("PG_PASS_SHARE")) +lnk_compare_mapping_code(conn, "PARS", cfg, conn_ref = conn_ref) +} + +} +\seealso{ +\code{\link[=lnk_compare_rollup]{lnk_compare_rollup()}}, \code{\link[=lnk_compare_wsg]{lnk_compare_wsg()}}, \code{\link[=lnk_pipeline_run]{lnk_pipeline_run()}} + +Other compare: +\code{\link{lnk_access}()}, +\code{\link{lnk_compare_rollup}()}, +\code{\link{lnk_compare_wsg}()}, +\code{\link{lnk_mapping_code}()}, +\code{\link{lnk_parity_annotate}()} +} +\concept{compare} diff --git a/man/lnk_compare_rollup.Rd b/man/lnk_compare_rollup.Rd index 924fd33..6e4bec2 100644 --- a/man/lnk_compare_rollup.Rd +++ b/man/lnk_compare_rollup.Rd @@ -90,6 +90,8 @@ print(rollup) \code{\link[=lnk_parity_annotate]{lnk_parity_annotate()}} Other compare: +\code{\link{lnk_access}()}, +\code{\link{lnk_compare_mapping_code}()}, \code{\link{lnk_compare_wsg}()}, \code{\link{lnk_mapping_code}()}, \code{\link{lnk_parity_annotate}()} diff --git a/man/lnk_compare_wsg.Rd b/man/lnk_compare_wsg.Rd index ea95b7d..1a05626 100644 --- a/man/lnk_compare_wsg.Rd +++ b/man/lnk_compare_wsg.Rd @@ -148,6 +148,8 @@ print(result_mc$mapping_code) \code{\link[=lnk_parity_annotate]{lnk_parity_annotate()}} Other compare: +\code{\link{lnk_access}()}, +\code{\link{lnk_compare_mapping_code}()}, \code{\link{lnk_compare_rollup}()}, \code{\link{lnk_mapping_code}()}, \code{\link{lnk_parity_annotate}()} diff --git a/man/lnk_mapping_code.Rd b/man/lnk_mapping_code.Rd index 62314eb..6accf15 100644 --- a/man/lnk_mapping_code.Rd +++ b/man/lnk_mapping_code.Rd @@ -88,7 +88,7 @@ This function replaces the inline assembly previously buried inside function via \code{lnk_pipeline_run(..., mapping_code = TRUE)}. Operators can also call this directly against persist schema with the tunnel down — the build is tunnel-independent (the diff vs reference is -separate, see \code{.lnk_compare_wsg_mapping_code_diff}). +separate, see \code{\link[=lnk_compare_mapping_code]{lnk_compare_mapping_code()}}). Tracks link#187 (tunnel decouple + portable build). } @@ -121,6 +121,8 @@ lnk_mapping_code( } \seealso{ Other compare: +\code{\link{lnk_access}()}, +\code{\link{lnk_compare_mapping_code}()}, \code{\link{lnk_compare_rollup}()}, \code{\link{lnk_compare_wsg}()}, \code{\link{lnk_parity_annotate}()} diff --git a/man/lnk_parity_annotate.Rd b/man/lnk_parity_annotate.Rd index 4044905..6e088b0 100644 --- a/man/lnk_parity_annotate.Rd +++ b/man/lnk_parity_annotate.Rd @@ -60,6 +60,8 @@ stopifnot(nrow(unexplained) == 0L) \code{\link[=lnk_compare_wsg]{lnk_compare_wsg()}} Other compare: +\code{\link{lnk_access}()}, +\code{\link{lnk_compare_mapping_code}()}, \code{\link{lnk_compare_rollup}()}, \code{\link{lnk_compare_wsg}()}, \code{\link{lnk_mapping_code}()} diff --git a/planning/active/findings.md b/planning/active/findings.md index 15496d9..b238a59 100644 --- a/planning/active/findings.md +++ b/planning/active/findings.md @@ -1,84 +1,45 @@ -# Findings — mapping_code accessibility, reproduce bcfp `barriers_` (#200) +# Findings — tunnel-free `lnk_compare_mapping_code` + orchestrator (#175) ## Issue context -link's per-species mapping_code accessibility (`barriers_per_sp` → `accessible`) uses `barriers__unified` = ALL barriers (incl dams/PSCIS/modelled) where the species is in `blocks_species`. bcfp's per-species access set is natural barriers only (gradient@species-threshold ∪ falls ∪ subsurface), MINUS upstream observation/habitat overrides, ∪ all `user_barriers_definite`. Dams are never in the access set — token2 descriptor only. Consequence: every segment below a dam reads inaccessible → `;DAM`/`;MODELLED`/`;ASSESSED` second token suppressed (token2 correctly gated on `accessible`) → link emits bare `SPAWN`/`REAR` where bcfp emits `SPAWN;DAM`. +#175 (updated 2026-05-24): promote `with_mapping_code` flag → stand-alone `lnk_compare_mapping_code()` export, sibling to `lnk_compare_rollup`. Post-#200 refinement: make the reference the **local snapshot** `fresh.streams_vw_bcfp`, not the `:63333` tunnel. Also (folded in): the provincial orchestrator's tunnel-free + M1-dispatch + post-consolidate cross-WSG recompute. Supersedes #167 (tunnel-drops → autossh; tunnel-free obviates it). -## bcfp mechanism (read authoritatively from `smnorris/bcfishpass@e12c1a5`, 2026-05-23) +## Mechanism (mapped this session) -All 5 per-species access models (`model/01_access/sql/model_access_{bt,ch_cm_co_pk_sk,wct,ct_dv_rb,st}.sql`) share one structure: -``` -barriers_ = ( gradient@species-classes ∪ falls ∪ subsurface ) - MINUS (barriers with upstream observation OR confirmed habitat, in the species' obs-set) - ∪ ALL barriers_user_definite -- override-EXEMPT (comment: "include *all* user added features, even those below observations") -``` -Per-species axes (the only differences across models): +- **Existing compare** (`R/lnk_compare_wsg.R`): `.lnk_compare_wsg_mapping_code_diff(conn, conn_ref, …)` diffs link's `.streams_mapping_code` vs `bcfishpass.streams_mapping_code` over the **tunnel** (`conn_ref`, `:63333`), joined on `segmented_stream_id`. Returns per-species `wsg, species, total_segs, match_pct, n_diffs`. `lnk_compare_rollup` is the km-rollup sibling (also tunnel, queries `bcfishpass.habitat_linear_`). +- **Tunnel-free swap**: the snapshot (`snapshot_bcfp.sh --with-bcfp-views`) already loads bcfp's published streams output into local `fresh.streams_vw_bcfp` (province-wide, has `mapping_code_` + `blue_line_key` + `downstream_route_measure`). So the compare = local join, same DB, no `conn_ref`. +- **Join key**: link's `.streams_mapping_code.id_segment` is a local surrogate (≠ bcfp `segmented_stream_id`). Join via `.streams` to get `blue_line_key` + `downstream_route_measure`, then match `fresh.streams_vw_bcfp` on `(blue_line_key, round(downstream_route_measure,1))`. This is the validated query: PARS BT 98.95%, LFRA BT 97.77% / CO 97.90%. +- **Reference build verified**: `s3://fresh-bc/bcfishpass/log.json` → `v0.7.15-14-ge12c1a5` (2026-05-20 rebuild); our snapshot matches. Next rebuild Tue 2026-05-27. -| Model | Gradient classes | Obs/habitat species | -|---|---|---| -| BT | 25, 30 | BT,CH,CM,CO,PK,SK,ST | -| salmon (CH,CM,CO,PK,SK) | 15, 20, 25, 30 | CH,CM,CO,PK,SK | -| ST | 20, 25, 30 | CH,CM,CO,PK,SK,ST | -| WCT | 20, 25, 30 | (wct) | -| CT,DV,RB | 25, 30 | BT,DV,CT,RB | +## Orchestrator gaps (predates #200 + tunnel-free) -- **`barriers_user_definite.sql`** materializes the definite table with a synthesized deterministic id + ltree resolved by joining the raw user CSV to `whse_basemapping.fwa_stream_networks_sp` (segment whose `[downstream_route_measure, upstream_route_measure)` contains the barrier). link's existing FALLS branch in `lnk_barriers_unify.R:221-242` does the **identical** join. -- **`load_streams_access.sql`** — `barriers__dnstr` (per-species, natural+definite) is separate from `barriers_anthropogenic_dnstr`/`barriers_dams_dnstr` (descriptors). `access_` = 0 if a downstream barrier exists, else 1/2 (obs-aware). token2 gate (`load_streams_mapping_code.sql`) = `barriers__dnstr = array[]` — identical to link's `ifelse(accessible, mc_barrier, NA)`. -- **Province-wide accumulation**: each `barriers_` is per-WSG-built but accumulated into one province-wide table, so cross-WSG downstream walks (PARS→PCEA→UPCE) see the correct override-applied set. +- `wsgs_run_pipeline.sh` pre-flight hard-requires `:63333` + `PG_PASS_SHARE` (lines ~179-181). M4-centric: hardcoded `ssh m1` (229/280/284), "snapshot on M4+M1", LPT host model `m4/m1/cy`. +- `wsg_compare.R` → `lnk_compare_rollup(reference="bcfishpass")` connects `conn_ref` to the tunnel (lines ~44-46). Each host runs compare per-WSG → each needs the tunnel. +- **Cross-WSG `;DAM` gap**: `wsgs_run_host.R` computes mapping_code per-WSG against only the host's local bucket barriers, **before** consolidate → cross-WSG dams in other buckets invisible. No post-consolidate recompute. Fix = Step 9b recompute on the merged schema (the two-pass), then one tunnel-free compare. Simplification: cyphers run+persist only (no compare, no tunnel); dispatcher recomputes + compares once. +- `cypher_prep.sh` installs link from `main` by default → cyphers get v0.40.4 automatically now that #200 is merged. ✓ (no branch-push needed). -## link mapping (every ingredient already exists) +## Study areas (the validation scope) -| bcfp ingredient | link object | state | -|---|---|---| -| gradient@species-threshold | `access_gradient_max` → `blocks_species` (gradient CASE in `lnk_barriers_unify`) | ✓ correct | -| obs/habitat override | `lnk_barrier_overrides` → `.barrier_overrides` (uses `fwa_upstream`, topological/cross-WSG) | ✓ computed; **per-WSG only, not persisted** | -| user_definite | `.barriers_definite` (`lnk_pipeline_prepare.R:182-200`; CSV cols, no id/ltree; empty-fallback = blk+drm only) | **per-WSG only, not in persist barriers** | -| natural barriers | persist `barriers` (gradient/falls/subsurface families) | ✓ province-wide | +From the `fish_passage_*_reporting` repos' `wsg_code`/`wsg` params: +- **Peace** (`fish_passage_peace_2025_reporting` index.Rmd): CARP, CRKD, FINA, FINL, FIRE, FOXR, INGR, LOMI, MESI, NATR, OSPK, PARA, PARS, PCEA, TOOD, UOMI (16) +- **Fraser** (`fish_passage_fraser_2025_reporting` index.Rmd): LCHL, NECR, FRAN, MORK, UFRA, WILL, TABR, LSAL (8) +- **Skeena** (`fish_passage_skeena_2024_reporting` `0160-load-bcfishpass-data.R`): BULK, MORR, ZYMO, KISP, KLUM (5) +- 29 focal → **52 with downstream-closure** (LFRA, MFRA, UPCE, LPCE, LBTN, LSKE, USKE, MSKE, …). Closure + DS-first order derivable from `public.wsg_outlet` (per-WSG outlet `wscode_ltree`, materialized this session) via `@>` ancestry. Major drainages by root wscode: Fraser `100` (68), Peace `200` (65), Columbia `300`/ELKR (17), Skeena `400` (12). -`lnk_barrier_overrides` output is `(blue_line_key, downstream_route_measure, species_code)` and currently feeds only `lnk_pipeline_classify` (habitat), NOT the access path. +## Cypher capability (proven 2026-05-24) -## The design decision (why province-wide, not a per-WSG view) +M1 fired up 3 cyphers in parallel (`cypher_up.sh --workspace job1/2/3`, ~3 min each from warm snapshot `228350154`), verified ready, burned clean (`cypher_down.sh`, 0 tofu resources). DO auth + Tailscale confirmed. `wsgs_run_pipeline.sh` is the all-in-one (spin→prep→dispatch→consolidate→compare→burn via `trap EXIT`). -The access set is a downstream `frs_network_features` walk that **crosses WSG boundaries**. A per-WSG `_access` view (subtract only the current WSG's overrides from province-wide natural barriers) is quietly wrong for any natural barrier in a downstream/sibling WSG — the cross-WSG twin of the dam bug. Rejected. Correct design: **persist all three access inputs province-wide** (natural ✓, override → new persist table, user_definite → `USER_DEFINITE` persist family), persisted **together per WSG** so any persisted WSG is internally consistent. Caveat (single-WSG run sees only persisted WSGs) is identical to today's natural barriers and bcfp's accumulation — handled by the provincial orchestrator. +## Phase 1 done + id_segment bug (2026-05-24) -Approach A (definite as unify family) + persist overrides was chosen over the issue-draft's per-WSG view-union (B') after a Plan-agent review and the user's explicit "make it provincial, don't ship a 2/200 one-off." `barriers_definite` lacks id+ltree → resolved via the FALLS-pattern FWA join. No `cols_barriers` DDL change (USER_DEFINITE is a new row-source, same columns). `barrier_overrides` persist uses a single `cols_barrier_overrides` vector for DDL+INSERT (avoids the v0.40.3 matched-pair drift). +`lnk_compare_mapping_code()` built tunnel-free (reads local `fresh.streams_vw_bcfp`); `.lnk_compare_wsg_mapping_code_diff` delegates; shared merge in `.lnk_mc_diff`. **Live PARS BT 98.95% tunnel-free** (reproduced the hand-validation). WSG-active species resolution added (PARS → BT only; CO empty in upper Peace, correctly excluded — avoids spurious 0%). -## Verified facts (this session) +**id_segment is NOT globally unique** (per-WSG row index): `fresh.streams` = 1,542,427 rows but only **80,555 distinct id_segment** (~19× repeat across WSGs; unique only on the persist PK `(id_segment, watershed_group_code)`). Any persist join on `id_segment` ALONE is a ~19-22× cartesian. Found two: `lnk_compare_mapping_code` (fixed in build) and **`lnk_compare_rollup`** (3 joins — was inflating km ~22×: PARS BT spawning_km 36,820 → 1,681; **tactically fixed to full-PK joins this phase**). Safe: `lnk_compare_wsg`/`lnk_pipeline_persist` (working schema = single WSG). -- Persist pattern is `cols_*`-vector-driven; `cols_barriers` drives both DDL (`lnk_persist_init`) and INSERT (`lnk_pipeline_persist.R:94,99`). -- break (`lnk_pipeline_break.R:112`) + classify (`lnk_pipeline_classify.R:223`) read `.barriers_definite` separately → adding `USER_DEFINITE` to persist `barriers` does NOT double-count. -- `barriers_anthropogenic/dams/pscis_unified` filter by `barrier_source` → `USER_DEFINITE` doesn't pollute them. `lnk_compare_*` don't read `_unified`. -- `frs_network_features` (fresh) needs `feature_id_col, blue_line_key, downstream_route_measure, wscode_ltree, localcode_ltree` on the feature table. The `phase4d_plan_draft.md:37` draft wrongly dropped the ltree cols. -- `whse_basemapping.fwa_stream_networks_sp` present in local fwapg; `fresh.streams_vw_bcfp` loaded (4.23M rows, PARS 43,660, carries `mapping_code_bt`) — tunnel-free parity baseline. bcfp baseline `v0.7.15-14-ge12c1a5`. - -## Phase 4 validation — PARS BT (2026-05-23) - -Run `lnk_pipeline_run("PARS", mapping_code=TRUE)` against local docker fwapg (bcfp baseline `v0.7.15-14-ge12c1a5` in `fresh.streams_vw_bcfp`). - -**Result: 99.04% per-segment match vs bcfp** (42,701 / 43,114 joined on `blue_line_key` + rounded `downstream_route_measure`). The headline #200 fix works: -- token1 collapse GONE — `ACCESS`/`SPAWN`/`REAR` emit (was bare `SPAWN`/`REAR`). Counts ≈ bcfp (`SPAWN;DAM` 5293 vs 5263, `REAR;DAM` 2213 vs 2191). -- token2 `;DAM` emerges — dam-downstream-but-accessible segments now annotate `;DAM` not `;NONE`. - -**Cross-WSG dependency confirmed (the provincial design's whole point):** the FIRST PARS run (PARS only) emitted `;NONE` because the Bennett/Peace Canyon dams live in PCEA/UPCE, which weren't persisted. After persisting PCEA + UPCE barriers (`mapping_code=FALSE`) and re-running PARS, the cross-WSG downstream walk saw the dams → `;DAM`. token2/`barrier_sources` is unchanged by #200; it just needs the downstream WSGs in persist. - -Residual ~1% (413 segs): token1 `ACCESS`↔`REAR` swaps (habitat-presence threshold — dimensions/rules, not #200) + a few token2 `DAM`↔`MODELLED` next-downstream-ordering edges. Not the dam-access divergence. - -### Phase 4 validation — LFRA (anadromous; Coquitlam/Alouette/Stave/Ruskin dams) - -`lnk_pipeline_run("LFRA", mapping_code=TRUE)`. Match vs bcfp: **LFRA/bt 97.77%, LFRA/co 97.90%** (26,651 segs each). LFRA coho DAM-token count link **4672 vs bcfp 4636** — the dam descriptor + above-dam path works for anadromous salmon, not just resident BT. LFRA drains to the ocean (lowest Fraser group) so its dams are in-WSG — single run, no cross-WSG persist needed (unlike PARS→PCEA/UPCE). - -Residual ~2%: token1 `ACCESS`↔`SPAWN`/`REAR` (spawning/rearing **habitat-presence** — token1 habitat fires regardless of access per RUNBOOK §4; governed by `frs_habitat_classify`/dimensions, unaffected by #200) + small token2 `DAM`↔`MODELLED`/`NONE` next-downstream-ordering edges. The dam-access fix itself (token2 DAM on accessible dam-downstream segments) matches. - -**Acceptance MET** for both resident + anadromous. Remaining habitat-token1 parity is a separate, pre-existing concern (habitat rules), not #200. - -### Stale-persist-table drift (pre-existing, surfaced in Phase 4) - -LFRA first failed: `column "has_barriers_ch_dnstr" of relation "streams_access" does not exist`. The M1 `fresh.streams_access` / `streams_mapping_code` were stale at bt+co width (old pre-v0.40.2 runs); `lnk_persist_init`'s `CREATE IF NOT EXISTS` won't widen an existing table, and `.lnk_validate_persist_table` only detects GENERATED-column drift, not species-count drift. `lnk_pipeline_run` correctly sizes persist_init to `cfg$species` (8) — so DROPping the two stale wide tables + re-running recreated them full-width. NOT a #200 bug (production provincial runs start clean). Possible follow-up: extend drift-validate to species-column count. - -**Real bug caught + fixed during the run:** `barrier_overrides` PK was `(blk, drm, species_code)` — UPCE's persist INSERT collided with PCEA's because the SAME override position is computed by two adjacent WSG runs (boundary streams whose `blue_line_key` spans WSGs). Fixed: PK now includes `watershed_group_code` (mirrors `cols_barriers`), so per-WSG DELETE+INSERT is clean; the WSG-agnostic access anti-join makes the duplicate harmless. +**bcfp `segmented_stream_id` (verified):** globally unique (4.23M distinct); position-derived from `blue_line_key` + `downstream_route_measure` (data dictionary confirms); text; integer part = blk, fraction = `round(measure,3)`-derived; segmentation-dependent. Root fix = make link `id_segment` likewise position-derived → filed **#203** (also enables direct `id_segment == segmented_stream_id` joins). ## Open / watch -- Cross-WSG override correctness — the provincial design should fix it; **verify on LFRA** in Phase 4 (don't assume). -- `remediated_dnstr_ind` divergence is bcfp's own bug (`smnorris/bcfishpass#690`) — whitelist in the parity diff. -- Pre-existing (out of scope): `lnk_pipeline_run.R:228` `pscis = .barriers_pscis` vs view name `barriers_pscis_unified` — watch in Phase 4. +- Reference freshness: re-snapshot if the run slips past Tue 2026-05-27 (next bcfp rebuild). Orchestrator Step 1+2 re-snapshots each host automatically. +- Residual ~1-2% mapping_code mismatch is token1 habitat-presence (dimensions/rules), not the dam-access fix — don't chase under #175. +- Base public inputs (pscis/cabd/obs) pulled live each snapshot → slight drift vs bcfp's frozen build inputs; small (98.95% confirms). diff --git a/planning/active/progress.md b/planning/active/progress.md index 14a6c9b..65d50c1 100644 --- a/planning/active/progress.md +++ b/planning/active/progress.md @@ -1,14 +1,34 @@ -# Progress — mapping_code accessibility, reproduce bcfp `barriers_` (#200) - -## Session 2026-05-23 - -- M4→M1 handoff resumed; v0.40.3 shipped (PR #199 merged `46b2042`, tagged). -- Environment up on M1: docker fresh-db, link 0.40.3, bcfp snapshot reloaded (`v0.7.15-14-ge12c1a5`, `streams_vw_bcfp` loaded locally), local `:63333→:5432` forwarder (db_newgraph tunnel key deauthorized on M1 — not blocking). -- Read bcfp access machinery end-to-end (5 species models + `barriers_user_definite.sql` + `load_streams_access.sql`) — see findings. -- Plan-agent design review: rejected per-WSG view (B'); user pushed for province-wide correctness. Final design = persist all access inputs province-wide (USER_DEFINITE family + persist barrier_overrides + `_access` view). Plan approved. -- Archived #196 PWF; created branch `200-mapping-code-accessibility-reproduce-bcf`; scaffolded #200 PWF baseline. -- Phase 1 done: `USER_DEFINITE` family in `lnk_barriers_unify` (FALLS-pattern FWA ltree join); `cols_barrier_overrides` + DDL in `lnk_persist_init`; persist copy in `lnk_pipeline_persist` (pre-persist auto-picks it up). Unit tests 96 pass; DB-smoke validated DDL + branch (empty-fallback safe, one-row resolves ltree/geom). Both configs persist to schema `fresh` (provincial `fresh_default` is a runtime `--schema` override). Commit `a82a7fc`. -- Phase 2 done: `barriers__access` view in `lnk_barriers_views` (natural-only + override anti-join + definite-exempt). DB-smoke: 904,262 natural rows, 0 non-natural. code-check clean. Commit `f758e44`. -- Phase 3 done: repoint `barriers_per_sp` → `_access` + comment. Commit `c412d68`. -- Phase 4 done: validated PARS BT 98.95%, LFRA BT 97.77% / CO 97.90% vs `streams_vw_bcfp`; `;DAM` tokens correct. Found+fixed `barrier_overrides` PK (added `watershed_group_code` — boundary-stream cross-WSG collision). Worked around stale bt+co persist tables (DROP + recreate full-width). PARS needed PCEA+UPCE persisted first (cross-WSG dams). Commit `2ea3a7d`. -- Phase 5: RUNBOOK §2a/§3/§5/§7 updated; NEWS + DESCRIPTION → 0.40.4; temp scripts removed. test 1193 pass (lone FAIL = env db_conn). Next: archive + open PR (stop before merge per user). +# Progress — tunnel-free `lnk_compare_mapping_code` + orchestrator (#175) + +## Session 2026-05-24 + +- #200 (v0.40.4) merged via PR #202; cyphers proven (3-way spin/burn) + tunnel-free reference verified (`v0.7.15-14-ge12c1a5`). +- Edited #175 + #167 in-body (no new issues): #175 now scopes tunnel-free `lnk_compare_mapping_code` + orchestrator (M1-dispatch + post-consolidate recompute); #167 superseded by tunnel-free. +- Plan-mode exploration; phases approved by user. +- Archived #200 PWF; created branch `175-promote-with-mapping-code-flag-to-stand` off main; scaffolded #175 PWF baseline. +- Phase 1 done: `lnk_compare_mapping_code()` tunnel-free + `.lnk_compare_wsg_mapping_code_diff` delegates + `.lnk_mc_diff` shared. Live PARS BT 98.95% tunnel-free; 1216 tests pass (lone FAIL = env db_conn). Caught id_segment ~22× cartesian → fixed `lnk_compare_rollup` (full-PK joins) + WSG-active species resolution; filed #203 (position-derived globally-unique id_segment, bcfp-verified). +- Phase 2 done: `lnk_compare_wsg(mapping_code=TRUE)` now tunnel-free (routes through `lnk_compare_mapping_code`, no conn_ref; rollup still tunnel — snapshot lacks habitat_linear). Removed dead `.lnk_compare_wsg_mapping_code_diff`; added `wsg_compare_mapping_code()` (tunnel-free orchestrator entry, verified PARS 98.95% w/ PG_PASS_SHARE unset). 93 compare / 1216 total pass. +- Next: Phase 3 — orchestrator (`wsgs_run_pipeline.sh` drop :63333 pre-flight + M1-dispatch + Step 9b post-consolidate recompute; `wsgs_run_host.R` cyphers run+persist only). + +## Session 2026-05-25 + +- 3-WSG smoke (CRKD@M1, LCHL@cy1, ZYMO@cy2): plumbing spin→prep→run→consolidate→burn works; caught wide-table shape drift — cyphers' `streams_access`/`streams_mapping_code` had 11 species cols (CT/DV/RB) vs M1's 8 → positional COPY-consolidate failed. Cyphers burned + confirmed gone (0 tofu resources each). +- Root cause: `cypher_prep.sh` seeded `lnk_persist_init` from `parameters_fresh` (11 sp) while `lnk_pipeline_run` + dispatcher use `cfg$species` (8 sp). Warm snapshot predates wide tables (#187) — not a snapshot artifact. +- Phase 3a done (per user steer "build abstract, shouldn't matter which machine / how many species cols, don't hardcode"): + - `schema_consolidate.R` → shape-tolerant COPY (runtime shared-column intersection, copy-by-name, dest ordinal order). No hardcoded species/cols/host. + - `cypher_prep.sh` → persist species = `cfg$species` (mirrors `lnk_pipeline_run`). + - Filed #204 (persist_init species-column-set drift detection + abstract/no-hardcode north star). `/code-check` clean. +- Next: Phase 3 orchestrator (M1-dispatch generalization — "shouldn't matter which machine runs"), then re-run the 3-WSG smoke to confirm consolidate of the two wide tables, then study-area parity. +- Phase 3 REVISED (user: "are these already dealt with in our start-to-finish scripts?"): the smoke flow already does M1-dispatch + tunnel-free + abstract consolidate. Dropped the 30-edit old-orchestrator refactor. Built 4 lean reusable scripts (study_area_wsgs.R / wsg_run_one.R / study_area_compare.R / study_area_run.sh); cross-WSG `;DAM` solved via drainage-closed buckets + DS-first (no recompute), validated on public.wsg_outlet. /code-check clean. Commit 0673025. Filed #204. +- **Phase 4 — Stage A PASS (dispatcher-only PARS, $0):** driver + tunnel-free pre-flight + DS-first run + tunnel-free compare all work. **Cross-WSG `;DAM` CONFIRMED** — PARS top_pattern `ACCESS;DAM;INTERMITTENT` (Bennett dams in PCEA/UPCE ran DS-first). match% BT: FINA 99.8 / PARA 99.3 / PARS 99.0 / PCEA 99.8 / UPCE 99.6 (LBTN/LPCE no bcfp ref). CSV: data-raw/logs/study_area_run/20260525_090730_compare.csv. +- **Phase 5 — full 3-area run** (52 WSGs, zero overlap: Peace 19 / Fraser 24 / Skeena 9). Several iterations + a data-loss incident, all root-caused + fixed from the RECORDS (user steer: "look in our records — there's a runbook"): + - Fixed cypher prep scp race (SSH-readiness wait, 583a4ab) and branch-not-on-cyphers (`CYPHER_PREP_BRANCH`, 7e96b10 — branch must be pushed). + - **Data-loss incident (root-caused):** closure pulled in species-less WSG LEUT → `lnk_pipeline_run` "No species resolved for AOI" → dispatcher `|| exit 1` → driver FATAL → **trap burned cyphers with un-consolidated Peace+Skeena data**. One bad WSG lost a whole run. (Cyphers were NOT externally destroyed — the driver's own trap burned them.) + - **Records review:** `research/provincial_run_runbook.md` (archive→spin→snapshot→install→smoke→dispatch→verify→consolidate→burn) + `data-raw/wsgs_run_host.R:88` already solved this via the **#157 species-presence filter** + resume-safe **per-WSG soft-fail**. My lean driver had the right shape (tunnel-free, M1-dispatch) but dropped that robustness. + - **Fixes (65d26ca):** `study_area_wsgs.R` filters closure to bundle-species presence (Fraser drops LEUT,LNRS); `study_area_run.sh` per-WSG soft-fail (never abort host/burn before consolidate); `wsg_run_one.R` defensive skip. /code-check clean. + - **Cost reality (runbook §6):** cyphers ~$0.06/hr each — "minimize idle" = don't leave up for HOURS (2026-05-12 10-hr incident), not shave minutes. My early-burn agonizing was over cents; dropped it. + - Re-run launched (pid 37285): dispatcher=Fraser (on M1 per user), cy1=Peace, cy2=Skeena. Log /tmp/fullrun4.log. + - **Full run completed (fullrun4, 50 WSGs):** consolidate (incl wide tables — #204 fix validated) + burn clean + tunnel-free compare. **median match 99.7%, mean 98.7%**, 150 rows (50 WSGs × 7 sp). + - **METHODOLOGY FINDING (the deliverable):** drainage-closed + DS-first per-host is **NOT sufficient** for cross-WSG access/`;DAM`. 6/50 WSGs diverged (FINA 75.5%, PARA 68.6%, UNRS, LFRA, LKEL, SETN) — their downstream barriers were cross-bucket / late-in-DS-order → access computed on incomplete barriers. Re-modelling on the full consolidated barrier set fixes them exactly (FINA→99.8%, PARA→99.3%). The orchestrator has NO recompute (provincial run compares km rollups, not per-segment mapping_code) — so this is new to #175. + - **Correct durable methodology = distribute (any bucketing) → consolidate → POST-CONSOLIDATE RECOMPUTE (diverged WSGs) → compare.** Bucketing is now a speed knob, not a correctness lever. Added to `study_area_run.sh` (recompute any-species-`<99%`, re-compare). Filed **#205** (cheap access-only recompute reusing persisted streams/habitat → makes recompute-ALL bulletproof + fast). Docs: `research/study_area_run.md`. + - Next: commit driver recompute + docs; re-run from the top to produce the authoritative post-recompute parity CSV; then `research/provincial_parity_2026_05_25.md` + CLAUDE.md status. diff --git a/planning/active/task_plan.md b/planning/active/task_plan.md index 3de484c..1aa127e 100644 --- a/planning/active/task_plan.md +++ b/planning/active/task_plan.md @@ -1,50 +1,80 @@ -# Task: mapping_code accessibility — reproduce bcfp `barriers_` (natural-only + override), provincially consistent (#200) +# Task: tunnel-free `lnk_compare_mapping_code` + provincial orchestrator for 4-way study-area parity (#175) -link's per-species mapping_code accessibility uses `barriers__unified` = ALL barriers (incl dams) where the species ∈ `blocks_species`, so dam-downstream segments read inaccessible and lose their `;DAM` token2. bcfp's access set is natural-only (gradient@species-threshold ∪ falls ∪ subsurface) MINUS observation/habitat override ∪ all user_definite — dams annotate (token2), never block. Fix: make all access inputs province-wide-persisted (natural ✓ already, override + user_definite added), build a `barriers__access` view over them, repoint `barriers_per_sp`. Full design: `planning/active/findings.md` + `RUNBOOK.md` §5. +Promote the `with_mapping_code` flag to a stand-alone `lnk_compare_mapping_code()` export, made **tunnel-free** (reference = local snapshot `fresh.streams_vw_bcfp`, not the `:63333` bcfp tunnel), then fix the provincial orchestrator (M1-dispatch + post-consolidate cross-WSG recompute) so the 3 study areas (Peace 16 / Fraser 8 / Skeena 5, ~52 drainage-closed WSGs) run correctly 4-way (3 cyphers + M1). Full design: `planning/active/findings.md` + issue #175 (updated 2026-05-24). Relates to SRED NewGraphEnvironment/sred-2025-2026#24. -## Phase 1 — make the override + user_definite province-wide +## Phase 1 — `lnk_compare_mapping_code()` standalone + tunnel-free (reusable core) -- [x] `USER_DEFINITE` family in `lnk_barriers_unify` (mirror FALLS branch: FWA-join for ltree; source `.barriers_definite`; `blocks_species`=all; reference only `blue_line_key`+`downstream_route_measure` for empty-fallback safety). No persist DDL change. -- [x] `cols_barrier_overrides` vector + `CREATE TABLE IF NOT EXISTS .barrier_overrides` in `lnk_persist_init` (one vector drives DDL + INSERT). -- [x] Persist `barrier_overrides` (DELETE-WHERE-WSG + INSERT, add `''` as `watershed_group_code`) in `lnk_pipeline_persist`; probe-gated. -- [x] Pre-persist auto-handled: the mapping_code-phase pre-persist (`lnk_pipeline_run.R:188`) already calls the full `lnk_pipeline_persist`, which now persists `barrier_overrides` — no separate edit needed. -- [x] Tests: unify `USER_DEFINITE` branch SQL; persist_init `barrier_overrides` DDL; persist INSERT projection. (96 pass) -- [x] DB-smoke: `barrier_overrides` DDL creates in `fresh`; USER_DEFINITE branch parses + resolves ltree/geom (empty-fallback safe + one-row). -- [x] `/code-check` (round 1 clean) + commit. +- [x] New export `R/lnk_compare_mapping_code.R`: tunnel-free (reference = local `fresh.streams_vw_bcfp`, `conn_ref=NULL` default; tunnel path kept for back-compat). Joins `.streams_mapping_code` → `.streams` on the **full PK** `(id_segment, watershed_group_code)`, diffs vs the snapshot on `(blue_line_key, round(measure,3))` per **WSG-active** species. Returns `wsg, species, total_segs, match_pct, n_diffs, top_pattern, top_pattern_count`. +- [x] Refactor `.lnk_compare_wsg_mapping_code_diff` → delegates; shared merge/match in `.lnk_mc_diff`. +- [x] Tests: `test-lnk_compare_mapping_code.R` (arg-val + `.lnk_mc_diff` compose + live PARS BT) + adapted the moved test in `test-lnk_compare_wsg.R`. 93 compare tests pass; **live PARS BT 98.95% tunnel-free**. +- [x] **Bug caught + fixed:** `id_segment` is per-WSG (not globally unique; 80,555 distinct / 1.5M rows → ~22× cartesian on `id_segment`-alone persist joins). Fixed `lnk_compare_rollup`'s 3 joins to full PK (PARS BT spawning_km 36,820 → 1,681). Added WSG-active species resolution (avoids spurious 0% for absent species). Filed root issue **#203** (position-derived globally-unique `id_segment`, bcfp-style). +- [ ] `/code-check` + commit. -## Phase 2 — `barriers__access` view over province-wide inputs +## Phase 2 — compare-family wiring + back-compat -- [x] Per-species `_access` view in `lnk_barriers_views`: natural (`barrier_source IN ('GRADIENT','FALLS','SUBSURFACE_FLOW','USER_DEFINITE')`) over persist `barriers`, `NOT EXISTS` anti-join over `barrier_overrides` (derived from the barriers source schema via `sub()`), `USER_DEFINITE` override-exempt, expose `barriers__access_id`, keep `wscode_ltree`/`localcode_ltree`, alias `b`. -- [x] Tests: `_access` view SQL per species; counts updated (22→38, 10→14). 30 pass. -- [x] DB-smoke: `barriers_bt_access` valid + queryable; 904,262 natural rows, 0 non-natural (dams/anthropogenic excluded) vs `_unified` 1,045,358. -- [x] `/code-check` (clean) + commit. +- [x] `lnk_compare_wsg(mapping_code = TRUE)` routes through `lnk_compare_mapping_code` tunnel-free (no `conn_ref` for the mapping_code lens; rollup still uses the tunnel — snapshot lacks `habitat_linear`). Removed dead `.lnk_compare_wsg_mapping_code_diff` helper; fixed the `lnk_mapping_code` doc ref. +- [x] `data-raw/wsg_compare.R`: added `wsg_compare_mapping_code()` — tunnel-free (local conn only, no `PG_PASS_SHARE`/`:63333`). The dispatcher's post-consolidate compare entry. +- [x] Tests: repointed `lnk_compare_wsg` composition test to mock `lnk_compare_mapping_code`; 93 compare pass / 1216 total (lone FAIL = env db_conn). Live `wsg_compare_mapping_code("PARS")` = 98.95% with `PG_PASS_SHARE` unset. +- [ ] `/code-check` + commit. -## Phase 3 — repoint `barriers_per_sp` → `_access` +## Phase 3a — consolidate/persist shape-tolerance (3-WSG smoke fixes, #204) -- [x] `lnk_pipeline_run` (:215-217) `..._unified` → `..._access`; rewrote KNOWN-DIVERGENT comment (:200-213) to describe the landed fix. -- [x] `test-lnk_pipeline_run.R` asserts no `_unified`/`barriers_per_sp` name → no update needed. -- [x] Full suite 1193 pass (the lone FAIL is the env-only `db_conn` test — needs the real db_newgraph tunnel: `.Renviron` `PG_*_SHARE` → `:63333` w/ airvine/bcfishpass creds; CI skips it via `skip_if_no_db`). Phase 3 repoint validated by Phase 2's code-check (consumption confirmed). -- [x] commit. +- [x] `data-raw/schema_consolidate.R`: shape-tolerant COPY — enumerate columns on both hosts, COPY the shared set **by name** in dest ordinal order (was positional `SELECT *` → `FROM STDIN`, which broke on any species-column-count drift). Host- and species-count-agnostic; nothing hardcoded. Sibling to #185. +- [x] `data-raw/cypher_prep.sh`: align persist species to `cfg$species` (matches `lnk_pipeline_run` R/lnk_pipeline_run.R:157), not `parameters_fresh` (11 sp incl CT/DV/RB). Removes the cross-host wide-table drift at source. +- [x] Filed #204 (persist_init blind to species-column-set drift; abstract/no-hardcode north star). `/code-check` clean (round 1 — 0 findings). -## Phase 4 — DB validation (hard gate): PARS + LFRA vs `fresh.streams_vw_bcfp` +## Phase 3 — orchestrator: REVISED 2026-05-25 (reuse smoke flow, NOT old-orchestrator refactor) -- [x] Rebuild PARS (BT) `mapping_code=TRUE`; `;DAM` tokens now emit (`SPAWN;DAM` 5293≈5263 bcfp). **98.95% match.** Needed PCEA+UPCE barriers persisted first (cross-WSG dams) — confirms the provincial design. -- [x] Rebuild LFRA (anadromous; Coquitlam/Alouette/Stave/Ruskin). **bt 97.77%, co 97.90%**; coho DAM tokens 4672≈4636 bcfp. Dams in-WSG (drains to ocean), single run. -- [x] Found + fixed real bug: `barrier_overrides` PK needed `watershed_group_code` (boundary-stream overrides shared across adjacent WSGs collided). Found + worked around stale-persist-table drift (bt+co wide tables → DROP + recreate full-width). -- [x] Residual ~1-2% characterized: token1 habitat-presence (`ACCESS`↔`SPAWN`/`REAR`, dimensions/rules) + minor token2 ordering — NOT the dam-access divergence. Recorded in `findings.md`. +**Decision (user steer "are these already dealt with in our start-to-finish scripts?"):** the 3-WSG smoke already proved an M1-dispatch, tunnel-free, abstract flow that BYPASSES the old M4-centric `wsgs_run_pipeline.sh`/`wsgs_dispatch.sh`/`wsgs_run_host.R`. Those carry M4/tunnel baggage and are NOT being refactored. Discarded the Plan-agent's 30-edit refactor. -## Phase 5 — docs + release +The proven flow = `cypher_up.sh` → `cypher_prep.sh` → per-host `lnk_pipeline_run(aoi=WSG, mapping_code=TRUE)` (local, no tunnel, no M4) → `schema_consolidate(sources=list({host,via,bucket}))` (shape-tolerant) → `wsg_compare_mapping_code(wsg,cfg)` (tunnel-free) → `cypher_down.sh`. -- [x] `RUNBOOK.md` §2a/§3/§5/§7 updated (fix landed; province-wide override/definite persistence; provincial-accumulation note). -- [x] `NEWS.md` + `DESCRIPTION` → 0.40.4. -- [x] Temp validation scripts removed. -- [ ] `/planning-archive`, `/gh-pr-push`. +- [x] ~~Cross-WSG `;DAM` solved WITHOUT recompute~~ **CORRECTED 2026-05-25: drainage-closed + DS-first is NOT sufficient.** It reduces but doesn't eliminate cross-WSG access gaps — downstream barriers can be cross-bucket or arrive late in DS-first order. Full run showed FINA 75.5% / PARA 68.6% per-host → 99%+ only after re-modelling on the full consolidated barrier set. **A POST-CONSOLIDATE RECOMPUTE is required** (see Phase 5 + #205). Drainage-closed bucketing is now just a speed knob (less divergence → less recompute), not a correctness lever. +- [x] Built 4 lean reusable scripts (reuse existing cypher_up/prep/down + schema_consolidate + lnk_pipeline_run + wsg_compare_mapping_code): + - `data-raw/study_area_wsgs.R` — closure + DS-first list via `public.wsg_outlet`. + - `data-raw/wsg_run_one.R` — `lnk_pipeline_run(mapping_code=TRUE)` for one WSG, local, host-agnostic (`LNK_LOAD=loadall` dispatcher / `library(link)` cyphers). + - `data-raw/study_area_compare.R` — tunnel-free `wsg_compare_mapping_code` loop → CSV. + - `data-raw/study_area_run.sh` — driver: pre-flight (tunnel-free) → spin → prep → run DS-first buckets (dispatcher local + cyphers) → consolidate cyphers→dispatcher → BURN (minimise idle) → compare → CSV. trap-EXIT burn safety net. +- [x] `/code-check` (1 fresh-eyes round): fixed burn-verification pipefail (`|| n="?"`), added bucket-overlap warning; accepted empty-array idiom (M1 bash 5.3). Committed. + +## Phase 4 — end-to-end mechanics (lean flow) + +- [x] Stage A (dispatcher-only PARS, $0): driver + tunnel-free pre-flight + DS-first + compare validated; PARS `ACCESS;DAM;INTERMITTENT`. +- [x] Full 3-area run (fullrun4, 50 WSGs, M1+2cy): spin → prep → run → consolidate (incl wide tables, #204 fix validated) → burn clean → compare. Caught the prep-race (#583a4ab), branch-on-cyphers (#7e96b10), data-loss-via-trap-burn (→ species filter #157 + soft-fail, 65d26ca). + +## Phase 5 — study-area parity run + the recompute finding + +- [x] **Authoritative parity obtained** (post-recompute, all 50 WSGs on M1): **median 99.66%, mean 99.11%**, 130/148 rows ≥99%. Genuine divergences (recompute-stable → taxonomy): SETN salmon ~94%, UNRS BT 61.8%. +- [x] **Methodology finding:** post-consolidate recompute REQUIRED (drainage-closed insufficient). Driver does it (recompute diverged WSGs <99% via full pipeline, then re-compare). Filed **#205** — the full-pipeline recompute is ~2× on diverged WSGs (re-runs streams/habitat to redo cheap access); the cheap access-only recompute (reuse persisted streams/habitat) makes recompute-ALL bulletproof + ~1×. +- [x] **DECIDED: build #205** (cheap recompute) → Phase 7 below. Then one clean driver-automated run that's both validated AND fast. +- [ ] Annotate SETN/UNRS via `research/bcfp_divergence_taxonomy.yml` (lnk_parity_annotate). + +## Phase 6 — docs + release + +- [x] `research/study_area_run.md` (lean flow + recompute methodology + gotchas); `data-raw/README.md` driver entry; memory. +- [ ] `research/provincial_parity_2026_05_25.md` (authoritative numbers — written this compact-prep). +- [ ] RUNBOOK (recompute step); NEWS + DESCRIPTION bump; `/planning-archive`; `/gh-pr-push` (after #205 + clean run). + +## Phase 7 — #205 cheap access-only recompute (the efficiency keystone) + +Plan: `~/.claude/plans/atomic-conjuring-tome.md`. Pre-flight DONE (PSCIS in persist barriers ✓; preservable phase-1 cols ✓). Builds on this `175-` branch. + +- [x] **7a.** `R/lnk_access.R` — portable access builder (twin of `lnk_mapping_code`). Builds per-species `_access` + source views internally; AOI-scopes streams as a **real TABLE** with indexes + `ANALYZE` (CRITICAL — view didn't carry stats, planner picked the wrong join direction, blew up cost ~1000×); `merge=TRUE` does surgical UPDATE preserving `remediated_dnstr_ind` + observed `access_=2`. `devtools::document()` regenerates NAMESPACE + man. +- [x] **#205 ancillary fixes surfaced + fixed:** + - `R/lnk_persist_init.R` — added ltree GIST/btree indexes on persist `streams` + `barriers` (matches `fresh::utils.R:416-431`; required for `frs_network_features` traversal). + - `R/lnk_mapping_code.R` — fix #203 cross-WSG cartesian (access read used `id_segment IN ...` which was 50×-duplicated against persist; branch by `watershed_group_code` column presence). + - `data-raw/wsg_recompute_one.R` — sets `statement_timeout`/`lock_timeout` so a runaway/locked query fails fast instead of orphaning a server-side backend (RUNBOOK §6). +- [x] **7b.** `data-raw/wsg_recompute_one.R` (sibling of `wsg_run_one.R`) — `lnk_access(merge=TRUE)` + `lnk_mapping_code` + DELETE/INSERT into `.streams_mapping_code`. `study_area_run.sh` wired to call it + switched to recompute-ALL (cheap → bulletproof, no threshold). Docs: `research/study_area_run.md` past-tense + `RUNBOOK.md` §6 (orphaned-backend / `statement_timeout` / view-vs-table planner gotcha / #203 cartesian). +- [ ] **7c.** `tests/testthat/test-lnk_access.R` (deferred — M1 integration test is the proof; unit test is follow-up). +- [x] **M1 validation:** FINA cheap recompute **11.86 s wall** (vs ~90 s full pipeline = ~8× faster), bcfp parity **99.8% / 57 diffs / `ACCESS;DAM` top** — IDENTICAL to the full-pipeline recompute. Methodology is correct + cheap. +- [ ] `/code-check` (deferred: empirical integration validation passed; review on PR). ## Validation -- [x] `devtools::test()` 1193 pass (lone FAIL = env-only `db_conn`, needs real tunnel; CI skips). -- [~] `devtools::check()` — CI runs it green (no DB → db_conn skips); local can't be fully green without the db_newgraph tunnel. Noted in PR. -- [x] Phase 4 DB parity (hard gate): PARS BT 98.95%, LFRA BT 97.77% / CO 97.90%; `;DAM` correct. -- [x] `/code-check` clean on Phases 1-2; Phase 3 covered by Phase 2 review; Phase 4 PK fix reasoned + tested. -- [x] PWF checkboxes match landed work. -- [ ] `/planning-archive` on completion. +- [ ] `devtools::test()` green; Phase 1 live-DB reproduces PARS BT ≈ 98.95% tunnel-free +- [ ] #205: `lnk_access(merge=TRUE)` recompute reproduces full-pipeline numbers in ~seconds +- [ ] Phase 4 4-WSG run completes spin→…→burn, cyphers torn down clean +- [ ] Phase 5 study-area `match_pct` recorded; PARS shows `;DAM` +- [ ] `/code-check` clean on each commit +- [ ] PWF checkboxes match landed work +- [ ] `/planning-archive` on completion diff --git a/planning/archive/2026-05-issue-200-mapping-code-accessibility/README.md b/planning/archive/2026-05-issue-200-mapping-code-accessibility/README.md new file mode 100644 index 0000000..ce219e4 --- /dev/null +++ b/planning/archive/2026-05-issue-200-mapping-code-accessibility/README.md @@ -0,0 +1,11 @@ +## Outcome + +Reproduced bcfp's per-species accessibility so dam-downstream segments emit the dam descriptor (`SPAWN;DAM`/`REAR;DAM`/`ACCESS;DAM`) instead of a bare habitat token. The mapping_code phase now drives `accessible` from a new per-species `barriers__access` view (`lnk_barriers_views`) = natural barriers only (gradient@species-threshold ∪ falls ∪ subsurface) MINUS the observation/habitat override, ∪ all user-definite (override-exempt). Dams stay in `barrier_sources` (token2 only). + +The load-bearing design decision (after a Plan-agent review and user push-back on a per-WSG shortcut): **all three access inputs are persisted province-wide** so the cross-WSG downstream walk is correct in every WSG, not just the run's own — natural barriers (already), `user_barriers_definite` (new `USER_DEFINITE` family in `lnk_barriers_unify`, ltree-resolved via the FWA join), and the override (new `.barrier_overrides` table). A per-WSG view would have been quietly wrong for natural barriers in downstream/sibling WSGs. + +Validated against `bcfishpass@v0.7.15`: PARS BT 98.95%, LFRA BT 97.77% / CO 97.90% per-segment mapping_code match. The DB run caught + fixed a real bug — `barrier_overrides` PK needed `watershed_group_code` (boundary-stream override positions are computed by two adjacent WSG runs and collided) — and surfaced the provincial-accumulation property: PARS only emits `;DAM` once PCEA+UPCE (holding the Bennett/Peace Canyon dams it drains through) are persisted. Residual ~1-2% is token1 habitat-presence (`ACCESS`↔`SPAWN`/`REAR`, governed by dimensions/rules), a separate pre-existing concern, not the dam-access fix. + +Follow-ups: #201 (blocks_species redesign + evidence-based dam-override, builds on this natural-access foundation); possible drift-validate extension for persist species-column count (stale bt+co wide tables surfaced during validation). Mechanism documented in `RUNBOOK.md` §5. + +Closed by: PR #202 (squash `2beb42f`), tagged **v0.40.4**. Commits a82a7fc → e4353b6. diff --git a/planning/archive/2026-05-issue-200-mapping-code-accessibility/findings.md b/planning/archive/2026-05-issue-200-mapping-code-accessibility/findings.md new file mode 100644 index 0000000..15496d9 --- /dev/null +++ b/planning/archive/2026-05-issue-200-mapping-code-accessibility/findings.md @@ -0,0 +1,84 @@ +# Findings — mapping_code accessibility, reproduce bcfp `barriers_` (#200) + +## Issue context + +link's per-species mapping_code accessibility (`barriers_per_sp` → `accessible`) uses `barriers__unified` = ALL barriers (incl dams/PSCIS/modelled) where the species is in `blocks_species`. bcfp's per-species access set is natural barriers only (gradient@species-threshold ∪ falls ∪ subsurface), MINUS upstream observation/habitat overrides, ∪ all `user_barriers_definite`. Dams are never in the access set — token2 descriptor only. Consequence: every segment below a dam reads inaccessible → `;DAM`/`;MODELLED`/`;ASSESSED` second token suppressed (token2 correctly gated on `accessible`) → link emits bare `SPAWN`/`REAR` where bcfp emits `SPAWN;DAM`. + +## bcfp mechanism (read authoritatively from `smnorris/bcfishpass@e12c1a5`, 2026-05-23) + +All 5 per-species access models (`model/01_access/sql/model_access_{bt,ch_cm_co_pk_sk,wct,ct_dv_rb,st}.sql`) share one structure: +``` +barriers_ = ( gradient@species-classes ∪ falls ∪ subsurface ) + MINUS (barriers with upstream observation OR confirmed habitat, in the species' obs-set) + ∪ ALL barriers_user_definite -- override-EXEMPT (comment: "include *all* user added features, even those below observations") +``` +Per-species axes (the only differences across models): + +| Model | Gradient classes | Obs/habitat species | +|---|---|---| +| BT | 25, 30 | BT,CH,CM,CO,PK,SK,ST | +| salmon (CH,CM,CO,PK,SK) | 15, 20, 25, 30 | CH,CM,CO,PK,SK | +| ST | 20, 25, 30 | CH,CM,CO,PK,SK,ST | +| WCT | 20, 25, 30 | (wct) | +| CT,DV,RB | 25, 30 | BT,DV,CT,RB | + +- **`barriers_user_definite.sql`** materializes the definite table with a synthesized deterministic id + ltree resolved by joining the raw user CSV to `whse_basemapping.fwa_stream_networks_sp` (segment whose `[downstream_route_measure, upstream_route_measure)` contains the barrier). link's existing FALLS branch in `lnk_barriers_unify.R:221-242` does the **identical** join. +- **`load_streams_access.sql`** — `barriers__dnstr` (per-species, natural+definite) is separate from `barriers_anthropogenic_dnstr`/`barriers_dams_dnstr` (descriptors). `access_` = 0 if a downstream barrier exists, else 1/2 (obs-aware). token2 gate (`load_streams_mapping_code.sql`) = `barriers__dnstr = array[]` — identical to link's `ifelse(accessible, mc_barrier, NA)`. +- **Province-wide accumulation**: each `barriers_` is per-WSG-built but accumulated into one province-wide table, so cross-WSG downstream walks (PARS→PCEA→UPCE) see the correct override-applied set. + +## link mapping (every ingredient already exists) + +| bcfp ingredient | link object | state | +|---|---|---| +| gradient@species-threshold | `access_gradient_max` → `blocks_species` (gradient CASE in `lnk_barriers_unify`) | ✓ correct | +| obs/habitat override | `lnk_barrier_overrides` → `.barrier_overrides` (uses `fwa_upstream`, topological/cross-WSG) | ✓ computed; **per-WSG only, not persisted** | +| user_definite | `.barriers_definite` (`lnk_pipeline_prepare.R:182-200`; CSV cols, no id/ltree; empty-fallback = blk+drm only) | **per-WSG only, not in persist barriers** | +| natural barriers | persist `barriers` (gradient/falls/subsurface families) | ✓ province-wide | + +`lnk_barrier_overrides` output is `(blue_line_key, downstream_route_measure, species_code)` and currently feeds only `lnk_pipeline_classify` (habitat), NOT the access path. + +## The design decision (why province-wide, not a per-WSG view) + +The access set is a downstream `frs_network_features` walk that **crosses WSG boundaries**. A per-WSG `_access` view (subtract only the current WSG's overrides from province-wide natural barriers) is quietly wrong for any natural barrier in a downstream/sibling WSG — the cross-WSG twin of the dam bug. Rejected. Correct design: **persist all three access inputs province-wide** (natural ✓, override → new persist table, user_definite → `USER_DEFINITE` persist family), persisted **together per WSG** so any persisted WSG is internally consistent. Caveat (single-WSG run sees only persisted WSGs) is identical to today's natural barriers and bcfp's accumulation — handled by the provincial orchestrator. + +Approach A (definite as unify family) + persist overrides was chosen over the issue-draft's per-WSG view-union (B') after a Plan-agent review and the user's explicit "make it provincial, don't ship a 2/200 one-off." `barriers_definite` lacks id+ltree → resolved via the FALLS-pattern FWA join. No `cols_barriers` DDL change (USER_DEFINITE is a new row-source, same columns). `barrier_overrides` persist uses a single `cols_barrier_overrides` vector for DDL+INSERT (avoids the v0.40.3 matched-pair drift). + +## Verified facts (this session) + +- Persist pattern is `cols_*`-vector-driven; `cols_barriers` drives both DDL (`lnk_persist_init`) and INSERT (`lnk_pipeline_persist.R:94,99`). +- break (`lnk_pipeline_break.R:112`) + classify (`lnk_pipeline_classify.R:223`) read `.barriers_definite` separately → adding `USER_DEFINITE` to persist `barriers` does NOT double-count. +- `barriers_anthropogenic/dams/pscis_unified` filter by `barrier_source` → `USER_DEFINITE` doesn't pollute them. `lnk_compare_*` don't read `_unified`. +- `frs_network_features` (fresh) needs `feature_id_col, blue_line_key, downstream_route_measure, wscode_ltree, localcode_ltree` on the feature table. The `phase4d_plan_draft.md:37` draft wrongly dropped the ltree cols. +- `whse_basemapping.fwa_stream_networks_sp` present in local fwapg; `fresh.streams_vw_bcfp` loaded (4.23M rows, PARS 43,660, carries `mapping_code_bt`) — tunnel-free parity baseline. bcfp baseline `v0.7.15-14-ge12c1a5`. + +## Phase 4 validation — PARS BT (2026-05-23) + +Run `lnk_pipeline_run("PARS", mapping_code=TRUE)` against local docker fwapg (bcfp baseline `v0.7.15-14-ge12c1a5` in `fresh.streams_vw_bcfp`). + +**Result: 99.04% per-segment match vs bcfp** (42,701 / 43,114 joined on `blue_line_key` + rounded `downstream_route_measure`). The headline #200 fix works: +- token1 collapse GONE — `ACCESS`/`SPAWN`/`REAR` emit (was bare `SPAWN`/`REAR`). Counts ≈ bcfp (`SPAWN;DAM` 5293 vs 5263, `REAR;DAM` 2213 vs 2191). +- token2 `;DAM` emerges — dam-downstream-but-accessible segments now annotate `;DAM` not `;NONE`. + +**Cross-WSG dependency confirmed (the provincial design's whole point):** the FIRST PARS run (PARS only) emitted `;NONE` because the Bennett/Peace Canyon dams live in PCEA/UPCE, which weren't persisted. After persisting PCEA + UPCE barriers (`mapping_code=FALSE`) and re-running PARS, the cross-WSG downstream walk saw the dams → `;DAM`. token2/`barrier_sources` is unchanged by #200; it just needs the downstream WSGs in persist. + +Residual ~1% (413 segs): token1 `ACCESS`↔`REAR` swaps (habitat-presence threshold — dimensions/rules, not #200) + a few token2 `DAM`↔`MODELLED` next-downstream-ordering edges. Not the dam-access divergence. + +### Phase 4 validation — LFRA (anadromous; Coquitlam/Alouette/Stave/Ruskin dams) + +`lnk_pipeline_run("LFRA", mapping_code=TRUE)`. Match vs bcfp: **LFRA/bt 97.77%, LFRA/co 97.90%** (26,651 segs each). LFRA coho DAM-token count link **4672 vs bcfp 4636** — the dam descriptor + above-dam path works for anadromous salmon, not just resident BT. LFRA drains to the ocean (lowest Fraser group) so its dams are in-WSG — single run, no cross-WSG persist needed (unlike PARS→PCEA/UPCE). + +Residual ~2%: token1 `ACCESS`↔`SPAWN`/`REAR` (spawning/rearing **habitat-presence** — token1 habitat fires regardless of access per RUNBOOK §4; governed by `frs_habitat_classify`/dimensions, unaffected by #200) + small token2 `DAM`↔`MODELLED`/`NONE` next-downstream-ordering edges. The dam-access fix itself (token2 DAM on accessible dam-downstream segments) matches. + +**Acceptance MET** for both resident + anadromous. Remaining habitat-token1 parity is a separate, pre-existing concern (habitat rules), not #200. + +### Stale-persist-table drift (pre-existing, surfaced in Phase 4) + +LFRA first failed: `column "has_barriers_ch_dnstr" of relation "streams_access" does not exist`. The M1 `fresh.streams_access` / `streams_mapping_code` were stale at bt+co width (old pre-v0.40.2 runs); `lnk_persist_init`'s `CREATE IF NOT EXISTS` won't widen an existing table, and `.lnk_validate_persist_table` only detects GENERATED-column drift, not species-count drift. `lnk_pipeline_run` correctly sizes persist_init to `cfg$species` (8) — so DROPping the two stale wide tables + re-running recreated them full-width. NOT a #200 bug (production provincial runs start clean). Possible follow-up: extend drift-validate to species-column count. + +**Real bug caught + fixed during the run:** `barrier_overrides` PK was `(blk, drm, species_code)` — UPCE's persist INSERT collided with PCEA's because the SAME override position is computed by two adjacent WSG runs (boundary streams whose `blue_line_key` spans WSGs). Fixed: PK now includes `watershed_group_code` (mirrors `cols_barriers`), so per-WSG DELETE+INSERT is clean; the WSG-agnostic access anti-join makes the duplicate harmless. + +## Open / watch + +- Cross-WSG override correctness — the provincial design should fix it; **verify on LFRA** in Phase 4 (don't assume). +- `remediated_dnstr_ind` divergence is bcfp's own bug (`smnorris/bcfishpass#690`) — whitelist in the parity diff. +- Pre-existing (out of scope): `lnk_pipeline_run.R:228` `pscis = .barriers_pscis` vs view name `barriers_pscis_unified` — watch in Phase 4. diff --git a/planning/archive/2026-05-issue-200-mapping-code-accessibility/progress.md b/planning/archive/2026-05-issue-200-mapping-code-accessibility/progress.md new file mode 100644 index 0000000..14a6c9b --- /dev/null +++ b/planning/archive/2026-05-issue-200-mapping-code-accessibility/progress.md @@ -0,0 +1,14 @@ +# Progress — mapping_code accessibility, reproduce bcfp `barriers_` (#200) + +## Session 2026-05-23 + +- M4→M1 handoff resumed; v0.40.3 shipped (PR #199 merged `46b2042`, tagged). +- Environment up on M1: docker fresh-db, link 0.40.3, bcfp snapshot reloaded (`v0.7.15-14-ge12c1a5`, `streams_vw_bcfp` loaded locally), local `:63333→:5432` forwarder (db_newgraph tunnel key deauthorized on M1 — not blocking). +- Read bcfp access machinery end-to-end (5 species models + `barriers_user_definite.sql` + `load_streams_access.sql`) — see findings. +- Plan-agent design review: rejected per-WSG view (B'); user pushed for province-wide correctness. Final design = persist all access inputs province-wide (USER_DEFINITE family + persist barrier_overrides + `_access` view). Plan approved. +- Archived #196 PWF; created branch `200-mapping-code-accessibility-reproduce-bcf`; scaffolded #200 PWF baseline. +- Phase 1 done: `USER_DEFINITE` family in `lnk_barriers_unify` (FALLS-pattern FWA ltree join); `cols_barrier_overrides` + DDL in `lnk_persist_init`; persist copy in `lnk_pipeline_persist` (pre-persist auto-picks it up). Unit tests 96 pass; DB-smoke validated DDL + branch (empty-fallback safe, one-row resolves ltree/geom). Both configs persist to schema `fresh` (provincial `fresh_default` is a runtime `--schema` override). Commit `a82a7fc`. +- Phase 2 done: `barriers__access` view in `lnk_barriers_views` (natural-only + override anti-join + definite-exempt). DB-smoke: 904,262 natural rows, 0 non-natural. code-check clean. Commit `f758e44`. +- Phase 3 done: repoint `barriers_per_sp` → `_access` + comment. Commit `c412d68`. +- Phase 4 done: validated PARS BT 98.95%, LFRA BT 97.77% / CO 97.90% vs `streams_vw_bcfp`; `;DAM` tokens correct. Found+fixed `barrier_overrides` PK (added `watershed_group_code` — boundary-stream cross-WSG collision). Worked around stale bt+co persist tables (DROP + recreate full-width). PARS needed PCEA+UPCE persisted first (cross-WSG dams). Commit `2ea3a7d`. +- Phase 5: RUNBOOK §2a/§3/§5/§7 updated; NEWS + DESCRIPTION → 0.40.4; temp scripts removed. test 1193 pass (lone FAIL = env db_conn). Next: archive + open PR (stop before merge per user). diff --git a/planning/archive/2026-05-issue-200-mapping-code-accessibility/task_plan.md b/planning/archive/2026-05-issue-200-mapping-code-accessibility/task_plan.md new file mode 100644 index 0000000..3de484c --- /dev/null +++ b/planning/archive/2026-05-issue-200-mapping-code-accessibility/task_plan.md @@ -0,0 +1,50 @@ +# Task: mapping_code accessibility — reproduce bcfp `barriers_` (natural-only + override), provincially consistent (#200) + +link's per-species mapping_code accessibility uses `barriers__unified` = ALL barriers (incl dams) where the species ∈ `blocks_species`, so dam-downstream segments read inaccessible and lose their `;DAM` token2. bcfp's access set is natural-only (gradient@species-threshold ∪ falls ∪ subsurface) MINUS observation/habitat override ∪ all user_definite — dams annotate (token2), never block. Fix: make all access inputs province-wide-persisted (natural ✓ already, override + user_definite added), build a `barriers__access` view over them, repoint `barriers_per_sp`. Full design: `planning/active/findings.md` + `RUNBOOK.md` §5. + +## Phase 1 — make the override + user_definite province-wide + +- [x] `USER_DEFINITE` family in `lnk_barriers_unify` (mirror FALLS branch: FWA-join for ltree; source `.barriers_definite`; `blocks_species`=all; reference only `blue_line_key`+`downstream_route_measure` for empty-fallback safety). No persist DDL change. +- [x] `cols_barrier_overrides` vector + `CREATE TABLE IF NOT EXISTS .barrier_overrides` in `lnk_persist_init` (one vector drives DDL + INSERT). +- [x] Persist `barrier_overrides` (DELETE-WHERE-WSG + INSERT, add `''` as `watershed_group_code`) in `lnk_pipeline_persist`; probe-gated. +- [x] Pre-persist auto-handled: the mapping_code-phase pre-persist (`lnk_pipeline_run.R:188`) already calls the full `lnk_pipeline_persist`, which now persists `barrier_overrides` — no separate edit needed. +- [x] Tests: unify `USER_DEFINITE` branch SQL; persist_init `barrier_overrides` DDL; persist INSERT projection. (96 pass) +- [x] DB-smoke: `barrier_overrides` DDL creates in `fresh`; USER_DEFINITE branch parses + resolves ltree/geom (empty-fallback safe + one-row). +- [x] `/code-check` (round 1 clean) + commit. + +## Phase 2 — `barriers__access` view over province-wide inputs + +- [x] Per-species `_access` view in `lnk_barriers_views`: natural (`barrier_source IN ('GRADIENT','FALLS','SUBSURFACE_FLOW','USER_DEFINITE')`) over persist `barriers`, `NOT EXISTS` anti-join over `barrier_overrides` (derived from the barriers source schema via `sub()`), `USER_DEFINITE` override-exempt, expose `barriers__access_id`, keep `wscode_ltree`/`localcode_ltree`, alias `b`. +- [x] Tests: `_access` view SQL per species; counts updated (22→38, 10→14). 30 pass. +- [x] DB-smoke: `barriers_bt_access` valid + queryable; 904,262 natural rows, 0 non-natural (dams/anthropogenic excluded) vs `_unified` 1,045,358. +- [x] `/code-check` (clean) + commit. + +## Phase 3 — repoint `barriers_per_sp` → `_access` + +- [x] `lnk_pipeline_run` (:215-217) `..._unified` → `..._access`; rewrote KNOWN-DIVERGENT comment (:200-213) to describe the landed fix. +- [x] `test-lnk_pipeline_run.R` asserts no `_unified`/`barriers_per_sp` name → no update needed. +- [x] Full suite 1193 pass (the lone FAIL is the env-only `db_conn` test — needs the real db_newgraph tunnel: `.Renviron` `PG_*_SHARE` → `:63333` w/ airvine/bcfishpass creds; CI skips it via `skip_if_no_db`). Phase 3 repoint validated by Phase 2's code-check (consumption confirmed). +- [x] commit. + +## Phase 4 — DB validation (hard gate): PARS + LFRA vs `fresh.streams_vw_bcfp` + +- [x] Rebuild PARS (BT) `mapping_code=TRUE`; `;DAM` tokens now emit (`SPAWN;DAM` 5293≈5263 bcfp). **98.95% match.** Needed PCEA+UPCE barriers persisted first (cross-WSG dams) — confirms the provincial design. +- [x] Rebuild LFRA (anadromous; Coquitlam/Alouette/Stave/Ruskin). **bt 97.77%, co 97.90%**; coho DAM tokens 4672≈4636 bcfp. Dams in-WSG (drains to ocean), single run. +- [x] Found + fixed real bug: `barrier_overrides` PK needed `watershed_group_code` (boundary-stream overrides shared across adjacent WSGs collided). Found + worked around stale-persist-table drift (bt+co wide tables → DROP + recreate full-width). +- [x] Residual ~1-2% characterized: token1 habitat-presence (`ACCESS`↔`SPAWN`/`REAR`, dimensions/rules) + minor token2 ordering — NOT the dam-access divergence. Recorded in `findings.md`. + +## Phase 5 — docs + release + +- [x] `RUNBOOK.md` §2a/§3/§5/§7 updated (fix landed; province-wide override/definite persistence; provincial-accumulation note). +- [x] `NEWS.md` + `DESCRIPTION` → 0.40.4. +- [x] Temp validation scripts removed. +- [ ] `/planning-archive`, `/gh-pr-push`. + +## Validation + +- [x] `devtools::test()` 1193 pass (lone FAIL = env-only `db_conn`, needs real tunnel; CI skips). +- [~] `devtools::check()` — CI runs it green (no DB → db_conn skips); local can't be fully green without the db_newgraph tunnel. Noted in PR. +- [x] Phase 4 DB parity (hard gate): PARS BT 98.95%, LFRA BT 97.77% / CO 97.90%; `;DAM` correct. +- [x] `/code-check` clean on Phases 1-2; Phase 3 covered by Phase 2 review; Phase 4 PK fix reasoned + tested. +- [x] PWF checkboxes match landed work. +- [ ] `/planning-archive` on completion. diff --git a/research/provincial_parity_2026_05_25.md b/research/provincial_parity_2026_05_25.md new file mode 100644 index 0000000..3dbb810 --- /dev/null +++ b/research/provincial_parity_2026_05_25.md @@ -0,0 +1,66 @@ +# Study-area mapping_code parity — 2026-05-25 + +First tunnel-free, M1-dispatch, per-segment `mapping_code` parity across the 3 +FWCP study areas (link#175). Run via `data-raw/study_area_run.sh`; procedure + +methodology in `research/study_area_run.md`. + +## Run metadata + +- **Scope:** 50 WSGs (29 focal + drainage closure, species-filtered) across + Peace / Fraser / Skeena. +- **Hosts:** dispatcher = M1 (Fraser bucket), cy1 = Peace, cy2 = Skeena (2 DO + cyphers, burned at end — confirmed 0 tofu resources / no droplets). +- **Reference:** local bcfp snapshot `fresh.streams_vw_bcfp` (tunnel-free); + compare = `lnk_compare_mapping_code` per WSG-active species. +- **link:** branch `175-promote-with-mapping-code-flag-to-stand` @ `34b0cd3`. +- **Numbers are post-recompute** (see methodology) — M1's `fresh` holds the + full consolidated + recomputed state; full table `/tmp/authoritative.csv`. + +## Headline + +| Metric | Value | +|---|---| +| Rows (WSG × active species) | 150 (50 WSGs × ~3 sp avg) | +| **Median match** | **99.66%** | +| Mean match | 99.11% | +| Rows ≥ 99% | 130 / 148 (88%) | +| Median BT | 99.57% | + +## Genuine divergences (recompute-stable → taxonomy) + +These did NOT improve on re-modelling against the full consolidated barrier set, +so they're real methodology departures (not cross-WSG accumulation gaps) — the +kind tracked in `research/bcfp_divergence_taxonomy.yml`: + +| WSG | species | match% | likely class | +|---|---|---|---| +| UNRS | BT | 61.8% | Kenney reservoir / dam-override (CABD dam passability) | +| SETN | CH/CM/CO/PK/SK/ST | 93.7–94.8% | SK-geography / salmon class | + +Next: `lnk_parity_annotate` against the taxonomy; the acceptance bar is 0 +UNEXPLAINED at |diff|≥2% after annotation. + +## Methodology finding (the load-bearing result) + +Per-segment access (hence `mapping_code` token1/token2) depends on barriers +**downstream**, possibly in a different WSG (provincial-accumulation, RUNBOOK +§5). Distributed hosts see only their own bucket's barriers mid-run. + +**Drainage-closed + DS-first per-host is NOT sufficient** — it reduces but +doesn't eliminate the gap (downstream barriers can be cross-bucket or arrive +late in DS-first order). Per-host this run produced FINA 75.5% / PARA 68.6% / +LFRA & LKEL low; all → 99%+ after **re-modelling on the full consolidated +barrier set**. So the correct, machine/WSG-agnostic methodology is: + +**distribute (any bucketing) → consolidate → POST-CONSOLIDATE RECOMPUTE → +compare.** The recompute is the correctness guarantee; bucketing is only a +speed knob. + +### Open efficiency issue (#205) + +Today the recompute re-runs the **full pipeline** on diverged WSGs — ~2× cost +on those WSGs (re-derives streams/habitat just to redo the cheap access step), +and recompute-ALL would be ~2× overall (defeating distribution). **#205** is the +cheap access-only recompute (reuse persisted streams/habitat) that makes +recompute-ALL bulletproof and ~1×. Build it before the next run, then one clean +driver-automated run that is both validated and fast. diff --git a/research/study_area_run.md b/research/study_area_run.md new file mode 100644 index 0000000..dae38bd --- /dev/null +++ b/research/study_area_run.md @@ -0,0 +1,113 @@ +# Study-area run (tunnel-free, M1-dispatch) + +Lean alternative to the 5-host `provincial_run_runbook.md` for running the **3 +FWCP study areas** (Peace / Fraser / Skeena) mapping_code parity. Reuses the +proven per-WSG build + cypher lifecycle but is **tunnel-free** (compare = +local bcfp snapshot, no `:63333`) and **M1-as-dispatcher** (no M4). Built for +link#175. Companion: `provincial_run_runbook.md` (shared mechanics), +`RUNBOOK.md` (the access/mapping_code machinery), `data-raw/README.md`. + +## One command + +```bash +cd ~/Projects/repo/link +# largest area on the dispatcher (fast/free M1); smaller areas on the cyphers +bash data-raw/study_area_run.sh \ + --cy-workspaces=job1,job2 \ + --focal= \ # -> dispatcher (M1) + --focal= \ # -> cy1 (job1) + --focal= # -> cy2 (job2) +``` + +Focal lists live in `~/.claude/.../memory/study-areas-peace-fraser-skeena.md`. +`--focal` count MUST equal `1 + N(--cy-workspaces)`; first = dispatcher, rest = +cyphers in order. Dispatcher-only (no cyphers): omit `--cy-workspaces`, pass one +`--focal`. Pre-req: dispatcher has `fresh.streams_vw_bcfp` +(`snapshot_bcfp.sh --with-bcfp-views`); branch pushed to origin (cyphers pull it). + +## What it does + +1. **Pre-flight** (tunnel-free): local fwapg up, `fresh.streams_vw_bcfp` present, + doctl/tofu (only if cyphers). +2. **Drainage-closed DS-first buckets** (`study_area_wsgs.R`): each focal set → + its closure (every WSG it drains through, via `public.wsg_outlet`, + `f.outlet <@ w.outlet`) ordered downstream-first (`nlevel(outlet) ASC`), + then **filtered to bundle-species presence** (link#157). +3. **Spin + prep** cyphers (`cypher_up.sh`, `cypher_prep.sh` with + `CYPHER_PREP_BRANCH=`). +4. **Run** each host's bucket DS-first (`wsg_run_one.R` = + `lnk_pipeline_run(mapping_code=TRUE)`), dispatcher local + cyphers via ssh, + per-WSG **soft-fail**. +5. **Consolidate** cyphers → dispatcher (`schema_consolidate.R`, shape-tolerant). +6. **Burn** cyphers (then a trap-EXIT safety net). +7. **Post-consolidate recompute ALL run WSGs** on the dispatcher via + `wsg_recompute_one.R` → [lnk_access()] `merge=TRUE` (cheap access-only, + reuses persisted streams/habitat/barriers; ~10s/WSG) + `lnk_mapping_code`. + Because it is cheap, every run WSG is re-settled — bucketing is a speed + knob, not a correctness lever (link#205). +8. **Compare** all run WSGs tunnel-free (`study_area_compare.R`) → CSV. + +## Post-consolidate recompute — the correctness guarantee + +Each WSG's accessibility (hence its `mapping_code` token1 ACCESS/SPAWN/REAR and +token2 DAM/…) depends on whether a blocking barrier exists **downstream** — +possibly in a *different* WSG (the provincial-accumulation property, +`RUNBOOK.md` §5). When WSGs are distributed across machines, each machine holds +only its own bucket's barriers while it runs, so a WSG's access is computed +against an **incomplete** barrier set → wrong tokens. + +**Drainage-closed + DS-first bucketing is NOT sufficient on its own.** It +*reduces* divergence (downstream often persists first within a bucket) but does +not eliminate it: downstream barriers can be cross-bucket, or arrive late in +DS-first order. Caught 2026-05-25 — FINA 75.5% / PARA 68.6% per-host → both +**99%+** after re-modelling on the full consolidated barrier set. + +So the methodology is **distribute (any bucketing) → consolidate → recompute → +compare**, and the *recompute* is what makes it correct **regardless of machine +count or WSG assignment**. The recompute is **`lnk_access(merge=TRUE)` + +`lnk_mapping_code`** (link#205): cheap access-only, reusing the persisted +streams / habitat / barriers / barrier_overrides — no full pipeline, ~10s/WSG +(FINA: 11.86s wall vs ~90s full pipeline, identical bcfp parity). Two +non-obvious things had to be true for it to be cheap: +1. AOI-scope the segments as a **real table** (with indexes + `ANALYZE`), + not a view — otherwise the planner picks the ~800k-row barriers as the + outer driver and the join cost explodes by ~1000×. +2. Persist `streams` / `barriers` need **ltree GIST/btree indexes** + (`lnk_persist_init` builds them; matches `fresh`'s working-table pattern). +3. `lnk_mapping_code` must filter access by `watershed_group_code` when the + table has that column (persist) — the original `id_segment IN (…)` query is + cartesian against persist because `id_segment` is per-WSG, not globally + unique (link#203). + +## Gotchas that cost real time (2026-05-25) + +- **A per-WSG FATAL burns the cyphers with un-consolidated data.** A + species-less closure WSG (LEUT) errored `No species resolved for AOI` → + `|| exit 1` → driver FATAL → trap `cypher_down` → an entire run's Peace+Skeena + data gone. **Fixes:** species-presence filter the closure (link#157) AND + per-WSG soft-fail (never abort a host before consolidate). The cyphers were + NOT externally destroyed — the driver's own trap burned them. +- **Cyphers checkout `main` by default.** The driver scripts + branch link only + exist on the feature branch → cyphers must run `CYPHER_PREP_BRANCH=`, + and the branch must be **pushed** first (`cypher_prep` does `git fetch origin + && git reset --hard origin/$BRANCH`). +- **Fresh-droplet sshd race.** `cypher_up` returns when the IP is assigned, + before sshd is up → scp `Connection closed`. Poll ssh before scp. +- **Wide-table shape drift across hosts.** `streams_access` / + `streams_mapping_code` carry one column per species; a host seeding persist + from `parameters_fresh` (11 sp) vs `cfg$species` (8 sp) breaks the positional + COPY-consolidate. `cypher_prep` now uses `cfg$species`; `schema_consolidate` + COPYs shared columns by name (link#204). +- **Cypher cost is ~$0.06/hr each.** "Minimize idle" means don't leave them up + for HOURS (the 2026-05-12 10-hr incident), not shave minutes. Don't + over-engineer early-burn for cents. + +## Scripts + +| Script | Role | +|---|---| +| `data-raw/study_area_run.sh` | driver: pre-flight → spin → prep → run DS-first → consolidate → burn → compare | +| `data-raw/study_area_wsgs.R` | focal → drainage-closed, DS-first, species-filtered WSG list | +| `data-raw/wsg_run_one.R` | one WSG: `lnk_pipeline_run(mapping_code=TRUE)`, local, host-agnostic | +| `data-raw/wsg_recompute_one.R` | one WSG cheap post-consolidate recompute (`lnk_access(merge=TRUE)` + `lnk_mapping_code`) — link#205. Sets `statement_timeout`/`lock_timeout` so a runaway/locked query fails fast | +| `data-raw/study_area_compare.R` | tunnel-free `lnk_compare_mapping_code` loop → CSV | diff --git a/tests/testthat/test-lnk_compare_mapping_code.R b/tests/testthat/test-lnk_compare_mapping_code.R new file mode 100644 index 0000000..f40a49d --- /dev/null +++ b/tests/testthat/test-lnk_compare_mapping_code.R @@ -0,0 +1,97 @@ +# Tests for lnk_compare_mapping_code — tunnel-free per-segment token compare (#175) + +cfg_fixture <- function() lnk_config("bcfishpass") + +test_that("lnk_compare_mapping_code validates arguments", { + cfg <- cfg_fixture() + conn <- structure(list(), class = "DBIConnection") + expect_error(lnk_compare_mapping_code("notconn", "PARS", cfg)) + expect_error(lnk_compare_mapping_code(conn, "", cfg)) + expect_error(lnk_compare_mapping_code(conn, "toolongwsg", cfg)) + expect_error(lnk_compare_mapping_code(conn, "PARS", list())) + expect_error(lnk_compare_mapping_code(conn, "PARS", cfg, reference = "nope"), + "Unsupported reference") + expect_error(lnk_compare_mapping_code(conn, "PARS", cfg, conn_ref = "notconn")) +}) + +test_that(".lnk_mc_diff computes per-species match stats + top mismatch pattern", { + # Three segments share FWA position keys; bt differs on seg 3 + # (REAR vs REAR;DAM), co matches everywhere. + link_mc <- data.frame( + blue_line_key = c(1, 2, 3), + downstream_route_measure = c(10, 20, 30), + length_metre = c(5, 5, 5), + mapping_code_bt = c("ACCESS;DAM", "SPAWN;DAM", "REAR"), + mapping_code_co = c("ACCESS", "SPAWN", "REAR"), + stringsAsFactors = FALSE) + bcfp_mc <- data.frame( + blue_line_key = c(1, 2, 3), + downstream_route_measure = c(10, 20, 30), + length_metre = c(5, 5, 5), + mapping_code_bt = c("ACCESS;DAM", "SPAWN;DAM", "REAR;DAM"), + mapping_code_co = c("ACCESS", "SPAWN", "REAR"), + stringsAsFactors = FALSE) + + out <- link:::.lnk_mc_diff(link_mc, bcfp_mc, aoi = "PARS", + species = c("BT", "CO")) + + expect_equal(sort(out$species), c("BT", "CO")) + bt <- out[out$species == "BT", ] + co <- out[out$species == "CO", ] + expect_equal(bt$total_segs, 3L) + expect_equal(bt$match_pct, round(100 * 2 / 3, 2)) + expect_equal(bt$n_diffs, 1L) + expect_equal(bt$top_pattern, "REAR | REAR;DAM") + expect_equal(bt$top_pattern_count, 1L) + expect_equal(co$match_pct, 100) + expect_equal(co$n_diffs, 0L) + expect_true(is.na(co$top_pattern)) +}) + +test_that(".lnk_mc_diff NA-fills when reference has no rows for the WSG", { + link_mc <- data.frame( + blue_line_key = 1, downstream_route_measure = 10, length_metre = 5, + mapping_code_bt = "ACCESS", stringsAsFactors = FALSE) + bcfp_mc <- link_mc[0, ] + expect_warning( + out <- link:::.lnk_mc_diff(link_mc, bcfp_mc, aoi = "XXXX", + species = "BT"), + "0 rows") + expect_equal(out$total_segs, 0L) + expect_true(is.na(out$match_pct)) +}) + +test_that(".lnk_mc_diff errors on non-empty reference with no key overlap", { + link_mc <- data.frame( + blue_line_key = 1, downstream_route_measure = 10, length_metre = 5, + mapping_code_bt = "ACCESS", stringsAsFactors = FALSE) + bcfp_mc <- data.frame( + blue_line_key = 999, downstream_route_measure = 99, length_metre = 9, + mapping_code_bt = "ACCESS", stringsAsFactors = FALSE) + expect_error( + link:::.lnk_mc_diff(link_mc, bcfp_mc, aoi = "PARS", species = "BT"), + "no position overlap") +}) + +# -- live DB: tunnel-free PARS compare vs the local snapshot -------------- + +test_that("lnk_compare_mapping_code reproduces PARS BT parity tunnel-free", { + conn <- skip_if_no_db() + # Needs a prior PARS mapping_code=TRUE run (fresh.streams_mapping_code) + + # the bcfp snapshot (fresh.streams_vw_bcfp). Skip cleanly if absent. + have <- tryCatch({ + a <- DBI::dbGetQuery(conn, "SELECT 1 FROM fresh.streams_mapping_code WHERE watershed_group_code='PARS' LIMIT 1") + b <- DBI::dbGetQuery(conn, "SELECT 1 FROM fresh.streams_vw_bcfp WHERE watershed_group_code='PARS' LIMIT 1") + nrow(a) > 0 && nrow(b) > 0 + }, error = function(e) FALSE) + if (!isTRUE(have)) { + testthat::skip("PARS streams_mapping_code or streams_vw_bcfp snapshot not present") + } + + out <- lnk_compare_mapping_code(conn, aoi = "PARS", cfg = cfg_fixture(), + species = "BT") + expect_s3_class(out, "tbl_df") + expect_equal(out$species, "BT") + expect_gt(out$total_segs, 40000) # PARS ~43k segments + expect_gt(out$match_pct, 95) # validated ~98.95% tunnel-free +}) diff --git a/tests/testthat/test-lnk_compare_wsg.R b/tests/testthat/test-lnk_compare_wsg.R index 63f904b..d9b7bf7 100644 --- a/tests/testthat/test-lnk_compare_wsg.R +++ b/tests/testthat/test-lnk_compare_wsg.R @@ -315,8 +315,8 @@ test_that("lnk_compare_wsg composes mapping_code branch when mapping_code=TRUE", habitat_type = "spawning", unit = "km", link_value = 10, ref_value = 11, diff_pct = -9.1) } - m_mc_diff <- function(...) { - calls <<- c(calls, "mapping_code_diff") + m_mc <- function(...) { + calls <<- c(calls, "mapping_code") tibble::tibble( wsg = "ADMS", species = "BT", total_segs = 100L, match_pct = 99.5, n_diffs = 0L, @@ -327,7 +327,7 @@ test_that("lnk_compare_wsg composes mapping_code branch when mapping_code=TRUE", with_mocked_bindings( lnk_pipeline_run = m_pipeline_run, lnk_compare_rollup = m_rollup, - .lnk_compare_wsg_mapping_code_diff = m_mc_diff, + lnk_compare_mapping_code = m_mc, { with_mocked_bindings( dbExecute = m_exec, @@ -349,7 +349,7 @@ test_that("lnk_compare_wsg composes mapping_code branch when mapping_code=TRUE", # cleanup_working passes straight through (no special-case retention # needed since build runs inside pipeline_run, not in compare_wsg). # Then rollup reads persisted state, then diff fires against persist. - expect_equal(calls, c("pipeline_run", "rollup", "mapping_code_diff")) + expect_equal(calls, c("pipeline_run", "rollup", "mapping_code")) expect_false(pipeline_args$cleanup_working) expect_true(pipeline_args$mapping_code) @@ -368,42 +368,27 @@ test_that("lnk_compare_wsg composes mapping_code branch when mapping_code=TRUE", # corresponding reference-gate assertion happens in the # "lnk_compare_wsg validates reference" test elsewhere in this file. -test_that(".lnk_compare_wsg_mapping_code_diff computes per-species stats with top_pattern", { - # 4 segments. Species BT: 1 match, 3 mismatches all with same pattern. - # Species CH: all 4 match. Species CM: 2 match, 2 different patterns. +test_that(".lnk_mc_diff computes per-species stats with top_pattern (merge logic)", { + # Merge/match logic moved to .lnk_mc_diff (link#175); test it directly — + # no DB mock needed. 4 segments. BT: 1 match, 3 same-pattern mismatches. + # CH: all 4 match. CM: 2 match, 2 different patterns. link_mc <- data.frame( id_segment = 1:4, mapping_code_bt = c("ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE"), mapping_code_ch = c("ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE"), mapping_code_cm = c("ACCESS;NONE", "ACCESS;NONE", "REAR;NONE", "ACCESS;NONE"), blue_line_key = 1:4, downstream_route_measure = c(10, 20, 30, 40), - length_metre = 100, stringsAsFactors = FALSE ) bcfp_mc <- data.frame( - segmented_stream_id = 1:4, + blue_line_key = 1:4, downstream_route_measure = c(10, 20, 30, 40), mapping_code_bt = c("ACCESS;NONE", "ACCESS;MODELLED", "ACCESS;MODELLED", "ACCESS;MODELLED"), mapping_code_ch = c("ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE", "ACCESS;NONE"), mapping_code_cm = c("ACCESS;NONE", "ACCESS;MODELLED", "SPAWN;NONE", "ACCESS;NONE"), - blue_line_key = 1:4, downstream_route_measure = c(10, 20, 30, 40), - length_metre = 100, stringsAsFactors = FALSE ) - m_q <- function(conn, sql) { - if (grepl("FROM bcfishpass", sql)) bcfp_mc else link_mc - } - with_mocked_bindings( - dbGetQuery = m_q, - dbQuoteLiteral = function(...) DBI::SQL("'ADMS'"), - .package = "DBI", - { - result <- link:::.lnk_compare_wsg_mapping_code_diff( - conn = mock_conn(), conn_ref = mock_conn(), - aoi = "ADMS", cfg = mock_cfg(), - bcfp_species = c("bt", "ch", "cm") - ) - } - ) + result <- link:::.lnk_mc_diff(link_mc, bcfp_mc, aoi = "ADMS", + species = c("bt", "ch", "cm")) expect_s3_class(result, "tbl_df") expect_equal(nrow(result), 3L)