diff --git a/.gitignore b/.gitignore index 0d25f59..4f4fd5e 100644 --- a/.gitignore +++ b/.gitignore @@ -171,3 +171,6 @@ notebooks/analysis/mavedb_files urn:*.json tmp:*.json *_mapping_*.json + +# Agent settings +.claude/ diff --git a/src/dcd_mapping/align.py b/src/dcd_mapping/align.py index d9fd335..24ff8c2 100644 --- a/src/dcd_mapping/align.py +++ b/src/dcd_mapping/align.py @@ -177,7 +177,12 @@ def _run_blat( cmd.extend(shlex.split(target_args)) cmd.extend( - [f"-minScore={min_score}", f"-out={out_format}", str(query_file), out_file] + [ + f"-minScore={min_score}", + f"-out={out_format}", + str(query_file), + out_file, + ] ) _logger.debug("Running BLAT command: %s", " ".join(cmd)) diff --git a/src/dcd_mapping/vrs_map.py b/src/dcd_mapping/vrs_map.py index 04dbee7..28c3795 100644 --- a/src/dcd_mapping/vrs_map.py +++ b/src/dcd_mapping/vrs_map.py @@ -8,7 +8,7 @@ from Bio.Seq import Seq from bioutils.accessions import infer_namespace from cool_seq_tool.schemas import AnnotationLayer, Strand -from ga4gh.core import ga4gh_identify, sha512t24u +from ga4gh.core import sha512t24u from ga4gh.vrs._internal.models import ( Allele, Haplotype, @@ -47,6 +47,7 @@ VrsMapResult, ) from dcd_mapping.transcripts import TxSelectError +from dcd_mapping.vrs_utils import identify_allele, normalize_and_identify __all__ = ["vrs_map"] @@ -1005,10 +1006,7 @@ def _construct_vrs_allele( else: allele = translate_ref_identical_to_vrs(hgvs_string) - normalize(allele, data_proxy=get_seqrepo()) - - allele.id = ga4gh_identify(allele) - alleles.append(allele) + alleles.append(normalize_and_identify(allele)) continue allele = translate_hgvs_to_vrs(hgvs_string) @@ -1040,8 +1038,7 @@ def _construct_vrs_allele( ) allele.state = _rle_to_lse(allele.state, allele.location) - # Run ga4gh_identify to assign VA digest - allele.id = ga4gh_identify(allele) + allele.id = identify_allele(allele) alleles.append(allele) if not alleles: diff --git a/src/dcd_mapping/vrs_utils.py b/src/dcd_mapping/vrs_utils.py new file mode 100644 index 0000000..c96ed23 --- /dev/null +++ b/src/dcd_mapping/vrs_utils.py @@ -0,0 +1,51 @@ +"""VRS allele identification helpers. + +Centralizes the digest-correctness invariant for GA4GH VRS alleles: the +``ga4gh_identify`` Merkle tree caches sub-object digests on the object after +first identification, so any subsequent mutation (refgetAccession swap, +normalization, state coercion) leaves a stale id unless the cached digests +are cleared first. All allele identification in dcd_mapping must route +through :func:`identify_allele` so the digest is always recomputed from +current content. +""" + +from ga4gh.core import ga4gh_identify +from ga4gh.vrs._internal.models import Allele, SequenceLocation +from ga4gh.vrs.normalize import normalize + +from dcd_mapping.lookup import get_seqrepo + + +def identify_allele(allele: Allele) -> str: + """Clear cached digests and return a fresh GA4GH identifier for *allele*. + + ``ga4gh_identify`` is a Merkle-tree: it calls ``get_or_create_digest`` on + sub-objects, returning any cached value without recomputing. Clearing both + the location digest and the allele digest first ensures the id is always + derived from the current object content — not from a value set before a + refgetAccession mutation or normalization. + """ + if isinstance(allele.location, SequenceLocation): + allele.location.digest = None + + allele.digest = None + digest = ga4gh_identify(allele) + if digest is None: + raise ValueError("Failed to compute GA4GH identifier for allele") # noqa: EM101 + + return digest + + +def normalize_and_identify(allele: Allele) -> Allele: + """Normalize *allele* and stamp it with a freshly computed GA4GH digest. + + Pairs the two finalize steps every VRS allele construction path needs. + Routing identification through :func:`identify_allele` (rather than + ``ga4gh_identify`` directly) is the invariant that protects against the + Merkle-tree's stale-digest behavior after mutation -- so any allele + construction site that bypasses this helper risks reintroducing the + stale-digest bug. + """ + allele = normalize(allele, data_proxy=get_seqrepo()) + allele.id = identify_allele(allele) + return allele diff --git a/tests/test_vrs_utils.py b/tests/test_vrs_utils.py new file mode 100644 index 0000000..88e811f --- /dev/null +++ b/tests/test_vrs_utils.py @@ -0,0 +1,110 @@ +"""Tests for VRS allele identification helpers. + +These tests run offline: ``ga4gh_identify`` is pure computation given a +well-formed VRS object, and ``normalize`` is patched so no SeqRepo or UTA +access is required. +""" + +import pytest +from ga4gh.vrs._internal.models import ( + Allele, + LiteralSequenceExpression, + SequenceLocation, + SequenceReference, +) + +from dcd_mapping.vrs_utils import identify_allele, normalize_and_identify + + +def _make_allele(start: int = 1, end: int = 2, sequence: str = "A") -> Allele: + """Build a minimal in-memory VRS Allele with no cached digests.""" + return Allele( + location=SequenceLocation( + sequenceReference=SequenceReference( + refgetAccession="SQ.0123456789abcdef0123456789abcdef" + ), + start=start, + end=end, + ), + state=LiteralSequenceExpression(sequence=sequence), + ) + + +def test_identify_allele_returns_va_digest(): + assert identify_allele(_make_allele()).startswith("ga4gh:VA.") + + +def test_identify_allele_is_content_addressed(): + # Identical content -> identical digest; differing content -> differing digest. + a = _make_allele() + b = _make_allele() + c = _make_allele(start=5, end=6) + assert identify_allele(a) == identify_allele(b) + assert identify_allele(a) != identify_allele(c) + + +def test_identify_allele_clears_stale_digests(): + """The whole point of the helper: a cached pre-mutation digest must not + leak into the post-mutation identifier. + + Reproduces the bug pattern where an allele is constructed, its location + gets a refgetAccession mutation, and then it's identified. If the helper + is bypassed and ``ga4gh_identify`` is called directly, the Merkle-tree + returns the stale digest without recomputing, so the identifier doesn't + reflect the current content. By clearing cached digests first, the helper + ensures the identifier is always correct even if the input allele has been + mutated since the last identification. + """ + allele = _make_allele(start=5, end=6) + allele.location.digest = "ga4gh:SL.stale-location-digest" + allele.digest = "ga4gh:VA.stale-allele-digest" + + identifier = identify_allele(allele) + + assert identifier.startswith("ga4gh:VA.") + assert identifier != "ga4gh:VA.stale-allele-digest" + # The identifier must match a freshly-built allele with the same content, + # not anything derived from the stale digests. + assert identifier == identify_allele(_make_allele(start=5, end=6)) + + +def test_normalize_and_identify_pairs_both_steps(mocker): + """normalize_and_identify normalizes then identifies in that order.""" + allele = _make_allele() + # Pass-through normalize so the test stays offline (no SeqRepo). + mock_normalize = mocker.patch( + "dcd_mapping.vrs_utils.normalize", side_effect=lambda a, **_: a + ) + mocker.patch("dcd_mapping.vrs_utils.get_seqrepo", return_value=mocker.MagicMock()) + + result = normalize_and_identify(allele) + + mock_normalize.assert_called_once() + assert result is allele + assert result.id is not None + assert result.id.startswith("ga4gh:VA.") + + +def test_normalize_and_identify_clears_stale_digest_through_helper(mocker): + """Stale digests survive a no-op normalize, so the identify step must + still clear them. Verifies the pairing actually delivers the invariant. + """ + allele = _make_allele() + allele.location.digest = "ga4gh:SL.stale" + allele.digest = "ga4gh:VA.stale" + mocker.patch("dcd_mapping.vrs_utils.normalize", side_effect=lambda a, **_: a) + mocker.patch("dcd_mapping.vrs_utils.get_seqrepo", return_value=mocker.MagicMock()) + + result = normalize_and_identify(allele) + + assert result.id != "ga4gh:VA.stale" + assert result.id.startswith("ga4gh:VA.") + + +def test_identify_allele_raises_when_digest_unobtainable(mocker): + """When ``ga4gh_identify`` returns ``None`` (malformed input), surface it + as a ValueError rather than silently stamping an unidentifiable allele. + """ + mocker.patch("dcd_mapping.vrs_utils.ga4gh_identify", return_value=None) + with pytest.raises(ValueError, match="Failed to compute GA4GH identifier"): + identify_allele(_make_allele())