Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,6 @@ notebooks/analysis/mavedb_files
urn:*.json
tmp:*.json
*_mapping_*.json

# Agent settings
.claude/
7 changes: 6 additions & 1 deletion src/dcd_mapping/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,12 @@ def _run_blat(
cmd.extend(shlex.split(target_args))

cmd.extend(
[f"-minScore={min_score}", f"-out={out_format}", str(query_file), out_file]
[
f"-minScore={min_score}",
f"-out={out_format}",
str(query_file),
out_file,
]
)
_logger.debug("Running BLAT command: %s", " ".join(cmd))

Expand Down
11 changes: 4 additions & 7 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from Bio.Seq import Seq
from bioutils.accessions import infer_namespace
from cool_seq_tool.schemas import AnnotationLayer, Strand
from ga4gh.core import ga4gh_identify, sha512t24u
from ga4gh.core import sha512t24u
from ga4gh.vrs._internal.models import (
Allele,
Haplotype,
Expand Down Expand Up @@ -47,6 +47,7 @@
VrsMapResult,
)
from dcd_mapping.transcripts import TxSelectError
from dcd_mapping.vrs_utils import identify_allele, normalize_and_identify

__all__ = ["vrs_map"]

Expand Down Expand Up @@ -1005,10 +1006,7 @@ def _construct_vrs_allele(
else:
allele = translate_ref_identical_to_vrs(hgvs_string)

normalize(allele, data_proxy=get_seqrepo())

allele.id = ga4gh_identify(allele)
alleles.append(allele)
alleles.append(normalize_and_identify(allele))
continue

allele = translate_hgvs_to_vrs(hgvs_string)
Expand Down Expand Up @@ -1040,8 +1038,7 @@ def _construct_vrs_allele(
)
allele.state = _rle_to_lse(allele.state, allele.location)

# Run ga4gh_identify to assign VA digest
allele.id = ga4gh_identify(allele)
allele.id = identify_allele(allele)
alleles.append(allele)

if not alleles:
Expand Down
51 changes: 51 additions & 0 deletions src/dcd_mapping/vrs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""VRS allele identification helpers.

Centralizes the digest-correctness invariant for GA4GH VRS alleles: the
``ga4gh_identify`` Merkle tree caches sub-object digests on the object after
first identification, so any subsequent mutation (refgetAccession swap,
normalization, state coercion) leaves a stale id unless the cached digests
are cleared first. All allele identification in dcd_mapping must route
through :func:`identify_allele` so the digest is always recomputed from
current content.
"""

from ga4gh.core import ga4gh_identify
from ga4gh.vrs._internal.models import Allele, SequenceLocation
from ga4gh.vrs.normalize import normalize

from dcd_mapping.lookup import get_seqrepo


def identify_allele(allele: Allele) -> str:
"""Clear cached digests and return a fresh GA4GH identifier for *allele*.

``ga4gh_identify`` is a Merkle-tree: it calls ``get_or_create_digest`` on
sub-objects, returning any cached value without recomputing. Clearing both
the location digest and the allele digest first ensures the id is always
derived from the current object content — not from a value set before a
refgetAccession mutation or normalization.
"""
if isinstance(allele.location, SequenceLocation):
allele.location.digest = None

allele.digest = None
digest = ga4gh_identify(allele)
if digest is None:
raise ValueError("Failed to compute GA4GH identifier for allele") # noqa: EM101

return digest


def normalize_and_identify(allele: Allele) -> Allele:
"""Normalize *allele* and stamp it with a freshly computed GA4GH digest.

Pairs the two finalize steps every VRS allele construction path needs.
Routing identification through :func:`identify_allele` (rather than
``ga4gh_identify`` directly) is the invariant that protects against the
Merkle-tree's stale-digest behavior after mutation -- so any allele
construction site that bypasses this helper risks reintroducing the
stale-digest bug.
"""
allele = normalize(allele, data_proxy=get_seqrepo())
allele.id = identify_allele(allele)
return allele
110 changes: 110 additions & 0 deletions tests/test_vrs_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Tests for VRS allele identification helpers.

These tests run offline: ``ga4gh_identify`` is pure computation given a
well-formed VRS object, and ``normalize`` is patched so no SeqRepo or UTA
access is required.
"""

import pytest
from ga4gh.vrs._internal.models import (
Allele,
LiteralSequenceExpression,
SequenceLocation,
SequenceReference,
)

from dcd_mapping.vrs_utils import identify_allele, normalize_and_identify


def _make_allele(start: int = 1, end: int = 2, sequence: str = "A") -> Allele:
"""Build a minimal in-memory VRS Allele with no cached digests."""
return Allele(
location=SequenceLocation(
sequenceReference=SequenceReference(
refgetAccession="SQ.0123456789abcdef0123456789abcdef"
),
start=start,
end=end,
),
state=LiteralSequenceExpression(sequence=sequence),
)


def test_identify_allele_returns_va_digest():
assert identify_allele(_make_allele()).startswith("ga4gh:VA.")


def test_identify_allele_is_content_addressed():
# Identical content -> identical digest; differing content -> differing digest.
a = _make_allele()
b = _make_allele()
c = _make_allele(start=5, end=6)
assert identify_allele(a) == identify_allele(b)
assert identify_allele(a) != identify_allele(c)


def test_identify_allele_clears_stale_digests():
"""The whole point of the helper: a cached pre-mutation digest must not
leak into the post-mutation identifier.

Reproduces the bug pattern where an allele is constructed, its location
gets a refgetAccession mutation, and then it's identified. If the helper
is bypassed and ``ga4gh_identify`` is called directly, the Merkle-tree
returns the stale digest without recomputing, so the identifier doesn't
reflect the current content. By clearing cached digests first, the helper
ensures the identifier is always correct even if the input allele has been
mutated since the last identification.
"""
allele = _make_allele(start=5, end=6)
allele.location.digest = "ga4gh:SL.stale-location-digest"
allele.digest = "ga4gh:VA.stale-allele-digest"

identifier = identify_allele(allele)

assert identifier.startswith("ga4gh:VA.")
assert identifier != "ga4gh:VA.stale-allele-digest"
# The identifier must match a freshly-built allele with the same content,
# not anything derived from the stale digests.
assert identifier == identify_allele(_make_allele(start=5, end=6))


def test_normalize_and_identify_pairs_both_steps(mocker):
"""normalize_and_identify normalizes then identifies in that order."""
allele = _make_allele()
# Pass-through normalize so the test stays offline (no SeqRepo).
mock_normalize = mocker.patch(
"dcd_mapping.vrs_utils.normalize", side_effect=lambda a, **_: a
)
mocker.patch("dcd_mapping.vrs_utils.get_seqrepo", return_value=mocker.MagicMock())

result = normalize_and_identify(allele)

mock_normalize.assert_called_once()
assert result is allele
assert result.id is not None
assert result.id.startswith("ga4gh:VA.")


def test_normalize_and_identify_clears_stale_digest_through_helper(mocker):
"""Stale digests survive a no-op normalize, so the identify step must
still clear them. Verifies the pairing actually delivers the invariant.
"""
allele = _make_allele()
allele.location.digest = "ga4gh:SL.stale"
allele.digest = "ga4gh:VA.stale"
mocker.patch("dcd_mapping.vrs_utils.normalize", side_effect=lambda a, **_: a)
mocker.patch("dcd_mapping.vrs_utils.get_seqrepo", return_value=mocker.MagicMock())

result = normalize_and_identify(allele)

assert result.id != "ga4gh:VA.stale"
assert result.id.startswith("ga4gh:VA.")


def test_identify_allele_raises_when_digest_unobtainable(mocker):
"""When ``ga4gh_identify`` returns ``None`` (malformed input), surface it
as a ValueError rather than silently stamping an unidentifiable allele.
"""
mocker.patch("dcd_mapping.vrs_utils.ga4gh_identify", return_value=None)
with pytest.raises(ValueError, match="Failed to compute GA4GH identifier"):
identify_allele(_make_allele())
Loading