Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
288 changes: 288 additions & 0 deletions malariagen_data/anoph/admixture_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,288 @@
from typing import Tuple, Optional
import allel
import numpy as np
import numba
from . import base_params
from . import admixture_stats_params
from .snp_data import AnophelesSnpData
from .sample_metadata import AnophelesSampleMetadata
from ..util import _check_types
from numpydoc_decorator import doc


@numba.njit
def _remap_observed(ac_full):
"""Remap allele indices to remove unobserved alleles."""
n_variants = ac_full.shape[0]
n_alleles = ac_full.shape[1]
# Create the output array - this is an allele mapping array,
# that specifies how to recode allele indices.
mapping = np.empty((n_variants, n_alleles), dtype=np.int32)
mapping[:] = -1
# Iterate over variants.
for i in range(n_variants):

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem optimised but I don't have a better approach in mind right now.

# This will be the new index that we are mapping this allele to, if the
# allele count is not zero.
j_out = 0
# Iterate over columns (alleles) in the input array.
for j in range(n_alleles):
# Access the count for the jth allele.
c = ac_full[i, j]
if c > 0:
# We have found a non-zero allele count, remap the allele.
mapping[i, j] = j_out
j_out += 1
return mapping


class AnophelesAdmixtureAnalysis(
AnophelesSnpData,
AnophelesSampleMetadata,
):
def __init__(
self,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

@_check_types
@doc(
summary="""
Compute Patterson's f3 statistic for a specified recipient population and two source populations.
""",
returns="""
A NumPy float of the f3 value, standard error (SE) and Z-score.
""",
)
def patterson_f3(
self,
recipient_query: base_params.sample_query,

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given the lack of symmetry between the various queries, giving them different parameter descriptions might be helpful.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although it is called recipient query here, it is basically the same (just a sample query) and is more to avoid confusion over which population to test.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a "naive" user asking for these function's help should be given more info on the assumptions associated to each query than the current:
"""
A pandas query string to be evaluated against the sample metadata, to
select samples to be included in the returned data. E.g., "country ==
'Uganda'". If the query returns zero results, a warning will be
emitted with fuzzy-match suggestions for possible typos or case
mismatches.
"""
For instance, while it makes sense that cohort D is the outgroup in f4/D, I think it is worth telling the user explicitly, in particular when it is treated differently.

source1_query: base_params.sample_query,
source2_query: base_params.sample_query,
region: base_params.region,
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
n_jack: base_params.n_jack = 200,
max_missing_an: base_params.max_missing_an = 0,
segregating_mode: admixture_stats_params.segregating_mode = "recipient",
cohort_size: Optional[
base_params.cohort_size
] = admixture_stats_params.cohort_size_default,
min_cohort_size: Optional[
base_params.min_cohort_size
] = admixture_stats_params.min_cohort_size_default,
max_cohort_size: Optional[
base_params.max_cohort_size
] = admixture_stats_params.max_cohort_size_default,
) -> Tuple[float, float, float]:
# Purely for conciseness, internally here we use the scikit-allel convention
# of labelling the recipient population "C" and the source populations as
# "A" and "B".

# Compute allele counts for the three cohorts.
acc = self.snp_allele_counts(
sample_query=recipient_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)
aca = self.snp_allele_counts(
sample_query=source1_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)
acb = self.snp_allele_counts(
sample_query=source2_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

# Locate biallelic variants and deal with missingness.
ac = acc + aca + acb
loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
anc = np.sum(acc, axis=1)
ana = np.sum(aca, axis=1)
anb = np.sum(acb, axis=1)
an = np.sum(ac, axis=1) # no. required for sample size
n_chroms = an.max()
an_missing = n_chroms - an
# In addition to applying the max_missing_an threshold, also make sure that
# all three cohorts have nonzero allele counts.
loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
loc_sites = loc_bi & loc_nomiss
acc_bi = acc[loc_sites]
aca_bi = aca[loc_sites]
acb_bi = acb[loc_sites]
ac_bi = ac[loc_sites]

# Squeeze the allele counts so we end up with only two columns.
sqz_mapping = _remap_observed(ac_bi)
acc_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
aca_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
acb_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)

# Keep only segregating variants.
if segregating_mode == "recipient":
loc_seg = acc_sqz.is_segregating()
elif segregating_mode == "all":
loc_seg = (
acc_sqz.is_segregating()
& aca_sqz.is_segregating()
& acb_sqz.is_segregating()
)
else:
raise ValueError("Invalid value for 'segregating_mode' parameter.")
if loc_seg.sum() == 0:
raise ValueError("No segregating variants found")
else:
print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.")
acc_seg = acc_sqz[loc_seg]
aca_seg = aca_sqz[loc_seg]
acb_seg = acb_sqz[loc_seg]

# Compute f3 statistic.
blen = acc_seg.shape[0] // n_jack
f3, se, z, _, _ = allel.average_patterson_f3(
acc_seg,
aca_seg,
acb_seg,
blen=blen,
normed=True,
)

return f3, se, z

@_check_types
@doc(
summary="""
Compute Patterson's f4 statistic or ABBA-BABA test for specified cohorts A, B, C and D where D is the outgroup.
""",
returns="""
A NumPy float of the f4 value, standard error (SE) and Z-score.
""",
)
def patterson_f4(
self,
a_query: base_params.sample_query,
b_query: base_params.sample_query,
c_query: base_params.sample_query,
d_query: base_params.sample_query,
region: base_params.region,
site_mask: Optional[base_params.site_mask] = base_params.DEFAULT,
n_jack: base_params.n_jack = 200,
max_missing_an: base_params.max_missing_an = 0,
segregating_mode: admixture_stats_params.segregating_mode = "recipient",
cohort_size: Optional[
base_params.cohort_size
] = admixture_stats_params.cohort_size_default,
min_cohort_size: Optional[
base_params.min_cohort_size
] = admixture_stats_params.min_cohort_size_default,
max_cohort_size: Optional[
base_params.max_cohort_size
] = admixture_stats_params.max_cohort_size_default,
) -> Tuple[float, float, float]:
# Compute allele counts for the four cohorts.
aca = self.snp_allele_counts(
sample_query=a_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acb = self.snp_allele_counts(
sample_query=b_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acc = self.snp_allele_counts(
sample_query=c_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

acd = self.snp_allele_counts(
sample_query=d_query,
region=region,
site_mask=site_mask,
cohort_size=cohort_size,
min_cohort_size=min_cohort_size,
max_cohort_size=max_cohort_size,
)

# Locate biallelic variants and deal with missingness.
ac = acc + aca + acb

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why no acd?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is because D is the outgroup, its been a very long time since the stats function was written so my memory is hazy but I remember considering this point and I believe there would much less or not enough variation for analysis if the outgroup was also taken into account. We do not except them to share anywhere near as much variation as the other populations.

loc_bi = allel.AlleleCountsArray(ac).is_biallelic()
ana = np.sum(acc, axis=1)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's probably not going to cause an error, but that's very confusing!

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please could you give some more explanation here/a suggested change. Thanks.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, ana should be np.sum(aca, axis=1), not np.sum(acc, axis=1), I believe.

anb = np.sum(aca, axis=1)
anc = np.sum(acb, axis=1)
an = np.sum(ac, axis=1) # no. required for sample size
n_chroms = an.max()
an_missing = n_chroms - an
# In addition to applying the max_missing_an threshold, also make sure that
# all four cohorts have nonzero allele counts.
loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0)
loc_sites = loc_bi & loc_nomiss
aca_bi = acc[loc_sites]
acb_bi = aca[loc_sites]
acc_bi = acb[loc_sites]
acd_bi = acd[loc_sites]
ac_bi = aca_bi + acb_bi + acc_bi + acd_bi

# Squeeze the allele counts so we end up with only two columns.
sqz_mapping = _remap_observed(ac_bi)
aca_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1)
acb_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1)
acc_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1)
acd_sqz = allel.AlleleCountsArray(acd_bi).map_alleles(sqz_mapping, max_allele=1)

# Keep only segregating variants.
if segregating_mode == "recipient":
loc_seg = acc_sqz.is_segregating()
elif segregating_mode == "all":
loc_seg = (
aca_sqz.is_segregating()
& acb_sqz.is_segregating()
& acc_sqz.is_segregating()
)
else:
raise ValueError("Invalid value for 'segregating_mode' parameter.")
if loc_seg.sum() == 0:
raise ValueError("No segregating variants found")
else:
print(f"Using {loc_seg.sum()} segregating variants for F4 calculation.")
aca_seg = acc_sqz[loc_seg]
acb_seg = aca_sqz[loc_seg]
acc_seg = acb_sqz[loc_seg]
acd_seg = acd_sqz[loc_seg]

# Compute f4 statistic.
blen = acc_seg.shape[0] // n_jack
f4, se, z, _, _ = allel.average_patterson_d(

@jonbrenas jonbrenas Jun 11, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not an expert here, but I think D and f4 are slightly different. The code for patterson_d states that you can get f4 by ignoring the den but the average_patterson_d doesn't do that, and thus returns (as the name implies) D and not f4.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct, the description is slightly off and should be revised to D.

aca_seg,
acb_seg,
acc_seg,
acd_seg,
blen=blen,
)

return f4, se, z
17 changes: 17 additions & 0 deletions malariagen_data/anoph/admixture_stats_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Parameter definitions for admixture functions."""

from typing import Optional
import typing_extensions
from . import base_params

cohort_size_default: Optional[base_params.cohort_size] = None
min_cohort_size_default: base_params.min_cohort_size = 10
max_cohort_size_default: base_params.max_cohort_size = 50

segregating_mode: typing_extensions.TypeAlias = typing_extensions.Annotated[
str,
"""
Define the way segregating variants are chosen. Available options are "recipient" or "all" to use
sites segregating within both the recipient and source population cohorts.
""",
]
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from .anoph.to_vcf import SnpVcfExporter
from .anoph.g123 import AnophelesG123Analysis
from .anoph.fst import AnophelesFstAnalysis
from .anoph.admixture_stats import AnophelesAdmixtureAnalysis
from .anoph.h12 import AnophelesH12Analysis
from .anoph.h1x import AnophelesH1XAnalysis
from .anoph.phenotypes import AnophelesPhenotypeData
Expand Down Expand Up @@ -86,6 +87,7 @@ class AnophelesDataResource(
AnophelesH12Analysis,
AnophelesG123Analysis,
AnophelesFstAnalysis,
AnophelesAdmixtureAnalysis,
AnophelesHetAnalysis,
AnophelesHapFrequencyAnalysis,
AnophelesDistanceAnalysis,
Expand Down
Loading
Loading