-
Notifications
You must be signed in to change notification settings - Fork 170
Admixture f3 and f4 statistics function. #1330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,288 @@ | ||
| from typing import Tuple, Optional | ||
| import allel | ||
| import numpy as np | ||
| import numba | ||
| from . import base_params | ||
| from . import admixture_stats_params | ||
| from .snp_data import AnophelesSnpData | ||
| from .sample_metadata import AnophelesSampleMetadata | ||
| from ..util import _check_types | ||
| from numpydoc_decorator import doc | ||
|
|
||
|
|
||
| @numba.njit | ||
| def _remap_observed(ac_full): | ||
| """Remap allele indices to remove unobserved alleles.""" | ||
| n_variants = ac_full.shape[0] | ||
| n_alleles = ac_full.shape[1] | ||
| # Create the output array - this is an allele mapping array, | ||
| # that specifies how to recode allele indices. | ||
| mapping = np.empty((n_variants, n_alleles), dtype=np.int32) | ||
| mapping[:] = -1 | ||
| # Iterate over variants. | ||
| for i in range(n_variants): | ||
| # This will be the new index that we are mapping this allele to, if the | ||
| # allele count is not zero. | ||
| j_out = 0 | ||
| # Iterate over columns (alleles) in the input array. | ||
| for j in range(n_alleles): | ||
| # Access the count for the jth allele. | ||
| c = ac_full[i, j] | ||
| if c > 0: | ||
| # We have found a non-zero allele count, remap the allele. | ||
| mapping[i, j] = j_out | ||
| j_out += 1 | ||
| return mapping | ||
|
|
||
|
|
||
| class AnophelesAdmixtureAnalysis( | ||
| AnophelesSnpData, | ||
| AnophelesSampleMetadata, | ||
| ): | ||
| def __init__( | ||
| self, | ||
| **kwargs, | ||
| ): | ||
| # N.B., this class is designed to work cooperatively, and | ||
| # so it's important that any remaining parameters are passed | ||
| # to the superclass constructor. | ||
| super().__init__(**kwargs) | ||
|
|
||
| @_check_types | ||
| @doc( | ||
| summary=""" | ||
| Compute Patterson's f3 statistic for a specified recipient population and two source populations. | ||
| """, | ||
| returns=""" | ||
| A NumPy float of the f3 value, standard error (SE) and Z-score. | ||
| """, | ||
| ) | ||
| def patterson_f3( | ||
| self, | ||
| recipient_query: base_params.sample_query, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Given the lack of symmetry between the various queries, giving them different parameter descriptions might be helpful.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although it is called recipient query here, it is basically the same (just a sample query) and is more to avoid confusion over which population to test.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a "naive" user asking for these function's help should be given more info on the assumptions associated to each query than the current: |
||
| source1_query: base_params.sample_query, | ||
| source2_query: base_params.sample_query, | ||
| region: base_params.region, | ||
| site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, | ||
| n_jack: base_params.n_jack = 200, | ||
| max_missing_an: base_params.max_missing_an = 0, | ||
| segregating_mode: admixture_stats_params.segregating_mode = "recipient", | ||
| cohort_size: Optional[ | ||
| base_params.cohort_size | ||
| ] = admixture_stats_params.cohort_size_default, | ||
| min_cohort_size: Optional[ | ||
| base_params.min_cohort_size | ||
| ] = admixture_stats_params.min_cohort_size_default, | ||
| max_cohort_size: Optional[ | ||
| base_params.max_cohort_size | ||
| ] = admixture_stats_params.max_cohort_size_default, | ||
| ) -> Tuple[float, float, float]: | ||
| # Purely for conciseness, internally here we use the scikit-allel convention | ||
| # of labelling the recipient population "C" and the source populations as | ||
| # "A" and "B". | ||
|
|
||
| # Compute allele counts for the three cohorts. | ||
| acc = self.snp_allele_counts( | ||
| sample_query=recipient_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
| aca = self.snp_allele_counts( | ||
| sample_query=source1_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
| acb = self.snp_allele_counts( | ||
| sample_query=source2_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
|
|
||
| # Locate biallelic variants and deal with missingness. | ||
| ac = acc + aca + acb | ||
| loc_bi = allel.AlleleCountsArray(ac).is_biallelic() | ||
| anc = np.sum(acc, axis=1) | ||
| ana = np.sum(aca, axis=1) | ||
| anb = np.sum(acb, axis=1) | ||
| an = np.sum(ac, axis=1) # no. required for sample size | ||
| n_chroms = an.max() | ||
| an_missing = n_chroms - an | ||
| # In addition to applying the max_missing_an threshold, also make sure that | ||
| # all three cohorts have nonzero allele counts. | ||
| loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0) | ||
| loc_sites = loc_bi & loc_nomiss | ||
| acc_bi = acc[loc_sites] | ||
| aca_bi = aca[loc_sites] | ||
| acb_bi = acb[loc_sites] | ||
| ac_bi = ac[loc_sites] | ||
|
|
||
| # Squeeze the allele counts so we end up with only two columns. | ||
| sqz_mapping = _remap_observed(ac_bi) | ||
| acc_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1) | ||
| aca_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1) | ||
| acb_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1) | ||
|
|
||
| # Keep only segregating variants. | ||
| if segregating_mode == "recipient": | ||
| loc_seg = acc_sqz.is_segregating() | ||
| elif segregating_mode == "all": | ||
| loc_seg = ( | ||
| acc_sqz.is_segregating() | ||
| & aca_sqz.is_segregating() | ||
| & acb_sqz.is_segregating() | ||
| ) | ||
| else: | ||
| raise ValueError("Invalid value for 'segregating_mode' parameter.") | ||
| if loc_seg.sum() == 0: | ||
| raise ValueError("No segregating variants found") | ||
| else: | ||
| print(f"Using {loc_seg.sum()} segregating variants for F3 calculation.") | ||
| acc_seg = acc_sqz[loc_seg] | ||
| aca_seg = aca_sqz[loc_seg] | ||
| acb_seg = acb_sqz[loc_seg] | ||
|
|
||
| # Compute f3 statistic. | ||
| blen = acc_seg.shape[0] // n_jack | ||
| f3, se, z, _, _ = allel.average_patterson_f3( | ||
| acc_seg, | ||
| aca_seg, | ||
| acb_seg, | ||
| blen=blen, | ||
| normed=True, | ||
| ) | ||
|
|
||
| return f3, se, z | ||
|
|
||
| @_check_types | ||
| @doc( | ||
| summary=""" | ||
| Compute Patterson's f4 statistic or ABBA-BABA test for specified cohorts A, B, C and D where D is the outgroup. | ||
| """, | ||
| returns=""" | ||
| A NumPy float of the f4 value, standard error (SE) and Z-score. | ||
| """, | ||
| ) | ||
| def patterson_f4( | ||
| self, | ||
| a_query: base_params.sample_query, | ||
| b_query: base_params.sample_query, | ||
| c_query: base_params.sample_query, | ||
| d_query: base_params.sample_query, | ||
| region: base_params.region, | ||
| site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, | ||
| n_jack: base_params.n_jack = 200, | ||
| max_missing_an: base_params.max_missing_an = 0, | ||
| segregating_mode: admixture_stats_params.segregating_mode = "recipient", | ||
| cohort_size: Optional[ | ||
| base_params.cohort_size | ||
| ] = admixture_stats_params.cohort_size_default, | ||
| min_cohort_size: Optional[ | ||
| base_params.min_cohort_size | ||
| ] = admixture_stats_params.min_cohort_size_default, | ||
| max_cohort_size: Optional[ | ||
| base_params.max_cohort_size | ||
| ] = admixture_stats_params.max_cohort_size_default, | ||
| ) -> Tuple[float, float, float]: | ||
| # Compute allele counts for the four cohorts. | ||
| aca = self.snp_allele_counts( | ||
| sample_query=a_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
|
|
||
| acb = self.snp_allele_counts( | ||
| sample_query=b_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
|
|
||
| acc = self.snp_allele_counts( | ||
| sample_query=c_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
|
|
||
| acd = self.snp_allele_counts( | ||
| sample_query=d_query, | ||
| region=region, | ||
| site_mask=site_mask, | ||
| cohort_size=cohort_size, | ||
| min_cohort_size=min_cohort_size, | ||
| max_cohort_size=max_cohort_size, | ||
| ) | ||
|
|
||
| # Locate biallelic variants and deal with missingness. | ||
| ac = acc + aca + acb | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why no
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is because D is the outgroup, its been a very long time since the stats function was written so my memory is hazy but I remember considering this point and I believe there would much less or not enough variation for analysis if the outgroup was also taken into account. We do not except them to share anywhere near as much variation as the other populations. |
||
| loc_bi = allel.AlleleCountsArray(ac).is_biallelic() | ||
| ana = np.sum(acc, axis=1) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's probably not going to cause an error, but that's very confusing!
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please could you give some more explanation here/a suggested change. Thanks.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, |
||
| anb = np.sum(aca, axis=1) | ||
| anc = np.sum(acb, axis=1) | ||
| an = np.sum(ac, axis=1) # no. required for sample size | ||
| n_chroms = an.max() | ||
| an_missing = n_chroms - an | ||
| # In addition to applying the max_missing_an threshold, also make sure that | ||
| # all four cohorts have nonzero allele counts. | ||
| loc_nomiss = (an_missing <= max_missing_an) & (anc > 0) & (ana > 0) & (anb > 0) | ||
| loc_sites = loc_bi & loc_nomiss | ||
| aca_bi = acc[loc_sites] | ||
| acb_bi = aca[loc_sites] | ||
| acc_bi = acb[loc_sites] | ||
| acd_bi = acd[loc_sites] | ||
| ac_bi = aca_bi + acb_bi + acc_bi + acd_bi | ||
|
|
||
| # Squeeze the allele counts so we end up with only two columns. | ||
| sqz_mapping = _remap_observed(ac_bi) | ||
| aca_sqz = allel.AlleleCountsArray(acc_bi).map_alleles(sqz_mapping, max_allele=1) | ||
| acb_sqz = allel.AlleleCountsArray(aca_bi).map_alleles(sqz_mapping, max_allele=1) | ||
| acc_sqz = allel.AlleleCountsArray(acb_bi).map_alleles(sqz_mapping, max_allele=1) | ||
| acd_sqz = allel.AlleleCountsArray(acd_bi).map_alleles(sqz_mapping, max_allele=1) | ||
|
|
||
| # Keep only segregating variants. | ||
| if segregating_mode == "recipient": | ||
| loc_seg = acc_sqz.is_segregating() | ||
| elif segregating_mode == "all": | ||
| loc_seg = ( | ||
| aca_sqz.is_segregating() | ||
| & acb_sqz.is_segregating() | ||
| & acc_sqz.is_segregating() | ||
| ) | ||
| else: | ||
| raise ValueError("Invalid value for 'segregating_mode' parameter.") | ||
| if loc_seg.sum() == 0: | ||
| raise ValueError("No segregating variants found") | ||
| else: | ||
| print(f"Using {loc_seg.sum()} segregating variants for F4 calculation.") | ||
| aca_seg = acc_sqz[loc_seg] | ||
| acb_seg = aca_sqz[loc_seg] | ||
| acc_seg = acb_sqz[loc_seg] | ||
| acd_seg = acd_sqz[loc_seg] | ||
|
|
||
| # Compute f4 statistic. | ||
| blen = acc_seg.shape[0] // n_jack | ||
| f4, se, z, _, _ = allel.average_patterson_d( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not an expert here, but I think D and f4 are slightly different. The code for
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are correct, the description is slightly off and should be revised to D. |
||
| aca_seg, | ||
| acb_seg, | ||
| acc_seg, | ||
| acd_seg, | ||
| blen=blen, | ||
| ) | ||
|
|
||
| return f4, se, z | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| """Parameter definitions for admixture functions.""" | ||
|
|
||
| from typing import Optional | ||
| import typing_extensions | ||
| from . import base_params | ||
|
|
||
| cohort_size_default: Optional[base_params.cohort_size] = None | ||
| min_cohort_size_default: base_params.min_cohort_size = 10 | ||
| max_cohort_size_default: base_params.max_cohort_size = 50 | ||
|
|
||
| segregating_mode: typing_extensions.TypeAlias = typing_extensions.Annotated[ | ||
| str, | ||
| """ | ||
| Define the way segregating variants are chosen. Available options are "recipient" or "all" to use | ||
| sites segregating within both the recipient and source population cohorts. | ||
| """, | ||
| ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't seem optimised but I don't have a better approach in mind right now.