From 5a0567ad89ed799bab439ad3df8e2a6134d6c60a Mon Sep 17 00:00:00 2001 From: 31puneet Date: Mon, 1 Jun 2026 14:58:56 +0000 Subject: [PATCH 1/2] handle Mapping cohorts in pca --- malariagen_data/anoph/pca.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/malariagen_data/anoph/pca.py b/malariagen_data/anoph/pca.py index d18e2842d..260a21d55 100644 --- a/malariagen_data/anoph/pca.py +++ b/malariagen_data/anoph/pca.py @@ -8,6 +8,7 @@ from ..util import CacheMiss, _check_types, _jitter from . import base_params, pca_params, plotly_params +from .sample_metadata import _locate_cohorts from .snp_data import AnophelesSnpData @@ -89,7 +90,7 @@ def pca( ) -> Tuple[pca_params.df_pca, pca_params.evr]: # Change this name if you ever change the behaviour of this function, to # invalidate any previously cached data. - name = "pca_v8" + name = "pca_v9" # Check that either sample_query xor sample_indices are provided. base_params._validate_sample_selection_params( @@ -118,9 +119,12 @@ def pca( sample_query_options=sample_query_options, ) # N.B., we are going to overwrite the sample_indices parameter here. - groups = df_samples.groupby(cohorts, sort=False) + coh_dict = _locate_cohorts( + cohorts=cohorts, data=df_samples, min_cohort_size=0 + ) ix = [] - for _, group in groups: + for _label, loc_coh in coh_dict.items(): + group = df_samples[loc_coh] if len(group) > max_cohort_size: ix.extend( group.sample( From 0de092bc4cf07b1c42c2fb76979a667ce87ca8ec Mon Sep 17 00:00:00 2001 From: 31puneet Date: Mon, 1 Jun 2026 16:12:27 +0000 Subject: [PATCH 2/2] running ci again