From f540e9e8f0e8d32d11db19bcd8af50f33ba1f2fc Mon Sep 17 00:00:00 2001 From: Aakash Ashok Naik <91958822+naik-aakash@users.noreply.github.com> Date: Sat, 21 Jun 2025 10:15:22 +0200 Subject: [PATCH 1/6] enhance sklearn RR --- modnet/sklearn.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index ba59d150..58a29abe 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -77,7 +77,9 @@ class RR(TransformerMixin, BaseEstimator): """ def __init__( - self, n_feat: Union[None, int] = None, rr_parameters: Union[None, Dict] = None + self, n_feat: Union[None, int] = None, rr_parameters: Union[None, Dict] = None, + n_jobs: Union[None, int] = None, target_nmi_kwargs: Union[None, Dict] = None, + cross_nmi_kwargs: Union[None, Dict] = None ): """Constructor for RR transformer. @@ -87,6 +89,9 @@ def __init__( to constant values instead of using the dynamical evaluation. Expects to find keys `"p"` and `"c"`, containing either a callable that takes `n` as an argument and returns the desired `p` or `c`, or another dictionary containing the key `"value"` that stores a constant value of `p` or `c`. + n_jobs: max number of processes to use when calculating cross NMI. + target_nmi_kwargs: Keyword arguments to be passed down to the modnet.preprocessing.nmi_target + cross_nmi_kwargs: Keyword arguments to be passed down to the modnet.preprocessing.get_cross_nmi """ self.n_feat = n_feat self.rr_parameters = rr_parameters @@ -109,9 +114,17 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): """ if cross_nmi_feats is None: - cross_nmi_feats = get_cross_nmi(X) + cross_nmi_feats = get_cross_nmi(X, n_jobs=n_jobs, **cross_nmi_kwargs) if nmi_feats_target is None: - nmi_feats_target = nmi_target(X, y) + nmi_feats_target = nmi_target(X, y, **target_nmi_kwargs) + + missing = [x for x in cross_nmi_feats.index if x not in nmi_feats_target.index] + cross_nmi_feats = cross_nmi_feats.drop(missing, axis=0).drop(missing, axis=1) + + missing = [x for x in nmi_feats_target.index if x not in cross_nmi_feats.index] + nmi_feats_target = nmi_feats_target.drop(missing, axis=0) + nmi_feats_target = nmi_feats_target.astype(np.float64) + rr_results = get_features_relevance_redundancy( nmi_feats_target, @@ -121,6 +134,8 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): ) self.optimal_descriptors = [x["feature"] for x in rr_results] + return self + def transform(self, X, y=None): """Transform the inputs X based on a fitted RR analysis. The best n_feat features are kept and returned. From 8252fa08b075cc060234710e6e5ed18dba9e0c1a Mon Sep 17 00:00:00 2001 From: naik-aakash Date: Sat, 21 Jun 2025 10:28:58 +0200 Subject: [PATCH 2/6] fix linting and undefined variables --- modnet/sklearn.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index 58a29abe..31af45ab 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -24,6 +24,7 @@ (see https://scikit-learn.org/stable/developers/develop.html#instantiation). """ +import numpy as np from sklearn.base import BaseEstimator from sklearn.base import RegressorMixin from sklearn.base import TransformerMixin @@ -77,9 +78,12 @@ class RR(TransformerMixin, BaseEstimator): """ def __init__( - self, n_feat: Union[None, int] = None, rr_parameters: Union[None, Dict] = None, - n_jobs: Union[None, int] = None, target_nmi_kwargs: Union[None, Dict] = None, - cross_nmi_kwargs: Union[None, Dict] = None + self, + n_feat: Union[None, int] = None, + rr_parameters: Union[None, Dict] = None, + n_jobs: Union[None, int] = None, + cross_nmi_kwargs: Union[None, Dict] = None, + target_nmi_kwargs: Union[None, Dict] = None, ): """Constructor for RR transformer. @@ -90,12 +94,15 @@ def __init__( containing either a callable that takes `n` as an argument and returns the desired `p` or `c`, or another dictionary containing the key `"value"` that stores a constant value of `p` or `c`. n_jobs: max number of processes to use when calculating cross NMI. - target_nmi_kwargs: Keyword arguments to be passed down to the modnet.preprocessing.nmi_target cross_nmi_kwargs: Keyword arguments to be passed down to the modnet.preprocessing.get_cross_nmi + target_nmi_kwargs: Keyword arguments to be passed down to the modnet.preprocessing.nmi_target """ self.n_feat = n_feat self.rr_parameters = rr_parameters self.optimal_descriptors = [] + self.n_jobs = n_jobs + self.cross_nmi_kwargs = cross_nmi_kwargs if cross_nmi_kwargs is not None else {} + self.target_nmi_kwargs = target_nmi_kwargs if target_nmi_kwargs is not None else {} def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): """Ranking of the features. This is based on relevance and redundancy provided as NMI dataframes. @@ -114,9 +121,9 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): """ if cross_nmi_feats is None: - cross_nmi_feats = get_cross_nmi(X, n_jobs=n_jobs, **cross_nmi_kwargs) + cross_nmi_feats = get_cross_nmi(X, n_jobs=self.n_jobs, **self.cross_nmi_kwargs) if nmi_feats_target is None: - nmi_feats_target = nmi_target(X, y, **target_nmi_kwargs) + nmi_feats_target = nmi_target(X, y, **self.target_nmi_kwargs) missing = [x for x in cross_nmi_feats.index if x not in nmi_feats_target.index] cross_nmi_feats = cross_nmi_feats.drop(missing, axis=0).drop(missing, axis=1) @@ -125,7 +132,6 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): nmi_feats_target = nmi_feats_target.drop(missing, axis=0) nmi_feats_target = nmi_feats_target.astype(np.float64) - rr_results = get_features_relevance_redundancy( nmi_feats_target, cross_nmi_feats, From 703e3819db6cb991900e29355b459407f930232d Mon Sep 17 00:00:00 2001 From: naik-aakash Date: Sat, 21 Jun 2025 10:32:54 +0200 Subject: [PATCH 3/6] fix linting and undefined variables --- modnet/sklearn.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index 31af45ab..7e550655 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -102,7 +102,9 @@ def __init__( self.optimal_descriptors = [] self.n_jobs = n_jobs self.cross_nmi_kwargs = cross_nmi_kwargs if cross_nmi_kwargs is not None else {} - self.target_nmi_kwargs = target_nmi_kwargs if target_nmi_kwargs is not None else {} + self.target_nmi_kwargs = ( + target_nmi_kwargs if target_nmi_kwargs is not None else {} + ) def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): """Ranking of the features. This is based on relevance and redundancy provided as NMI dataframes. @@ -121,7 +123,9 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): """ if cross_nmi_feats is None: - cross_nmi_feats = get_cross_nmi(X, n_jobs=self.n_jobs, **self.cross_nmi_kwargs) + cross_nmi_feats = get_cross_nmi( + X, n_jobs=self.n_jobs, **self.cross_nmi_kwargs + ) if nmi_feats_target is None: nmi_feats_target = nmi_target(X, y, **self.target_nmi_kwargs) From c884a506695d30553e905a5b12c82c49a0ff0d87 Mon Sep 17 00:00:00 2001 From: naik-aakash Date: Tue, 24 Jun 2025 18:19:20 +0200 Subject: [PATCH 4/6] adapt RR sklearn to work for multiple targets --- modnet/sklearn.py | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index 7e550655..41277425 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -33,7 +33,9 @@ get_features_relevance_redundancy, get_cross_nmi, nmi_target, + merge_ranked, ) +from modnet.utils import LOG class MODNetFeaturizer(TransformerMixin, BaseEstimator): @@ -100,6 +102,7 @@ def __init__( self.n_feat = n_feat self.rr_parameters = rr_parameters self.optimal_descriptors = [] + self.optimal_features_by_target = {} self.n_jobs = n_jobs self.cross_nmi_kwargs = cross_nmi_kwargs if cross_nmi_kwargs is not None else {} self.target_nmi_kwargs = ( @@ -126,23 +129,37 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): cross_nmi_feats = get_cross_nmi( X, n_jobs=self.n_jobs, **self.cross_nmi_kwargs ) + + ranked_lists = [] + if nmi_feats_target is None: - nmi_feats_target = nmi_target(X, y, **self.target_nmi_kwargs) + for name in list(y): + LOG.info(f"Starting NMI computations for target {name}") + X_temp = X.copy() + y_temp = y[[name]] - missing = [x for x in cross_nmi_feats.index if x not in nmi_feats_target.index] - cross_nmi_feats = cross_nmi_feats.drop(missing, axis=0).drop(missing, axis=1) + nmi_feats_target = nmi_target(X_temp, y_temp, **self.target_nmi_kwargs) - missing = [x for x in nmi_feats_target.index if x not in cross_nmi_feats.index] - nmi_feats_target = nmi_feats_target.drop(missing, axis=0) - nmi_feats_target = nmi_feats_target.astype(np.float64) + missing = [ + x for x in nmi_feats_target.index if x not in cross_nmi_feats.index + ] + nmi_feats_target = nmi_feats_target.drop(missing, axis=0) + nmi_feats_target = nmi_feats_target.astype(np.float64) - rr_results = get_features_relevance_redundancy( - nmi_feats_target, - cross_nmi_feats, - n_feat=self.n_feat, - rr_parameters=self.rr_parameters, - ) - self.optimal_descriptors = [x["feature"] for x in rr_results] + rr_results = get_features_relevance_redundancy( + nmi_feats_target, + cross_nmi_feats, + n_feat=self.n_feat, + rr_parameters=self.rr_parameters, + ) + + self.optimal_features_by_target[name] = [ + x["feature"] for x in rr_results + ] + + ranked_lists.append(self.optimal_features_by_target[name]) + + self.optimal_descriptors = merge_ranked(ranked_lists) return self From 50020dd111da5ae2f0d6ea26db1bc413a6f30052 Mon Sep 17 00:00:00 2001 From: naik-aakash Date: Wed, 25 Jun 2025 06:06:26 +0200 Subject: [PATCH 5/6] fix accidental error introduced for case of precomputed NMI are provided --- modnet/sklearn.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index 41277425..bddb0546 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -125,13 +125,13 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): Fitted RR transformer """ + ranked_lists = [] + if cross_nmi_feats is None: cross_nmi_feats = get_cross_nmi( X, n_jobs=self.n_jobs, **self.cross_nmi_kwargs ) - ranked_lists = [] - if nmi_feats_target is None: for name in list(y): LOG.info(f"Starting NMI computations for target {name}") @@ -159,7 +159,17 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): ranked_lists.append(self.optimal_features_by_target[name]) - self.optimal_descriptors = merge_ranked(ranked_lists) + rr_results = get_features_relevance_redundancy( + nmi_feats_target, + cross_nmi_feats, + n_feat=self.n_feat, + rr_parameters=self.rr_parameters, + ) + + if ranked_lists: + self.optimal_descriptors = merge_ranked(ranked_lists) + else: + self.optimal_descriptors = [x["feature"] for x in rr_results] return self From f2654d551fb5d15275dce48da2bf727a6ddf76ce Mon Sep 17 00:00:00 2001 From: naik-aakash Date: Wed, 25 Jun 2025 06:17:46 +0200 Subject: [PATCH 6/6] fix duplicated rr computations --- modnet/sklearn.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/modnet/sklearn.py b/modnet/sklearn.py index bddb0546..50a929ce 100644 --- a/modnet/sklearn.py +++ b/modnet/sklearn.py @@ -159,16 +159,15 @@ def fit(self, X, y, nmi_feats_target=None, cross_nmi_feats=None): ranked_lists.append(self.optimal_features_by_target[name]) - rr_results = get_features_relevance_redundancy( - nmi_feats_target, - cross_nmi_feats, - n_feat=self.n_feat, - rr_parameters=self.rr_parameters, - ) - if ranked_lists: self.optimal_descriptors = merge_ranked(ranked_lists) else: + rr_results = get_features_relevance_redundancy( + nmi_feats_target, + cross_nmi_feats, + n_feat=self.n_feat, + rr_parameters=self.rr_parameters, + ) self.optimal_descriptors = [x["feature"] for x in rr_results] return self