From dda054369e8a8ae9b8d166a7116ab983edfeaf8b Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:22:03 +0100
Subject: [PATCH 01/44] fix: Parallelization broken in lpca and different
 parallelism backends unified. Removed duplicated imports.

---
 src/pyRATS/_utils.py | 110 ++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 81 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index fd2dd9c..fa9fd0c 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -3,18 +3,12 @@
 from scipy.linalg import svd, svdvals
 from scipy.sparse.linalg import svds
 from sklearn.decomposition import KernelPCA
-from scipy.sparse import csr_matrix, triu
+from scipy.sparse import csr_matrix, triu, block_diag
 import itertools
 
 from joblib import delayed, Parallel
-import multiprocess as mp
-from multiprocess import shared_memory
-
-from scipy.sparse.linalg import svds
-from scipy.linalg import svd
 
 from scipy.spatial.distance import pdist, squareform
-from scipy.sparse import csr_matrix, block_diag
 from sklearn.utils.extmath import svd_flip
 from scipy.sparse.csgraph import (
     minimum_spanning_tree,
@@ -105,7 +99,6 @@ def target_proc(p_num, chunk_sz):
     results = Parallel(n_jobs=n_jobs)(
         delayed(target_proc)(i, chunk_sz) for i in range(n_jobs)
     )
-    results = [target_proc(0, n)]
 
     for i in range(len(results)):
         start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i]
@@ -165,7 +158,7 @@ def kpca(X, d, U, kernel, fit_inverse_transform, n_jobs, verbose=False):
     local_param.model = np.empty(n, dtype=object)
     local_param.zeta = np.zeros(n)
 
-    def target_proc(p_num, chunk_sz, q_):
+    def target_proc(p_num, chunk_sz):
         start_ind = p_num * chunk_sz
         if p_num == (n_jobs - 1):
             end_ind = n
@@ -184,26 +177,16 @@ def target_proc(p_num, chunk_sz, q_):
             U_k = U[k]
             X_k = X[U_k, :]
             model_[k - start_ind].fit(X_k)
-        q_.put((start_ind, end_ind, model_))
+        return start_ind, end_ind, model_
 
-    q_ = mp.Queue()
     chunk_sz = int(n / n_jobs)
-    proc = []
-    for p_num in range(n_jobs):
-        proc.append(
-            mp.Process(target=target_proc, args=(p_num, chunk_sz, q_), daemon=True)
-        )
-        proc[-1].start()
+    results = Parallel(n_jobs=n_jobs)(
+        delayed(target_proc)(p_num, chunk_sz) for p_num in range(n_jobs)
+    )
 
-    for p_num in range(n_jobs):
-        start_ind, end_ind, model_ = q_.get()
+    for start_ind, end_ind, model_ in results:
         local_param.model[start_ind:end_ind] = model_
 
-    q_.close()
-
-    for p_num in range(n_jobs):
-        proc[p_num].join()
-
     if verbose:
         print("local_param: all %d points processed..." % n)
     return local_param
@@ -673,20 +656,6 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
     cost = np.zeros(n) + np.inf
     dest = np.zeros(n, dtype="int") - 1
 
-    shm_cost = shared_memory.SharedMemory(create=True, size=cost.nbytes)
-    np_cost = np.ndarray(cost.shape, dtype=cost.dtype, buffer=shm_cost.buf)
-    np_cost[:] = cost[:]
-    shm_dest = shared_memory.SharedMemory(create=True, size=dest.nbytes)
-    np_dest = np.ndarray(dest.shape, dtype=dest.dtype, buffer=shm_dest.buf)
-    np_dest[:] = dest[:]
-
-    shm_cost_name = shm_cost.name
-    cost_shape = cost.shape
-    cost_dtype = cost.dtype
-    shm_dest_name = shm_dest.name
-    dest_shape = dest.shape
-    dest_dtype = dest.dtype
-
     # Vary eta from 2 to eta_{min}
     if verbose:
         print("Constructing intermediate views.")
@@ -699,50 +668,34 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
             )
             print("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta)))
 
-        def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S):
-            existing_shm_cost = shared_memory.SharedMemory(name=shm_cost_name)
-            cost_ = np.ndarray(
-                cost_shape, dtype=cost_dtype, buffer=existing_shm_cost.buf
-            )
-            existing_shm_dest = shared_memory.SharedMemory(name=shm_dest_name)
-            dest_ = np.ndarray(
-                dest_shape, dtype=dest_dtype, buffer=existing_shm_dest.buf
-            )
-
+        def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
             start_ind = p_num * chunk_sz
             if p_num == (n_jobs - 1):
                 end_ind = n_
             else:
                 end_ind = (p_num + 1) * chunk_sz
 
-            for k in range(start_ind, end_ind):
-                if S is None:
-                    k1 = k
-                else:
-                    k1 = S[k]
-                cost_[k1], dest_[k1] = cost_of_moving(
-                    k1, d_e, neigh_ind[k1], U_[k1], param, c, n_C, Utilde, eta, eta_max
+            cost_ = np.zeros(end_ind - start_ind) + np.inf
+            dest_ = np.zeros(end_ind - start_ind, dtype="int") - 1
+            for i, k in enumerate(range(start_ind, end_ind)):
+                cost_[i], dest_[i] = cost_of_moving(
+                    k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max
                 )
+            return start_ind, end_ind, cost_, dest_
 
-        proc = []
         chunk_sz = int(n / n_jobs)
-        for p_num in range(n_jobs):
-            proc.append(
-                mp.Process(
-                    target=target_proc,
-                    args=(p_num, chunk_sz, n, Utilde, n_C, c, None),
-                    daemon=True,
-                )
-            )
-            proc[-1].start()
-
-        for p_num in range(n_jobs):
-            proc[p_num].join()
+        results = Parallel(n_jobs=n_jobs)(
+            delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
+            for p_num in range(n_jobs)
+        )
+        for start_ind, end_ind, cost_, dest_ in results:
+            cost[start_ind:end_ind] = cost_
+            dest[start_ind:end_ind] = dest_
 
         # Compute point with minimum cost
         # Compute k and cost^*
-        k = np.argmin(np_cost)
-        cost_star = np_cost[k]
+        k = np.argmin(cost)
+        cost_star = cost[k]
 
         if verbose:
             print("Costs computed when eta = %d." % eta)
@@ -751,11 +704,10 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S):
         total_len_S = 0
         ctr = 0
         while cost_star < np.inf:
-            # print(k, cost_star, flush=True)
             # Move x_k from cluster s to
             # dest_k and update variables
             s = c[k]
-            dest_k = np_dest[k]
+            dest_k = dest[k]
             c[k] = dest_k
             n_C[s] -= 1
             n_C[dest_k] += 1
@@ -769,23 +721,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S):
             if n_C[s] > 0:
                 S_ = (
                     (c == dest_k)
-                    | (np_dest == dest_k)
+                    | (dest == dest_k)
                     | np.array(U[:, list(Clstr[s])].sum(1), dtype=bool).flatten()
                 )
             else:
-                S_ = (c == dest_k) | (np_dest == dest_k) | (np_dest == s)
+                S_ = (c == dest_k) | (dest == dest_k) | (dest == s)
             S = np.where(S_)[0].tolist()
             len_S = len(S)
             total_len_S += len_S
             ctr += 1
 
             for k in S:
-                np_cost[k], np_dest[k] = cost_of_moving(
+                cost[k], dest[k] = cost_of_moving(
                     k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max
                 )
 
-            k = np.argmin(np_cost)
-            cost_star = np_cost[k]
+            k = np.argmin(cost)
+            cost_star = cost[k]
 
         if verbose:
             print(
@@ -798,10 +750,6 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S):
             )
             print("Done with eta = %d." % eta)
 
-    shm_cost.close()
-    shm_cost.unlink()
-    shm_dest.close()
-    shm_dest.unlink()
     return c, n_C
 
 
From a4e1851b734746ef60b6fd5327d7d76d37496184 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Tue, 24 Mar 2026 19:12:29 +0000
Subject: [PATCH 02/44] Tests: Update to CI for new parallel library changes
 plus fix for randomization issue

---
 .github/workflows/ci.yml            | 2 +-
 .gitignore                          | 1 +
 tests/scripts/generate_snapshots.py | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 34c61d6..a0cc7e6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
     # Step 2: Generate Expected Baselines (using baseline source code but current scripts)
     - name: Generate Expected Baselines
       run: |
-        git checkout 8e27b77b3247e1ac7447e7ea429e0f0c103a3fb0 -- src/
+        git checkout dda054369e8a8ae9b8d166a7116ab983edfeaf8b -- src/
         python tests/scripts/generate_snapshots.py --path-to-results-dir tests/data/expected ${{ steps.set-flag.outputs.fast_flag }}
         git checkout HEAD -- src/
 
diff --git a/.gitignore b/.gitignore
index b9017f9..1cfc639 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 src/pyRATS/__pycache__/
 src/pyRATS.egg-info
+tests/data
 .vscode
 .DS_Store
 __pycache__
\ No newline at end of file
diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py
index 21cbf4a..31ad21d 100644
--- a/tests/scripts/generate_snapshots.py
+++ b/tests/scripts/generate_snapshots.py
@@ -127,6 +127,6 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal
 
     argslist = list(itertools.product(ks, eta_mins, cost_fn_names, dataset_names))
 
-    Parallel(n_jobs=-1)(
+    Parallel(n_jobs=1)(
         delayed(run_model)(*args, dirpath, force_compute) for args in argslist
     )

From 6aac4b186b47f4f349c76c5c5275b45f617dbf59 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:25:28 +0100
Subject: [PATCH 03/44] speedup: Parallelization of post-processing neighbour
 distance measures using smart triu and neighbourhood graph checks

---
 src/pyRATS/rats.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 78c7063..e6be6f9 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -1,6 +1,5 @@
 import numpy as np
 from scipy.sparse import csr_matrix
-from scipy.spatial.distance import squareform
 
 from multiprocessing import cpu_count
 
@@ -429,12 +428,17 @@ def _postprocess(self):
 
         n = self.U.shape[0]
 
-        original_dists = np.empty((n, int((self.k * (self.k - 1) // 2))))
-        for i in range(n):
-            U_i = self.U[i, :].indices
-            d_e_i = self.neighborhood_graph_sym[np.ix_(U_i, U_i)]
-            d_e_mask_ = squareform(d_e_i.toarray())
-            original_dists[i] = d_e_mask_  # These can be stored and reused
+        # Vectorised: gather all pairwise neighbour distances at once.
+        # neigh_inds[i] are the k neighbour indices of point i (same as
+        # connectivity_matrix used below, extracted once here for clarity).
+        neigh_inds = self.U.indices.reshape(n, self.k)  # (n, k)
+        pair_i, pair_j = np.triu_indices(self.k, k=1)   # all upper-triangle pairs
+        row_idx = neigh_inds[:, pair_i].ravel()           # (n * num_pairs,)
+        col_idx = neigh_inds[:, pair_j].ravel()           # (n * num_pairs,)
+        num_pairs = len(pair_i)
+        original_dists = np.asarray(
+            self.neighborhood_graph_sym[row_idx, col_idx]
+        ).reshape(n, num_pairs)
 
         embedded_datapoints = self.param.batched_eval_(
             {

From 803570166a6ca631e2eeb833b09219f2dd104a04 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:30:24 +0100
Subject: [PATCH 04/44] speedup: Vectorizing pdist

---
 src/pyRATS/_utils.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index fa9fd0c..b9e9114 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -243,16 +243,10 @@ def batched_pdist(x):
         x = np.array(x)
 
     n, k, _ = x.shape
-    res = np.empty((n, k * (k - 1) // 2))
-    p = 0
-    for i in range(k):
-        a = x[:, i]
-        bs = x[:, i + 1 :]
-
-        cs = bs - a[:, None, :]
-        res[:, p : p + k - i - 1] = np.sum(cs * cs, axis=-1)
-        p += k - i - 1
-    return np.sqrt(res)
+    # Precompute all upper-triangle pair indices (same order as the original loop)
+    pair_i, pair_j = np.triu_indices(k, k=1)          # each shape (num_pairs,)
+    diff = x[:, pair_i, :] - x[:, pair_j, :]          # (n, num_pairs, d)
+    return np.sqrt(np.sum(diff * diff, axis=-1))        # (n, num_pairs)
 
 
 def batched_procrustes_cost(X, Y):

From 01ab3c88fa66bc182b15fcbcda06c56071c8664b Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:33:17 +0100
Subject: [PATCH 05/44] fix: Improving error handling and variable descriptions

---
 src/pyRATS/rats.py | 89 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 81 insertions(+), 8 deletions(-)

diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index e6be6f9..4041ab4 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -1,4 +1,5 @@
 import numpy as np
+import warnings
 from scipy.sparse import csr_matrix
 
 from multiprocessing import cpu_count
@@ -32,8 +33,11 @@ class RATS:
         If True, searches for local embeddings that minimize the local distortion.
         Useful for noisy manifolds.
 
-    kpca_kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default='linear'
-        Kernel used for PCA. See https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html for more information.
+    kpca_kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None
+        Kernel used for PCA. None uses standard linear PCA (lpca). Any other value
+        uses sklearn.decomposition.KernelPCA. See
+        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html
+        for more information.
 
     cost_fn_name : {'alignment', 'distortion'}, default='alignment'
         'alignment': Alignment error should be prefered when runtime matters or dealing with noisy manifolds.
@@ -108,18 +112,40 @@ def __init__(
         eps=1e-8,
         patience=5,
         tol=1e-2,
-        n_connected_manifolds=1,
+        n_forced_clusters=1,
         verbose=False,
         n_jobs=-1,
     ):
 
-        assert cost_fn_name in ["alignment", "distortion"]
+        if cost_fn_name not in ["alignment", "distortion"]:
+            raise ValueError(
+                f"cost_fn_name must be 'alignment' or 'distortion', got {cost_fn_name!r}."
+            )
+        if eta_max <= eta_min:
+            raise ValueError(
+                f"eta_max ({eta_max}) must be greater than eta_min ({eta_min})."
+            )
+        if metric != "euclidean":
+            warnings.warn(
+                f"metric={metric!r} is not yet fully supported. Only 'euclidean' is "
+                "used in the embedding step. This parameter will be extended in a "
+                "future release.",
+                FutureWarning,
+                stacklevel=2,
+            )
+        if kpca_fit_inverse_transform:
+            warnings.warn(
+                "kpca_fit_inverse_transform=True is not yet supported and will be "
+                "ignored. Inverse transform support will be added in a future release.",
+                FutureWarning,
+                stacklevel=2,
+            )
 
         self.verbose = verbose
         self.n_jobs = n_jobs
 
         self.d = d
-        self.kpca_kernel = None if kpca_kernel == "linear" else kpca_kernel
+        self.kpca_kernel = kpca_kernel
         self.kpca_fit_inverse_transform = kpca_fit_inverse_transform
         self.k = k
         if cost_fn_name == "distortion":
@@ -134,9 +160,8 @@ def __init__(
         self.max_iter, self.max_internal_iter = max_iter, max_internal_iter
         self.alpha, self.eps = alpha, eps
         self.patience, self.tol = patience, tol
-        self.verbose = verbose
         self.metric = metric
-        self.n_forced_clusters = n_connected_manifolds
+        self.n_forced_clusters = n_forced_clusters
 
         if self.patience is None:
             self.patience = self.max_iter
@@ -208,8 +233,56 @@ def compute_color_of_pts_on_tear(self, y, tear_color_eig_inds=[0, 1, 2]):
             self.Utildeg,
         )
 
+    def fit(self, X):
+        """Fit the model on X without returning the embedding.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            Sample data.
+
+        Returns
+        -------
+        self
+        """
+        raise NotImplementedError(
+            "fit() is not yet implemented. Use fit_transform() instead."
+        )
+
+    def transform(self, X):
+        """Apply the fitted embedding to new data X.
+
+        Parameters
+        ----------
+        X : array-like, shape (n_samples, n_features)
+            New data to embed.
+
+        Returns
+        -------
+        y : array-like, shape (n_samples, d)
+            X transformed in the new space.
+        """
+        raise NotImplementedError(
+            "transform() is not yet implemented. Use fit_transform() instead."
+        )
+
     def inverse_transform(self, y):
-        NotImplementedError()
+        """Map points from the embedding space back to the original space.
+
+        Parameters
+        ----------
+        y : array-like, shape (n_samples, d)
+            Points in the embedding space.
+
+        Returns
+        -------
+        X : array-like, shape (n_samples, n_features)
+            Points in the original space.
+        """
+        raise NotImplementedError(
+            "inverse_transform() is not yet implemented."
+        )
+
 
     def _fit_nbrhd_graph(self, X, sort_results=True):
         """Fitting the neighborhood graph.

From 0e2313bc18e73f348a000fbd8b66102152515d5c Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:44:58 +0100
Subject: [PATCH 06/44] style: Code cleanups for neatness and readability.
 Removed unused procrustes code.

---
 pyproject.toml       |   2 +-
 src/pyRATS/_utils.py | 120 ++++++++++---------------------------------
 src/pyRATS/rats.py   |  14 ++---
 3 files changed, 33 insertions(+), 103 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index aae6249..4b63c93 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ where = ["src"]
 [project]
 name = "pyRATS"
 version = "0.0.1"
-description = ""
+description = "Riemannian Alignment of Tangent Spaces (RATS) for dimensionality reduction"
 readme = "README.md"
 license-files = ["License"]
 requires-python = ">=3.9"
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index b9e9114..36ba659 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -452,7 +452,7 @@ def cost_of_moving_distortion(
             # that is cost_{x_k \rightarrow m}
             cost_x_k_to[i] = compute_zeta(
                 d_e[np.ix_(U_k_U_Utilde_m, U_k_U_Utilde_m)],
-                local_param.eval_({"view_index": m, "data_mask": U_k_U_Utilde_m}),
+                local_param.eval_(m, U_k_U_Utilde_m),
             )
 
         i += 1
@@ -812,8 +812,8 @@ def target_proc(p_num):
             m = W_rows[i]
             mpp = W_cols[i]
             mask = i_mat[m, :].multiply(i_mat[mpp, :]).nonzero()[1]
-            V_mmp = param.eval_({"view_index": m, "data_mask": mask})
-            V_mpm = param.eval_({"view_index": mpp, "data_mask": mask})
+            V_mmp = param.eval_(m, mask)
+            V_mpm = param.eval_(mpp, mask)
             Vbar_mmp = V_mmp - np.mean(V_mmp, 0)[np.newaxis, :]
             Vbar_mpm = V_mpm - np.mean(V_mpm, 0)[np.newaxis, :]
             # Compute ambiguity of the overlaps captured by singular values
@@ -971,7 +971,7 @@ def compute_init_embedding(
         seq_0 = seq[0]
         is_visited_view[seq_0] = True
         y[C[seq_0, :].indices, :] = param.eval_(
-            {"view_index": seq_0, "data_mask": C[seq_0, :].indices}
+            seq_0, C[seq_0, :].indices
         )
         y, is_visited_view = procrustes_init(
             seq, rho, y, is_visited_view, d, Utilde, C, param
@@ -1286,7 +1286,7 @@ def build_ortho_optim(d, Utilde, param, verbose):
 
     for i in range(M):
         Utilde_i = Utilde[i, :].indices
-        X_ = param.eval_({"view_index": i, "data_mask": Utilde_i})
+        X_ = param.eval_(i, Utilde_i)
         sqrt_p_ki = np.sqrt(np.array(W[i, :].data).flatten()[:, None])
         X_ = sqrt_p_ki * X_
         D.append(np.matmul(X_.T, X_))
@@ -1392,14 +1392,14 @@ def procrustes_init(seq, rho, y, is_visited_view, d, Utilde, C, param):
             Utilde_s_mp = Utilde_s.multiply(Utilde[mp, :]).nonzero()[1]
             n_Utilde_s_Z_s[Utilde_s_mp] += 1
             mu_s[Utilde_s_mp, :] += param.eval_(
-                {"view_index": mp, "data_mask": Utilde_s_mp}
+                mp, Utilde_s_mp
             )
 
         # Compute T_s and v_s by aligning the embedding of the overlap
         # between sth view and the views in Z_s, with the centroid mu_s
         temp = n_Utilde_s_Z_s > 0
         mu_s = mu_s[temp, :] / n_Utilde_s_Z_s[temp, np.newaxis]
-        V_s_Z_s = param.eval_({"view_index": s, "data_mask": temp})
+        V_s_Z_s = param.eval_(s, temp)
 
         T_s, v_s = procrustes(V_s_Z_s, mu_s)
 
@@ -1412,72 +1412,11 @@ def procrustes_init(seq, rho, y, is_visited_view, d, Utilde, C, param):
 
         # Compute global embedding of point in sth cluster
         C_s = C[s, :].indices
-        y[C_s, :] = param.eval_({"view_index": s, "data_mask": C_s})
+        y[C_s, :] = param.eval_(s, C_s)
     return y, is_visited_view
 
 
-def procrustes_final(
-    y, d, Utilde, C, intermed_param, seq_of_intermed_views_in_cluster, global_opts
-):
-    M, n = Utilde.shape
-    # Traverse over intermediate views in a random order
-    seq = np.random.permutation(M)
-    is_first_view_in_cluster = np.zeros(M, dtype=bool)
-    # is_first_view_in_cluster[i] = True if the ith view is the first
-    # view in some cluster of views
-    for i in range(len(seq_of_intermed_views_in_cluster)):
-        is_first_view_in_cluster[seq_of_intermed_views_in_cluster[i][0]] = True
-
-    y_due_to_all_views = []
-    for k in range(n):
-        y_due_to_all_views.append({})
-
-    for s in range(M):
-        Utilde_s = Utilde[s, :].nonzero()[1]
-        y_Utilde_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s})
-        for k in range(len(Utilde_s)):
-            y_due_to_all_views[Utilde_s[k]][s] = y_Utilde_s[k, :]
-
-    # For a given seq, refine the global embedding
-    for _ in range(global_opts["max_internal_iter"]):
-        for s in seq.tolist():
-            # # Never refine s_0th intermediate view
-            # if is_first_view_in_cluster[s]:
-            #     continue
-
-            Utilde_s = Utilde[s, :].nonzero()[1]
-            mu_s = []
-            Utilde_s_ = []
-            for k_ in range(len(Utilde_s)):
-                k = Utilde_s[k_]
-                if len(y_due_to_all_views[k]) == 1:
-                    continue
-                Utilde_s_.append(k)
-                mu = np.array(list(y_due_to_all_views[k].values())).sum(axis=0)
-                mu = (mu - y_due_to_all_views[k][s]) / (len(y_due_to_all_views[k]) - 1)
-                mu_s.append(mu)
-            mu_s = np.array(mu_s)
-            Utilde_s_ = np.array(Utilde_s_)
-
-            # Compute T_s and v_s by aligning the embedding of the overlap
-            # between sth view and the views in Z_s, with the centroid mu_s
-            V_s_Z_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s_})
-            T_s, v_s = procrustes(V_s_Z_s, mu_s)
-
-            # Update T_s, v_s
-            intermed_param.T[s, :, :] = np.matmul(intermed_param.T[s, :, :], T_s)
-            intermed_param.v[s, :] = (
-                np.matmul(intermed_param.v[s, :][np.newaxis, :], T_s) + v_s
-            )
 
-            y_Utilde_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s})
-            for k in range(len(Utilde_s)):
-                y_due_to_all_views[Utilde_s[k]][s] = y_Utilde_s[k, :]
-
-    y = np.zeros((n, d))
-    for k in range(n):
-        y[k, :] = np.array(list(y_due_to_all_views[k].values())).mean(axis=0)
-    return y
 
 
 class Param:
@@ -1512,28 +1451,24 @@ def __init__(self, algo="lpca", **kwargs):
 
         # For KPCA etc
         self.model = None
-        self.X = None
-        self.y = None
 
         self.add_dim = False
         self.standardize = False
 
-    def batched_eval_(self, opts):
+    def batched_eval_(self, view_index, data_mask):
         """Maps multiple points to the new space through their local (K-)PCA prameterizations.
 
         Parameters
         ------
-        opts : dict
-            Must have keys
-                view_index : array-like, shape (n_points)
-                    The indices of the parameterization to use.
+        view_index : array-like, shape (n_points)
+            The indices of the parameterization to use.
 
-                data_mask : array-like, shape (n_points, n_neighbors)
-                    List of points to map to the embedding dimension for each point in 'view_index'
+        data_mask : array-like, shape (n_points, n_neighbors)
+            List of points to map to the embedding dimension for each point in 'view_index'
         """
 
-        ks = opts["view_index"]
-        masks = opts["data_mask"]
+        ks = view_index
+        masks = data_mask
 
         if self.algo == "lpca":
             temp = np.matmul(
@@ -1568,22 +1503,23 @@ def batched_eval_(self, opts):
                 temp = temp + self.v[[ks], :]
         return temp
 
-    def eval_(self, opts, apply_b=True):
+    def eval_(self, view_index, data_mask, apply_b=True):
         """Maps points to the new space through their local (K-)PCA prameterizations.
 
         Parameters
         ------
-        opts : dict
-            Must have keys
-                'view_index : int
-                    The index of the parameterization to use.
+        view_index : int
+            The index of the parameterization to use.
+
+        data_mask : array-like, shape (n_neighbors)
+            List of points to map to the embedding dimension.
 
-                data_mask : array-like, shape (n_neighbors)
-                    List of points to map to the embedding dimension.
+        apply_b : bool, default=True
+            Whether to apply the scale transformation b[k].
         """
 
-        k = opts["view_index"]
-        mask = opts["data_mask"]
+        k = view_index
+        mask = data_mask
 
         if self.algo == "lpca":
             temp = np.dot(
@@ -1633,9 +1569,9 @@ def replace_(self, new_param_ind):
         else:  # ISOMAP, LKPCA
             self.model = self.model[new_param_ind]
 
-    def reconstruct_(self, opts):
-        k = opts["view_index"]
-        y_ = opts["embeddings"]
+    def reconstruct_(self, view_index, embeddings):
+        k = view_index
+        y_ = embeddings
         if self.algo == "LPCA":
             temp = (
                 np.dot(
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 4041ab4..aaa8dcc 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -514,10 +514,7 @@ def _postprocess(self):
         ).reshape(n, num_pairs)
 
         embedded_datapoints = self.param.batched_eval_(
-            {
-                "view_index": range(n),
-                "data_mask": self.U.indices.reshape(n, self.k),
-            }
+            range(n), self.U.indices.reshape(n, self.k)
         )  # tested and they are equivalent
         embedded_dists = batched_pdist(embedded_datapoints)
         disc_lip_const = np.divide(
@@ -534,7 +531,7 @@ def _postprocess(self):
 
         if self.verbose:
             print(
-                r"Maximum local distoriton before postptocessing is: %0.4f"
+                r"Maximum local distortion before postprocessing is: %0.4f"
                 % np.max(zeta)
             )
 
@@ -564,10 +561,7 @@ def _postprocess(self):
                 ]  # list of indices
 
                 batch_embedded_datapoints = self.param.batched_eval_(
-                    {
-                        "view_index": future_use_phi_of[use_phi_of],
-                        "data_mask": neighbors_to_reconsider,
-                    }
+                    future_use_phi_of[use_phi_of], neighbors_to_reconsider
                 )
                 batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
                 batch_original_dists = original_dists[points_to_reconsider]
@@ -609,6 +603,6 @@ def _postprocess(self):
         self.param.replace_(future_use_phi_of)
         if self.verbose:
             print(
-                f"Maximum local distoriton after post-processing is: %0.4f"
+                f"Maximum local distortion after postprocessing is: %0.4f"
                 % (np.max(self.param.zeta))
             )

From e425db7570fba4ffd9ead46625388a82e0590230 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 18:53:58 +0100
Subject: [PATCH 07/44] CI: New continuous integration testing for speed. Tests
 speed across a range of datapoint numbers

---
 .github/workflows/benchmark.yml     | 35 +++++++++++++++
 tests/scripts/benchmark.py          | 69 +++++++++++++++++++++++++++++
 tests/scripts/generate_snapshots.py |  2 +-
 3 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/benchmark.yml
 create mode 100644 tests/scripts/benchmark.py

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
new file mode 100644
index 0000000..30f15f4
--- /dev/null
+++ b/.github/workflows/benchmark.yml
@@ -0,0 +1,35 @@
+name: Performance Benchmark
+
+on:
+  push:
+    branches: [ '**' ]
+  pull_request:
+    branches: [ 'main' ]
+  workflow_dispatch:
+
+permissions:
+  contents: write
+
+jobs:
+  benchmark:
+    name: RATS Performance Benchmark
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install Dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e .[test]
+
+      - name: Run Benchmark
+        run: |
+          echo "## RATS Performance Benchmarks" >> $GITHUB_STEP_SUMMARY
+          echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY
+          echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY
+          python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY
diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py
new file mode 100644
index 0000000..335e8d3
--- /dev/null
+++ b/tests/scripts/benchmark.py
@@ -0,0 +1,69 @@
+import os
+import sys
+import time
+import json
+import argparse
+
+# Allow importing src and examples
+repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+if repo_root not in sys.path:
+    sys.path.insert(0, repo_root)
+
+from pyRATS import rats
+from examples import datasets
+
+def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"):
+    sys.stderr.write(f"Running benchmark for n={n_samples}, k={k}, eta_min={eta_min}...\n")
+    sys.stderr.flush()
+    
+    # Load dataset
+    ds = datasets.Datasets()
+    X, _, _ = ds.kleinbottle4d(n=n_samples)
+    
+    model = rats.RATS(
+        d=2,
+        k=k,
+        cost_fn_name=cost_fn_name,
+        eta_min=eta_min,
+        max_iter=3,
+        nu=4,
+        verbose=False, # Set to False for cleaner benchmark output
+    )
+    
+    start_time = time.perf_counter()
+    model.fit_transform(X=X)
+    end_time = time.perf_counter()
+    
+    duration = end_time - start_time
+    sys.stderr.write(f"Completed in {duration:.4f}s\n")
+    sys.stderr.flush()
+    return duration
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Run RATS performance benchmarks.")
+    parser.add_argument("--output", type=str, help="Output JSON file path")
+    parser.add_argument("--fast", action="store_true", help="Run only a small subset for testing")
+    args = parser.parse_args()
+
+    if args.fast:
+        sample_sizes = [400, 400, 500]
+    else:
+        sample_sizes = [400, 400 , 500, 1000, 2000, 5000, 10000]
+    
+    results = []
+
+    print("\n| Sample Size | Duration (s) |")
+    print("|-------------|--------------|")
+    for n in sample_sizes:
+        duration = run_benchmark(n)
+        results.append({
+            "name": f"Klein Bottle {n} samples",
+            "unit": "s",
+            "value": duration
+        })
+        print(f"| {n:11} | {duration:12.4f} |")
+
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to {args.output}")
diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py
index 31ad21d..9588d12 100644
--- a/tests/scripts/generate_snapshots.py
+++ b/tests/scripts/generate_snapshots.py
@@ -121,7 +121,7 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal
             "swissrollwithhole",
             "squarewithtwoholes",
         ]
-        ks = [14, 21, 28]
+        ks = [14, 21]
         eta_mins = [5, 10]
         cost_fn_names = ["distortion", "alignment"]
 

From cc84dd7ee6332f14e51d4bdaaa2c73233ce25355 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 19:14:07 +0100
Subject: [PATCH 08/44] speedup: Dhruv's eigh implementation

---
 src/pyRATS/_utils.py | 59 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 36ba659..533bf30 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -1206,14 +1206,59 @@ def update(alpha, max_iter, O, CC, M, d):
 
 
 def compute_Lpinv_helpers(W):
-    B_ = W.copy().transpose().astype("float")
+    M, n = W.shape
+    B_ = W.transpose().tocsr().astype("float")
     D_1 = np.asarray(B_.sum(axis=1))
     D_2 = np.asarray(B_.sum(axis=0))
-    D_1_inv_sqrt = np.sqrt(1 / D_1)
-    D_2_inv_sqrt = np.sqrt(1 / D_2)
-    B_tilde = B_.multiply(D_2_inv_sqrt).multiply(D_1_inv_sqrt)
-
-    U12, SS, VT = svd(B_tilde.todense(), full_matrices=False)
+    D_1_inv_sqrt = np.sqrt(1 / D_1).flatten()
+    D_2_inv_sqrt = np.sqrt(1 / D_2).flatten()
+    
+    #B_tilde = B_.multiply(D_2_inv_sqrt).multiply(D_1_inv_sqrt)
+    # Create sparse diagonal matrices
+    D1_diag = diags(D_1_inv_sqrt)
+    D2_diag = diags(D_2_inv_sqrt)
+    
+    # Sparse matrix multiplication is MUCH faster than .multiply()
+    B_tilde = D1_diag @ B_ @ D2_diag
+
+    # U12, SS, VT = svd(B_tilde.todense(), full_matrices=False)
+    # --- OPTIMIZED SVD: Gram Matrix Approach ---
+    if n >= M:
+        # B_tilde is (n, M). C becomes a tiny (M, M) dense matrix.
+        # Sparse matrix multiplication here is incredibly fast.
+        C = (B_tilde.T @ B_tilde).todense()
+        S2, V = eigh(C)
+        
+        # eigh returns ascending order; reverse to match standard SVD output
+        idx = np.argsort(S2)[::-1]
+        S2 = S2[idx]
+        V = V[:, idx]
+        
+        # Recover singular values and right singular vectors
+        SS = np.sqrt(np.maximum(S2, 0))
+        VT = V.T
+        
+        # Recover left singular vectors: U = B_tilde @ V @ diag(1/SS)
+        with np.errstate(divide='ignore', invalid='ignore'):
+            SS_inv = np.where(SS > 1e-10, 1.0 / SS, 0.0)
+            
+        U12 = (B_tilde @ V) * SS_inv[np.newaxis, :]
+    else:
+        # Fallback if M > n: Compute (n, n) Gram matrix instead
+        C = (B_tilde @ B_tilde.T).todense()
+        S2, U12 = eigh(C)
+        
+        idx = np.argsort(S2)[::-1]
+        S2 = S2[idx]
+        U12 = U12[:, idx]
+        
+        SS = np.sqrt(np.maximum(S2, 0))
+        
+        with np.errstate(divide='ignore', invalid='ignore'):
+            SS_inv = np.where(SS > 1e-10, 1.0 / SS, 0.0)
+            
+        VT = (U12.T @ B_tilde) * SS_inv[:, np.newaxis]
+    # -------------------------------------------
     U12, VT = svd_flip(U12, VT)
 
     V = VT.T
@@ -1226,7 +1271,7 @@ def compute_Lpinv_helpers(W):
     U2 = U12[:, m_1:]
     V1 = V[:, :m_1]
     V2 = V[:, m_1:]
-    return [D_1_inv_sqrt, D_2_inv_sqrt, U1, U2, V1, V2, Sigma_1, Sigma_2]
+    return [D_1_inv_sqrt[:,None], D_2_inv_sqrt[None,:], U1, U2, V1, V2, Sigma_1, Sigma_2]
 
 
 # Ngoc-Diep Ho, Paul Van Dooren, On the pseudo-inverse of the Laplacian of a bipartite graph

From 40db85ae42c5668cec203abf1d131eba5a66db49 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 20:55:44 +0000
Subject: [PATCH 09/44] fix: Imports fix

---
 src/pyRATS/_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 533bf30..ffaef76 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -1,9 +1,9 @@
 import numpy as np
 from sklearn.neighbors import NearestNeighbors
-from scipy.linalg import svd, svdvals
+from scipy.linalg import svd, svdvals, eigh
 from scipy.sparse.linalg import svds
 from sklearn.decomposition import KernelPCA
-from scipy.sparse import csr_matrix, triu, block_diag
+from scipy.sparse import csr_matrix, triu, block_diag, diags
 import itertools
 
 from joblib import delayed, Parallel

From 4641346545b1272a0a2f908cb6eefddcab34d1d3 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 21:39:17 +0000
Subject: [PATCH 10/44] speedup: Vectorizing B construction and avoiding sparse
 to dense conversions

---
 src/pyRATS/_utils.py | 140 +++++++++++++++++++++++++++++--------------
 1 file changed, 94 insertions(+), 46 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index ffaef76..772e066 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -554,10 +554,8 @@ def cost_of_moving_alignment_error(
     # c_U_k_uniq_k = np.union1d(c_U_k_uniq[neighbor_mask], c_k)
     c_U_k_uniq_k = np.union1d(c_U_k_uniq[neighbor_mask], k)
     evals = param.batched_eval_(
-        {
-            "view_index": c_U_k_uniq_k,
-            "data_mask": np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]),
-        }
+        c_U_k_uniq_k,
+        np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]),
     )
 
     m = c_U_k_uniq_k == k
@@ -1278,33 +1276,52 @@ def compute_Lpinv_helpers(W):
 def compute_Lpinv_MT(Lpinv_helpers, B):
     D_1_inv_sqrt, D_2_inv_sqrt, U1, U2, V1, V2, Sigma_1, Sigma_2 = Lpinv_helpers
     n = D_1_inv_sqrt.shape[0]
-    B_mean = B.mean(axis=1)
+    Md = B.shape[0]
+    M = B.shape[1] - n
+
+    B_mean = np.array(B.mean(axis=1))
     if len(B_mean.shape) == 1:
         B_mean = B_mean[:, None]
-    B_n = B - B_mean
-    B_n = np.asarray(B_n)
-    B1T = D_1_inv_sqrt * (B_n[:, :n].T)
-    B2T = D_2_inv_sqrt.T * (B_n[:, n:].T)
 
-    U1TB1T = np.matmul(U1.T, B1T)
-    U2TB1T = np.matmul(U2.T, B1T)
-    V1TB2T = np.matmul(V1.T, B2T)
-    V2TB2T = np.matmul(V2.T, B2T)
+    B1 = B[:, :n]
+    B2 = B[:, n:]
+
+    # Optimized matrix-vector products using identities to avoid full dense B_n
+    # Identity: U^T (diag(D) (B - mu 1^T))^T = (U^T diag(D)) B^T - (U^T diag(D) 1) mu^T
+    
+    # Compute U^T * B1T terms
+    U1T_D1 = (U1 * D_1_inv_sqrt).T  # (m1, n)
+    U1TB1T = (U1T_D1 @ B1.T) - (U1T_D1.sum(axis=1)[:, None] @ B_mean.T)
+
+    U2T_D1 = (U2 * D_1_inv_sqrt).T  # (M-m1, n)
+    U2TB1T = (U2T_D1 @ B1.T) - (U2T_D1.sum(axis=1)[:, None] @ B_mean.T)
+
+    # Compute V^T * B2T terms
+    # D_2_inv_sqrt is (1, M)
+    V1T_D2 = (V1 * D_2_inv_sqrt.T).T  # (m1, M)
+    V1TB2T = (V1T_D2 @ B2.T) - (V1T_D2.sum(axis=1)[:, None] @ B_mean.T)
+
+    V2T_D2 = (V2 * D_2_inv_sqrt.T).T  # (M-m1, M)
+    V2TB2T = (V2T_D2 @ B2.T) - (V2T_D2.sum(axis=1)[:, None] @ B_mean.T)
+
+    # B1T and B2T are needed for the final sum, but we only materialize them once
+    # B1T = D_1_inv_sqrt * (B - B_mean)[:, :n].T
+    B1T = (B1.T.multiply(D_1_inv_sqrt)).toarray() - (D_1_inv_sqrt @ B_mean.T)
 
     temp1 = (
-        -0.75 * np.matmul(U1, U1TB1T)
-        - 0.25 * np.matmul(U1, V1TB2T)
-        + np.matmul(U2, ((Sigma_1 - 1)) * (U2TB1T))
-        + np.matmul(U2, Sigma_2 * (V2TB2T))
+        -0.75 * (U1 @ U1TB1T)
+        - 0.25 * (U1 @ V1TB2T)
+        + (U2 @ ((Sigma_1 - 1) * U2TB1T))
+        + (U2 @ (Sigma_2 * V2TB2T))
         + B1T
     )
     temp1 = temp1 * D_1_inv_sqrt
 
     temp2 = (
-        -0.25 * np.matmul(V1, U1TB1T)
-        + 0.25 * np.matmul(V1, V1TB2T)
-        + np.matmul(V2, Sigma_2 * (U2TB1T))
-        + np.matmul(V2, Sigma_1 * (V2TB2T))
+        -0.25 * (V1 @ U1TB1T)
+        + 0.25 * (V1 @ V1TB2T)
+        + (V2 @ (Sigma_2 * U2TB1T))
+        + (V2 @ (Sigma_1 * V2TB2T))
     )
     temp2 = temp2 * D_2_inv_sqrt.T
 
@@ -1321,48 +1338,79 @@ def compute_CC(D, B, Lpinv_BT):
 def build_ortho_optim(d, Utilde, param, verbose):
     """Compute the Graph-Laplacian's inverse times B^\top."""
     M, n = Utilde.shape
-    B_row_inds = []
-    B_col_inds = []
-    B_vals = []
-    D = []
-
     W = Utilde.astype(float)
-    W_vals = W.data
-
+    W_vals_all = W.data
+    
+    # Vectorized construction of D and B components
+    D_list = []
+    
+    # We still loop over M to build B values, but we use pre-allocated arrays
+    # or better, compute B values in blocks.
+    B_data_vals = []
+    B_cluster_vals = []
+    B_cols = []
+    
     for i in range(M):
         Utilde_i = Utilde[i, :].indices
         X_ = param.eval_(i, Utilde_i)
-        sqrt_p_ki = np.sqrt(np.array(W[i, :].data).flatten()[:, None])
-        X_ = sqrt_p_ki * X_
-        D.append(np.matmul(X_.T, X_))
-
-        row_inds = list(range(d * i, d * (i + 1)))
-        col_inds = Utilde_i.tolist()
+        weights_i = W_vals_all[W.indptr[i]:W.indptr[i+1]]
+        sqrt_p_ki = np.sqrt(weights_i[:, None])
+        
+        # Weighted embeddings for D: sqrt(W) * X
+        X_weighted = sqrt_p_ki * X_
+        D_list.append(X_weighted.T @ X_weighted)
 
-        B_row_inds += row_inds + np.repeat(row_inds, len(col_inds)).tolist()
-        B_col_inds += np.repeat([n + i], d).tolist() + np.tile(col_inds, d).tolist()
+        # Weighted embeddings for B: W * X
+        X_B = weights_i[:, None] * X_
+        B_data_vals.append(X_B.T.flatten())
+        B_cluster_vals.append(np.sum(-X_B.T, axis=1))
+        B_cols.append(Utilde_i)
 
-        X_ = sqrt_p_ki * X_
-        B_vals += np.sum(-X_.T, axis=1).tolist() + X_.T.flatten().tolist()
+    D = block_diag(D_list, format="csr")
+    
+    # Efficiently construct B
+    B_data_vals = np.concatenate(B_data_vals)
+    B_cluster_vals = np.concatenate(B_cluster_vals)
+    
+    B_cols_data = []
+    for i in range(M):
+        B_cols_data.append(np.tile(B_cols[i], d))
+    B_cols_data = np.concatenate(B_cols_data)
+    
+    # Row indices for data values
+    counts = np.diff(W.indptr)
+    B_rows_data = np.repeat(np.arange(M) * d, counts * d) + np.tile(np.arange(d), len(B_cols_data) // d)
+    # Wait, the above tiling is not quite right if counts vary.
+    # Correct rows: repeat each row index (i*d + offset)
+    B_rows_data = []
+    for i in range(M):
+        r = np.arange(i * d, (i + 1) * d)
+        B_rows_data.append(np.repeat(r, counts[i]))
+    B_rows_data = np.concatenate(B_rows_data)
 
-    D = block_diag(D, format="csr")
-    B = csr_matrix((B_vals, (B_row_inds, B_col_inds)), shape=(M * d, n + M))
+    # B indices for cluster nodes
+    B_rows_cluster = np.arange(M * d)
+    B_cols_cluster = np.repeat(np.arange(n, n + M), d)
+    
+    # Combine everything
+    B_row_all = np.concatenate([B_rows_cluster, B_rows_data])
+    B_col_all = np.concatenate([B_cols_cluster, B_cols_data])
+    B_val_all = np.concatenate([B_cluster_vals, B_data_vals])
+    
+    B = csr_matrix((B_val_all, (B_row_all, B_col_all)), shape=(M * d, n + M))
 
     if verbose:
-        print("min and max weights:", np.array(W_vals).min(), np.array(W_vals).max())
-
+        print("min and max weights:", np.array(W_vals_all).min(), np.array(W_vals_all).max())
         print(
             f"Computing Pseudoinverse of a matrix of L of size {n} + {M} multiplied with B",
             flush=True,
         )
-    Lpinv_helpers = compute_Lpinv_helpers(W)
 
+    Lpinv_helpers = compute_Lpinv_helpers(W)
     Lpinv_BT = compute_Lpinv_MT(Lpinv_helpers, B)
     CC = compute_CC(D, B, Lpinv_BT)
 
-    CC_net = CC
-
-    return CC_net, Lpinv_BT, D, B
+    return CC, Lpinv_BT, D, B
 
 
 # unscaled alignment error

From 26528b3106b835e89f5b76950ccdc652a9fc769b Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 23:27:10 +0000
Subject: [PATCH 11/44] speedup: Speedup of zeta function by replacing pdist
 with measurement of row and col via nonzero and dist measure. Plus better csc
 usage

---
 src/pyRATS/_utils.py | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 772e066..1d0a0fd 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -470,13 +470,18 @@ def cost_of_moving_distortion(
 
 
 def compute_zeta(d_e_mask0, Psi_k_mask):
-    d_e_mask = d_e_mask0.toarray()
-    if d_e_mask.shape[0] == 1:
+    if d_e_mask0.shape[0] == 1:
         return 1
-    d_e_mask_ = squareform(d_e_mask)
-    mask = d_e_mask_ != 0
-    d_e_mask_ = d_e_mask_[mask]
-    disc_lip_const = pdist(Psi_k_mask)[mask] / d_e_mask_
+    d_e_upper = triu(d_e_mask0, k=1)
+    row, col = d_e_upper.nonzero()
+    if len(row) == 0:
+        return 1
+    
+    d_e_vals = d_e_upper.data
+    diff = Psi_k_mask[row] - Psi_k_mask[col]
+    dist_embedded = np.sqrt(np.sum(diff * diff, axis=1))
+    
+    disc_lip_const = dist_embedded / d_e_vals
     return np.max(disc_lip_const) / (np.min(disc_lip_const) + 1e-12)
 
 
@@ -647,6 +652,7 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
 
     cost = np.zeros(n) + np.inf
     dest = np.zeros(n, dtype="int") - 1
+    U_csc = U.tocsc()
 
     # Vary eta from 2 to eta_{min}
     if verbose:
@@ -714,7 +720,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
                 S_ = (
                     (c == dest_k)
                     | (dest == dest_k)
-                    | np.array(U[:, list(Clstr[s])].sum(1), dtype=bool).flatten()
+                    | np.array(U_csc[:, list(Clstr[s])].sum(1), dtype=bool).flatten()
                 )
             else:
                 S_ = (c == dest_k) | (dest == dest_k) | (dest == s)

From 5bdc3d701222a5f145f9d4b8fef88c068fdc9876 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 19 Mar 2026 00:05:12 +0000
Subject: [PATCH 12/44] Merging zeta change

---
 .github/workflows/benchmark.yml | 12 +++++++++---
 tests/scripts/benchmark.py      |  2 +-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 30f15f4..656d991 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -12,8 +12,14 @@ permissions:
 
 jobs:
   benchmark:
-    name: RATS Performance Benchmark
-    runs-on: ubuntu-latest
+    name: RATS Performance Benchmark (${{ matrix.os }})
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+    runs-on: ${{ matrix.os }}
+    defaults:
+      run:
+        shell: bash
     steps:
       - uses: actions/checkout@v4
 
@@ -29,7 +35,7 @@ jobs:
 
       - name: Run Benchmark
         run: |
-          echo "## RATS Performance Benchmarks" >> $GITHUB_STEP_SUMMARY
+          echo "## RATS Performance Benchmarks (${{ matrix.os }})" >> $GITHUB_STEP_SUMMARY
           echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY
           echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY
           python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY
diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py
index 335e8d3..6824787 100644
--- a/tests/scripts/benchmark.py
+++ b/tests/scripts/benchmark.py
@@ -48,7 +48,7 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"):
     if args.fast:
         sample_sizes = [400, 400, 500]
     else:
-        sample_sizes = [400, 400 , 500, 1000, 2000, 5000, 10000]
+        sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 25_000]
     
     results = []
 

From cb6fcd1f99a6dfbc324708ccd039e8a9ddf6c69f Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 19 Mar 2026 00:05:12 +0000
Subject: [PATCH 13/44] tests: Updating benchmarking tests to include windows,
 mac, and linux test


From a1a2806d71df5221e7f9656c65e606c26f37fa68 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 19 Mar 2026 16:48:05 +0000
Subject: [PATCH 14/44] Test: Adding a test benchmark script to give timings
 across different commits

---
 .github/workflows/benchmark.yml | 22 +++++++++++++++++++---
 tests/scripts/benchmark.py      |  2 +-
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 656d991..226ad86 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -12,16 +12,32 @@ permissions:
 
 jobs:
   benchmark:
-    name: RATS Performance Benchmark (${{ matrix.os }})
+    name: RATS Benchmark (${{ matrix.os }}, ${{ matrix.ref_name }})
     strategy:
+      fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
+        include:
+          - ref: "${{ github.sha }}"
+            ref_name: "Current HEAD"
+          - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a"
+            ref_name: "f1a3f22 (Optimized MT)"
+          - ref: "e1d13b7b1d2446a2499add3ed78f3a38e41103bc"
+            ref_name: "e1d13b7 (Merge Unit Testing V1)"
     runs-on: ${{ matrix.os }}
     defaults:
       run:
         shell: bash
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout Code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ matrix.ref }}
+          fetch-depth: 0
+
+      - name: Overlay Latest Benchmark Script
+        run: |
+          git checkout ${{ github.sha }} -- tests/scripts/benchmark.py
 
       - name: Set up Python
         uses: actions/setup-python@v4
@@ -36,6 +52,6 @@ jobs:
       - name: Run Benchmark
         run: |
           echo "## RATS Performance Benchmarks (${{ matrix.os }})" >> $GITHUB_STEP_SUMMARY
+          echo "Commit: ${{ matrix.ref_name }} (${{ matrix.ref }})" >> $GITHUB_STEP_SUMMARY
           echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY
-          echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY
           python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY
diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py
index 6824787..4297b05 100644
--- a/tests/scripts/benchmark.py
+++ b/tests/scripts/benchmark.py
@@ -48,7 +48,7 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"):
     if args.fast:
         sample_sizes = [400, 400, 500]
     else:
-        sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 25_000]
+        sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 20_000]
     
     results = []
 

From ef234188572e251444008ad93e34338504748ab7 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 19 Mar 2026 16:51:39 +0000
Subject: [PATCH 15/44] Fix: Issue with sha code

---
 .github/workflows/benchmark.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 226ad86..0e04ebf 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -37,7 +37,8 @@ jobs:
 
       - name: Overlay Latest Benchmark Script
         run: |
-          git checkout ${{ github.sha }} -- tests/scripts/benchmark.py
+          git fetch origin ${{ github.sha }} --depth=1
+          git checkout FETCH_HEAD -- tests/scripts/benchmark.py
 
       - name: Set up Python
         uses: actions/setup-python@v4

From bfca42936278dbffd85606ad339dc1e7913f3ae8 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 19 Mar 2026 17:10:48 +0000
Subject: [PATCH 16/44] Matrixifying refs for benchmarks and removing Dhruv's

---
 .github/workflows/benchmark.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0e04ebf..617c04b 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -17,13 +17,12 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
+        ref: ["${{ github.sha }}", "f1a3f2233e8248ca9610af9a8498122fc557657a"]
         include:
           - ref: "${{ github.sha }}"
             ref_name: "Current HEAD"
           - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a"
-            ref_name: "f1a3f22 (Optimized MT)"
-          - ref: "e1d13b7b1d2446a2499add3ed78f3a38e41103bc"
-            ref_name: "e1d13b7 (Merge Unit Testing V1)"
+            ref_name: "f1a3f22 (Dhruv Optimized MT)"
     runs-on: ${{ matrix.os }}
     defaults:
       run:

From 274f8413e8e7a7d9c3bc3131833c1980f24a7b63 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Tue, 24 Mar 2026 19:41:36 +0000
Subject: [PATCH 17/44] Efficiency: Memefficiency via Joshua code and and
 updated sha for comparisons

---
 .github/workflows/benchmark.yml |   6 +-
 src/pyRATS/rats.py              | 126 ++++++++++++++++++++------------
 2 files changed, 81 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 617c04b..093c24e 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -17,12 +17,12 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        ref: ["${{ github.sha }}", "f1a3f2233e8248ca9610af9a8498122fc557657a"]
+        ref: ["${{ github.sha }}", "5eaa1c6a82eb0a316ff2a6911e0ef7379add0bad"]
         include:
           - ref: "${{ github.sha }}"
             ref_name: "Current HEAD"
-          - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a"
-            ref_name: "f1a3f22 (Dhruv Optimized MT)"
+          - ref: "5eaa1c6a82eb0a316ff2a6911e0ef7379add0bad"
+            ref_name: "5eaa1c6 (Joshua V1)"
     runs-on: ${{ matrix.os }}
     defaults:
       run:
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index aaa8dcc..e066696 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -505,16 +505,16 @@ def _postprocess(self):
         # neigh_inds[i] are the k neighbour indices of point i (same as
         # connectivity_matrix used below, extracted once here for clarity).
         neigh_inds = self.U.indices.reshape(n, self.k)  # (n, k)
-        pair_i, pair_j = np.triu_indices(self.k, k=1)   # all upper-triangle pairs
-        row_idx = neigh_inds[:, pair_i].ravel()           # (n * num_pairs,)
-        col_idx = neigh_inds[:, pair_j].ravel()           # (n * num_pairs,)
+        pair_i, pair_j = np.triu_indices(self.k, k=1)  # all upper-triangle pairs
+        row_idx = neigh_inds[:, pair_i].ravel()  # (n * num_pairs,)
+        col_idx = neigh_inds[:, pair_j].ravel()  # (n * num_pairs,)
         num_pairs = len(pair_i)
         original_dists = np.asarray(
             self.neighborhood_graph_sym[row_idx, col_idx]
         ).reshape(n, num_pairs)
 
         embedded_datapoints = self.param.batched_eval_(
-            range(n), self.U.indices.reshape(n, self.k)
+            {"view_index": range(n), "data_mask": self.U.indices.reshape(n, self.k)}
         )  # tested and they are equivalent
         embedded_dists = batched_pdist(embedded_datapoints)
         disc_lip_const = np.divide(
@@ -538,60 +538,44 @@ def _postprocess(self):
         connectivity_matrix = self.U.indices.reshape(n, -1)
         param_changed = np.arange(n)
         future_use_phi_of = np.arange(n, dtype=int)
-        future_use_phi_of_ = np.arange(n, dtype=int)
 
         while len(param_changed) > 0:
             reconsider_mask = np.isin(connectivity_matrix, param_changed) & (
                 connectivity_matrix != np.arange(n)[:, None]
             )
-            param_changed = np.array([])
-
-            for i in range(connectivity_matrix.shape[1]):
-
-                use_phi_of = connectivity_matrix[:, i][
-                    reconsider_mask[:, i]
-                ]  # select neighborhood indices of the Phi's to use
-                if len(use_phi_of) == 0:
-                    continue
-                neighbors_to_reconsider = connectivity_matrix[
-                    reconsider_mask[:, i]
-                ]  # select datapoint and neighborhood indices who's neighborhood must be re-evaluated
-                points_to_reconsider = np.where(reconsider_mask[:, i])[
-                    0
-                ]  # list of indices
-
-                batch_embedded_datapoints = self.param.batched_eval_(
-                    future_use_phi_of[use_phi_of], neighbors_to_reconsider
-                )
-                batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
-                batch_original_dists = original_dists[points_to_reconsider]
-
-                disc_lip_const = np.divide(
-                    batch_embedded_dists,
-                    batch_original_dists,
-                    out=np.full_like(batch_embedded_dists, np.nan),
-                    where=batch_original_dists != 0,
-                )
-                zeta_ = np.nanmax(disc_lip_const, axis=1) / (
-                    np.nanmin(disc_lip_const, axis=1) + 1e-12
-                )
 
-                swap_has_improved_point = points_to_reconsider[
-                    np.where(zeta_ < zeta[points_to_reconsider])[0]
-                ]  # indices of points where swapping by the i'th neighbor improves the local distortion
+            def work_range(
+                start_end,
+            ):
+                start, end = start_end
+                return [
+                    self._process_column(
+                        i,
+                        connectivity_matrix,
+                        reconsider_mask[:, i],
+                        self.param,
+                        future_use_phi_of,
+                        original_dists,
+                    )
+                    for i in range(start, end)
+                ]
 
-                param_changed = np.union1d(param_changed, swap_has_improved_point)
-                zeta[points_to_reconsider] = np.minimum(
-                    zeta_, zeta[points_to_reconsider]
-                )
+            chunk_sz = connectivity_matrix.shape[1] // self.n_jobs
+            ranges = [(i * chunk_sz, (i + 1) * chunk_sz) for i in range(self.n_jobs)]
+            ranges[-1] = (ranges[-1][0], connectivity_matrix.shape[1])
+            zetas = Parallel(n_jobs=self.n_jobs)(delayed(work_range)(r) for r in ranges)
 
-                future_use_phi_of_[swap_has_improved_point] = future_use_phi_of[
-                    connectivity_matrix[:, i][swap_has_improved_point]
+            zetas = np.vstack(zetas).T
+
+            zeta_ = np.min(zetas, axis=1)
+            param_changed = np.where(zeta > zeta_)[0]
+            future_use_phi_of[param_changed] = future_use_phi_of[
+                connectivity_matrix[
+                    param_changed, np.argmin(zetas, axis=1)[param_changed]
                 ]
+            ]
+            zeta = np.where(zeta < zeta_, zeta, zeta_)
 
-            future_use_phi_of = (
-                future_use_phi_of_.copy()
-            )  # this is necessary for some reason
             if self.verbose:
                 print(
                     "#Param replaced: %d, max distortion: %f"
@@ -606,3 +590,49 @@ def _postprocess(self):
                 f"Maximum local distortion after postprocessing is: %0.4f"
                 % (np.max(self.param.zeta))
             )
+
+    def _process_column(
+        self,
+        i,
+        connectivity_matrix,
+        reconsider_mask,
+        param,
+        future_use_phi_of,
+        original_dists,
+    ):
+        zeta = np.zeros(len(connectivity_matrix)) + np.inf
+
+        use_phi_of = connectivity_matrix[:, i][
+            reconsider_mask
+        ]  # select neighborhood indices of the Phi's to use
+
+        if len(use_phi_of) == 0:
+            return zeta
+
+        neighbors_to_reconsider = connectivity_matrix[
+            reconsider_mask
+        ]  # select datapoint and neighborhood indices who's neighborhood must be re-evaluated
+        points_to_reconsider = np.where(reconsider_mask)[0]  # list of indices
+
+        batch_embedded_datapoints = param.batched_eval_(
+            {
+                "view_index": future_use_phi_of[use_phi_of],
+                "data_mask": neighbors_to_reconsider,
+            }
+        )
+        batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
+        batch_original_dists = original_dists[points_to_reconsider]
+
+        disc_lip_const = np.divide(
+            batch_embedded_dists,
+            batch_original_dists,
+            out=np.full_like(batch_embedded_dists, np.nan),
+            where=batch_original_dists != 0,
+        )
+
+        zeta_ = np.nanmax(disc_lip_const, axis=1) / (
+            np.nanmin(disc_lip_const, axis=1) + 1e-12
+        )
+        zeta[points_to_reconsider] = zeta_
+        return zeta
+ 
\ No newline at end of file

From 12541995f3debed95161e93d8032a7eedfd334c0 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 18 Mar 2026 16:51:01 +0100
Subject: [PATCH 18/44] Fix: new mode of batch all and importing

---
 examples/__init__.py | 0
 src/pyRATS/rats.py   | 9 ++++-----
 2 files changed, 4 insertions(+), 5 deletions(-)
 create mode 100644 examples/__init__.py

diff --git a/examples/__init__.py b/examples/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index e066696..a56d64d 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -3,6 +3,7 @@
 from scipy.sparse import csr_matrix
 
 from multiprocessing import cpu_count
+from joblib import Parallel, delayed
 
 from pyRATS._utils import (
     nearest_neighbors,
@@ -514,7 +515,7 @@ def _postprocess(self):
         ).reshape(n, num_pairs)
 
         embedded_datapoints = self.param.batched_eval_(
-            {"view_index": range(n), "data_mask": self.U.indices.reshape(n, self.k)}
+            range(n), self.U.indices.reshape(n, self.k)
         )  # tested and they are equivalent
         embedded_dists = batched_pdist(embedded_datapoints)
         disc_lip_const = np.divide(
@@ -615,10 +616,8 @@ def _process_column(
         points_to_reconsider = np.where(reconsider_mask)[0]  # list of indices
 
         batch_embedded_datapoints = param.batched_eval_(
-            {
-                "view_index": future_use_phi_of[use_phi_of],
-                "data_mask": neighbors_to_reconsider,
-            }
+            future_use_phi_of[use_phi_of],
+            neighbors_to_reconsider,
         )
         batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
         batch_original_dists = original_dists[points_to_reconsider]

From a82f82793464170d53f1c93b45966979b0621800 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Tue, 24 Mar 2026 19:55:27 +0000
Subject: [PATCH 19/44] Fix: Past git repo commit needed init

---
 .github/workflows/benchmark.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 093c24e..bd18821 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -38,6 +38,7 @@ jobs:
         run: |
           git fetch origin ${{ github.sha }} --depth=1
           git checkout FETCH_HEAD -- tests/scripts/benchmark.py
+          git checkout FETCH_HEAD -- examples/__init__.py
 
       - name: Set up Python
         uses: actions/setup-python@v4

From d1f3123ebdc6ab9eaa43a50aa4238dadd0691455 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Tue, 24 Mar 2026 19:58:10 +0000
Subject: [PATCH 20/44] Update: Cleaner method for src reconstruction for
 commit comparisons

---
 .github/workflows/benchmark.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index bd18821..4fe417d 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -31,14 +31,14 @@ jobs:
       - name: Checkout Code
         uses: actions/checkout@v4
         with:
-          ref: ${{ matrix.ref }}
+          ref: ${{ github.sha }}
           fetch-depth: 0
 
-      - name: Overlay Latest Benchmark Script
+      - name: Overlay src/ from Comparison Ref
+        if: matrix.ref != github.sha
         run: |
-          git fetch origin ${{ github.sha }} --depth=1
-          git checkout FETCH_HEAD -- tests/scripts/benchmark.py
-          git checkout FETCH_HEAD -- examples/__init__.py
+          git fetch origin ${{ matrix.ref }} --depth=1
+          git checkout FETCH_HEAD -- src/
 
       - name: Set up Python
         uses: actions/setup-python@v4

From 81018d527cab1d13f37ce2ea7ed5e0158c323662 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 15:54:20 +0000
Subject: [PATCH 21/44] Test: Alternative memory efficient, with
 parallelization over num threads, for postprocessing

---
 src/pyRATS/rats.py | 160 ++++++++++++++++++++++++---------------------
 1 file changed, 87 insertions(+), 73 deletions(-)

diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index a56d64d..9ca2c53 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -19,6 +19,57 @@
 from pyRATS._tear_coloring import compute_color_of_pts_on_tear
 
 
+def _postprocess_col_range(
+    start,
+    end,
+    connectivity_matrix,
+    reconsider_mask,
+    param,
+    future_use_phi_of,
+    original_dists,
+    n,
+):
+    """Process a contiguous range of neighbor-columns [start, end) for _postprocess.
+
+    Returns the element-wise minimum distortion and the corresponding column
+    index across the processed range, both as length-n arrays.
+    """
+    local_zeta_min = np.full(n, np.inf)
+    local_best_col = np.zeros(n, dtype=int)
+
+    for i in range(start, end):
+        col_mask = reconsider_mask[:, i]
+        use_phi_of = connectivity_matrix[:, i][col_mask]
+        if len(use_phi_of) == 0:
+            continue
+
+        neighbors_to_reconsider = connectivity_matrix[col_mask]
+        points_to_reconsider = np.where(col_mask)[0]
+
+        batch_embedded_datapoints = param.batched_eval_(
+            future_use_phi_of[use_phi_of], neighbors_to_reconsider
+        )
+        batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
+        batch_original_dists = original_dists[points_to_reconsider]
+
+        disc_lip_const = np.divide(
+            batch_embedded_dists,
+            batch_original_dists,
+            out=np.full_like(batch_embedded_dists, np.nan),
+            where=batch_original_dists != 0,
+        )
+        col_zeta = np.full(n, np.inf)
+        col_zeta[points_to_reconsider] = np.nanmax(disc_lip_const, axis=1) / (
+            np.nanmin(disc_lip_const, axis=1) + 1e-12
+        )
+
+        improved = col_zeta < local_zeta_min
+        local_best_col[improved] = i
+        local_zeta_min = np.minimum(local_zeta_min, col_zeta)
+
+    return local_zeta_min, local_best_col
+
+
 class RATS:
     """Riemannian Alignment of Tangent Spaces
 
@@ -545,37 +596,44 @@ def _postprocess(self):
                 connectivity_matrix != np.arange(n)[:, None]
             )
 
-            def work_range(
-                start_end,
-            ):
-                start, end = start_end
-                return [
-                    self._process_column(
-                        i,
-                        connectivity_matrix,
-                        reconsider_mask[:, i],
-                        self.param,
-                        future_use_phi_of,
-                        original_dists,
-                    )
-                    for i in range(start, end)
-                ]
-
-            chunk_sz = connectivity_matrix.shape[1] // self.n_jobs
-            ranges = [(i * chunk_sz, (i + 1) * chunk_sz) for i in range(self.n_jobs)]
-            ranges[-1] = (ranges[-1][0], connectivity_matrix.shape[1])
-            zetas = Parallel(n_jobs=self.n_jobs)(delayed(work_range)(r) for r in ranges)
-
-            zetas = np.vstack(zetas).T
-
-            zeta_ = np.min(zetas, axis=1)
-            param_changed = np.where(zeta > zeta_)[0]
+            # Split the k columns into n_jobs chunks and process each chunk in
+            # a separate worker.  Each worker returns its local (zeta_min,
+            # best_col) for the columns it owns; we then do a final
+            # min-reduction here.  This mirrors the target_proc pattern used
+            # throughout _utils.py (lpca, best, …) while keeping memory
+            # bounded to one chunk of columns per worker at a time.
+            k_cols = connectivity_matrix.shape[1]
+            chunk_sz = max(1, k_cols // self.n_jobs)
+            ranges = [(p * chunk_sz, (p + 1) * chunk_sz) for p in range(self.n_jobs)]
+            ranges[-1] = (ranges[-1][0], k_cols)  # extend last chunk to cover remainder
+            ranges = [(s, e) for s, e in ranges if s < k_cols]  # drop empty ranges
+
+            results = Parallel(n_jobs=self.n_jobs)(
+                delayed(_postprocess_col_range)(
+                    s, e,
+                    connectivity_matrix,
+                    reconsider_mask,
+                    self.param,
+                    future_use_phi_of,
+                    original_dists,
+                    n,
+                )
+                for s, e in ranges
+            )
+
+            # Reduce chunk results into a single (zeta_min, best_col).
+            zeta_min = np.full(n, np.inf)
+            best_col = np.zeros(n, dtype=int)
+            for chunk_zeta_min, chunk_best_col in results:
+                improved = chunk_zeta_min < zeta_min
+                best_col[improved] = chunk_best_col[improved]
+                zeta_min = np.minimum(zeta_min, chunk_zeta_min)
+
+            param_changed = np.where(zeta > zeta_min)[0]
             future_use_phi_of[param_changed] = future_use_phi_of[
-                connectivity_matrix[
-                    param_changed, np.argmin(zetas, axis=1)[param_changed]
-                ]
+                connectivity_matrix[param_changed, best_col[param_changed]]
             ]
-            zeta = np.where(zeta < zeta_, zeta, zeta_)
+            zeta = np.minimum(zeta, zeta_min)
 
             if self.verbose:
                 print(
@@ -591,47 +649,3 @@ def work_range(
                 f"Maximum local distortion after postprocessing is: %0.4f"
                 % (np.max(self.param.zeta))
             )
-
-    def _process_column(
-        self,
-        i,
-        connectivity_matrix,
-        reconsider_mask,
-        param,
-        future_use_phi_of,
-        original_dists,
-    ):
-        zeta = np.zeros(len(connectivity_matrix)) + np.inf
-
-        use_phi_of = connectivity_matrix[:, i][
-            reconsider_mask
-        ]  # select neighborhood indices of the Phi's to use
-
-        if len(use_phi_of) == 0:
-            return zeta
-
-        neighbors_to_reconsider = connectivity_matrix[
-            reconsider_mask
-        ]  # select datapoint and neighborhood indices who's neighborhood must be re-evaluated
-        points_to_reconsider = np.where(reconsider_mask)[0]  # list of indices
-
-        batch_embedded_datapoints = param.batched_eval_(
-            future_use_phi_of[use_phi_of],
-            neighbors_to_reconsider,
-        )
-        batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
-        batch_original_dists = original_dists[points_to_reconsider]
-
-        disc_lip_const = np.divide(
-            batch_embedded_dists,
-            batch_original_dists,
-            out=np.full_like(batch_embedded_dists, np.nan),
-            where=batch_original_dists != 0,
-        )
-
-        zeta_ = np.nanmax(disc_lip_const, axis=1) / (
-            np.nanmin(disc_lip_const, axis=1) + 1e-12
-        )
-        zeta[points_to_reconsider] = zeta_
-        return zeta
- 
\ No newline at end of file

From 3325634537958454dab2f97ade0953f6bd8f7db7 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 16:43:21 +0000
Subject: [PATCH 22/44] Style: Updating parameter names to match sklearn

---
 README.md                                    |   2 +-
 examples/barbell_example.ipynb               |  10 +-
 examples/curvedtorus_example.ipynb           |  14 +-
 examples/kleinbottle_example.ipynb           |  10 +-
 examples/small_noisyswissroll3_example.ipynb |  12 +-
 examples/small_spherewithhole_example.ipynb  |  10 +-
 examples/small_swissrollwithhole.ipynb       |  10 +-
 examples/squarerwithtwoholes.ipynb           |  10 +-
 src/pyRATS/rats.py                           | 208 +++++++++++++------
 tests/scripts/benchmark.py                   |  14 +-
 tests/scripts/generate_snapshots.py          |  28 +--
 tests/test_e2e.py                            |  14 +-
 12 files changed, 210 insertions(+), 132 deletions(-)

diff --git a/README.md b/README.md
index 064fe03..6d0391e 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ import datasets, vis
 X, labels, _ = datasets.Datasets().kleinbottle4d(n=5000)
 
 # create a RATS object projecting the data to 2d while tearing the manifold
-model = rats.RATS(d=2, k=28, eta_min=5, to_tear=True)
+model = rats.RATS(n_components=2, n_neighbors=28, min_cluster_size=5, tear=True)
 y = model.fit_transform(X)
 
 # compute the gluing instructions along the tear
diff --git a/examples/barbell_example.ipynb b/examples/barbell_example.ipynb
index 910cddb..3072280 100644
--- a/examples/barbell_example.ipynb
+++ b/examples/barbell_example.ipynb
@@ -59,10 +59,10 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -129,4 +129,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/curvedtorus_example.ipynb b/examples/curvedtorus_example.ipynb
index ece11d1..94ef672 100644
--- a/examples/curvedtorus_example.ipynb
+++ b/examples/curvedtorus_example.ipynb
@@ -59,13 +59,13 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=21,\n",
-    "    eta_min=5,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=21,\n",
+    "    min_cluster_size=5,\n",
     "    nu=3,\n",
-    "    max_iter=40,\n",
-    "    to_tear=True,\n",
-    "    cost_fn_name='distortion'\n",
+    "    n_iter=40,\n",
+    "    tear=True,\n",
+    "    cost_function='distortion'\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -132,4 +132,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb
index 8723479..1c23740 100644
--- a/examples/kleinbottle_example.ipynb
+++ b/examples/kleinbottle_example.ipynb
@@ -59,10 +59,10 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -121,4 +121,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/small_noisyswissroll3_example.ipynb b/examples/small_noisyswissroll3_example.ipynb
index f25db13..db22477 100644
--- a/examples/small_noisyswissroll3_example.ipynb
+++ b/examples/small_noisyswissroll3_example.ipynb
@@ -59,11 +59,11 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
-    "    cost_fn_name='alignment'\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
+    "    cost_function='alignment'\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -122,4 +122,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/small_spherewithhole_example.ipynb b/examples/small_spherewithhole_example.ipynb
index 2836f35..255464f 100644
--- a/examples/small_spherewithhole_example.ipynb
+++ b/examples/small_spherewithhole_example.ipynb
@@ -59,10 +59,10 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -131,4 +131,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/small_swissrollwithhole.ipynb b/examples/small_swissrollwithhole.ipynb
index 1cb8856..f2ff33a 100644
--- a/examples/small_swissrollwithhole.ipynb
+++ b/examples/small_swissrollwithhole.ipynb
@@ -59,10 +59,10 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -129,4 +129,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/examples/squarerwithtwoholes.ipynb b/examples/squarerwithtwoholes.ipynb
index dbd0cdb..1b90e81 100644
--- a/examples/squarerwithtwoholes.ipynb
+++ b/examples/squarerwithtwoholes.ipynb
@@ -59,10 +59,10 @@
    "outputs": [],
    "source": [
     "buml_obj = rats.RATS(\n",
-    "    d=2,\n",
-    "    k=28,\n",
-    "    eta_min=5,\n",
-    "    to_tear=True,\n",
+    "    n_components=2,\n",
+    "    n_neighbors=28,\n",
+    "    min_cluster_size=5,\n",
+    "    tear=True,\n",
     ")\n",
     "\n",
     "y = buml_obj.fit_transform(X=X)\n",
@@ -130,4 +130,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
\ No newline at end of file
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 9ca2c53..1de93d6 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -75,69 +75,70 @@ class RATS:
 
     Parameters
     ----------
-    d : int
-       Intrinsic dimension of the manifold.
+    n_components : int, default=2
+        Intrinsic dimension of the manifold (target embedding dimension).
 
-    k : int
+    n_neighbors : int, default=28
         Neighborhood size for local view fitting.
 
-    to_postprocess : bool, default=True
+    postprocess : bool, default=True
         If True, searches for local embeddings that minimize the local distortion.
         Useful for noisy manifolds.
 
-    kpca_kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None
+    kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None
         Kernel used for PCA. None uses standard linear PCA (lpca). Any other value
         uses sklearn.decomposition.KernelPCA. See
         https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html
         for more information.
 
-    cost_fn_name : {'alignment', 'distortion'}, default='alignment'
-        'alignment': Alignment error should be prefered when runtime matters or dealing with noisy manifolds.
+    cost_function : {'alignment', 'distortion'}, default='alignment'
+        'alignment': Alignment error should be preferred when runtime matters or dealing with noisy manifolds.
         'distortion': Distortion is slow, but works well on low noise manifolds.
 
-    eta_min : int, default=5
-        Minimum number of points per cluster. A cluster is formed by merging local embeddings that minimize cost_fn_name.
-        Larger clusters improve the runtime, and regularise noisy manifolds.
-        The value must be >= 1.
+    min_cluster_size : int, default=5
+        Minimum number of points per cluster. A cluster is formed by merging local
+        embeddings that minimise cost_function. Larger clusters improve runtime and
+        regularise noisy manifolds. The value must be >= 1.
 
-    eta_max : int, default=25
-        Maximum allowed size of the clusters.
-        The value must be > eta_min.
+    max_cluster_size : int, default=25
+        Maximum allowed size of the clusters. The value must be > min_cluster_size.
 
-    to_tear : bool, default=True
+    tear : bool, default=True
         Whether to tear the manifold.
 
     nu : int, default=3
         The ratio of the size of local views in the embedding against those in the data.
 
-    max_iter : int
-        Number of iterations to refine the global embedding.
-        In total Riemannian gradient descent is run for max_iter * max_internal_iter iterations.
-        For every iteration in max_iter, the alignment of points in the embedding is recomputed and the tear is re-evaluated if to_tear=True.
+    n_iter : int, default=20
+        Number of outer iterations to refine the global embedding.
+        Riemannian gradient descent runs for n_iter * n_iter_inner total steps.
+        For every iteration the alignment of points in the embedding is recomputed
+        and the tear is re-evaluated if tear=True.
 
-    max_internal_iter : int
-        The number of internal iterations used by Riemannian Gradient Descent.
+    n_iter_inner : int, default=100
+        Number of internal iterations used by Riemannian Gradient Descent per outer step.
 
-    alpha : float
-        The step size used in the Riemannian gradient descent.
+    alpha : float, default=0.3
+        Step size used in the Riemannian gradient descent.
 
-    patience : int
-        The number of iterations to wait for error below tolerance
-        to persist before stopping the refinement.
+    n_iter_without_progress : int, default=5
+        The number of outer iterations to wait after the relative change in alignment
+        error and tear size both drop below *tol* before stopping early.
 
-    tol : float
-        The tolerance level for the relative change in the alignment error and the
+    tol : float, default=1e-2
+        Tolerance level for the relative change in the alignment error and the
         relative change in the size of the tear.
 
     metric : str, default='euclidean'
-        To be added in future releases. Metric assumed on the embedding. Currently only 'euclidean' is supported.
+        To be added in future releases. Metric assumed on the embedding.
+        Currently only 'euclidean' is supported.
 
-    kpca_fit_inverse_transform : bool, default=False
-        To be added in future releases. If True, computes the inverse of the embedding transformation.
-        This flag must be only be enabled when running RATS with using the kpca_kernel.
-        If running pca with kpca_kernel=None, kpca_fit_inverse_transform is always True.
+    fit_inverse_transform : bool, default=False
+        To be added in future releases. If True, computes the inverse of the embedding
+        transformation. Must only be enabled together with a non-None kernel.
+        When kernel=None (linear PCA) the inverse is always available.
 
-    verbose : bool
+    verbose : bool, default=False
         If True, print logs to stdout.
 
     n_jobs : int, default=-1
@@ -145,37 +146,114 @@ class RATS:
 
     """
 
+    # Mapping from deprecated parameter names to their new equivalents.
+    _DEPRECATED_PARAMS = {
+        "d": "n_components",
+        "k": "n_neighbors",
+        "max_iter": "n_iter",
+        "max_internal_iter": "n_iter_inner",
+        "patience": "n_iter_without_progress",
+        "cost_fn_name": "cost_function",
+        "eta_min": "min_cluster_size",
+        "eta_max": "max_cluster_size",
+        "kpca_kernel": "kernel",
+        "kpca_fit_inverse_transform": "fit_inverse_transform",
+        "to_postprocess": "postprocess",
+        "to_tear": "tear",
+    }
+
     def __init__(
         self,
-        d=2,
-        kpca_kernel=None,
-        kpca_fit_inverse_transform=False,
-        k=28,
-        cost_fn_name="alignment",
+        n_components=2,
+        kernel=None,
+        fit_inverse_transform=False,
+        n_neighbors=28,
+        cost_function="alignment",
         metric="euclidean",
-        to_postprocess=True,
-        eta_min=5,
-        eta_max=25,
-        to_tear=True,
+        postprocess=True,
+        min_cluster_size=5,
+        max_cluster_size=25,
+        tear=True,
         nu=3,
-        max_iter=20,
-        max_internal_iter=100,
+        n_iter=20,
+        n_iter_inner=100,
         alpha=0.3,
         eps=1e-8,
-        patience=5,
+        n_iter_without_progress=5,
         tol=1e-2,
         n_forced_clusters=1,
         verbose=False,
         n_jobs=-1,
+        **kwargs,
     ):
+        # ------------------------------------------------------------------
+        # Backward-compatibility: accept deprecated parameter names and emit
+        # a DeprecationWarning so old code keeps working for one release.
+        # ------------------------------------------------------------------
+        for old, new in self._DEPRECATED_PARAMS.items():
+            if old in kwargs:
+                warnings.warn(
+                    f"The parameter '{old}' is deprecated and will be removed in a "
+                    f"future release. Use '{new}' instead.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+                # Only override the new-name value if the caller did not also
+                # supply the new name explicitly.
+                locals_snapshot = {
+                    "n_components": n_components,
+                    "kernel": kernel,
+                    "fit_inverse_transform": fit_inverse_transform,
+                    "n_neighbors": n_neighbors,
+                    "cost_function": cost_function,
+                    "postprocess": postprocess,
+                    "min_cluster_size": min_cluster_size,
+                    "max_cluster_size": max_cluster_size,
+                    "tear": tear,
+                    "n_iter": n_iter,
+                    "n_iter_inner": n_iter_inner,
+                    "n_iter_without_progress": n_iter_without_progress,
+                }
+                if old == "d":
+                    n_components = kwargs.pop(old)
+                elif old == "k":
+                    n_neighbors = kwargs.pop(old)
+                elif old == "max_iter":
+                    n_iter = kwargs.pop(old)
+                elif old == "max_internal_iter":
+                    n_iter_inner = kwargs.pop(old)
+                elif old == "patience":
+                    n_iter_without_progress = kwargs.pop(old)
+                elif old == "cost_fn_name":
+                    cost_function = kwargs.pop(old)
+                elif old == "eta_min":
+                    min_cluster_size = kwargs.pop(old)
+                elif old == "eta_max":
+                    max_cluster_size = kwargs.pop(old)
+                elif old == "kpca_kernel":
+                    kernel = kwargs.pop(old)
+                elif old == "kpca_fit_inverse_transform":
+                    fit_inverse_transform = kwargs.pop(old)
+                elif old == "to_postprocess":
+                    postprocess = kwargs.pop(old)
+                elif old == "to_tear":
+                    tear = kwargs.pop(old)
+            else:
+                kwargs.pop(old, None)
+
+        if kwargs:
+            raise TypeError(
+                f"RATS.__init__() got unexpected keyword arguments: {list(kwargs.keys())}"
+            )
 
-        if cost_fn_name not in ["alignment", "distortion"]:
+        if cost_function not in ["alignment", "distortion"]:
             raise ValueError(
-                f"cost_fn_name must be 'alignment' or 'distortion', got {cost_fn_name!r}."
+                f"cost_function must be 'alignment' or 'distortion', got {cost_function!r}."
             )
-        if eta_max <= eta_min:
+        if max_cluster_size <= min_cluster_size:
             raise ValueError(
-                f"eta_max ({eta_max}) must be greater than eta_min ({eta_min})."
+                f"max_cluster_size ({max_cluster_size}) must be greater than "
+                f"min_cluster_size ({min_cluster_size})."
             )
         if metric != "euclidean":
             warnings.warn(
@@ -185,9 +263,9 @@ def __init__(
                 FutureWarning,
                 stacklevel=2,
             )
-        if kpca_fit_inverse_transform:
+        if fit_inverse_transform:
             warnings.warn(
-                "kpca_fit_inverse_transform=True is not yet supported and will be "
+                "fit_inverse_transform=True is not yet supported and will be "
                 "ignored. Inverse transform support will be added in a future release.",
                 FutureWarning,
                 stacklevel=2,
@@ -196,22 +274,22 @@ def __init__(
         self.verbose = verbose
         self.n_jobs = n_jobs
 
-        self.d = d
-        self.kpca_kernel = kpca_kernel
-        self.kpca_fit_inverse_transform = kpca_fit_inverse_transform
-        self.k = k
-        if cost_fn_name == "distortion":
-            self.k_nn0 = max(k, eta_max * k)
+        self.d = n_components
+        self.kpca_kernel = kernel
+        self.kpca_fit_inverse_transform = fit_inverse_transform
+        self.k = n_neighbors
+        if cost_function == "distortion":
+            self.k_nn0 = max(n_neighbors, max_cluster_size * n_neighbors)
         else:
-            self.k_nn0 = k
-        self.cost_fn = cost_fn_name
-        self.to_postprocess = to_postprocess
-        self.eta_min, self.eta_max = eta_min, eta_max
-        self.to_tear = to_tear
+            self.k_nn0 = n_neighbors
+        self.cost_fn = cost_function
+        self.to_postprocess = postprocess
+        self.eta_min, self.eta_max = min_cluster_size, max_cluster_size
+        self.to_tear = tear
         self.nu = nu
-        self.max_iter, self.max_internal_iter = max_iter, max_internal_iter
+        self.max_iter, self.max_internal_iter = n_iter, n_iter_inner
         self.alpha, self.eps = alpha, eps
-        self.patience, self.tol = patience, tol
+        self.patience, self.tol = n_iter_without_progress, tol
         self.metric = metric
         self.n_forced_clusters = n_forced_clusters
 
diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py
index 4297b05..59a2adc 100644
--- a/tests/scripts/benchmark.py
+++ b/tests/scripts/benchmark.py
@@ -12,8 +12,8 @@
 from pyRATS import rats
 from examples import datasets
 
-def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"):
-    sys.stderr.write(f"Running benchmark for n={n_samples}, k={k}, eta_min={eta_min}...\n")
+def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function="distortion"):
+    sys.stderr.write(f"Running benchmark for n={n_samples}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}...\n")
     sys.stderr.flush()
     
     # Load dataset
@@ -21,11 +21,11 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"):
     X, _, _ = ds.kleinbottle4d(n=n_samples)
     
     model = rats.RATS(
-        d=2,
-        k=k,
-        cost_fn_name=cost_fn_name,
-        eta_min=eta_min,
-        max_iter=3,
+        n_components=2,
+        n_neighbors=n_neighbors,
+        cost_function=cost_function,
+        min_cluster_size=min_cluster_size,
+        n_iter=3,
         nu=4,
         verbose=False, # Set to False for cleaner benchmark output
     )
diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py
index 9588d12..4fdcb1b 100644
--- a/tests/scripts/generate_snapshots.py
+++ b/tests/scripts/generate_snapshots.py
@@ -46,8 +46,8 @@ def load_dataset(name):
     return X, labels
 
 
-def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=False):
-    fname = f"dataset={dataset_name}_k={k}_eta_min={eta_min}_cost-fn={cost_fn_name}.res"
+def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpath, force_compute=False):
+    fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res"
     path = os.path.join(dirpath, fname)
     
     if os.path.exists(path) and not force_compute:
@@ -56,11 +56,11 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal
     X, _ = load_dataset(dataset_name)
 
     model = rats.RATS(
-        d=2,
-        k=k,
-        cost_fn_name=cost_fn_name,
-        eta_min=eta_min,
-        max_iter=3,
+        n_components=2,
+        n_neighbors=n_neighbors,
+        cost_function=cost_function,
+        min_cluster_size=min_cluster_size,
+        n_iter=3,
         nu=4,
         verbose=True,
     )
@@ -108,9 +108,9 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal
 
     if args.fast_mode:
         dataset_names = ["small_spherewithhole"]
-        ks = [14]
-        eta_mins = [5]
-        cost_fn_names = ["distortion"]
+        n_neighbors_list = [14]
+        min_cluster_sizes = [5]
+        cost_functions = ["distortion"]
     else:
         dataset_names = [
             "curvedtorus3d",
@@ -121,11 +121,11 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal
             "swissrollwithhole",
             "squarewithtwoholes",
         ]
-        ks = [14, 21]
-        eta_mins = [5, 10]
-        cost_fn_names = ["distortion", "alignment"]
+        n_neighbors_list = [14, 21]
+        min_cluster_sizes = [5, 10]
+        cost_functions = ["distortion", "alignment"]
 
-    argslist = list(itertools.product(ks, eta_mins, cost_fn_names, dataset_names))
+    argslist = list(itertools.product(n_neighbors_list, min_cluster_sizes, cost_functions, dataset_names))
 
     Parallel(n_jobs=1)(
         delayed(run_model)(*args, dirpath, force_compute) for args in argslist
diff --git a/tests/test_e2e.py b/tests/test_e2e.py
index 616b947..480e3a3 100644
--- a/tests/test_e2e.py
+++ b/tests/test_e2e.py
@@ -22,10 +22,10 @@ def get_test_cases():
             # Sample name: dataset=small_spherewithhole_k=14_eta_min=5_cost-fn=distortion.res
             try:
                 dname = fname.split("dataset=")[1].split("_k=")[0]
-                k = int(fname.split("_k=")[1].split("_eta_min=")[0])
-                eta = int(fname.split("_eta_min=")[1].split("_cost-fn=")[0])
-                cfn = fname.split("_cost-fn=")[1].replace(".res", "")
-                cases.append((k, eta, cfn, dname))
+                n_neighbors = int(fname.split("_k=")[1].split("_eta_min=")[0])
+                min_cluster_size = int(fname.split("_eta_min=")[1].split("_cost-fn=")[0])
+                cost_function = fname.split("_cost-fn=")[1].replace(".res", "")
+                cases.append((n_neighbors, min_cluster_size, cost_function, dname))
             except Exception:
                 continue
     return cases
@@ -36,9 +36,9 @@ def get_test_cases():
 if not TEST_CASES:
     TEST_CASES = [(14, 5, "distortion", "dummy_no_baseline")]
 
-@pytest.mark.parametrize("k,eta_min,cost_fn_name,dataset_name", TEST_CASES)
-def test_end_to_end(k, eta_min, cost_fn_name, dataset_name):
-    fname = f"dataset={dataset_name}_k={k}_eta_min={eta_min}_cost-fn={cost_fn_name}.res"
+@pytest.mark.parametrize("n_neighbors,min_cluster_size,cost_function,dataset_name", TEST_CASES)
+def test_end_to_end(n_neighbors, min_cluster_size, cost_function, dataset_name):
+    fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res"
     expected_path = os.path.join(EXPECTED_DIR, fname)
     actual_path = os.path.join(ACTUAL_DIR, fname)
     

From d7e10d62e74fd2b80a315dfb070e08cccfbd52dc Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 16:54:22 +0000
Subject: [PATCH 23/44] Fix: Test var names

---
 tests/scripts/benchmark.py          | 35 +++++++++++++++++++++++++++-
 tests/scripts/generate_snapshots.py | 36 ++++++++++++++++++++++++++++-
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py
index 59a2adc..25708c3 100644
--- a/tests/scripts/benchmark.py
+++ b/tests/scripts/benchmark.py
@@ -12,6 +12,39 @@
 from pyRATS import rats
 from examples import datasets
 
+def construct_rats(
+    n_components=2,
+    n_neighbors=14,
+    min_cluster_size=5,
+    cost_function="distortion",
+    n_iter=3,
+    nu=4,
+    verbose=False,
+):
+    """Helper to instantiate RATS with fallback for older versions of the code."""
+    try:
+        # Try new sklearn-style names
+        return rats.RATS(
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+            min_cluster_size=min_cluster_size,
+            cost_function=cost_function,
+            n_iter=n_iter,
+            nu=nu,
+            verbose=verbose,
+        )
+    except TypeError:
+        # Fallback to old names
+        return rats.RATS(
+            d=n_components,
+            k=n_neighbors,
+            eta_min=min_cluster_size,
+            cost_fn_name=cost_function,
+            max_iter=n_iter,
+            nu=nu,
+            verbose=verbose,
+        )
+
 def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function="distortion"):
     sys.stderr.write(f"Running benchmark for n={n_samples}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}...\n")
     sys.stderr.flush()
@@ -20,7 +53,7 @@ def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function="
     ds = datasets.Datasets()
     X, _, _ = ds.kleinbottle4d(n=n_samples)
     
-    model = rats.RATS(
+    model = construct_rats(
         n_components=2,
         n_neighbors=n_neighbors,
         cost_function=cost_function,
diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py
index 4fdcb1b..1c89122 100644
--- a/tests/scripts/generate_snapshots.py
+++ b/tests/scripts/generate_snapshots.py
@@ -46,6 +46,40 @@ def load_dataset(name):
     return X, labels
 
 
+def construct_rats(
+    n_components=2,
+    n_neighbors=28,
+    min_cluster_size=5,
+    cost_function="alignment",
+    n_iter=20,
+    nu=3,
+    verbose=False,
+):
+    """Helper to instantiate RATS with fallback for older versions of the code."""
+    try:
+        # Try new sklearn-style names
+        return rats.RATS(
+            n_components=n_components,
+            n_neighbors=n_neighbors,
+            min_cluster_size=min_cluster_size,
+            cost_function=cost_function,
+            n_iter=n_iter,
+            nu=nu,
+            verbose=verbose,
+        )
+    except TypeError:
+        # Fallback to old names
+        return rats.RATS(
+            d=n_components,
+            k=n_neighbors,
+            eta_min=min_cluster_size,
+            cost_fn_name=cost_function,
+            max_iter=n_iter,
+            nu=nu,
+            verbose=verbose,
+        )
+
+
 def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpath, force_compute=False):
     fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res"
     path = os.path.join(dirpath, fname)
@@ -55,7 +89,7 @@ def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpat
 
     X, _ = load_dataset(dataset_name)
 
-    model = rats.RATS(
+    model = construct_rats(
         n_components=2,
         n_neighbors=n_neighbors,
         cost_function=cost_function,

From 2d7695b04791ecc69d7c90ac886f439af2ec3963 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 16:57:31 +0000
Subject: [PATCH 24/44] style: Naming change of buml_obj to model

---
 examples/barbell_example.ipynb               | 6 +++---
 examples/curvedtorus_example.ipynb           | 6 +++---
 examples/kleinbottle_example.ipynb           | 6 +++---
 examples/small_noisyswissroll3_example.ipynb | 6 +++---
 examples/small_spherewithhole_example.ipynb  | 6 +++---
 examples/small_swissrollwithhole.ipynb       | 6 +++---
 examples/squarerwithtwoholes.ipynb           | 6 +++---
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/examples/barbell_example.ipynb b/examples/barbell_example.ipynb
index 3072280..7a967ce 100644
--- a/examples/barbell_example.ipynb
+++ b/examples/barbell_example.ipynb
@@ -58,14 +58,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
     "    tear=True,\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -85,7 +85,7 @@
    "outputs": [],
    "source": [
     "tear_color_eig_inds = [0,1,2] # interior has green color so assigned smallest value of i to g-channel\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds\n",
     ")\n",
diff --git a/examples/curvedtorus_example.ipynb b/examples/curvedtorus_example.ipynb
index 94ef672..eb2a0b3 100644
--- a/examples/curvedtorus_example.ipynb
+++ b/examples/curvedtorus_example.ipynb
@@ -58,7 +58,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=21,\n",
     "    min_cluster_size=5,\n",
@@ -68,7 +68,7 @@
     "    cost_function='distortion'\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -88,7 +88,7 @@
    "outputs": [],
    "source": [
     "tear_color_eig_inds = [6, 1, 4] # interior has green color so assigned smallest value of i to g-channel\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds\n",
     ")\n",
diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb
index 1c23740..7684183 100644
--- a/examples/kleinbottle_example.ipynb
+++ b/examples/kleinbottle_example.ipynb
@@ -58,14 +58,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
     "    tear=True,\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -77,7 +77,7 @@
    "outputs": [],
    "source": [
     "tear_color_eig_inds = [7, 2, 4]\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds, \n",
     ")\n",
diff --git a/examples/small_noisyswissroll3_example.ipynb b/examples/small_noisyswissroll3_example.ipynb
index db22477..f21b196 100644
--- a/examples/small_noisyswissroll3_example.ipynb
+++ b/examples/small_noisyswissroll3_example.ipynb
@@ -58,7 +58,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
@@ -66,7 +66,7 @@
     "    cost_function='alignment'\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -78,7 +78,7 @@
    "outputs": [],
    "source": [
     "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds, \n",
     ")\n",
diff --git a/examples/small_spherewithhole_example.ipynb b/examples/small_spherewithhole_example.ipynb
index 255464f..2deb028 100644
--- a/examples/small_spherewithhole_example.ipynb
+++ b/examples/small_spherewithhole_example.ipynb
@@ -58,14 +58,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
     "    tear=True,\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -87,7 +87,7 @@
     "figsize = (3,3)\n",
     "\n",
     "tear_color_eig_inds = [7, 1, 2]\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds, \n",
     ")\n",
diff --git a/examples/small_swissrollwithhole.ipynb b/examples/small_swissrollwithhole.ipynb
index f2ff33a..2b95103 100644
--- a/examples/small_swissrollwithhole.ipynb
+++ b/examples/small_swissrollwithhole.ipynb
@@ -58,14 +58,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
     "    tear=True,\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -85,7 +85,7 @@
    "outputs": [],
    "source": [
     "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds, \n",
     ")\n",
diff --git a/examples/squarerwithtwoholes.ipynb b/examples/squarerwithtwoholes.ipynb
index 1b90e81..f18f1f7 100644
--- a/examples/squarerwithtwoholes.ipynb
+++ b/examples/squarerwithtwoholes.ipynb
@@ -58,14 +58,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "buml_obj = rats.RATS(\n",
+    "model = rats.RATS(\n",
     "    n_components=2,\n",
     "    n_neighbors=28,\n",
     "    min_cluster_size=5,\n",
     "    tear=True,\n",
     ")\n",
     "\n",
-    "y = buml_obj.fit_transform(X=X)\n",
+    "y = model.fit_transform(X=X)\n",
     "y"
    ]
   },
@@ -86,7 +86,7 @@
    "source": [
     "figsize = (3,3)\n",
     "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n",
-    "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n",
+    "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n",
     "    y,\n",
     "    tear_color_eig_inds, \n",
     ")\n",

From d7fcb1040a68a8aa92ba310dc1c877052805ccc5 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 17:46:55 +0000
Subject: [PATCH 25/44] feat: New method for dynamic allocation of memory to
 enable large datasets without crashes

---
 src/pyRATS/_utils.py | 57 +++++++++++++++++++++++++++++++++++++++++---
 src/pyRATS/rats.py   |  4 +++-
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 1d0a0fd..ad0f3b1 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -16,6 +16,33 @@
     breadth_first_order,
 )
 
+import os
+
+def _get_available_memory():
+    """Returns the available memory in bytes.
+    
+    Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil,
+    and finally falls back to a 4GB default.
+    """
+    # 1. Environment variable override
+    limit = os.environ.get("PYRATS_MEMORY_LIMIT")
+    if limit:
+        try:
+            return int(limit)
+        except ValueError:
+            pass
+            
+    # 2. Try psutil for dynamic discovery
+    try:
+        import psutil
+        # Use 75% of available RAM as a safe upper bound for our buffer
+        return int(psutil.virtual_memory().available * 0.75)
+    except (ImportError, AttributeError):
+        pass
+        
+    # 3. Fallback to 2GB
+    return 2 * 1024 * 1024 * 1024
+
 
 def lpca(X, d, U, n_jobs, verbose=False):
     """Fit PCA model on the data X.
@@ -1570,9 +1597,33 @@ def batched_eval_(self, view_index, data_mask):
         masks = data_mask
 
         if self.algo == "lpca":
-            temp = np.matmul(
-                self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks]
-            )
+            # Dynamic chunking to prevent memory spikes on high-dimensional data
+            n_eval = len(ks)
+            n_neighbors = masks.shape[1]
+            n_features = self.X.shape[1]
+            
+            # Calculate chunk size based on available memory
+            memory_limit = _get_available_memory()
+            itemsize = self.X.dtype.itemsize
+            # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize)
+            chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize))
+            
+            if n_eval <= chunk_size:
+                # fits in memory, process all at once
+                temp = np.matmul(
+                    self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks]
+                )
+            else:
+                # split into chunks
+                temp = np.zeros((n_eval, n_neighbors, self.Psi.shape[2]))
+                for start in range(0, n_eval, chunk_size):
+                    end = min(start + chunk_size, n_eval)
+                    ks_chunk = ks[start:end]
+                    masks_chunk = masks[start:end]
+                    temp[start:end] = np.matmul(
+                        self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :], 
+                        self.Psi[ks_chunk]
+                    )
             n = self.X.shape[0]
         else:
             temp = []
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 1de93d6..f2649dc 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -670,7 +670,9 @@ def _postprocess(self):
         future_use_phi_of = np.arange(n, dtype=int)
 
         while len(param_changed) > 0:
-            reconsider_mask = np.isin(connectivity_matrix, param_changed) & (
+            changed_mask = np.zeros(n, dtype=bool)
+            changed_mask[param_changed] = True
+            reconsider_mask = changed_mask[connectivity_matrix] & (
                 connectivity_matrix != np.arange(n)[:, None]
             )
 

From d8f85a12e94219f8e083bbd1f25679868dbc25ca Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 18:10:49 +0000
Subject: [PATCH 26/44] Fix: Dynamic memory allocation fix

---
 README.md            |  9 +++++++++
 src/pyRATS/_utils.py | 31 ++++++++++++++++++-------------
 src/pyRATS/rats.py   |  4 +++-
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 6d0391e..dbccff6 100644
--- a/README.md
+++ b/README.md
@@ -61,6 +61,15 @@ python -m http.server 8000
 ```
 and opening http://localhost:8000 in your browser.
 
+# Memory Management
+`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it uses up to 75% of available system RAM. If you need to explicitly limit memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes):
+
+```bash
+# Limit pyRATS to 4GB
+export PYRATS_MEMORY_LIMIT=4294967296
+python your_script.py
+```
+
 
 Citation
 ----------
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index ad0f3b1..0eb93e6 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -18,11 +18,11 @@
 
 import os
 
-def _get_available_memory():
-    """Returns the available memory in bytes.
+def _get_available_memory(n_jobs=1):
+    """Returns the available memory in bytes, scaled by n_jobs.
     
     Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil,
-    and finally falls back to a 4GB default.
+    and finally falls back to a 4GB default (divided by n_jobs).
     """
     # 1. Environment variable override
     limit = os.environ.get("PYRATS_MEMORY_LIMIT")
@@ -35,13 +35,14 @@ def _get_available_memory():
     # 2. Try psutil for dynamic discovery
     try:
         import psutil
-        # Use 75% of available RAM as a safe upper bound for our buffer
-        return int(psutil.virtual_memory().available * 0.75)
+        # Use a fraction of available RAM as a safe upper bound for our buffer,
+        # divided by the number of parallel workers.
+        return int(psutil.virtual_memory().available * 0.75 / n_jobs)
     except (ImportError, AttributeError):
         pass
         
-    # 3. Fallback to 2GB
-    return 2 * 1024 * 1024 * 1024
+    # 3. Fallback to 4GB total (divided by workers)
+    return (4 * 1024 * 1024 * 1024) // n_jobs
 
 
 def lpca(X, d, U, n_jobs, verbose=False):
@@ -387,7 +388,7 @@ def nearest_neighbors(X, k, metric, sort_results=True, n_jobs=-1):
 
 
 def cost_of_moving_distortion(
-    k, d_e, neigh_ind_k, U_k, local_param, c, n_C, Utilde, eta_min, eta_max
+    k, d_e, neigh_ind_k, U_k, local_param, c, n_C, Utilde, eta_min, eta_max, n_jobs=1
 ):
     """Computes the minimum cost and destination cluster possible
     when merging k with its neighboring clusters.
@@ -513,7 +514,7 @@ def compute_zeta(d_e_mask0, Psi_k_mask):
 
 
 def cost_of_moving_alignment_error(
-    k, d_e, neigh_ind_k, U_k, param, c, n_C, Utilde, eta_min, eta_max
+    k, d_e, neigh_ind_k, U_k, param, c, n_C, Utilde, eta_min, eta_max, n_jobs=1
 ):
     """Computes the minimum cost and destination cluster possible
     when merging k with its neighboring clusters.
@@ -588,6 +589,7 @@ def cost_of_moving_alignment_error(
     evals = param.batched_eval_(
         c_U_k_uniq_k,
         np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]),
+        n_jobs=n_jobs
     )
 
     m = c_U_k_uniq_k == k
@@ -704,7 +706,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
             dest_ = np.zeros(end_ind - start_ind, dtype="int") - 1
             for i, k in enumerate(range(start_ind, end_ind)):
                 cost_[i], dest_[i] = cost_of_moving(
-                    k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max
+                    k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=n_jobs
                 )
             return start_ind, end_ind, cost_, dest_
 
@@ -758,7 +760,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
 
             for k in S:
                 cost[k], dest[k] = cost_of_moving(
-                    k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max
+                    k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=1
                 )
 
             k = np.argmin(cost)
@@ -1581,7 +1583,7 @@ def __init__(self, algo="lpca", **kwargs):
         self.add_dim = False
         self.standardize = False
 
-    def batched_eval_(self, view_index, data_mask):
+    def batched_eval_(self, view_index, data_mask, n_jobs=1):
         """Maps multiple points to the new space through their local (K-)PCA prameterizations.
 
         Parameters
@@ -1591,6 +1593,9 @@ def batched_eval_(self, view_index, data_mask):
 
         data_mask : array-like, shape (n_points, n_neighbors)
             List of points to map to the embedding dimension for each point in 'view_index'
+
+        n_jobs : int, default=1
+            Number of parallel jobs. Used to scale memory limits.
         """
 
         ks = view_index
@@ -1603,7 +1608,7 @@ def batched_eval_(self, view_index, data_mask):
             n_features = self.X.shape[1]
             
             # Calculate chunk size based on available memory
-            memory_limit = _get_available_memory()
+            memory_limit = _get_available_memory(n_jobs=n_jobs)
             itemsize = self.X.dtype.itemsize
             # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize)
             chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize))
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index f2649dc..c2370e9 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -28,6 +28,7 @@ def _postprocess_col_range(
     future_use_phi_of,
     original_dists,
     n,
+    n_jobs=1,
 ):
     """Process a contiguous range of neighbor-columns [start, end) for _postprocess.
 
@@ -47,7 +48,7 @@ def _postprocess_col_range(
         points_to_reconsider = np.where(col_mask)[0]
 
         batch_embedded_datapoints = param.batched_eval_(
-            future_use_phi_of[use_phi_of], neighbors_to_reconsider
+            future_use_phi_of[use_phi_of], neighbors_to_reconsider, n_jobs=n_jobs
         )
         batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
         batch_original_dists = original_dists[points_to_reconsider]
@@ -697,6 +698,7 @@ def _postprocess(self):
                     future_use_phi_of,
                     original_dists,
                     n,
+                    n_jobs=self.n_jobs,
                 )
                 for s, e in ranges
             )

From 87b2e81d0610cfa3f41a6ea6f363eb5aacef75aa Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 18:25:12 +0000
Subject: [PATCH 27/44] style: progress bar update and verbose cleanup

---
 src/pyRATS/_tear_coloring.py |  27 +++++-----
 src/pyRATS/_utils.py         | 100 +++++++++++++++++++----------------
 src/pyRATS/rats.py           |  44 +++++++++------
 3 files changed, 95 insertions(+), 76 deletions(-)

diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py
index 04a0131..bd43b33 100644
--- a/src/pyRATS/_tear_coloring.py
+++ b/src/pyRATS/_tear_coloring.py
@@ -4,6 +4,7 @@
 import scipy
 
 from joblib import delayed, Parallel
+from tqdm import tqdm
 
 from pyRATS import _utils
 
@@ -67,14 +68,15 @@ def compute_spectral_color_of_pts_on_tear(
         i_mat_in_emb=i_mat_in_emb,
     )
     if pts_across_tear is None:
-        print("No tear detected.")
+        if verbose:
+            tqdm.write("No tear detected.")
         return None
     n_pts_across_tear = len(pts_across_tear)
     n_comp, labels = connected_components(
         tear_graph, directed=False, return_labels=True
     )
     if verbose:
-        print("Number of components in the tear graph:", n_comp, flush=True)
+        tqdm.write(f"Number of components in the tear graph: {n_comp}")
 
     n_points_in_comp = []
     for i in range(n_comp):
@@ -92,7 +94,7 @@ def compute_spectral_color_of_pts_on_tear(
         n_comp_i = np.sum(comp_i)
         scale = n_comp_i / n_pts_across_tear
         if verbose:
-            print("#points in the tear component no.", i, "are:", n_comp_i, flush=True)
+            tqdm.write(f"#points in the tear component no. {i} are: {n_comp_i}")
 
         # If the component is very small then assign the constant color
         if n_comp_i <= max(3, int(color_cutoff_frac * n)):
@@ -155,7 +157,7 @@ def compute_tear_graph(
     # If i_mat_in_emb if not provided
     if i_mat_in_emb is None:
         if verbose:
-            print("Metric used for computing incidence matrix in embedding:", metric)
+            tqdm.write(f"Metric used for computing incidence matrix in embedding: {metric}")
         i_mat_in_emb = _utils.compute_incidence_matrix_in_embedding(
             y, partition, k, nu, metric
         )
@@ -171,14 +173,13 @@ def compute_tear_graph(
     views_across_tear_graph.eliminate_zeros()
     # If no partitions/views are across the tear then there is no tear
     if len(views_across_tear_graph.data) == 0:
-        print("No views across the tear detected.")
+        if verbose:
+            tqdm.write("No views across the tear detected.")
         return None, None
 
     if verbose:
-        print(
-            "total #pairs of overlapping partitions/views:",
-            overlap.count_nonzero(),
-            flush=True,
+        tqdm.write(
+            f"total #pairs of overlapping partitions/views: {overlap.count_nonzero()}"
         )
     pts_across_tear, tear_graph = compute_points_across_tear_graph(
         views_across_tear_graph,
@@ -188,8 +189,8 @@ def compute_tear_graph(
         n_jobs=n_jobs,
     )
     if verbose:
-        print("#vertices in tear graph =", tear_graph.shape[0], flush=True)
-        print("#edges in tear graph =", len(tear_graph.data), flush=True)
+        tqdm.write(f"#vertices in tear graph = {tear_graph.shape[0]}")
+        tqdm.write(f"#edges in tear graph = {len(tear_graph.data)}")
     return pts_across_tear, tear_graph
 
 
@@ -208,7 +209,7 @@ def compute_points_across_tear_graph(
     )
     n_pairs = len(views_across_tear_graph_row)
     if verbose:
-        print("#pairs of partitions/views across tear:", n_pairs, flush=True)
+        tqdm.write(f"#pairs of partitions/views across tear: {n_pairs}")
 
     # Define per-pair processing function
     def process_pairs(ij_pairs):
@@ -274,7 +275,7 @@ def process_pairs(ij_pairs):
     for _, _, pts in results:
         pts_across_tear_mask[pts] = True
     if verbose:
-        print("Computing tear graph.", flush=True)
+        tqdm.write("Computing tear graph.")
     # Build sparse tear graph
     tear_graph = csr_matrix(
         (
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 0eb93e6..2f76341 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -15,6 +15,7 @@
     connected_components,
     breadth_first_order,
 )
+from tqdm import tqdm
 
 import os
 
@@ -125,7 +126,10 @@ def target_proc(p_num, chunk_sz):
 
     chunk_sz = int(n / n_jobs)
     results = Parallel(n_jobs=n_jobs)(
-        delayed(target_proc)(i, chunk_sz) for i in range(n_jobs)
+        delayed(target_proc)(i, chunk_sz)
+        for i in tqdm(
+            range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose
+        )
     )
 
     for i in range(len(results)):
@@ -136,7 +140,7 @@ def target_proc(p_num, chunk_sz):
         param.n_pc_dir_chosen[start_ind:end_ind] = n_pc_dir_chosen_
 
     if verbose:
-        print("local_param: all %d points processed..." % n)
+        tqdm.write("local_param: all %d points processed..." % n)
 
     return param
 
@@ -209,14 +213,17 @@ def target_proc(p_num, chunk_sz):
 
     chunk_sz = int(n / n_jobs)
     results = Parallel(n_jobs=n_jobs)(
-        delayed(target_proc)(p_num, chunk_sz) for p_num in range(n_jobs)
+        delayed(target_proc)(p_num, chunk_sz)
+        for p_num in tqdm(
+            range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose
+        )
     )
 
     for start_ind, end_ind, model_ in results:
         local_param.model[start_ind:end_ind] = model_
 
     if verbose:
-        print("local_param: all %d points processed..." % n)
+        tqdm.write("local_param: all %d points processed..." % n)
     return local_param
 
 
@@ -684,16 +691,16 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
     U_csc = U.tocsc()
 
     # Vary eta from 2 to eta_{min}
-    if verbose:
-        print("Constructing intermediate views.")
-    for eta in range(2, eta_min + 1):
+    for eta in tqdm(
+        range(2, eta_min + 1), desc="Intermediate views", disable=not verbose
+    ):
         if verbose:
             print("eta = %d." % eta)
-            print(
-                "#non-empty views with sz < %d = %d"
-                % (eta, np.sum((n_C > 0) * (n_C < eta)))
-            )
-            print("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta)))
+            # tqdm.write(
+            #     "#non-empty views with sz < %d = %d"
+            #     % (eta, np.sum((n_C > 0) * (n_C < eta)))
+            # )
+            # tqdm.write("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta)))
 
         def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
             start_ind = p_num * chunk_sz
@@ -713,7 +720,13 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
         chunk_sz = int(n / n_jobs)
         results = Parallel(n_jobs=n_jobs)(
             delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
-            for p_num in range(n_jobs)
+            for p_num in tqdm(
+                range(n_jobs),
+                desc=f"eta={eta}",
+                unit="chunk",
+                leave=False,
+                disable=not verbose,
+            )
         )
         for start_ind, end_ind, cost_, dest_ in results:
             cost[start_ind:end_ind] = cost_
@@ -724,10 +737,19 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
         k = np.argmin(cost)
         cost_star = cost[k]
 
-        if verbose:
-            print("Costs computed when eta = %d." % eta)
+        # if verbose:
+        #     tqdm.write("Costs computed when eta = %d." % eta)
 
         # Loop until minimum cost is inf
+        pbar_merge = None
+        if verbose:
+            pbar_merge = tqdm(
+                total=np.sum(n_C[c] < eta),
+                desc="Merging",
+                unit="pts",
+                leave=False,
+            )
+
         total_len_S = 0
         ctr = 0
         while cost_star < np.inf:
@@ -763,19 +785,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
                     k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=1
                 )
 
+            if verbose:
+                pbar_merge.update(1)
+
             k = np.argmin(cost)
             cost_star = cost[k]
 
         if verbose:
-            print(
-                "ctr=%d, total_len_S=%d, avg_len_S=%0.3f"
-                % (ctr, total_len_S, total_len_S / (ctr + 1e-12))
-            )
-            print(
-                "Remaining #nodes in views with sz < %d = %d"
-                % (eta, np.sum(n_C[c] < eta))
-            )
-            print("Done with eta = %d." % eta)
+            pbar_merge.close()
+            # tqdm.write(
+            #     "ctr=%d, total_len_S=%d, avg_len_S=%0.3f"
+            #     % (ctr, total_len_S, total_len_S / (ctr + 1e-12))
+            # )
+            # tqdm.write(
+            #     "Remaining #nodes in views with sz < %d = %d"
+            #     % (eta, np.sum(n_C[c] < eta))
+            # )
+            # tqdm.write("Done with eta = %d." % eta)
 
     return c, n_C
 
@@ -866,16 +892,6 @@ def target_proc(p_num):
     W_data = overlap_svals[:, -1]
     for i in range(overlap_svals.shape[1] - 2, -1, -1):
         mask = W_data == 0
-        if verbose:
-            print(
-                "Iter:",
-                i,
-                ":: Updating scores of",
-                np.sum(mask),
-                "edges out of",
-                W_data.shape[0],
-                "edges.",
-            )
         n_zero_elem = np.sum(mask)
         if n_zero_elem == 0:
             break
@@ -1136,7 +1152,9 @@ def compute_final_embedding(
     Utilde_t = Utilde.copy()
 
     # Refine global embedding y
-    for it0 in range(max_iter):
+    for it0 in tqdm(
+        range(max_iter), desc="RGD alignment", unit="iter", disable=not verbose
+    ):
 
         if to_tear:
             Utildeg = compute_incidence_matrix_in_embedding(y, C, k, nu, metric)
@@ -1150,8 +1168,6 @@ def compute_final_embedding(
             err = compute_alignment_err(d, Utilde_t, param, verbose)
             E_Gamma_t = Utilde_t.nnz
             err = err / E_Gamma_t
-            if verbose:
-                print("Alignment error: %0.6f" % (err), flush=True)
             if prev_err is not None:
                 if (np.abs(err - prev_err) / (prev_err + 1e-12) < tol) and (
                     np.abs(E_Gamma_t - prev_edges) / (prev_edges + 1e-12) < tol
@@ -1222,9 +1238,6 @@ def update(alpha, max_iter, O, CC, M, d):
     CC, Lpinv_BT, _, _ = build_ortho_optim(d, Utilde, param, verbose)
     M, n = Utilde.shape
 
-    if verbose:
-        print("Descent starts", flush=True)
-
     Tstar = update(
         alpha, max_internal_iter, np.tile(np.eye(d), (1, M)), CC, M, d
     )  # At each iteration compute a new S starting with S = I
@@ -1434,13 +1447,6 @@ def build_ortho_optim(d, Utilde, param, verbose):
     
     B = csr_matrix((B_val_all, (B_row_all, B_col_all)), shape=(M * d, n + M))
 
-    if verbose:
-        print("min and max weights:", np.array(W_vals_all).min(), np.array(W_vals_all).max())
-        print(
-            f"Computing Pseudoinverse of a matrix of L of size {n} + {M} multiplied with B",
-            flush=True,
-        )
-
     Lpinv_helpers = compute_Lpinv_helpers(W)
     Lpinv_BT = compute_Lpinv_MT(Lpinv_helpers, B)
     CC = compute_CC(D, B, Lpinv_BT)
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index c2370e9..b491ef4 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -4,6 +4,7 @@
 
 from multiprocessing import cpu_count
 from joblib import Parallel, delayed
+from tqdm import tqdm
 
 from pyRATS._utils import (
     nearest_neighbors,
@@ -313,18 +314,35 @@ def fit_transform(self, X):
         y : array-like, shape (n_samples, d)
             X transformed in the new space.
         """
+        n_steps = 5 if self.to_postprocess else 4
+        current_step = 1
 
+        if self.verbose:
+            print(f"[{current_step}/{n_steps}] Fitting neighborhood graph...")
         self._fit_nbrhd_graph(X)
+        current_step += 1
 
         # Construct low dimensional local views
+        if self.verbose:
+            print(f"[{current_step}/{n_steps}] Fitting local views...")
         self._fit_local_views(X)
+        current_step += 1
+
         if self.to_postprocess:
+            if self.verbose:
+                print(f"[{current_step}/{n_steps}] Postprocessing local parameters...")
             self._postprocess()
+            current_step += 1
 
         # Construct intermediate views
+        if self.verbose:
+            print(f"[{current_step}/{n_steps}] Clustering intermediate views...")
         c, n_C = self._fit_intermediate_views()
+        current_step += 1
 
         # Construct Global views
+        if self.verbose:
+            print(f"[{current_step}/{n_steps}] Aligning global views...")
         y = self._fit_global_views(c, n_C)
 
         return y
@@ -660,21 +678,21 @@ def _postprocess(self):
 
         self.param.zeta = zeta
 
-        if self.verbose:
-            print(
-                r"Maximum local distortion before postprocessing is: %0.4f"
-                % np.max(zeta)
-            )
 
         connectivity_matrix = self.U.indices.reshape(n, -1)
-        param_changed = np.arange(n)
         future_use_phi_of = np.arange(n, dtype=int)
 
+        pbar = None
+        if self.verbose:
+            pbar = tqdm(total=n, desc="Refining parameters", unit="pts", leave=False)
+
+        param_changed = np.arange(n)
         while len(param_changed) > 0:
             changed_mask = np.zeros(n, dtype=bool)
             changed_mask[param_changed] = True
-            reconsider_mask = changed_mask[connectivity_matrix] & (
-                connectivity_matrix != np.arange(n)[:, None]
+            reconsider_mask = (
+                changed_mask[connectivity_matrix]
+                & (connectivity_matrix != np.arange(n)[:, None])
             )
 
             # Split the k columns into n_jobs chunks and process each chunk in
@@ -718,16 +736,10 @@ def _postprocess(self):
             zeta = np.minimum(zeta, zeta_min)
 
             if self.verbose:
-                print(
-                    "#Param replaced: %d, max distortion: %f"
-                    % (len(param_changed), np.max(zeta))
-                )
+                pbar.update(n - len(param_changed) - pbar.n)
 
         self.param.zeta = zeta.copy()
 
         self.param.replace_(future_use_phi_of)
         if self.verbose:
-            print(
-                f"Maximum local distortion after postprocessing is: %0.4f"
-                % (np.max(self.param.zeta))
-            )
+            pbar.close()

From 1648a790c65f014e67158de7eda9c6bb9beca3f3 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 18:27:48 +0000
Subject: [PATCH 28/44] Update to readme and pyproject (tqdm addition)

---
 README.md      | 6 +++---
 pyproject.toml | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index dbccff6..18ff892 100644
--- a/README.md
+++ b/README.md
@@ -62,11 +62,11 @@ python -m http.server 8000
 and opening http://localhost:8000 in your browser.
 
 # Memory Management
-`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it uses up to 75% of available system RAM. If you need to explicitly limit memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes):
+`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it attempts to use up to 75% of available system RAM. This measurement can fail and defaults to 4GB otherwise. If you need to explicitly set memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes):
 
 ```bash
-# Limit pyRATS to 4GB
-export PYRATS_MEMORY_LIMIT=4294967296
+# Set pyRATS memory usage to 40GB
+export PYRATS_MEMORY_LIMIT=42949672960
 python your_script.py
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 4b63c93..1cc94a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dependencies = [
     "scipy",
     "joblib",
     "multiprocess",
+    "tqdm",
 ]
 
 [project.optional-dependencies]

From a074cc9dd6be045f6a807c46cb5d6516eadf1616 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 18:38:23 +0000
Subject: [PATCH 29/44] style: Cleaner tqdms

---
 src/pyRATS/_utils.py | 2 --
 src/pyRATS/rats.py   | 4 ----
 2 files changed, 6 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 2f76341..a334552 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -694,8 +694,6 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
     for eta in tqdm(
         range(2, eta_min + 1), desc="Intermediate views", disable=not verbose
     ):
-        if verbose:
-            print("eta = %d." % eta)
             # tqdm.write(
             #     "#non-empty views with sz < %d = %d"
             #     % (eta, np.sum((n_C > 0) * (n_C < eta)))
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index b491ef4..aa24bae 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -534,8 +534,6 @@ def _fit_intermediate_views(self):
                 self.verbose,
                 self.n_jobs,
             )
-            if self.verbose:
-                print("Pruning and cleaning up.")
             # Prune empty clusters
             non_empty_C = n_C > 0
             M = np.sum(non_empty_C)
@@ -561,8 +559,6 @@ def _fit_intermediate_views(self):
 
             np.random.seed(42)
             self.param.noise_seed = np.random.randint(M * M, size=M)
-            if self.verbose:
-                print("Done.")
         else:
             c = np.arange(n, dtype=int)
             C = csr_matrix((np.ones(n), (c, np.arange(n))), shape=(n, n), dtype=bool)

From 9203f56045258644edbe3993c19ffe3d12929938 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Wed, 25 Mar 2026 18:45:19 +0000
Subject: [PATCH 30/44] speedup test: bottlnecks re-introduced. Testing shift
 back.

---
 src/pyRATS/_utils.py |  95 +++++++++++++++++++--------------
 src/pyRATS/rats.py   | 123 +++++++++++++++----------------------------
 2 files changed, 98 insertions(+), 120 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index a334552..8305818 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -34,16 +34,16 @@ def _get_available_memory(n_jobs=1):
             pass
             
     # 2. Try psutil for dynamic discovery
-    try:
-        import psutil
-        # Use a fraction of available RAM as a safe upper bound for our buffer,
-        # divided by the number of parallel workers.
-        return int(psutil.virtual_memory().available * 0.75 / n_jobs)
-    except (ImportError, AttributeError):
-        pass
-        
-    # 3. Fallback to 4GB total (divided by workers)
-    return (4 * 1024 * 1024 * 1024) // n_jobs
+    # Cache the result to avoid repeated system calls in tight loops
+    if not hasattr(_get_available_memory, "_cached_val"):
+        try:
+            import psutil
+            # Use a fraction of available RAM as a safe upper bound for our buffer.
+            _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75)
+        except (ImportError, AttributeError):
+            _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 # 4GB fallback
+
+    return _get_available_memory._cached_val // n_jobs
 
 
 def lpca(X, d, U, n_jobs, verbose=False):
@@ -125,12 +125,15 @@ def target_proc(p_num, chunk_sz):
         return start_ind, end_ind, Psi, mu, var_explained, n_pc_dir_chosen
 
     chunk_sz = int(n / n_jobs)
-    results = Parallel(n_jobs=n_jobs)(
-        delayed(target_proc)(i, chunk_sz)
-        for i in tqdm(
-            range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose
+    if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead
+        results = [target_proc(0, n)]
+    else:
+        results = Parallel(n_jobs=n_jobs)(
+            delayed(target_proc)(i, chunk_sz)
+            for i in tqdm(
+                range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose
+            )
         )
-    )
 
     for i in range(len(results)):
         start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i]
@@ -139,9 +142,6 @@ def target_proc(p_num, chunk_sz):
         param.var_explained[start_ind:end_ind, :] = var_explained_
         param.n_pc_dir_chosen[start_ind:end_ind] = n_pc_dir_chosen_
 
-    if verbose:
-        tqdm.write("local_param: all %d points processed..." % n)
-
     return param
 
 
@@ -222,8 +222,6 @@ def target_proc(p_num, chunk_sz):
     for start_ind, end_ind, model_ in results:
         local_param.model[start_ind:end_ind] = model_
 
-    if verbose:
-        tqdm.write("local_param: all %d points processed..." % n)
     return local_param
 
 
@@ -505,16 +503,29 @@ def cost_of_moving_distortion(
 
 
 def compute_zeta(d_e_mask0, Psi_k_mask):
-    if d_e_mask0.shape[0] == 1:
+    """Computes the local distortion zeta for a given parameterization and neighborhood distances.
+    
+    Parameters
+    ----------
+    d_e_mask0 : sparse matrix or array-like
+        Pairwise distances between neighbors in the original space.
+    Psi_k_mask : array-like
+        Embedded coordinates of the neighbors.
+    """
+    # For small k (typical in clustering), dense operations are MUCH faster than
+    # sparse triu/nonzero lookups.
+    d_e_mask = d_e_mask0.toarray() if hasattr(d_e_mask0, "toarray") else d_e_mask0
+    if d_e_mask.shape[0] <= 1:
         return 1
-    d_e_upper = triu(d_e_mask0, k=1)
-    row, col = d_e_upper.nonzero()
-    if len(row) == 0:
+    
+    # Use squareform/pdist for efficient pair extraction on small dense matrices
+    d_e_mask_ = squareform(d_e_mask)
+    mask = d_e_mask_ != 0
+    if not np.any(mask):
         return 1
     
-    d_e_vals = d_e_upper.data
-    diff = Psi_k_mask[row] - Psi_k_mask[col]
-    dist_embedded = np.sqrt(np.sum(diff * diff, axis=1))
+    d_e_vals = d_e_mask_[mask]
+    dist_embedded = pdist(Psi_k_mask)[mask]
     
     disc_lip_const = dist_embedded / d_e_vals
     return np.max(disc_lip_const) / (np.min(disc_lip_const) + 1e-12)
@@ -716,16 +727,19 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
             return start_ind, end_ind, cost_, dest_
 
         chunk_sz = int(n / n_jobs)
-        results = Parallel(n_jobs=n_jobs)(
-            delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
-            for p_num in tqdm(
-                range(n_jobs),
-                desc=f"eta={eta}",
-                unit="chunk",
-                leave=False,
-                disable=not verbose,
+        if n_jobs == 1 or n < 1000:
+            results = [target_proc(0, n, n, Utilde, n_C, c)]
+        else:
+            results = Parallel(n_jobs=n_jobs)(
+                delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
+                for p_num in tqdm(
+                    range(n_jobs),
+                    desc=f"eta={eta}",
+                    unit="chunk",
+                    leave=False,
+                    disable=not verbose,
+                )
             )
-        )
         for start_ind, end_ind, cost_, dest_ in results:
             cost[start_ind:end_ind] = cost_
             dest[start_ind:end_ind] = dest_
@@ -879,9 +893,12 @@ def target_proc(p_num):
             overlap_svals_[i - start_ind, :] = svdvals_
         return (start_ind, end_ind, overlap_svals_)
 
-    res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
-        delayed(target_proc)(p_num) for p_num in range(n_jobs)
-    )
+    if n_jobs == 1 or n_elem < 1000:
+        res = [target_proc(p_num) for p_num in range(n_jobs)]
+    else:
+        res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
+            delayed(target_proc)(p_num) for p_num in range(n_jobs)
+        )
     for value in res:
         start_ind, end_ind, overlap_svals_ = value
         overlap_svals[start_ind:end_ind, :] = overlap_svals_
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index aa24bae..886bb9c 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -20,56 +20,7 @@
 from pyRATS._tear_coloring import compute_color_of_pts_on_tear
 
 
-def _postprocess_col_range(
-    start,
-    end,
-    connectivity_matrix,
-    reconsider_mask,
-    param,
-    future_use_phi_of,
-    original_dists,
-    n,
-    n_jobs=1,
-):
-    """Process a contiguous range of neighbor-columns [start, end) for _postprocess.
-
-    Returns the element-wise minimum distortion and the corresponding column
-    index across the processed range, both as length-n arrays.
-    """
-    local_zeta_min = np.full(n, np.inf)
-    local_best_col = np.zeros(n, dtype=int)
-
-    for i in range(start, end):
-        col_mask = reconsider_mask[:, i]
-        use_phi_of = connectivity_matrix[:, i][col_mask]
-        if len(use_phi_of) == 0:
-            continue
-
-        neighbors_to_reconsider = connectivity_matrix[col_mask]
-        points_to_reconsider = np.where(col_mask)[0]
-
-        batch_embedded_datapoints = param.batched_eval_(
-            future_use_phi_of[use_phi_of], neighbors_to_reconsider, n_jobs=n_jobs
-        )
-        batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
-        batch_original_dists = original_dists[points_to_reconsider]
-
-        disc_lip_const = np.divide(
-            batch_embedded_dists,
-            batch_original_dists,
-            out=np.full_like(batch_embedded_dists, np.nan),
-            where=batch_original_dists != 0,
-        )
-        col_zeta = np.full(n, np.inf)
-        col_zeta[points_to_reconsider] = np.nanmax(disc_lip_const, axis=1) / (
-            np.nanmin(disc_lip_const, axis=1) + 1e-12
-        )
-
-        improved = col_zeta < local_zeta_min
-        local_best_col[improved] = i
-        local_zeta_min = np.minimum(local_zeta_min, col_zeta)
-
-    return local_zeta_min, local_best_col
+# _postprocess_col_range removed to reduce Parallel overhead in tight loops.
 
 
 class RATS:
@@ -691,39 +642,49 @@ def _postprocess(self):
                 & (connectivity_matrix != np.arange(n)[:, None])
             )
 
-            # Split the k columns into n_jobs chunks and process each chunk in
-            # a separate worker.  Each worker returns its local (zeta_min,
-            # best_col) for the columns it owns; we then do a final
-            # min-reduction here.  This mirrors the target_proc pattern used
-            # throughout _utils.py (lpca, best, …) while keeping memory
-            # bounded to one chunk of columns per worker at a time.
-            k_cols = connectivity_matrix.shape[1]
-            chunk_sz = max(1, k_cols // self.n_jobs)
-            ranges = [(p * chunk_sz, (p + 1) * chunk_sz) for p in range(self.n_jobs)]
-            ranges[-1] = (ranges[-1][0], k_cols)  # extend last chunk to cover remainder
-            ranges = [(s, e) for s, e in ranges if s < k_cols]  # drop empty ranges
-
-            results = Parallel(n_jobs=self.n_jobs)(
-                delayed(_postprocess_col_range)(
-                    s, e,
-                    connectivity_matrix,
-                    reconsider_mask,
-                    self.param,
-                    future_use_phi_of,
-                    original_dists,
-                    n,
-                    n_jobs=self.n_jobs,
-                )
-                for s, e in ranges
-            )
-
-            # Reduce chunk results into a single (zeta_min, best_col).
+            # Revert to serial loop for column checks.
+            # Local empirical tests show that Parallel overhead here outweighs the
+            # benefit for small k (neighbors), especially since this is called
+            # inside a convergence loop.
             zeta_min = np.full(n, np.inf)
             best_col = np.zeros(n, dtype=int)
-            for chunk_zeta_min, chunk_best_col in results:
-                improved = chunk_zeta_min < zeta_min
-                best_col[improved] = chunk_best_col[improved]
-                zeta_min = np.minimum(zeta_min, chunk_zeta_min)
+            
+            # Cache the number of jobs for batched_eval_ memory scaling
+            n_jobs = self.n_jobs if self.n_jobs > 0 else 1
+
+            for i in range(connectivity_matrix.shape[1]):
+                col_mask = reconsider_mask[:, i]
+                points_to_reconsider = np.where(col_mask)[0]
+                if len(points_to_reconsider) == 0:
+                    continue
+
+                use_phi_of = connectivity_matrix[points_to_reconsider, i]
+                neighbors_of_points = connectivity_matrix[points_to_reconsider]
+
+                # batched_eval_ now uses cached memory limits to reduce overhead
+                batch_embedded_datapoints = self.param.batched_eval_(
+                    future_use_phi_of[use_phi_of], 
+                    neighbors_of_points, 
+                    n_jobs=n_jobs
+                )
+                batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
+                batch_original_dists = original_dists[points_to_reconsider]
+
+                disc_lip_const = np.divide(
+                    batch_embedded_dists,
+                    batch_original_dists,
+                    out=np.full_like(batch_embedded_dists, np.nan),
+                    where=batch_original_dists != 0,
+                )
+                
+                col_zeta = np.nanmax(disc_lip_const, axis=1) / (
+                    np.nanmin(disc_lip_const, axis=1) + 1e-12
+                )
+
+                improved = col_zeta < zeta_min[points_to_reconsider]
+                improved_pts = points_to_reconsider[improved]
+                best_col[improved_pts] = i
+                zeta_min[improved_pts] = col_zeta[improved]
 
             param_changed = np.where(zeta > zeta_min)[0]
             future_use_phi_of[param_changed] = future_use_phi_of[

From a46dceb5e7ea8690ca28072434bfaab2be823d9e Mon Sep 17 00:00:00 2001
From: Joshua Offergeld <joshua.offergeld@ru.nl>
Date: Sun, 12 Apr 2026 10:13:28 +0200
Subject: [PATCH 31/44] feat: add global distortion measurement

---
 examples/kleinbottle_example.ipynb |  34 +++-
 src/pyRATS/global_distortion.py    | 283 +++++++++++++++++++++++++++++
 2 files changed, 307 insertions(+), 10 deletions(-)
 create mode 100644 src/pyRATS/global_distortion.py

diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb
index 8723479..6b34a91 100644
--- a/examples/kleinbottle_example.ipynb
+++ b/examples/kleinbottle_example.ipynb
@@ -7,7 +7,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install git+https://github.com/Mishne-Lab/pyRATS\n",
+    "# %pip install git+https://github.com/Mishne-Lab/pyRATS\n",
     "# if not already installed on the system:\n",
     "# %pip install pandas imageio seaborn matplotlib"
    ]
@@ -19,9 +19,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import sys\n",
+    "sys.path.insert(0, '../src/')\n",
+    "\n",
     "from matplotlib import pyplot as plt\n",
     "\n",
-    "from pyRATS import rats\n",
+    "from pyRATS import rats, global_distortion\n",
     "import datasets, vis"
    ]
   },
@@ -69,6 +72,25 @@
     "y"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0fdf51f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_nbrs = global_distortion.get_n_nbrs_for_distortion_analysis(X)\n",
+    "\n",
+    "global_distortion.compute_global_distortion(\n",
+    "    X,\n",
+    "    y,\n",
+    "    n_nbrs,\n",
+    "    buml_obj,\n",
+    "    metric=\"euclidean\",\n",
+    "    max_crossings=20,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -90,14 +112,6 @@
     ")\n",
     "plt.show()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c9b56299",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/src/pyRATS/global_distortion.py b/src/pyRATS/global_distortion.py
new file mode 100644
index 0000000..8f715a4
--- /dev/null
+++ b/src/pyRATS/global_distortion.py
@@ -0,0 +1,283 @@
+from scipy.sparse import csr_matrix
+from scipy.sparse.csgraph import shortest_path
+from sklearn.neighbors import NearestNeighbors
+from pyRATS._tear_coloring import compute_tear_graph
+import numpy as np
+from joblib import Parallel, delayed
+
+
+def _shortest_paths(
+    X,
+    k,
+    metric="euclidean",
+):
+    nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(X)
+    knn_graph = nbrs.kneighbors_graph(mode="distance")
+
+    return shortest_path(
+        knn_graph,
+        return_predecessors=False,
+        directed=False,
+    )
+
+
+def _compute_tear_aware_shortest_path_distances(
+    param,
+    y,
+    Utilde,
+    n_Utilde_Utilde,
+    Utildeg,
+    C,
+    c,
+    k,
+    nu,
+    metric="euclidean",
+    max_crossings=5,
+    tol=1e-6,
+    dist=None,
+    n_batches=None,
+    debug=False,
+    n_jobs=-1,
+):
+    if n_batches is None:
+        n_batches = n_jobs
+    if dist is None:
+        dist = _shortest_paths(y, n_nbrs=k, metric=metric, return_predqecessors=False)
+
+    output = compute_tear_graph(
+        y,
+        Utilde,
+        C,
+        n_Utilde_Utilde,
+        k,
+        nu,
+        metric,
+        False,
+        n_jobs,
+        i_mat_in_emb=Utildeg,
+    )
+    if output is None:
+        return dist
+
+    pts_across_tear, tear_graph = output
+    tear_graph = tear_graph.tocoo()
+    tear_graph_row_inds = tear_graph.row
+    tear_graph_col_inds = tear_graph.col
+    tear_graph_with_weights = _add_weights_to_tear_graph(
+        param, tear_graph, pts_across_tear, c
+    )
+    dist_of_pts_across_tear = tear_graph_with_weights.data
+    tear_graph_row_inds = pts_across_tear[tear_graph_row_inds]
+    tear_graph_col_inds = pts_across_tear[tear_graph_col_inds]
+
+    # make shortcuts in dist
+    for k in range(len(tear_graph_row_inds)):
+        dist[tear_graph_row_inds[k], tear_graph_col_inds[k]] = dist_of_pts_across_tear[
+            k
+        ]
+
+    final_dist = _recompute_dist_using_tear(
+        dist,
+        pts_across_tear,
+        max_crossings=max_crossings,
+        n_batches=n_batches,
+        tol=tol,
+        debug=debug,
+    )
+
+    return final_dist
+
+
+def _add_weights_to_tear_graph(
+    param, tear_graph, pts_across_tear, cluster_label, n_batches=8
+):
+    tear_graph = tear_graph.copy().tocoo()
+    tear_graph_row_inds = tear_graph.row
+    tear_graph_col_inds = tear_graph.col
+    n_edges = len(tear_graph_row_inds)
+    dist_of_pts_across_tear = np.zeros(n_edges)
+
+    def process_(start_ind, end_ind):
+        temp = np.zeros(end_ind - start_ind) + np.inf
+        for k in range(start_ind, end_ind):
+            r_i = tear_graph_row_inds[k]
+            c_i = tear_graph_col_inds[k]
+            edge_i = pts_across_tear[r_i]
+            edge_j = pts_across_tear[c_i]
+            # for view_ind in view_cont_pts_across_tear[(edge_i, edge_j)]:
+            for view_ind in [cluster_label[edge_i], cluster_label[edge_j]]:
+                local_coords = param.eval_(
+                    {"data_mask": [edge_i, edge_j], "view_index": view_ind}
+                )
+                temp[k - start_ind] = min(
+                    temp[k - start_ind],
+                    np.linalg.norm(local_coords[0, :] - local_coords[1, :]),
+                )
+        return temp
+
+    chunk_sz = n_edges // n_batches
+    start_end_inds = []
+    for i in range(n_batches):
+        start_ind = i * chunk_sz
+        if i < n_batches - 1:
+            end_ind = start_ind + chunk_sz
+        else:
+            end_ind = n_edges
+        start_end_inds.append((start_ind, end_ind))
+
+    results = Parallel(n_jobs=n_batches)(
+        delayed(process_)(a, b) for a, b in start_end_inds
+    )
+    dist_of_pts_across_tear = np.concatenate(results)
+
+    return csr_matrix(
+        (dist_of_pts_across_tear, (tear_graph_row_inds, tear_graph_col_inds)),
+        shape=tear_graph.shape,
+    )
+
+
+def _recompute_dist_using_tear(
+    dist, pts_across_tear, max_crossings=20, n_batches=32, tol=1e-6, debug=False
+):
+    if n_batches < 1:
+        n_batches = 5
+
+    n = dist.shape[0]
+    if debug:
+        dists = [dist.copy()]
+    for i_crossing in range(max_crossings):
+        old_dist = dist.copy()
+        dist_from_pts_across_tear = dist[pts_across_tear, :].T  # n x n_pts_across_tear
+
+        def process_(dist, start_ind, end_ind):
+            dist_ = np.zeros((end_ind - start_ind, n))
+            for i_ in range(start_ind, end_ind):
+                i = i_ - start_ind
+                dist_[i, :] = np.minimum(
+                    dist[i, :],
+                    np.minimum(
+                        dist[i, :],
+                        np.min(
+                            dist_from_pts_across_tear[i_, :][:, None]
+                            + dist_from_pts_across_tear.T,
+                            axis=0,
+                        ),
+                    ),
+                )
+            return dist_
+
+        start_end_inds = []
+        chunk_sz = n // n_batches
+        for j in range(n_batches):
+            start_ind = j * chunk_sz
+            if j < n_batches - 1:
+                end_ind = start_ind + chunk_sz
+            else:
+                end_ind = n
+            start_end_inds.append((start_ind, end_ind))
+        results = Parallel(n_jobs=n_batches)(
+            delayed(process_)(dist.copy()[a:b, :], a, b) for a, b in start_end_inds
+        )
+
+        for i in range(len(results)):
+            start_ind, end_ind = start_end_inds[i]
+            dist[start_ind:end_ind, :] = results[i]
+
+        if debug:
+            dists.append(dist.copy())
+
+        mean_abs_rel_diff = np.ma.masked_invalid(
+            np.abs(dist - old_dist) / (old_dist + 1e-12)
+        ).mean()
+
+        if mean_abs_rel_diff < tol:
+            break
+
+    return dist
+
+
+def _compute_distortion_at(y_d_e, s_d_e):
+    scale_factors = (y_d_e + 1e-12) / (s_d_e + 1e-12)
+    mask = np.ones(scale_factors.shape, dtype=bool)
+    np.fill_diagonal(mask, 0)
+    max_distortion = np.max(scale_factors[mask]) / np.min(scale_factors[mask])
+    n = y_d_e.shape[0]
+    distortion_at = np.zeros(n)
+    mask = np.ones(n, dtype=bool)
+    for i in range(n):
+        mask[i] = 0
+        distortion_at[i] = np.max(scale_factors[i, mask]) / np.min(
+            scale_factors[i, mask]
+        )
+        mask[i] = 1
+    return distortion_at, max_distortion
+
+
+def get_n_nbrs_for_distortion_analysis(
+    X,
+    n_nbrs_list=list(range(5, 16)),
+    tol=1e-3,
+    persistence=3,
+    metric="euclidean",
+):
+
+    gt_dist_list = [_shortest_paths(X, k, metric=metric) for k in n_nbrs_list]
+
+    spd_rel_mean_abs_diff = []
+    for i in range(1, len(gt_dist_list)):
+        temp = np.mean(
+            np.abs(
+                (gt_dist_list[i] - gt_dist_list[i - 1]) / (gt_dist_list[i - 1] + 1e-12)
+            )
+        )
+        spd_rel_mean_abs_diff.append(temp)
+
+    while True:
+        temp = np.array(spd_rel_mean_abs_diff) < tol
+        if np.sum(temp) == 0:
+            tol = 2 * tol
+        else:
+            temp2 = np.zeros(len(temp) - persistence + 1, dtype=bool)
+            for i in range(len(temp) - persistence + 1):
+                if np.prod(temp[i : i + persistence]):
+                    temp2[i] = 1
+            if np.sum(temp2) == 0:
+                tol = 2 * tol
+            else:
+                n_nbrs_ind = np.argmax(temp2)
+                break
+
+    return n_nbrs_list[n_nbrs_ind]
+
+
+def compute_global_distortion(
+    X,
+    y,
+    n_nbrs,
+    model,
+    metric="euclidean",
+    max_crossings=20,
+):
+
+    gt_dist = _shortest_paths(X, n_nbrs, metric=metric)
+
+    emb_dist = _shortest_paths(y, n_nbrs)
+
+    if model.to_tear:
+        emb_dist = _compute_tear_aware_shortest_path_distances(
+            model.param,
+            y,
+            model.Utilde,
+            model.n_Utilde_Utilde,
+            model.Utildeg,
+            model.C,
+            model.c,
+            n_nbrs,
+            model.nu,
+            metric,
+            max_crossings=max_crossings,
+            dist=emb_dist,
+            n_jobs=model.n_jobs,
+        )
+
+    return _compute_distortion_at(emb_dist, gt_dist)

From 293c317e7a8c224dbd6625b13a39061a20c84efd Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 14:28:39 +0200
Subject: [PATCH 32/44] Fix for the breakages of parallelization in batched
 calls

---
 src/pyRATS/_utils.py | 95 ++++++++++++++++++++++++++------------------
 1 file changed, 57 insertions(+), 38 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 8305818..19b0181 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -6,7 +6,7 @@
 from scipy.sparse import csr_matrix, triu, block_diag, diags
 import itertools
 
-from joblib import delayed, Parallel
+from joblib import delayed, Parallel, parallel_backend
 
 from scipy.spatial.distance import pdist, squareform
 from sklearn.utils.extmath import svd_flip
@@ -20,30 +20,33 @@
 import os
 
 def _get_available_memory(n_jobs=1):
-    """Returns the available memory in bytes, scaled by n_jobs.
-    
-    Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil,
-    and finally falls back to a 4GB default (divided by n_jobs).
+    """Returns the per-worker memory budget in bytes.
+
+    Used by chunking code paths to size buffers safely. When the caller is
+    running inside ``n_jobs`` concurrent workers each holding their own
+    buffers, pass ``n_jobs`` so the budget is divided accordingly. When
+    called from a serial code path, leave ``n_jobs=1``.
+
+    Sources, in priority order:
+      1. PYRATS_MEMORY_LIMIT environment variable (total available bytes)
+      2. psutil's available system RAM, scaled by 0.75
+      3. 4GB fallback
     """
-    # 1. Environment variable override
     limit = os.environ.get("PYRATS_MEMORY_LIMIT")
     if limit:
         try:
-            return int(limit)
+            return int(limit) // max(1, n_jobs)
         except ValueError:
             pass
-            
-    # 2. Try psutil for dynamic discovery
-    # Cache the result to avoid repeated system calls in tight loops
+
     if not hasattr(_get_available_memory, "_cached_val"):
         try:
             import psutil
-            # Use a fraction of available RAM as a safe upper bound for our buffer.
             _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75)
         except (ImportError, AttributeError):
-            _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 # 4GB fallback
+            _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024
 
-    return _get_available_memory._cached_val // n_jobs
+    return _get_available_memory._cached_val // max(1, n_jobs)
 
 
 def lpca(X, d, U, n_jobs, verbose=False):
@@ -128,12 +131,17 @@ def target_proc(p_num, chunk_sz):
     if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead
         results = [target_proc(0, n)]
     else:
-        results = Parallel(n_jobs=n_jobs)(
-            delayed(target_proc)(i, chunk_sz)
-            for i in tqdm(
-                range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose
+        # inner_max_num_threads=1: each loky worker would otherwise fork its
+        # own BLAS pool sized to all cores, causing n_jobs**2 thread contention
+        # which produces a slowdown past n_jobs ~ ncores/2. Capping the inner
+        # pool to 1 keeps scaling monotonic.
+        with parallel_backend("loky", inner_max_num_threads=1):
+            results = Parallel(n_jobs=n_jobs)(
+                delayed(target_proc)(i, chunk_sz)
+                for i in tqdm(
+                    range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose
+                )
             )
-        )
 
     for i in range(len(results)):
         start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i]
@@ -212,12 +220,13 @@ def target_proc(p_num, chunk_sz):
         return start_ind, end_ind, model_
 
     chunk_sz = int(n / n_jobs)
-    results = Parallel(n_jobs=n_jobs)(
-        delayed(target_proc)(p_num, chunk_sz)
-        for p_num in tqdm(
-            range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose
+    with parallel_backend("loky", inner_max_num_threads=1):
+        results = Parallel(n_jobs=n_jobs)(
+            delayed(target_proc)(p_num, chunk_sz)
+            for p_num in tqdm(
+                range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose
+            )
         )
-    )
 
     for start_ind, end_ind, model_ in results:
         local_param.model[start_ind:end_ind] = model_
@@ -730,16 +739,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
         if n_jobs == 1 or n < 1000:
             results = [target_proc(0, n, n, Utilde, n_C, c)]
         else:
-            results = Parallel(n_jobs=n_jobs)(
-                delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
-                for p_num in tqdm(
-                    range(n_jobs),
-                    desc=f"eta={eta}",
-                    unit="chunk",
-                    leave=False,
-                    disable=not verbose,
+            # inner_max_num_threads=1: each loky worker would otherwise fork its
+            # own BLAS pool. With Apple Accelerate / OpenBLAS sized to all cores,
+            # n_jobs workers x n_cores BLAS threads = n_jobs**2 software threads
+            # contending for n_cores hardware threads, which produces the
+            # 4 -> 8 jobs slowdown. Capping the inner pool to 1 keeps scaling
+            # monotonic.
+            with parallel_backend("loky", inner_max_num_threads=1):
+                results = Parallel(n_jobs=n_jobs)(
+                    delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
+                    for p_num in tqdm(
+                        range(n_jobs),
+                        desc=f"eta={eta}",
+                        unit="chunk",
+                        leave=False,
+                        disable=not verbose,
+                    )
                 )
-            )
         for start_ind, end_ind, cost_, dest_ in results:
             cost[start_ind:end_ind] = cost_
             dest[start_ind:end_ind] = dest_
@@ -896,9 +912,11 @@ def target_proc(p_num):
     if n_jobs == 1 or n_elem < 1000:
         res = [target_proc(p_num) for p_num in range(n_jobs)]
     else:
-        res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")(
-            delayed(target_proc)(p_num) for p_num in range(n_jobs)
-        )
+        with parallel_backend("loky", inner_max_num_threads=1):
+            res = list(Parallel(n_jobs=n_jobs,
+                                return_as="generator_unordered")(
+                delayed(target_proc)(p_num) for p_num in range(n_jobs)
+            ))
     for value in res:
         start_ind, end_ind, overlap_svals_ = value
         overlap_svals[start_ind:end_ind, :] = overlap_svals_
@@ -1616,7 +1634,9 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1):
             List of points to map to the embedding dimension for each point in 'view_index'
 
         n_jobs : int, default=1
-            Number of parallel jobs. Used to scale memory limits.
+            Number of concurrent workers calling this method. Used only to
+            divide the chunking memory budget so each worker stays within
+            its safe share of available RAM. Pass 1 from serial code paths.
         """
 
         ks = view_index
@@ -1627,8 +1647,7 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1):
             n_eval = len(ks)
             n_neighbors = masks.shape[1]
             n_features = self.X.shape[1]
-            
-            # Calculate chunk size based on available memory
+
             memory_limit = _get_available_memory(n_jobs=n_jobs)
             itemsize = self.X.dtype.itemsize
             # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize)

From 5369ea34b665adcc7c530879358fbd586f02c74b Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 14:49:26 +0200
Subject: [PATCH 33/44] Memory Limiting Overhaul Check

---
 src/pyRATS/_tear_coloring.py    |  13 +-
 src/pyRATS/_utils.py            | 225 +++++++++++++++++++++--------
 src/pyRATS/rats.py              |  74 +++++-----
 tests/scripts/thread_scaling.py |  71 +++++++++
 tests/test_memory.py            | 248 ++++++++++++++++++++++++++++++++
 5 files changed, 526 insertions(+), 105 deletions(-)
 create mode 100644 tests/scripts/thread_scaling.py
 create mode 100644 tests/test_memory.py

diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py
index bd43b33..ddd3661 100644
--- a/src/pyRATS/_tear_coloring.py
+++ b/src/pyRATS/_tear_coloring.py
@@ -3,7 +3,7 @@
 from scipy.sparse.csgraph import connected_components, laplacian
 import scipy
 
-from joblib import delayed, Parallel
+from joblib import delayed, Parallel, parallel_backend
 from tqdm import tqdm
 
 from pyRATS import _utils
@@ -262,10 +262,13 @@ def process_pairs(ij_pairs):
     else:
         ij_pairs_batches = [(views_across_tear_graph_row, views_across_tear_graph_col)]
 
-    # Run in parallel
-    results = Parallel(n_jobs=n_jobs)(
-        delayed(process_pairs)(ij_pairs) for ij_pairs in ij_pairs_batches
-    )
+    # Run in parallel; cap inner BLAS threads so each loky worker doesn't
+    # spawn its own multi-threaded BLAS pool, which would cause n_jobs**2
+    # thread oversubscription against ncores hardware threads.
+    with parallel_backend("loky", inner_max_num_threads=1):
+        results = Parallel(n_jobs=n_jobs)(
+            delayed(process_pairs)(ij_pairs) for ij_pairs in ij_pairs_batches
+        )
 
     # Aggregate results
     tear_graph_row_inds = np.concatenate([r for r, _, _ in results if len(r)])
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 19b0181..a103ce7 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -18,35 +18,80 @@
 from tqdm import tqdm
 
 import os
+import sys
+import warnings
+
+_MIN_MEMORY_FLOOR = 64 * 1024 * 1024  # 64 MB — always make some progress
+_FALLBACK_MEMORY = 4 * 1024 * 1024 * 1024  # 4 GB if psutil is missing
+
+
+def _cgroup_available_bytes():
+    """Linux-only: bytes still available inside the current cgroup, or None.
+
+    Reads cgroup v2 first, then falls back to v1. Returns None on any other
+    OS, when no limit is set, or when the files can't be parsed.
+    """
+    if not sys.platform.startswith("linux"):
+        return None
+    pairs = (
+        ("/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory.current"),
+        ("/sys/fs/cgroup/memory/memory.limit_in_bytes",
+         "/sys/fs/cgroup/memory/memory.usage_in_bytes"),
+    )
+    for max_path, cur_path in pairs:
+        try:
+            with open(max_path) as f:
+                raw = f.read().strip()
+            if not raw or raw == "max":
+                return None
+            limit = int(raw)
+            # cgroup v1 uses a huge sentinel when unconstrained.
+            if limit >= (1 << 62):
+                return None
+            with open(cur_path) as f:
+                used = int(f.read().strip())
+            return max(0, limit - used)
+        except (OSError, ValueError):
+            continue
+    return None
 
-def _get_available_memory(n_jobs=1):
-    """Returns the per-worker memory budget in bytes.
 
-    Used by chunking code paths to size buffers safely. When the caller is
-    running inside ``n_jobs`` concurrent workers each holding their own
-    buffers, pass ``n_jobs`` so the budget is divided accordingly. When
-    called from a serial code path, leave ``n_jobs=1``.
+def _available_memory_bytes():
+    """Live probe of usable memory in bytes.
 
-    Sources, in priority order:
-      1. PYRATS_MEMORY_LIMIT environment variable (total available bytes)
-      2. psutil's available system RAM, scaled by 0.75
-      3. 4GB fallback
+    Sources, all combined via min():
+      * psutil.virtual_memory().available scaled by 0.75 (or 4GB fallback)
+      * cgroup v2/v1 remaining quota on Linux containers (avoids OOM-kill
+        before psutil sees pressure)
+      * PYRATS_MEMORY_LIMIT env var as a hard cap
+
+    Floored at 64MB so chunking can always make at least one row of progress.
+    Called per request; do not cache — pyRATS allocates large internal state
+    during a fit, and a stale snapshot is the root cause of the prior OOMs.
     """
-    limit = os.environ.get("PYRATS_MEMORY_LIMIT")
-    if limit:
+    try:
+        import psutil
+        avail = int(psutil.virtual_memory().available * 0.75)
+    except (ImportError, AttributeError):
+        avail = _FALLBACK_MEMORY
+
+    cgroup = _cgroup_available_bytes()
+    if cgroup is not None:
+        avail = min(avail, int(cgroup * 0.75))
+
+    user_cap = os.environ.get("PYRATS_MEMORY_LIMIT")
+    if user_cap:
         try:
-            return int(limit) // max(1, n_jobs)
+            avail = min(avail, int(user_cap))
         except ValueError:
             pass
 
-    if not hasattr(_get_available_memory, "_cached_val"):
-        try:
-            import psutil
-            _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75)
-        except (ImportError, AttributeError):
-            _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024
+    return max(avail, _MIN_MEMORY_FLOOR)
 
-    return _get_available_memory._cached_val // max(1, n_jobs)
+
+# Backward-compatible alias for any external caller pinned to the old name.
+def _get_available_memory(n_jobs=1):
+    return _available_memory_bytes() // max(1, n_jobs)
 
 
 def lpca(X, d, U, n_jobs, verbose=False):
@@ -1622,74 +1667,104 @@ def __init__(self, algo="lpca", **kwargs):
         self.add_dim = False
         self.standardize = False
 
-    def batched_eval_(self, view_index, data_mask, n_jobs=1):
-        """Maps multiple points to the new space through their local (K-)PCA prameterizations.
+    def iter_eval_(self, view_index, data_mask, peak_bytes_per_row=0):
+        """Stream the local-(K-)PCA embedding in memory-bounded chunks.
+
+        Yields ``(slice, chunk)`` pairs where ``chunk`` is the embedded
+        sub-array for ``view_index[slice]`` / ``data_mask[slice]``. Lets
+        callers reduce per chunk (e.g. compute pdists, take a max) without
+        ever materializing the full ``(n_eval, k, d)`` output.
 
         Parameters
         ------
-        view_index : array-like, shape (n_points)
-            The indices of the parameterization to use.
-
+        view_index : array-like, shape (n_points,)
         data_mask : array-like, shape (n_points, n_neighbors)
-            List of points to map to the embedding dimension for each point in 'view_index'
-
-        n_jobs : int, default=1
-            Number of concurrent workers calling this method. Used only to
-            divide the chunking memory budget so each worker stays within
-            its safe share of available RAM. Pass 1 from serial code paths.
+        peak_bytes_per_row : int, default=0
+            Caller-provided estimate of additional bytes the downstream
+            pipeline will allocate per row (e.g. batched_pdist's diff
+            buffer). Folded into the chunk-size budget so the *whole*
+            pipeline stays within available memory, not just this method.
+
+        On MemoryError the chunk size is halved and the slice is retried;
+        a one-time RuntimeWarning is emitted so the user knows they're at
+        the limit.
         """
-
-        ks = view_index
+        ks = np.asarray(view_index) if not isinstance(view_index, np.ndarray) else view_index
         masks = data_mask
+        n_eval = len(ks)
+        if n_eval == 0:
+            return
 
         if self.algo == "lpca":
-            # Dynamic chunking to prevent memory spikes on high-dimensional data
-            n_eval = len(ks)
             n_neighbors = masks.shape[1]
             n_features = self.X.shape[1]
-
-            memory_limit = _get_available_memory(n_jobs=n_jobs)
+            d = self.Psi.shape[2]
             itemsize = self.X.dtype.itemsize
-            # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize)
-            chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize))
-            
-            if n_eval <= chunk_size:
-                # fits in memory, process all at once
-                temp = np.matmul(
-                    self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks]
-                )
-            else:
-                # split into chunks
-                temp = np.zeros((n_eval, n_neighbors, self.Psi.shape[2]))
-                for start in range(0, n_eval, chunk_size):
-                    end = min(start + chunk_size, n_eval)
-                    ks_chunk = ks[start:end]
-                    masks_chunk = masks[start:end]
-                    temp[start:end] = np.matmul(
-                        self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :], 
-                        self.Psi[ks_chunk]
+            # Per-row peak inside this method: input gather + output buffer.
+            own_per_row = (n_neighbors * n_features + n_neighbors * d) * itemsize
+            per_row = max(1, own_per_row + max(0, int(peak_bytes_per_row)))
+
+            chunk_size = max(1, _available_memory_bytes() // per_row)
+            chunk_size = min(chunk_size, n_eval)
+
+            warned = False
+            start = 0
+            while start < n_eval:
+                end = min(start + chunk_size, n_eval)
+                sl = slice(start, end)
+                ks_chunk = ks[sl]
+                masks_chunk = masks[sl]
+                try:
+                    chunk = np.matmul(
+                        self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :],
+                        self.Psi[ks_chunk],
                     )
-            n = self.X.shape[0]
+                except MemoryError:
+                    if chunk_size <= 1:
+                        raise
+                    chunk_size = max(1, chunk_size // 2)
+                    if not warned:
+                        warnings.warn(
+                            "pyRATS: hit MemoryError during local embedding; "
+                            f"halving chunk size to {chunk_size} and retrying. "
+                            "Consider lowering n_neighbors or setting "
+                            "PYRATS_MEMORY_LIMIT.",
+                            RuntimeWarning,
+                            stacklevel=2,
+                        )
+                        warned = True
+                    continue
+                chunk = self._apply_post_(ks_chunk, masks_chunk, chunk)
+                yield sl, chunk
+                start = end
         else:
-            temp = []
+            # KPCA path is already row-by-row; yield singletons so callers
+            # have a uniform streaming interface.
             for i, k in enumerate(ks):
                 X_ = self.X[masks[i], :]
                 if self.standardize:
                     X_ = X_ - np.mean(X_, axis=0)[None, :]
                     X_ = X_ / (np.std(X_, axis=0, ddof=1)[None, :] + 1e-12)
-                temp.append(self.model[k].transform(X_))
-            temp = np.array(temp)
+                chunk = self.model[k].transform(X_)[None, ...]
+                ks_chunk = ks[i:i + 1]
+                masks_chunk = masks[i:i + 1]
+                chunk = self._apply_post_(ks_chunk, masks_chunk, chunk)
+                yield slice(i, i + 1), chunk
+
+    def _apply_post_(self, ks, masks, temp):
+        """Apply noise / add_dim / b / T / v to a chunk of embedded points.
 
+        Factored out of batched_eval_ so iter_eval_ can apply the same
+        post-processing per chunk. Semantics match the original inline code.
+        """
         if self.noise_var:
             np.random.seed(self.noise_seed[0])
-            temp2 = np.random.normal(0, self.noise_var, (n, temp.shape[2]))
+            temp2 = np.random.normal(0, self.noise_var, (self.X.shape[0], temp.shape[2]))
             temp = temp + temp2[masks, :]
         if self.noise is not None:
             temp = temp + self.noise[masks]
-
         if self.add_dim:
             temp = np.concatenate([temp, np.zeros((*temp.shape[:2], 1))], axis=2)
-
         if self.b is not None:
             temp = temp * self.b[ks][:, None, None]
             if self.T is not None:
@@ -1698,6 +1773,32 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1):
                 temp = temp + self.v[[ks], :]
         return temp
 
+    def batched_eval_(self, view_index, data_mask, n_jobs=1):
+        """Materialize the full local-(K-)PCA embedding.
+
+        Thin wrapper around iter_eval_ for callers that need the whole
+        ``(n_eval, n_neighbors, d)`` array (e.g. clustering's procrustes
+        cost). Memory-bounded internally via iter_eval_'s chunking and
+        MemoryError backoff.
+
+        ``n_jobs`` is accepted for backward compatibility and ignored;
+        the live memory probe in iter_eval_ already reflects whatever
+        sibling workers have allocated.
+        """
+        ks = view_index
+        masks = data_mask
+        n_eval = len(ks)
+        if n_eval == 0:
+            d = self.Psi.shape[2] if self.algo == "lpca" else 0
+            return np.zeros((0, masks.shape[1] if hasattr(masks, "shape") else 0, d))
+
+        out = None
+        for sl, chunk in self.iter_eval_(ks, masks):
+            if out is None:
+                out = np.empty((n_eval, chunk.shape[1], chunk.shape[2]), dtype=chunk.dtype)
+            out[sl] = chunk
+        return out
+
     def eval_(self, view_index, data_mask, apply_b=True):
         """Maps points to the new space through their local (K-)PCA prameterizations.
 
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 886bb9c..bef7e1c 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -609,19 +609,26 @@ def _postprocess(self):
             self.neighborhood_graph_sym[row_idx, col_idx]
         ).reshape(n, num_pairs)
 
-        embedded_datapoints = self.param.batched_eval_(
-            range(n), self.U.indices.reshape(n, self.k)
-        )  # tested and they are equivalent
-        embedded_dists = batched_pdist(embedded_datapoints)
-        disc_lip_const = np.divide(
-            embedded_dists,
-            original_dists,
-            out=np.full_like(embedded_dists, np.nan),
-            where=original_dists != 0,
-        )
-        zeta = np.nanmax(disc_lip_const, axis=1) / (
-            np.nanmin(disc_lip_const, axis=1) + 1e-12
-        )
+        # Per-row peak for the downstream batched_pdist: a (chunk, num_pairs, d)
+        # diff buffer plus its (chunk, num_pairs) result. Folded into the
+        # iter_eval_ chunk-size budget so the whole pipeline stays bounded.
+        d_embed = self.param.Psi.shape[2] if self.param.algo == "lpca" else self.d
+        itemsize = self.param.X.dtype.itemsize
+        pdist_per_row = num_pairs * (d_embed + 1) * itemsize
+
+        zeta = np.empty(n)
+        view_idx = np.arange(n, dtype=int)
+        masks_init = self.U.indices.reshape(n, self.k)
+        for sl, chunk in self.param.iter_eval_(view_idx, masks_init,
+                                               peak_bytes_per_row=pdist_per_row):
+            chunk_dists = batched_pdist(chunk)
+            chunk_orig = original_dists[sl]
+            dlc = np.divide(
+                chunk_dists, chunk_orig,
+                out=np.full_like(chunk_dists, np.nan),
+                where=chunk_orig != 0,
+            )
+            zeta[sl] = np.nanmax(dlc, axis=1) / (np.nanmin(dlc, axis=1) + 1e-12)
 
         self.param.zeta = zeta
 
@@ -642,15 +649,8 @@ def _postprocess(self):
                 & (connectivity_matrix != np.arange(n)[:, None])
             )
 
-            # Revert to serial loop for column checks.
-            # Local empirical tests show that Parallel overhead here outweighs the
-            # benefit for small k (neighbors), especially since this is called
-            # inside a convergence loop.
             zeta_min = np.full(n, np.inf)
             best_col = np.zeros(n, dtype=int)
-            
-            # Cache the number of jobs for batched_eval_ memory scaling
-            n_jobs = self.n_jobs if self.n_jobs > 0 else 1
 
             for i in range(connectivity_matrix.shape[1]):
                 col_mask = reconsider_mask[:, i]
@@ -660,26 +660,24 @@ def _postprocess(self):
 
                 use_phi_of = connectivity_matrix[points_to_reconsider, i]
                 neighbors_of_points = connectivity_matrix[points_to_reconsider]
-
-                # batched_eval_ now uses cached memory limits to reduce overhead
-                batch_embedded_datapoints = self.param.batched_eval_(
-                    future_use_phi_of[use_phi_of], 
-                    neighbors_of_points, 
-                    n_jobs=n_jobs
-                )
-                batch_embedded_dists = batched_pdist(batch_embedded_datapoints)
                 batch_original_dists = original_dists[points_to_reconsider]
 
-                disc_lip_const = np.divide(
-                    batch_embedded_dists,
-                    batch_original_dists,
-                    out=np.full_like(batch_embedded_dists, np.nan),
-                    where=batch_original_dists != 0,
-                )
-                
-                col_zeta = np.nanmax(disc_lip_const, axis=1) / (
-                    np.nanmin(disc_lip_const, axis=1) + 1e-12
-                )
+                col_zeta = np.empty(len(points_to_reconsider))
+                for sl, chunk in self.param.iter_eval_(
+                    future_use_phi_of[use_phi_of],
+                    neighbors_of_points,
+                    peak_bytes_per_row=pdist_per_row,
+                ):
+                    chunk_dists = batched_pdist(chunk)
+                    chunk_orig = batch_original_dists[sl]
+                    dlc = np.divide(
+                        chunk_dists, chunk_orig,
+                        out=np.full_like(chunk_dists, np.nan),
+                        where=chunk_orig != 0,
+                    )
+                    col_zeta[sl] = np.nanmax(dlc, axis=1) / (
+                        np.nanmin(dlc, axis=1) + 1e-12
+                    )
 
                 improved = col_zeta < zeta_min[points_to_reconsider]
                 improved_pts = points_to_reconsider[improved]
diff --git a/tests/scripts/thread_scaling.py b/tests/scripts/thread_scaling.py
new file mode 100644
index 0000000..425e186
--- /dev/null
+++ b/tests/scripts/thread_scaling.py
@@ -0,0 +1,71 @@
+"""Measure how RATS.fit_transform scales with n_jobs on a kleinbottle.
+
+Usage:
+    python tests/scripts/thread_scaling.py [--n N] [--jobs 1 2 4 8] [--repeats R]
+
+Prints a small table of wall times and speedup vs. the n_jobs=1 baseline.
+"""
+
+import argparse
+import os
+import sys
+import time
+
+# Allow importing src and examples
+repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
+if repo_root not in sys.path:
+    sys.path.insert(0, repo_root)
+
+from pyRATS import rats
+from examples import datasets
+
+
+def run_once(X, n_jobs):
+    model = rats.RATS(
+        n_components=2,
+        n_neighbors=14,
+        cost_function="distortion",
+        min_cluster_size=5,
+        n_iter=3,
+        nu=4,
+        verbose=False,
+        n_jobs=n_jobs,
+    )
+    t0 = time.perf_counter()
+    model.fit_transform(X=X)
+    return time.perf_counter() - t0
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--n", type=int, default=2000,
+                        help="Klein bottle sample count (default 2000).")
+    parser.add_argument("--jobs", type=int, nargs="+", default=[1, 2, 4, 8],
+                        help="List of n_jobs values to benchmark.")
+    parser.add_argument("--repeats", type=int, default=2,
+                        help="Repeats per n_jobs (best time kept).")
+    args = parser.parse_args()
+
+    print(f"Loading kleinbottle4d with n={args.n}...")
+    ds = datasets.Datasets()
+    X, _, _ = ds.kleinbottle4d(n=args.n)
+
+    # Warmup so JIT / first-touch costs don't pollute the first measurement.
+    print("Warmup (n_jobs=1)...")
+    run_once(X, 1)
+
+    print(f"\n| n_jobs | best (s) | speedup | all runs (s)            |")
+    print(f"|--------|----------|---------|-------------------------|")
+    baseline = None
+    for nj in args.jobs:
+        runs = [run_once(X, nj) for _ in range(args.repeats)]
+        best = min(runs)
+        if baseline is None:
+            baseline = best
+        speedup = baseline / best
+        runs_str = ", ".join(f"{r:.2f}" for r in runs)
+        print(f"| {nj:6d} | {best:8.3f} | {speedup:7.2f} | {runs_str:23s} |")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_memory.py b/tests/test_memory.py
new file mode 100644
index 0000000..99d3dd9
--- /dev/null
+++ b/tests/test_memory.py
@@ -0,0 +1,248 @@
+"""Memory-management tests for pyRATS.
+
+Covers:
+  * The live ``_available_memory_bytes`` probe and PYRATS_MEMORY_LIMIT cap.
+  * ``iter_eval_`` chunks the workload when the budget is tight, and the
+    streamed output matches the fully-materialized ``batched_eval_`` output.
+  * A simulated MemoryError causes chunk-size backoff (with a warning).
+  * Peak Python-tracked allocations during a streamed run stay within the
+    declared budget.
+"""
+
+import os
+import warnings
+import tracemalloc
+
+import numpy as np
+import pytest
+
+from pyRATS import _utils
+from pyRATS._utils import Param, _available_memory_bytes, batched_pdist
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+def _make_lpca_param(n=400, n_features=32, k=12, d=2, seed=0):
+    """Build a self-contained lpca-style Param for testing iter_eval_.
+
+    Skips the real lpca() fit (which we don't need) and just stuffs in
+    well-shaped Psi/mu/X arrays so iter_eval_ has something to chew on.
+    """
+    rng = np.random.default_rng(seed)
+    p = Param(algo="lpca")
+    p.X = rng.standard_normal((n, n_features)).astype(np.float64)
+    p.mu = rng.standard_normal((n, n_features))
+    p.Psi = rng.standard_normal((n, n_features, d))
+    p.b = np.ones(n)
+    p.noise_var = 0
+    p.noise = None
+    return p, k
+
+
+@pytest.fixture
+def clean_env(monkeypatch):
+    """Ensure PYRATS_MEMORY_LIMIT doesn't leak between tests."""
+    monkeypatch.delenv("PYRATS_MEMORY_LIMIT", raising=False)
+    yield monkeypatch
+
+
+# ---------------------------------------------------------------------------
+# Probe
+# ---------------------------------------------------------------------------
+
+def test_available_memory_honors_env_var(clean_env):
+    cap = 128 * 1024 * 1024  # 128 MB
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(cap))
+    assert _available_memory_bytes() <= cap
+
+
+def test_available_memory_floor(clean_env):
+    # A ridiculously small cap still gets clamped to the 64MB floor so
+    # chunking can always make progress.
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", "1")
+    assert _available_memory_bytes() >= 64 * 1024 * 1024
+
+
+def test_available_memory_is_live(clean_env):
+    """Probe must not cache: changing the env var must change the result."""
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(256 * 1024 * 1024))
+    first = _available_memory_bytes()
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(128 * 1024 * 1024))
+    second = _available_memory_bytes()
+    assert second < first
+
+
+# ---------------------------------------------------------------------------
+# iter_eval_ chunking + streaming correctness
+# ---------------------------------------------------------------------------
+
+def test_iter_eval_yields_multiple_chunks_when_budget_tight(clean_env):
+    p, k = _make_lpca_param(n=300, k=12)
+    masks = np.tile(np.arange(k), (300, 1))
+    view_idx = np.arange(300)
+
+    # Force a tiny budget so we get many chunks. Floor is 64MB so we go
+    # under it via monkeypatch to actually exercise the path.
+    clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", "8192")  # 8 KB
+
+    chunks = list(p.iter_eval_(view_idx, masks))
+    assert len(chunks) > 1, "expected chunking with a tight budget"
+
+    # Sum of chunk lengths must cover the input exactly once.
+    covered = sum(sl.stop - sl.start for sl, _ in chunks)
+    assert covered == 300
+
+
+def test_iter_eval_output_matches_batched_eval(clean_env):
+    """Chunked stream must reconstruct the same array as the unchunked path."""
+    p, k = _make_lpca_param(n=200, k=10)
+    masks = np.tile(np.arange(k), (200, 1))
+    view_idx = np.arange(200)
+
+    # Reference: large budget => one chunk.
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(1 << 30))
+    full = p.batched_eval_(view_idx, masks)
+
+    # Streamed: tight budget => many chunks. Sum back and compare.
+    clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", "16384")
+    streamed = np.empty_like(full)
+    nchunks = 0
+    for sl, chunk in p.iter_eval_(view_idx, masks):
+        streamed[sl] = chunk
+        nchunks += 1
+    assert nchunks > 1
+    np.testing.assert_allclose(streamed, full, rtol=0, atol=0)
+
+
+def test_peak_bytes_per_row_shrinks_chunks(clean_env):
+    """Caller-supplied downstream-cost estimate must shrink chunk size."""
+    p, k = _make_lpca_param(n=500, k=10)
+    masks = np.tile(np.arange(k), (500, 1))
+    view_idx = np.arange(500)
+
+    clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", "65536")
+
+    no_hint = list(p.iter_eval_(view_idx, masks, peak_bytes_per_row=0))
+    with_hint = list(
+        p.iter_eval_(view_idx, masks, peak_bytes_per_row=10 * 1024)
+    )
+    assert len(with_hint) >= len(no_hint), (
+        "declaring downstream allocation should produce >= chunks"
+    )
+
+
+# ---------------------------------------------------------------------------
+# MemoryError backoff
+# ---------------------------------------------------------------------------
+
+def test_memory_error_triggers_backoff_and_warns(clean_env, monkeypatch):
+    p, k = _make_lpca_param(n=64, k=8)
+    masks = np.tile(np.arange(k), (64, 1))
+    view_idx = np.arange(64)
+
+    real_matmul = np.matmul
+    state = {"raised": False}
+
+    def fake_matmul(a, b, *args, **kwargs):
+        # Raise MemoryError exactly once on the first call, then defer to
+        # real matmul. Mimics an OS-level transient pressure event.
+        if not state["raised"] and a.shape[0] > 1:
+            state["raised"] = True
+            raise MemoryError("simulated pressure")
+        return real_matmul(a, b, *args, **kwargs)
+
+    monkeypatch.setattr(np, "matmul", fake_matmul)
+
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always")
+        out = p.batched_eval_(view_idx, masks)
+
+    assert state["raised"], "fake_matmul was never invoked"
+    assert out.shape == (64, k, p.Psi.shape[2])
+    assert any(
+        issubclass(w.category, RuntimeWarning) and "MemoryError" in str(w.message)
+        for w in caught
+    ), "expected a RuntimeWarning about MemoryError backoff"
+
+
+def test_memory_error_propagates_when_chunk_already_one(clean_env, monkeypatch):
+    """If chunk size is already 1, backoff has nowhere to go — must raise."""
+    p, k = _make_lpca_param(n=4, k=4)
+    masks = np.tile(np.arange(k), (4, 1))
+    view_idx = np.arange(4)
+
+    clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 16)
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", "16")  # forces chunk_size = 1
+
+    def always_fail(a, b, *args, **kwargs):
+        raise MemoryError("cannot recover")
+
+    monkeypatch.setattr(np, "matmul", always_fail)
+
+    with pytest.raises(MemoryError):
+        list(p.iter_eval_(view_idx, masks))
+
+
+# ---------------------------------------------------------------------------
+# Measured peak (Python-tracked allocations)
+# ---------------------------------------------------------------------------
+
+def test_streamed_postprocess_peak_under_budget(clean_env):
+    """End-to-end peak check: a streamed _postprocess-style reduction must
+    keep peak Python-tracked allocations near the per-chunk working set,
+    not the full (n, num_pairs, d) materialized intermediate.
+
+    We measure peak with tracemalloc, which on CPython tracks NumPy buffers
+    via the default PyMem allocator. This is portable across Linux/macOS/
+    Windows and avoids relying on RSS (which depends on OS overcommit).
+    """
+    n, k, d = 1500, 16, 3
+    p, _ = _make_lpca_param(n=n, k=k, d=d, n_features=24)
+    masks = np.tile(np.arange(k), (n, 1))
+    view_idx = np.arange(n)
+
+    num_pairs = k * (k - 1) // 2
+    itemsize = p.X.dtype.itemsize
+
+    # --- Baseline: materialize the full pipeline (no streaming) ----------
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(1 << 32))  # effectively unlimited
+    tracemalloc.start()
+    full_eval = p.batched_eval_(view_idx, masks)
+    full_dists = batched_pdist(full_eval)
+    _, materialized_peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+    del full_eval, full_dists
+
+    # --- Streamed: chunked _postprocess-style reduction ------------------
+    # Cap at ~10x the per-row downstream peak so chunk_size lands at ~10
+    # rows. The streamed peak should be a small fraction of the
+    # materialized peak. We also lower the floor so the cap actually bites.
+    pdist_per_row = num_pairs * (d + 1) * itemsize
+    budget = 10 * pdist_per_row * 8
+    clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
+    clean_env.setenv("PYRATS_MEMORY_LIMIT", str(budget))
+
+    tracemalloc.start()
+    zeta = np.empty(n)
+    nchunks = 0
+    for sl, chunk in p.iter_eval_(view_idx, masks, peak_bytes_per_row=pdist_per_row):
+        chunk_dists = batched_pdist(chunk)
+        zeta[sl] = np.nanmax(chunk_dists, axis=1)
+        nchunks += 1
+    _, streamed_peak = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    assert nchunks > 1, "tight budget should produce multiple chunks"
+    # Streamed peak should be meaningfully smaller. Allow generous slack
+    # (0.5x) since tracemalloc captures fixed Param state too; the goal is
+    # to catch regressions where streaming silently degrades to full
+    # materialization.
+    assert streamed_peak < 0.5 * materialized_peak, (
+        f"streamed peak {streamed_peak} not meaningfully below "
+        f"materialized peak {materialized_peak}"
+    )

From 20697620d8fcd206ec03b301d5b536e43bafc74f Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 15:02:40 +0200
Subject: [PATCH 34/44] Allowing multiithreads within blas workers

---
 src/pyRATS/_utils.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index a103ce7..95b3817 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -94,6 +94,19 @@ def _get_available_memory(n_jobs=1):
     return _available_memory_bytes() // max(1, n_jobs)
 
 
+def _inner_blas_threads(n_jobs):
+    """BLAS threads per outer Parallel worker, sized to avoid oversubscription.
+
+    Apple Accelerate / OpenBLAS otherwise spawn a pool sized to all cores
+    inside each outer worker, giving n_jobs * cpu_count software threads on
+    cpu_count hardware threads — the n_jobs ~ cpu_count/2 cliff. Pinning to 1
+    fixes that but leaves cores idle when n_jobs < cpu_count. cpu_count //
+    n_jobs strikes the balance: full utilization, no contention. Floored at 1.
+    """
+    cores = os.cpu_count() or 1
+    return max(1, cores // max(1, n_jobs))
+
+
 def lpca(X, d, U, n_jobs, verbose=False):
     """Fit PCA model on the data X.
 
@@ -176,11 +189,7 @@ def target_proc(p_num, chunk_sz):
     if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead
         results = [target_proc(0, n)]
     else:
-        # inner_max_num_threads=1: each loky worker would otherwise fork its
-        # own BLAS pool sized to all cores, causing n_jobs**2 thread contention
-        # which produces a slowdown past n_jobs ~ ncores/2. Capping the inner
-        # pool to 1 keeps scaling monotonic.
-        with parallel_backend("loky", inner_max_num_threads=1):
+        with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)):
             results = Parallel(n_jobs=n_jobs)(
                 delayed(target_proc)(i, chunk_sz)
                 for i in tqdm(
@@ -265,7 +274,7 @@ def target_proc(p_num, chunk_sz):
         return start_ind, end_ind, model_
 
     chunk_sz = int(n / n_jobs)
-    with parallel_backend("loky", inner_max_num_threads=1):
+    with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)):
         results = Parallel(n_jobs=n_jobs)(
             delayed(target_proc)(p_num, chunk_sz)
             for p_num in tqdm(
@@ -784,13 +793,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
         if n_jobs == 1 or n < 1000:
             results = [target_proc(0, n, n, Utilde, n_C, c)]
         else:
-            # inner_max_num_threads=1: each loky worker would otherwise fork its
-            # own BLAS pool. With Apple Accelerate / OpenBLAS sized to all cores,
-            # n_jobs workers x n_cores BLAS threads = n_jobs**2 software threads
-            # contending for n_cores hardware threads, which produces the
-            # 4 -> 8 jobs slowdown. Capping the inner pool to 1 keeps scaling
-            # monotonic.
-            with parallel_backend("loky", inner_max_num_threads=1):
+            with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)):
                 results = Parallel(n_jobs=n_jobs)(
                     delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
                     for p_num in tqdm(
@@ -957,7 +960,7 @@ def target_proc(p_num):
     if n_jobs == 1 or n_elem < 1000:
         res = [target_proc(p_num) for p_num in range(n_jobs)]
     else:
-        with parallel_backend("loky", inner_max_num_threads=1):
+        with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)):
             res = list(Parallel(n_jobs=n_jobs,
                                 return_as="generator_unordered")(
                 delayed(target_proc)(p_num) for p_num in range(n_jobs)

From f01c358a393f264938cbcc365213e774c59fedbc Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 15:05:08 +0200
Subject: [PATCH 35/44] ci fixes

---
 .github/workflows/benchmark.yml | 4 ++--
 .github/workflows/ci.yml        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 4fe417d..3364965 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -47,8 +47,8 @@ jobs:
 
       - name: Install Dependencies
         run: |
-          pip install --upgrade pip
-          pip install -e .[test]
+          python -m pip install --upgrade pip
+          python -m pip install -e .[test]
 
       - name: Run Benchmark
         run: |
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a0cc7e6..6dfe788 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -29,8 +29,8 @@ jobs:
 
     - name: Install Dependencies
       run: |
-        pip install --upgrade pip
-        pip install -e .[test]
+        python -m pip install --upgrade pip
+        python -m pip install -e .[test]
 
     - name: Set Fast Mode Flag
       id: set-flag

From a2e7add0641bce8ab96edda7322e93c19e6fbe51 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 18:44:32 +0200
Subject: [PATCH 36/44] Testing more efficient allocation and memory checks

---
 src/pyRATS/_utils.py | 38 ++++++++++++++++++++++++++++++--------
 1 file changed, 30 insertions(+), 8 deletions(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 95b3817..7f16c80 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -19,6 +19,7 @@
 
 import os
 import sys
+import time
 import warnings
 
 _MIN_MEMORY_FLOOR = 64 * 1024 * 1024  # 64 MB — always make some progress
@@ -56,8 +57,12 @@ def _cgroup_available_bytes():
     return None
 
 
+_MEMORY_CACHE_TTL = 0.25  # seconds — fresh enough to track fit-phase allocations
+_memory_cache: tuple[float, int] | None = None  # (timestamp, bytes)
+
+
 def _available_memory_bytes():
-    """Live probe of usable memory in bytes.
+    """Live probe of usable memory in bytes, cached with a short TTL.
 
     Sources, all combined via min():
       * psutil.virtual_memory().available scaled by 0.75 (or 4GB fallback)
@@ -65,10 +70,17 @@ def _available_memory_bytes():
         before psutil sees pressure)
       * PYRATS_MEMORY_LIMIT env var as a hard cap
 
+    Result is cached for _MEMORY_CACHE_TTL seconds (default 250ms). This
+    eliminates psutil and cgroup file-read overhead on hot call sites (e.g.
+    iter_eval_ inside best()'s per-point loop) while still reacting to
+    large allocations between fit phases — the original "cache forever" bug.
     Floored at 64MB so chunking can always make at least one row of progress.
-    Called per request; do not cache — pyRATS allocates large internal state
-    during a fit, and a stale snapshot is the root cause of the prior OOMs.
     """
+    global _memory_cache
+    now = time.monotonic()
+    if _memory_cache is not None and now - _memory_cache[0] < _MEMORY_CACHE_TTL:
+        return _memory_cache[1]
+
     try:
         import psutil
         avail = int(psutil.virtual_memory().available * 0.75)
@@ -86,7 +98,9 @@ def _available_memory_bytes():
         except ValueError:
             pass
 
-    return max(avail, _MIN_MEMORY_FLOOR)
+    result = max(avail, _MIN_MEMORY_FLOOR)
+    _memory_cache = (now, result)
+    return result
 
 
 # Backward-compatible alias for any external caller pinned to the old name.
@@ -1795,10 +1809,18 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1):
             d = self.Psi.shape[2] if self.algo == "lpca" else 0
             return np.zeros((0, masks.shape[1] if hasattr(masks, "shape") else 0, d))
 
-        out = None
-        for sl, chunk in self.iter_eval_(ks, masks):
-            if out is None:
-                out = np.empty((n_eval, chunk.shape[1], chunk.shape[2]), dtype=chunk.dtype)
+        it = self.iter_eval_(ks, masks)
+        first_sl, first_chunk = next(it)
+        if first_sl.stop == n_eval:
+            # Common path: everything fit in one chunk — return directly,
+            # no extra allocation or copy.
+            return first_chunk
+
+        # Multi-chunk path: accumulate into a pre-allocated output array.
+        out = np.empty((n_eval, first_chunk.shape[1], first_chunk.shape[2]),
+                       dtype=first_chunk.dtype)
+        out[first_sl] = first_chunk
+        for sl, chunk in it:
             out[sl] = chunk
         return out
 

From 84a46f46f7b8f8d81b9ed3c8fa6079bd52a4def0 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Sun, 3 May 2026 18:53:07 +0200
Subject: [PATCH 37/44] Setting cpu core checker and warning

---
 src/pyRATS/rats.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index bef7e1c..5230592 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -252,6 +252,17 @@ def __init__(
         if n_jobs == -1:
             self.n_jobs = cpu_count()
 
+        cores = cpu_count()
+        if self.n_jobs > cores:
+            warnings.warn(
+                f"n_jobs={self.n_jobs} exceeds the number of available CPU cores "
+                f"({cores}). This causes oversubscription and will slow down the "
+                f"computation. n_jobs has been clamped to {cores}.",
+                UserWarning,
+                stacklevel=2,
+            )
+            self.n_jobs = cores
+
     def fit_transform(self, X):
         """Fit the model on the data in X, and transform X.
 

From 7db1ddf1a1dd3eade845e43cda3d6445836f3975 Mon Sep 17 00:00:00 2001
From: Dhruv Kohli <dhruvkohli99@gmail.com>
Date: Mon, 4 May 2026 05:33:04 -0700
Subject: [PATCH 38/44] Fix param.v update

when the embedding dimension is 1, .squeeze will remove the axis corresponding to the embedding dimension.
---
 src/pyRATS/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index fd2dd9c..caa12df 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -1258,7 +1258,7 @@ def update(alpha, max_iter, O, CC, M, d):
 
     Tstar_ = Tstar.reshape((len(Tstar), d, -1), order="F").transpose(2, 1, 0)
     param.T = np.matmul(param.T, Tstar_)  # update transformations of individual points
-    param.v = np.matmul(param.v[:, np.newaxis, :], Tstar_).squeeze() + Zstar[:, n:].T
+    param.v = np.matmul(param.v[:, np.newaxis, :], Tstar_)[:,0,:] + Zstar[:, n:].T
 
     return Zstar[:, :n].T
 

From 39534f8fa0abb566028c956c65c4da28214bb23e Mon Sep 17 00:00:00 2001
From: Dhruv Kohli <dhruvkohli99@gmail.com>
Date: Mon, 4 May 2026 05:37:57 -0700
Subject: [PATCH 39/44] Cast Utildeg to boolean after matrix multiplication

C and Ug are boolean sparse matrices. On my end, scipy is resulting a matrix with int data type with summed values (treating True as ones) instead of OR. Casting into boolean dtype gives back OR-ed values.
---
 src/pyRATS/_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index fd2dd9c..e9a71fc 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -1072,7 +1072,7 @@ def compute_incidence_matrix_in_embedding(y, C, k, nu, metric="euclidean"):
     k_ = min(int(k * nu), n - 1)
     _, neigh_indg = nearest_neighbors(y, k_, metric)
     Ug = sparse_matrix(neigh_indg, np.ones(neigh_indg.shape, dtype=bool))
-    Utildeg = C.dot(Ug)
+    Utildeg = C.dot(Ug).astype(bool)
     return Utildeg
 
 
From cf7eced5fde5043128e2546745afa562e414acc7 Mon Sep 17 00:00:00 2001
From: Dhruv Kohli <dhruvkohli99@gmail.com>
Date: Mon, 4 May 2026 05:43:28 -0700
Subject: [PATCH 40/44] Fix output collection

When there is no tear, compute_tear_graph returns a tuple of None. Comparing it with None results in False resulting in unnecessary subsequent processing.
---
 src/pyRATS/global_distortion.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/pyRATS/global_distortion.py b/src/pyRATS/global_distortion.py
index 8f715a4..c96db6c 100644
--- a/src/pyRATS/global_distortion.py
+++ b/src/pyRATS/global_distortion.py
@@ -44,7 +44,7 @@ def _compute_tear_aware_shortest_path_distances(
     if dist is None:
         dist = _shortest_paths(y, n_nbrs=k, metric=metric, return_predqecessors=False)
 
-    output = compute_tear_graph(
+    pts_across_tear, tear_graph = compute_tear_graph(
         y,
         Utilde,
         C,
@@ -56,10 +56,9 @@ def _compute_tear_aware_shortest_path_distances(
         n_jobs,
         i_mat_in_emb=Utildeg,
     )
-    if output is None:
+    if tear_graph is None:
         return dist
-
-    pts_across_tear, tear_graph = output
+        
     tear_graph = tear_graph.tocoo()
     tear_graph_row_inds = tear_graph.row
     tear_graph_col_inds = tear_graph.col

From 07bb35e452b566aab628bd3c2513892e3f930216 Mon Sep 17 00:00:00 2001
From: Dhruv Kohli <dhruvkohli99@gmail.com>
Date: Mon, 4 May 2026 05:46:29 -0700
Subject: [PATCH 41/44] Enhance comments on singular value handling in overlaps

Added comments to clarify the significance of singular values in overlap calculations.
---
 src/pyRATS/_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index fd2dd9c..121213f 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -887,8 +887,15 @@ def target_proc(p_num):
         start_ind, end_ind, overlap_svals_ = value
         overlap_svals[start_ind:end_ind, :] = overlap_svals_
 
+    # In most cases, when almost all overlaps have rank d, 
+    # W_data = overlap_svals[:,-1], the d-th singular value of the overlap
+    # is sufficient to determine the priority of the overlaps.
     # overlap_svals[overlap_svals<tol] = 0
     W_data = overlap_svals[:, -1]
+    # But in cases when a low-dimensional object is embedded in a high-dimensional space,
+    # or when d is higher than the local intrinsic dimension at several points,
+    # the d-th singular value will be zero at all the points with less than d local intrinsic dimension.
+    # In such cases, we look at the (d-1)-th singular value (and so on) to determine the priority of the overlaps.
     for i in range(overlap_svals.shape[1] - 2, -1, -1):
         mask = W_data == 0
         if verbose:

From 8fcf4788f07766fdda865c277277771252e41289 Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 21 May 2026 14:19:10 +0200
Subject: [PATCH 42/44] Reverting to old parallel lib for exact matching to
 Dhruv's initial codebase

---
 .github/workflows/ci.yml |  2 +-
 src/pyRATS/_utils.py     | 49 +++++++++++++++++++++++++++++-----------
 2 files changed, 37 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6dfe788..fa5db79 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,7 +50,7 @@ jobs:
     # Step 2: Generate Expected Baselines (using baseline source code but current scripts)
     - name: Generate Expected Baselines
       run: |
-        git checkout dda054369e8a8ae9b8d166a7116ab983edfeaf8b -- src/
+        git checkout 8e27b77b3247e1ac7447e7ea429e0f0c103a3fb0 -- src/
         python tests/scripts/generate_snapshots.py --path-to-results-dir tests/data/expected ${{ steps.set-flag.outputs.fast_flag }}
         git checkout HEAD -- src/
 
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 7f16c80..d687544 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -7,6 +7,7 @@
 import itertools
 
 from joblib import delayed, Parallel, parallel_backend
+import multiprocess as mp_lib
 
 from scipy.spatial.distance import pdist, squareform
 from sklearn.utils.extmath import svd_flip
@@ -534,7 +535,12 @@ def cost_of_moving_distortion(
     c_U_k_uniq = np.unique(c_U_k).tolist()
     cost_x_k_to = np.zeros(len(c_U_k_uniq)) + np.inf
 
-    U_k_list = list(U_k)
+    # Reconstruct the set from neigh_ind_k inside this process: pickling and
+    # unpickling a set rebuilds the hash table with a different bucket layout,
+    # so list(U_k) iterates in a different order in the parent vs. a loky
+    # worker. Building the set from the same numpy array in each process gives
+    # the same iteration order everywhere.
+    U_k_list = list(set(neigh_ind_k))
 
     # Iterate over all m in c_{U_k}
     i = 0
@@ -673,7 +679,12 @@ def cost_of_moving_alignment_error(
     c_U_k_uniq = np.unique(c_U_k)
     cost_x_k_to = np.zeros(len(c_U_k_uniq)) + np.inf
 
-    U_k_list = list(U_k)
+    # Reconstruct the set from neigh_ind_k inside this process: pickling and
+    # unpickling a set rebuilds the hash table with a different bucket layout,
+    # so list(U_k) iterates in a different order in the parent vs. a loky
+    # worker. Building the set from the same numpy array in each process gives
+    # the same iteration order everywhere.
+    U_k_list = list(set(neigh_ind_k))
 
     neighbor_mask = (
         (n_C[c_U_k_uniq] < eta_max) & (n_C[c_U_k_uniq] >= n_C_c_k) & (c_U_k_uniq != c_k)
@@ -807,17 +818,29 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c):
         if n_jobs == 1 or n < 1000:
             results = [target_proc(0, n, n, Utilde, n_C, c)]
         else:
-            with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)):
-                results = Parallel(n_jobs=n_jobs)(
-                    delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c)
-                    for p_num in tqdm(
-                        range(n_jobs),
-                        desc=f"eta={eta}",
-                        unit="chunk",
-                        leave=False,
-                        disable=not verbose,
-                    )
-                )
+            # Use multiprocess.Process (fork) rather than joblib/loky (spawn).
+            # cost_of_moving reads Utilde[m], a Python set whose iteration
+            # order depends on its hash-table bucket layout. Spawn re-pickles
+            # the set and rebuilds the table with a different layout in each
+            # worker, so list(Utilde[m]) iterates in a different order than
+            # in the parent — permuting the rows/cols passed to
+            # local_param.eval_ and d_e, producing float-LSB cost diffs that
+            # flip argmin tie-breaks and yield divergent cluster
+            # assignments. Fork shares the parent's address space via
+            # copy-on-write, so the children see the exact same set layout
+            # and produce bit-identical costs to a serial run.
+            q = mp_lib.Queue()
+            def _worker(p_num):
+                q.put(target_proc(p_num, chunk_sz, n, Utilde, n_C, c))
+            procs = [
+                mp_lib.Process(target=_worker, args=(p_num,), daemon=True)
+                for p_num in range(n_jobs)
+            ]
+            for p in procs:
+                p.start()
+            results = [q.get() for _ in range(n_jobs)]
+            for p in procs:
+                p.join()
         for start_ind, end_ind, cost_, dest_ in results:
             cost[start_ind:end_ind] = cost_
             dest[start_ind:end_ind] = dest_

From 02ca601e893a71b4f3b19f204333a1e61170e60d Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 21 May 2026 14:20:11 +0200
Subject: [PATCH 43/44] Memory cache update for memory testing

---
 tests/test_memory.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test_memory.py b/tests/test_memory.py
index 99d3dd9..66f5893 100644
--- a/tests/test_memory.py
+++ b/tests/test_memory.py
@@ -43,8 +43,9 @@ def _make_lpca_param(n=400, n_features=32, k=12, d=2, seed=0):
 
 @pytest.fixture
 def clean_env(monkeypatch):
-    """Ensure PYRATS_MEMORY_LIMIT doesn't leak between tests."""
+    """Ensure PYRATS_MEMORY_LIMIT and the TTL cache don't leak between tests."""
     monkeypatch.delenv("PYRATS_MEMORY_LIMIT", raising=False)
+    monkeypatch.setattr(_utils, "_memory_cache", None)
     yield monkeypatch
 
 
@@ -66,9 +67,11 @@ def test_available_memory_floor(clean_env):
 
 
 def test_available_memory_is_live(clean_env):
-    """Probe must not cache: changing the env var must change the result."""
+    """Env-var changes are picked up after the TTL expires (not cached forever)."""
     clean_env.setenv("PYRATS_MEMORY_LIMIT", str(256 * 1024 * 1024))
     first = _available_memory_bytes()
+    # Expire the cache so the next call re-reads the env var.
+    clean_env.setattr(_utils, "_memory_cache", None)
     clean_env.setenv("PYRATS_MEMORY_LIMIT", str(128 * 1024 * 1024))
     second = _available_memory_bytes()
     assert second < first
@@ -109,6 +112,7 @@ def test_iter_eval_output_matches_batched_eval(clean_env):
     # Streamed: tight budget => many chunks. Sum back and compare.
     clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
     clean_env.setenv("PYRATS_MEMORY_LIMIT", "16384")
+    clean_env.setattr(_utils, "_memory_cache", None)  # expire cache so tight cap takes effect
     streamed = np.empty_like(full)
     nchunks = 0
     for sl, chunk in p.iter_eval_(view_idx, masks):
@@ -226,6 +230,7 @@ def test_streamed_postprocess_peak_under_budget(clean_env):
     budget = 10 * pdist_per_row * 8
     clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024)
     clean_env.setenv("PYRATS_MEMORY_LIMIT", str(budget))
+    clean_env.setattr(_utils, "_memory_cache", None)  # expire cache so tight cap takes effect
 
     tracemalloc.start()
     zeta = np.empty(n)

From 101ba26fab351ff856c054ef28cd1066cd34487c Mon Sep 17 00:00:00 2001
From: Nasir Ahmad <nasiryahm@gmail.com>
Date: Thu, 21 May 2026 20:11:32 +0200
Subject: [PATCH 44/44] tqdm cleanup

---
 src/pyRATS/_tear_coloring.py | 4 ----
 src/pyRATS/_utils.py         | 5 +++--
 src/pyRATS/rats.py           | 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py
index ddd3661..3f0e8b7 100644
--- a/src/pyRATS/_tear_coloring.py
+++ b/src/pyRATS/_tear_coloring.py
@@ -75,8 +75,6 @@ def compute_spectral_color_of_pts_on_tear(
     n_comp, labels = connected_components(
         tear_graph, directed=False, return_labels=True
     )
-    if verbose:
-        tqdm.write(f"Number of components in the tear graph: {n_comp}")
 
     n_points_in_comp = []
     for i in range(n_comp):
@@ -93,8 +91,6 @@ def compute_spectral_color_of_pts_on_tear(
         comp_i = labels == i
         n_comp_i = np.sum(comp_i)
         scale = n_comp_i / n_pts_across_tear
-        if verbose:
-            tqdm.write(f"#points in the tear component no. {i} are: {n_comp_i}")
 
         # If the component is very small then assign the constant color
         if n_comp_i <= max(3, int(color_cutoff_frac * n)):
diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py
index 6dcd3cf..f4a8bb5 100644
--- a/src/pyRATS/_utils.py
+++ b/src/pyRATS/_utils.py
@@ -16,7 +16,7 @@
     connected_components,
     breadth_first_order,
 )
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 import os
 import sys
@@ -791,7 +791,7 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs):
 
     # Vary eta from 2 to eta_{min}
     for eta in tqdm(
-        range(2, eta_min + 1), desc="Intermediate views", disable=not verbose
+        range(2, eta_min + 1), desc="Intermediate views", disable=not verbose, position=0, leave=True
     ):
             # tqdm.write(
             #     "#non-empty views with sz < %d = %d"
@@ -861,6 +861,7 @@ def _worker(p_num):
                 desc="Merging",
                 unit="pts",
                 leave=False,
+                position=1,
             )
 
         total_len_S = 0
diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py
index 15f22c0..7895a3b 100644
--- a/src/pyRATS/rats.py
+++ b/src/pyRATS/rats.py
@@ -4,7 +4,7 @@
 
 from multiprocessing import cpu_count
 from joblib import Parallel, delayed
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 from pyRATS._utils import (
     nearest_neighbors,