From dda054369e8a8ae9b8d166a7116ab983edfeaf8b Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:22:03 +0100 Subject: [PATCH 01/44] fix: Parallelization broken in lpca and different parallelism backends unified. Removed duplicated imports. --- src/pyRATS/_utils.py | 110 ++++++++++++------------------------------- 1 file changed, 29 insertions(+), 81 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index fd2dd9c..fa9fd0c 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -3,18 +3,12 @@ from scipy.linalg import svd, svdvals from scipy.sparse.linalg import svds from sklearn.decomposition import KernelPCA -from scipy.sparse import csr_matrix, triu +from scipy.sparse import csr_matrix, triu, block_diag import itertools from joblib import delayed, Parallel -import multiprocess as mp -from multiprocess import shared_memory - -from scipy.sparse.linalg import svds -from scipy.linalg import svd from scipy.spatial.distance import pdist, squareform -from scipy.sparse import csr_matrix, block_diag from sklearn.utils.extmath import svd_flip from scipy.sparse.csgraph import ( minimum_spanning_tree, @@ -105,7 +99,6 @@ def target_proc(p_num, chunk_sz): results = Parallel(n_jobs=n_jobs)( delayed(target_proc)(i, chunk_sz) for i in range(n_jobs) ) - results = [target_proc(0, n)] for i in range(len(results)): start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i] @@ -165,7 +158,7 @@ def kpca(X, d, U, kernel, fit_inverse_transform, n_jobs, verbose=False): local_param.model = np.empty(n, dtype=object) local_param.zeta = np.zeros(n) - def target_proc(p_num, chunk_sz, q_): + def target_proc(p_num, chunk_sz): start_ind = p_num * chunk_sz if p_num == (n_jobs - 1): end_ind = n @@ -184,26 +177,16 @@ def target_proc(p_num, chunk_sz, q_): U_k = U[k] X_k = X[U_k, :] model_[k - start_ind].fit(X_k) - q_.put((start_ind, end_ind, model_)) + return start_ind, end_ind, model_ - q_ = mp.Queue() chunk_sz = int(n / n_jobs) - proc = [] - for p_num in range(n_jobs): - proc.append( - mp.Process(target=target_proc, args=(p_num, chunk_sz, q_), daemon=True) - ) - proc[-1].start() + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(p_num, chunk_sz) for p_num in range(n_jobs) + ) - for p_num in range(n_jobs): - start_ind, end_ind, model_ = q_.get() + for start_ind, end_ind, model_ in results: local_param.model[start_ind:end_ind] = model_ - q_.close() - - for p_num in range(n_jobs): - proc[p_num].join() - if verbose: print("local_param: all %d points processed..." % n) return local_param @@ -673,20 +656,6 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): cost = np.zeros(n) + np.inf dest = np.zeros(n, dtype="int") - 1 - shm_cost = shared_memory.SharedMemory(create=True, size=cost.nbytes) - np_cost = np.ndarray(cost.shape, dtype=cost.dtype, buffer=shm_cost.buf) - np_cost[:] = cost[:] - shm_dest = shared_memory.SharedMemory(create=True, size=dest.nbytes) - np_dest = np.ndarray(dest.shape, dtype=dest.dtype, buffer=shm_dest.buf) - np_dest[:] = dest[:] - - shm_cost_name = shm_cost.name - cost_shape = cost.shape - cost_dtype = cost.dtype - shm_dest_name = shm_dest.name - dest_shape = dest.shape - dest_dtype = dest.dtype - # Vary eta from 2 to eta_{min} if verbose: print("Constructing intermediate views.") @@ -699,50 +668,34 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): ) print("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta))) - def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S): - existing_shm_cost = shared_memory.SharedMemory(name=shm_cost_name) - cost_ = np.ndarray( - cost_shape, dtype=cost_dtype, buffer=existing_shm_cost.buf - ) - existing_shm_dest = shared_memory.SharedMemory(name=shm_dest_name) - dest_ = np.ndarray( - dest_shape, dtype=dest_dtype, buffer=existing_shm_dest.buf - ) - + def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): start_ind = p_num * chunk_sz if p_num == (n_jobs - 1): end_ind = n_ else: end_ind = (p_num + 1) * chunk_sz - for k in range(start_ind, end_ind): - if S is None: - k1 = k - else: - k1 = S[k] - cost_[k1], dest_[k1] = cost_of_moving( - k1, d_e, neigh_ind[k1], U_[k1], param, c, n_C, Utilde, eta, eta_max + cost_ = np.zeros(end_ind - start_ind) + np.inf + dest_ = np.zeros(end_ind - start_ind, dtype="int") - 1 + for i, k in enumerate(range(start_ind, end_ind)): + cost_[i], dest_[i] = cost_of_moving( + k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max ) + return start_ind, end_ind, cost_, dest_ - proc = [] chunk_sz = int(n / n_jobs) - for p_num in range(n_jobs): - proc.append( - mp.Process( - target=target_proc, - args=(p_num, chunk_sz, n, Utilde, n_C, c, None), - daemon=True, - ) - ) - proc[-1].start() - - for p_num in range(n_jobs): - proc[p_num].join() + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) + for p_num in range(n_jobs) + ) + for start_ind, end_ind, cost_, dest_ in results: + cost[start_ind:end_ind] = cost_ + dest[start_ind:end_ind] = dest_ # Compute point with minimum cost # Compute k and cost^* - k = np.argmin(np_cost) - cost_star = np_cost[k] + k = np.argmin(cost) + cost_star = cost[k] if verbose: print("Costs computed when eta = %d." % eta) @@ -751,11 +704,10 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S): total_len_S = 0 ctr = 0 while cost_star < np.inf: - # print(k, cost_star, flush=True) # Move x_k from cluster s to # dest_k and update variables s = c[k] - dest_k = np_dest[k] + dest_k = dest[k] c[k] = dest_k n_C[s] -= 1 n_C[dest_k] += 1 @@ -769,23 +721,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S): if n_C[s] > 0: S_ = ( (c == dest_k) - | (np_dest == dest_k) + | (dest == dest_k) | np.array(U[:, list(Clstr[s])].sum(1), dtype=bool).flatten() ) else: - S_ = (c == dest_k) | (np_dest == dest_k) | (np_dest == s) + S_ = (c == dest_k) | (dest == dest_k) | (dest == s) S = np.where(S_)[0].tolist() len_S = len(S) total_len_S += len_S ctr += 1 for k in S: - np_cost[k], np_dest[k] = cost_of_moving( + cost[k], dest[k] = cost_of_moving( k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max ) - k = np.argmin(np_cost) - cost_star = np_cost[k] + k = np.argmin(cost) + cost_star = cost[k] if verbose: print( @@ -798,10 +750,6 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c, S): ) print("Done with eta = %d." % eta) - shm_cost.close() - shm_cost.unlink() - shm_dest.close() - shm_dest.unlink() return c, n_C From a4e1851b734746ef60b6fd5327d7d76d37496184 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Tue, 24 Mar 2026 19:12:29 +0000 Subject: [PATCH 02/44] Tests: Update to CI for new parallel library changes plus fix for randomization issue --- .github/workflows/ci.yml | 2 +- .gitignore | 1 + tests/scripts/generate_snapshots.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 34c61d6..a0cc7e6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,7 +50,7 @@ jobs: # Step 2: Generate Expected Baselines (using baseline source code but current scripts) - name: Generate Expected Baselines run: | - git checkout 8e27b77b3247e1ac7447e7ea429e0f0c103a3fb0 -- src/ + git checkout dda054369e8a8ae9b8d166a7116ab983edfeaf8b -- src/ python tests/scripts/generate_snapshots.py --path-to-results-dir tests/data/expected ${{ steps.set-flag.outputs.fast_flag }} git checkout HEAD -- src/ diff --git a/.gitignore b/.gitignore index b9017f9..1cfc639 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ src/pyRATS/__pycache__/ src/pyRATS.egg-info +tests/data .vscode .DS_Store __pycache__ \ No newline at end of file diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py index 21cbf4a..31ad21d 100644 --- a/tests/scripts/generate_snapshots.py +++ b/tests/scripts/generate_snapshots.py @@ -127,6 +127,6 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal argslist = list(itertools.product(ks, eta_mins, cost_fn_names, dataset_names)) - Parallel(n_jobs=-1)( + Parallel(n_jobs=1)( delayed(run_model)(*args, dirpath, force_compute) for args in argslist ) From 6aac4b186b47f4f349c76c5c5275b45f617dbf59 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:25:28 +0100 Subject: [PATCH 03/44] speedup: Parallelization of post-processing neighbour distance measures using smart triu and neighbourhood graph checks --- src/pyRATS/rats.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 78c7063..e6be6f9 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -1,6 +1,5 @@ import numpy as np from scipy.sparse import csr_matrix -from scipy.spatial.distance import squareform from multiprocessing import cpu_count @@ -429,12 +428,17 @@ def _postprocess(self): n = self.U.shape[0] - original_dists = np.empty((n, int((self.k * (self.k - 1) // 2)))) - for i in range(n): - U_i = self.U[i, :].indices - d_e_i = self.neighborhood_graph_sym[np.ix_(U_i, U_i)] - d_e_mask_ = squareform(d_e_i.toarray()) - original_dists[i] = d_e_mask_ # These can be stored and reused + # Vectorised: gather all pairwise neighbour distances at once. + # neigh_inds[i] are the k neighbour indices of point i (same as + # connectivity_matrix used below, extracted once here for clarity). + neigh_inds = self.U.indices.reshape(n, self.k) # (n, k) + pair_i, pair_j = np.triu_indices(self.k, k=1) # all upper-triangle pairs + row_idx = neigh_inds[:, pair_i].ravel() # (n * num_pairs,) + col_idx = neigh_inds[:, pair_j].ravel() # (n * num_pairs,) + num_pairs = len(pair_i) + original_dists = np.asarray( + self.neighborhood_graph_sym[row_idx, col_idx] + ).reshape(n, num_pairs) embedded_datapoints = self.param.batched_eval_( { From 803570166a6ca631e2eeb833b09219f2dd104a04 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:30:24 +0100 Subject: [PATCH 04/44] speedup: Vectorizing pdist --- src/pyRATS/_utils.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index fa9fd0c..b9e9114 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -243,16 +243,10 @@ def batched_pdist(x): x = np.array(x) n, k, _ = x.shape - res = np.empty((n, k * (k - 1) // 2)) - p = 0 - for i in range(k): - a = x[:, i] - bs = x[:, i + 1 :] - - cs = bs - a[:, None, :] - res[:, p : p + k - i - 1] = np.sum(cs * cs, axis=-1) - p += k - i - 1 - return np.sqrt(res) + # Precompute all upper-triangle pair indices (same order as the original loop) + pair_i, pair_j = np.triu_indices(k, k=1) # each shape (num_pairs,) + diff = x[:, pair_i, :] - x[:, pair_j, :] # (n, num_pairs, d) + return np.sqrt(np.sum(diff * diff, axis=-1)) # (n, num_pairs) def batched_procrustes_cost(X, Y): From 01ab3c88fa66bc182b15fcbcda06c56071c8664b Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:33:17 +0100 Subject: [PATCH 05/44] fix: Improving error handling and variable descriptions --- src/pyRATS/rats.py | 89 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 81 insertions(+), 8 deletions(-) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index e6be6f9..4041ab4 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -1,4 +1,5 @@ import numpy as np +import warnings from scipy.sparse import csr_matrix from multiprocessing import cpu_count @@ -32,8 +33,11 @@ class RATS: If True, searches for local embeddings that minimize the local distortion. Useful for noisy manifolds. - kpca_kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default='linear' - Kernel used for PCA. See https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html for more information. + kpca_kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None + Kernel used for PCA. None uses standard linear PCA (lpca). Any other value + uses sklearn.decomposition.KernelPCA. See + https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html + for more information. cost_fn_name : {'alignment', 'distortion'}, default='alignment' 'alignment': Alignment error should be prefered when runtime matters or dealing with noisy manifolds. @@ -108,18 +112,40 @@ def __init__( eps=1e-8, patience=5, tol=1e-2, - n_connected_manifolds=1, + n_forced_clusters=1, verbose=False, n_jobs=-1, ): - assert cost_fn_name in ["alignment", "distortion"] + if cost_fn_name not in ["alignment", "distortion"]: + raise ValueError( + f"cost_fn_name must be 'alignment' or 'distortion', got {cost_fn_name!r}." + ) + if eta_max <= eta_min: + raise ValueError( + f"eta_max ({eta_max}) must be greater than eta_min ({eta_min})." + ) + if metric != "euclidean": + warnings.warn( + f"metric={metric!r} is not yet fully supported. Only 'euclidean' is " + "used in the embedding step. This parameter will be extended in a " + "future release.", + FutureWarning, + stacklevel=2, + ) + if kpca_fit_inverse_transform: + warnings.warn( + "kpca_fit_inverse_transform=True is not yet supported and will be " + "ignored. Inverse transform support will be added in a future release.", + FutureWarning, + stacklevel=2, + ) self.verbose = verbose self.n_jobs = n_jobs self.d = d - self.kpca_kernel = None if kpca_kernel == "linear" else kpca_kernel + self.kpca_kernel = kpca_kernel self.kpca_fit_inverse_transform = kpca_fit_inverse_transform self.k = k if cost_fn_name == "distortion": @@ -134,9 +160,8 @@ def __init__( self.max_iter, self.max_internal_iter = max_iter, max_internal_iter self.alpha, self.eps = alpha, eps self.patience, self.tol = patience, tol - self.verbose = verbose self.metric = metric - self.n_forced_clusters = n_connected_manifolds + self.n_forced_clusters = n_forced_clusters if self.patience is None: self.patience = self.max_iter @@ -208,8 +233,56 @@ def compute_color_of_pts_on_tear(self, y, tear_color_eig_inds=[0, 1, 2]): self.Utildeg, ) + def fit(self, X): + """Fit the model on X without returning the embedding. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Sample data. + + Returns + ------- + self + """ + raise NotImplementedError( + "fit() is not yet implemented. Use fit_transform() instead." + ) + + def transform(self, X): + """Apply the fitted embedding to new data X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + New data to embed. + + Returns + ------- + y : array-like, shape (n_samples, d) + X transformed in the new space. + """ + raise NotImplementedError( + "transform() is not yet implemented. Use fit_transform() instead." + ) + def inverse_transform(self, y): - NotImplementedError() + """Map points from the embedding space back to the original space. + + Parameters + ---------- + y : array-like, shape (n_samples, d) + Points in the embedding space. + + Returns + ------- + X : array-like, shape (n_samples, n_features) + Points in the original space. + """ + raise NotImplementedError( + "inverse_transform() is not yet implemented." + ) + def _fit_nbrhd_graph(self, X, sort_results=True): """Fitting the neighborhood graph. From 0e2313bc18e73f348a000fbd8b66102152515d5c Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:44:58 +0100 Subject: [PATCH 06/44] style: Code cleanups for neatness and readability. Removed unused procrustes code. --- pyproject.toml | 2 +- src/pyRATS/_utils.py | 120 ++++++++++--------------------------------- src/pyRATS/rats.py | 14 ++--- 3 files changed, 33 insertions(+), 103 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aae6249..4b63c93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ where = ["src"] [project] name = "pyRATS" version = "0.0.1" -description = "" +description = "Riemannian Alignment of Tangent Spaces (RATS) for dimensionality reduction" readme = "README.md" license-files = ["License"] requires-python = ">=3.9" diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index b9e9114..36ba659 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -452,7 +452,7 @@ def cost_of_moving_distortion( # that is cost_{x_k \rightarrow m} cost_x_k_to[i] = compute_zeta( d_e[np.ix_(U_k_U_Utilde_m, U_k_U_Utilde_m)], - local_param.eval_({"view_index": m, "data_mask": U_k_U_Utilde_m}), + local_param.eval_(m, U_k_U_Utilde_m), ) i += 1 @@ -812,8 +812,8 @@ def target_proc(p_num): m = W_rows[i] mpp = W_cols[i] mask = i_mat[m, :].multiply(i_mat[mpp, :]).nonzero()[1] - V_mmp = param.eval_({"view_index": m, "data_mask": mask}) - V_mpm = param.eval_({"view_index": mpp, "data_mask": mask}) + V_mmp = param.eval_(m, mask) + V_mpm = param.eval_(mpp, mask) Vbar_mmp = V_mmp - np.mean(V_mmp, 0)[np.newaxis, :] Vbar_mpm = V_mpm - np.mean(V_mpm, 0)[np.newaxis, :] # Compute ambiguity of the overlaps captured by singular values @@ -971,7 +971,7 @@ def compute_init_embedding( seq_0 = seq[0] is_visited_view[seq_0] = True y[C[seq_0, :].indices, :] = param.eval_( - {"view_index": seq_0, "data_mask": C[seq_0, :].indices} + seq_0, C[seq_0, :].indices ) y, is_visited_view = procrustes_init( seq, rho, y, is_visited_view, d, Utilde, C, param @@ -1286,7 +1286,7 @@ def build_ortho_optim(d, Utilde, param, verbose): for i in range(M): Utilde_i = Utilde[i, :].indices - X_ = param.eval_({"view_index": i, "data_mask": Utilde_i}) + X_ = param.eval_(i, Utilde_i) sqrt_p_ki = np.sqrt(np.array(W[i, :].data).flatten()[:, None]) X_ = sqrt_p_ki * X_ D.append(np.matmul(X_.T, X_)) @@ -1392,14 +1392,14 @@ def procrustes_init(seq, rho, y, is_visited_view, d, Utilde, C, param): Utilde_s_mp = Utilde_s.multiply(Utilde[mp, :]).nonzero()[1] n_Utilde_s_Z_s[Utilde_s_mp] += 1 mu_s[Utilde_s_mp, :] += param.eval_( - {"view_index": mp, "data_mask": Utilde_s_mp} + mp, Utilde_s_mp ) # Compute T_s and v_s by aligning the embedding of the overlap # between sth view and the views in Z_s, with the centroid mu_s temp = n_Utilde_s_Z_s > 0 mu_s = mu_s[temp, :] / n_Utilde_s_Z_s[temp, np.newaxis] - V_s_Z_s = param.eval_({"view_index": s, "data_mask": temp}) + V_s_Z_s = param.eval_(s, temp) T_s, v_s = procrustes(V_s_Z_s, mu_s) @@ -1412,72 +1412,11 @@ def procrustes_init(seq, rho, y, is_visited_view, d, Utilde, C, param): # Compute global embedding of point in sth cluster C_s = C[s, :].indices - y[C_s, :] = param.eval_({"view_index": s, "data_mask": C_s}) + y[C_s, :] = param.eval_(s, C_s) return y, is_visited_view -def procrustes_final( - y, d, Utilde, C, intermed_param, seq_of_intermed_views_in_cluster, global_opts -): - M, n = Utilde.shape - # Traverse over intermediate views in a random order - seq = np.random.permutation(M) - is_first_view_in_cluster = np.zeros(M, dtype=bool) - # is_first_view_in_cluster[i] = True if the ith view is the first - # view in some cluster of views - for i in range(len(seq_of_intermed_views_in_cluster)): - is_first_view_in_cluster[seq_of_intermed_views_in_cluster[i][0]] = True - - y_due_to_all_views = [] - for k in range(n): - y_due_to_all_views.append({}) - - for s in range(M): - Utilde_s = Utilde[s, :].nonzero()[1] - y_Utilde_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s}) - for k in range(len(Utilde_s)): - y_due_to_all_views[Utilde_s[k]][s] = y_Utilde_s[k, :] - - # For a given seq, refine the global embedding - for _ in range(global_opts["max_internal_iter"]): - for s in seq.tolist(): - # # Never refine s_0th intermediate view - # if is_first_view_in_cluster[s]: - # continue - - Utilde_s = Utilde[s, :].nonzero()[1] - mu_s = [] - Utilde_s_ = [] - for k_ in range(len(Utilde_s)): - k = Utilde_s[k_] - if len(y_due_to_all_views[k]) == 1: - continue - Utilde_s_.append(k) - mu = np.array(list(y_due_to_all_views[k].values())).sum(axis=0) - mu = (mu - y_due_to_all_views[k][s]) / (len(y_due_to_all_views[k]) - 1) - mu_s.append(mu) - mu_s = np.array(mu_s) - Utilde_s_ = np.array(Utilde_s_) - - # Compute T_s and v_s by aligning the embedding of the overlap - # between sth view and the views in Z_s, with the centroid mu_s - V_s_Z_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s_}) - T_s, v_s = procrustes(V_s_Z_s, mu_s) - - # Update T_s, v_s - intermed_param.T[s, :, :] = np.matmul(intermed_param.T[s, :, :], T_s) - intermed_param.v[s, :] = ( - np.matmul(intermed_param.v[s, :][np.newaxis, :], T_s) + v_s - ) - y_Utilde_s = intermed_param.eval_({"view_index": s, "data_mask": Utilde_s}) - for k in range(len(Utilde_s)): - y_due_to_all_views[Utilde_s[k]][s] = y_Utilde_s[k, :] - - y = np.zeros((n, d)) - for k in range(n): - y[k, :] = np.array(list(y_due_to_all_views[k].values())).mean(axis=0) - return y class Param: @@ -1512,28 +1451,24 @@ def __init__(self, algo="lpca", **kwargs): # For KPCA etc self.model = None - self.X = None - self.y = None self.add_dim = False self.standardize = False - def batched_eval_(self, opts): + def batched_eval_(self, view_index, data_mask): """Maps multiple points to the new space through their local (K-)PCA prameterizations. Parameters ------ - opts : dict - Must have keys - view_index : array-like, shape (n_points) - The indices of the parameterization to use. + view_index : array-like, shape (n_points) + The indices of the parameterization to use. - data_mask : array-like, shape (n_points, n_neighbors) - List of points to map to the embedding dimension for each point in 'view_index' + data_mask : array-like, shape (n_points, n_neighbors) + List of points to map to the embedding dimension for each point in 'view_index' """ - ks = opts["view_index"] - masks = opts["data_mask"] + ks = view_index + masks = data_mask if self.algo == "lpca": temp = np.matmul( @@ -1568,22 +1503,23 @@ def batched_eval_(self, opts): temp = temp + self.v[[ks], :] return temp - def eval_(self, opts, apply_b=True): + def eval_(self, view_index, data_mask, apply_b=True): """Maps points to the new space through their local (K-)PCA prameterizations. Parameters ------ - opts : dict - Must have keys - 'view_index : int - The index of the parameterization to use. + view_index : int + The index of the parameterization to use. + + data_mask : array-like, shape (n_neighbors) + List of points to map to the embedding dimension. - data_mask : array-like, shape (n_neighbors) - List of points to map to the embedding dimension. + apply_b : bool, default=True + Whether to apply the scale transformation b[k]. """ - k = opts["view_index"] - mask = opts["data_mask"] + k = view_index + mask = data_mask if self.algo == "lpca": temp = np.dot( @@ -1633,9 +1569,9 @@ def replace_(self, new_param_ind): else: # ISOMAP, LKPCA self.model = self.model[new_param_ind] - def reconstruct_(self, opts): - k = opts["view_index"] - y_ = opts["embeddings"] + def reconstruct_(self, view_index, embeddings): + k = view_index + y_ = embeddings if self.algo == "LPCA": temp = ( np.dot( diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 4041ab4..aaa8dcc 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -514,10 +514,7 @@ def _postprocess(self): ).reshape(n, num_pairs) embedded_datapoints = self.param.batched_eval_( - { - "view_index": range(n), - "data_mask": self.U.indices.reshape(n, self.k), - } + range(n), self.U.indices.reshape(n, self.k) ) # tested and they are equivalent embedded_dists = batched_pdist(embedded_datapoints) disc_lip_const = np.divide( @@ -534,7 +531,7 @@ def _postprocess(self): if self.verbose: print( - r"Maximum local distoriton before postptocessing is: %0.4f" + r"Maximum local distortion before postprocessing is: %0.4f" % np.max(zeta) ) @@ -564,10 +561,7 @@ def _postprocess(self): ] # list of indices batch_embedded_datapoints = self.param.batched_eval_( - { - "view_index": future_use_phi_of[use_phi_of], - "data_mask": neighbors_to_reconsider, - } + future_use_phi_of[use_phi_of], neighbors_to_reconsider ) batch_embedded_dists = batched_pdist(batch_embedded_datapoints) batch_original_dists = original_dists[points_to_reconsider] @@ -609,6 +603,6 @@ def _postprocess(self): self.param.replace_(future_use_phi_of) if self.verbose: print( - f"Maximum local distoriton after post-processing is: %0.4f" + f"Maximum local distortion after postprocessing is: %0.4f" % (np.max(self.param.zeta)) ) From e425db7570fba4ffd9ead46625388a82e0590230 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 18:53:58 +0100 Subject: [PATCH 07/44] CI: New continuous integration testing for speed. Tests speed across a range of datapoint numbers --- .github/workflows/benchmark.yml | 35 +++++++++++++++ tests/scripts/benchmark.py | 69 +++++++++++++++++++++++++++++ tests/scripts/generate_snapshots.py | 2 +- 3 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/benchmark.yml create mode 100644 tests/scripts/benchmark.py diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 0000000..30f15f4 --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,35 @@ +name: Performance Benchmark + +on: + push: + branches: [ '**' ] + pull_request: + branches: [ 'main' ] + workflow_dispatch: + +permissions: + contents: write + +jobs: + benchmark: + name: RATS Performance Benchmark + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + + - name: Install Dependencies + run: | + pip install --upgrade pip + pip install -e .[test] + + - name: Run Benchmark + run: | + echo "## RATS Performance Benchmarks" >> $GITHUB_STEP_SUMMARY + echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY + echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY + python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py new file mode 100644 index 0000000..335e8d3 --- /dev/null +++ b/tests/scripts/benchmark.py @@ -0,0 +1,69 @@ +import os +import sys +import time +import json +import argparse + +# Allow importing src and examples +repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +if repo_root not in sys.path: + sys.path.insert(0, repo_root) + +from pyRATS import rats +from examples import datasets + +def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"): + sys.stderr.write(f"Running benchmark for n={n_samples}, k={k}, eta_min={eta_min}...\n") + sys.stderr.flush() + + # Load dataset + ds = datasets.Datasets() + X, _, _ = ds.kleinbottle4d(n=n_samples) + + model = rats.RATS( + d=2, + k=k, + cost_fn_name=cost_fn_name, + eta_min=eta_min, + max_iter=3, + nu=4, + verbose=False, # Set to False for cleaner benchmark output + ) + + start_time = time.perf_counter() + model.fit_transform(X=X) + end_time = time.perf_counter() + + duration = end_time - start_time + sys.stderr.write(f"Completed in {duration:.4f}s\n") + sys.stderr.flush() + return duration + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run RATS performance benchmarks.") + parser.add_argument("--output", type=str, help="Output JSON file path") + parser.add_argument("--fast", action="store_true", help="Run only a small subset for testing") + args = parser.parse_args() + + if args.fast: + sample_sizes = [400, 400, 500] + else: + sample_sizes = [400, 400 , 500, 1000, 2000, 5000, 10000] + + results = [] + + print("\n| Sample Size | Duration (s) |") + print("|-------------|--------------|") + for n in sample_sizes: + duration = run_benchmark(n) + results.append({ + "name": f"Klein Bottle {n} samples", + "unit": "s", + "value": duration + }) + print(f"| {n:11} | {duration:12.4f} |") + + if args.output: + with open(args.output, "w") as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {args.output}") diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py index 31ad21d..9588d12 100644 --- a/tests/scripts/generate_snapshots.py +++ b/tests/scripts/generate_snapshots.py @@ -121,7 +121,7 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal "swissrollwithhole", "squarewithtwoholes", ] - ks = [14, 21, 28] + ks = [14, 21] eta_mins = [5, 10] cost_fn_names = ["distortion", "alignment"] From cc84dd7ee6332f14e51d4bdaaa2c73233ce25355 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 19:14:07 +0100 Subject: [PATCH 08/44] speedup: Dhruv's eigh implementation --- src/pyRATS/_utils.py | 59 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 36ba659..533bf30 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -1206,14 +1206,59 @@ def update(alpha, max_iter, O, CC, M, d): def compute_Lpinv_helpers(W): - B_ = W.copy().transpose().astype("float") + M, n = W.shape + B_ = W.transpose().tocsr().astype("float") D_1 = np.asarray(B_.sum(axis=1)) D_2 = np.asarray(B_.sum(axis=0)) - D_1_inv_sqrt = np.sqrt(1 / D_1) - D_2_inv_sqrt = np.sqrt(1 / D_2) - B_tilde = B_.multiply(D_2_inv_sqrt).multiply(D_1_inv_sqrt) - - U12, SS, VT = svd(B_tilde.todense(), full_matrices=False) + D_1_inv_sqrt = np.sqrt(1 / D_1).flatten() + D_2_inv_sqrt = np.sqrt(1 / D_2).flatten() + + #B_tilde = B_.multiply(D_2_inv_sqrt).multiply(D_1_inv_sqrt) + # Create sparse diagonal matrices + D1_diag = diags(D_1_inv_sqrt) + D2_diag = diags(D_2_inv_sqrt) + + # Sparse matrix multiplication is MUCH faster than .multiply() + B_tilde = D1_diag @ B_ @ D2_diag + + # U12, SS, VT = svd(B_tilde.todense(), full_matrices=False) + # --- OPTIMIZED SVD: Gram Matrix Approach --- + if n >= M: + # B_tilde is (n, M). C becomes a tiny (M, M) dense matrix. + # Sparse matrix multiplication here is incredibly fast. + C = (B_tilde.T @ B_tilde).todense() + S2, V = eigh(C) + + # eigh returns ascending order; reverse to match standard SVD output + idx = np.argsort(S2)[::-1] + S2 = S2[idx] + V = V[:, idx] + + # Recover singular values and right singular vectors + SS = np.sqrt(np.maximum(S2, 0)) + VT = V.T + + # Recover left singular vectors: U = B_tilde @ V @ diag(1/SS) + with np.errstate(divide='ignore', invalid='ignore'): + SS_inv = np.where(SS > 1e-10, 1.0 / SS, 0.0) + + U12 = (B_tilde @ V) * SS_inv[np.newaxis, :] + else: + # Fallback if M > n: Compute (n, n) Gram matrix instead + C = (B_tilde @ B_tilde.T).todense() + S2, U12 = eigh(C) + + idx = np.argsort(S2)[::-1] + S2 = S2[idx] + U12 = U12[:, idx] + + SS = np.sqrt(np.maximum(S2, 0)) + + with np.errstate(divide='ignore', invalid='ignore'): + SS_inv = np.where(SS > 1e-10, 1.0 / SS, 0.0) + + VT = (U12.T @ B_tilde) * SS_inv[:, np.newaxis] + # ------------------------------------------- U12, VT = svd_flip(U12, VT) V = VT.T @@ -1226,7 +1271,7 @@ def compute_Lpinv_helpers(W): U2 = U12[:, m_1:] V1 = V[:, :m_1] V2 = V[:, m_1:] - return [D_1_inv_sqrt, D_2_inv_sqrt, U1, U2, V1, V2, Sigma_1, Sigma_2] + return [D_1_inv_sqrt[:,None], D_2_inv_sqrt[None,:], U1, U2, V1, V2, Sigma_1, Sigma_2] # Ngoc-Diep Ho, Paul Van Dooren, On the pseudo-inverse of the Laplacian of a bipartite graph From 40db85ae42c5668cec203abf1d131eba5a66db49 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 20:55:44 +0000 Subject: [PATCH 09/44] fix: Imports fix --- src/pyRATS/_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 533bf30..ffaef76 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -1,9 +1,9 @@ import numpy as np from sklearn.neighbors import NearestNeighbors -from scipy.linalg import svd, svdvals +from scipy.linalg import svd, svdvals, eigh from scipy.sparse.linalg import svds from sklearn.decomposition import KernelPCA -from scipy.sparse import csr_matrix, triu, block_diag +from scipy.sparse import csr_matrix, triu, block_diag, diags import itertools from joblib import delayed, Parallel From 4641346545b1272a0a2f908cb6eefddcab34d1d3 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 21:39:17 +0000 Subject: [PATCH 10/44] speedup: Vectorizing B construction and avoiding sparse to dense conversions --- src/pyRATS/_utils.py | 140 +++++++++++++++++++++++++++++-------------- 1 file changed, 94 insertions(+), 46 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index ffaef76..772e066 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -554,10 +554,8 @@ def cost_of_moving_alignment_error( # c_U_k_uniq_k = np.union1d(c_U_k_uniq[neighbor_mask], c_k) c_U_k_uniq_k = np.union1d(c_U_k_uniq[neighbor_mask], k) evals = param.batched_eval_( - { - "view_index": c_U_k_uniq_k, - "data_mask": np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]), - } + c_U_k_uniq_k, + np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]), ) m = c_U_k_uniq_k == k @@ -1278,33 +1276,52 @@ def compute_Lpinv_helpers(W): def compute_Lpinv_MT(Lpinv_helpers, B): D_1_inv_sqrt, D_2_inv_sqrt, U1, U2, V1, V2, Sigma_1, Sigma_2 = Lpinv_helpers n = D_1_inv_sqrt.shape[0] - B_mean = B.mean(axis=1) + Md = B.shape[0] + M = B.shape[1] - n + + B_mean = np.array(B.mean(axis=1)) if len(B_mean.shape) == 1: B_mean = B_mean[:, None] - B_n = B - B_mean - B_n = np.asarray(B_n) - B1T = D_1_inv_sqrt * (B_n[:, :n].T) - B2T = D_2_inv_sqrt.T * (B_n[:, n:].T) - U1TB1T = np.matmul(U1.T, B1T) - U2TB1T = np.matmul(U2.T, B1T) - V1TB2T = np.matmul(V1.T, B2T) - V2TB2T = np.matmul(V2.T, B2T) + B1 = B[:, :n] + B2 = B[:, n:] + + # Optimized matrix-vector products using identities to avoid full dense B_n + # Identity: U^T (diag(D) (B - mu 1^T))^T = (U^T diag(D)) B^T - (U^T diag(D) 1) mu^T + + # Compute U^T * B1T terms + U1T_D1 = (U1 * D_1_inv_sqrt).T # (m1, n) + U1TB1T = (U1T_D1 @ B1.T) - (U1T_D1.sum(axis=1)[:, None] @ B_mean.T) + + U2T_D1 = (U2 * D_1_inv_sqrt).T # (M-m1, n) + U2TB1T = (U2T_D1 @ B1.T) - (U2T_D1.sum(axis=1)[:, None] @ B_mean.T) + + # Compute V^T * B2T terms + # D_2_inv_sqrt is (1, M) + V1T_D2 = (V1 * D_2_inv_sqrt.T).T # (m1, M) + V1TB2T = (V1T_D2 @ B2.T) - (V1T_D2.sum(axis=1)[:, None] @ B_mean.T) + + V2T_D2 = (V2 * D_2_inv_sqrt.T).T # (M-m1, M) + V2TB2T = (V2T_D2 @ B2.T) - (V2T_D2.sum(axis=1)[:, None] @ B_mean.T) + + # B1T and B2T are needed for the final sum, but we only materialize them once + # B1T = D_1_inv_sqrt * (B - B_mean)[:, :n].T + B1T = (B1.T.multiply(D_1_inv_sqrt)).toarray() - (D_1_inv_sqrt @ B_mean.T) temp1 = ( - -0.75 * np.matmul(U1, U1TB1T) - - 0.25 * np.matmul(U1, V1TB2T) - + np.matmul(U2, ((Sigma_1 - 1)) * (U2TB1T)) - + np.matmul(U2, Sigma_2 * (V2TB2T)) + -0.75 * (U1 @ U1TB1T) + - 0.25 * (U1 @ V1TB2T) + + (U2 @ ((Sigma_1 - 1) * U2TB1T)) + + (U2 @ (Sigma_2 * V2TB2T)) + B1T ) temp1 = temp1 * D_1_inv_sqrt temp2 = ( - -0.25 * np.matmul(V1, U1TB1T) - + 0.25 * np.matmul(V1, V1TB2T) - + np.matmul(V2, Sigma_2 * (U2TB1T)) - + np.matmul(V2, Sigma_1 * (V2TB2T)) + -0.25 * (V1 @ U1TB1T) + + 0.25 * (V1 @ V1TB2T) + + (V2 @ (Sigma_2 * U2TB1T)) + + (V2 @ (Sigma_1 * V2TB2T)) ) temp2 = temp2 * D_2_inv_sqrt.T @@ -1321,48 +1338,79 @@ def compute_CC(D, B, Lpinv_BT): def build_ortho_optim(d, Utilde, param, verbose): """Compute the Graph-Laplacian's inverse times B^\top.""" M, n = Utilde.shape - B_row_inds = [] - B_col_inds = [] - B_vals = [] - D = [] - W = Utilde.astype(float) - W_vals = W.data - + W_vals_all = W.data + + # Vectorized construction of D and B components + D_list = [] + + # We still loop over M to build B values, but we use pre-allocated arrays + # or better, compute B values in blocks. + B_data_vals = [] + B_cluster_vals = [] + B_cols = [] + for i in range(M): Utilde_i = Utilde[i, :].indices X_ = param.eval_(i, Utilde_i) - sqrt_p_ki = np.sqrt(np.array(W[i, :].data).flatten()[:, None]) - X_ = sqrt_p_ki * X_ - D.append(np.matmul(X_.T, X_)) - - row_inds = list(range(d * i, d * (i + 1))) - col_inds = Utilde_i.tolist() + weights_i = W_vals_all[W.indptr[i]:W.indptr[i+1]] + sqrt_p_ki = np.sqrt(weights_i[:, None]) + + # Weighted embeddings for D: sqrt(W) * X + X_weighted = sqrt_p_ki * X_ + D_list.append(X_weighted.T @ X_weighted) - B_row_inds += row_inds + np.repeat(row_inds, len(col_inds)).tolist() - B_col_inds += np.repeat([n + i], d).tolist() + np.tile(col_inds, d).tolist() + # Weighted embeddings for B: W * X + X_B = weights_i[:, None] * X_ + B_data_vals.append(X_B.T.flatten()) + B_cluster_vals.append(np.sum(-X_B.T, axis=1)) + B_cols.append(Utilde_i) - X_ = sqrt_p_ki * X_ - B_vals += np.sum(-X_.T, axis=1).tolist() + X_.T.flatten().tolist() + D = block_diag(D_list, format="csr") + + # Efficiently construct B + B_data_vals = np.concatenate(B_data_vals) + B_cluster_vals = np.concatenate(B_cluster_vals) + + B_cols_data = [] + for i in range(M): + B_cols_data.append(np.tile(B_cols[i], d)) + B_cols_data = np.concatenate(B_cols_data) + + # Row indices for data values + counts = np.diff(W.indptr) + B_rows_data = np.repeat(np.arange(M) * d, counts * d) + np.tile(np.arange(d), len(B_cols_data) // d) + # Wait, the above tiling is not quite right if counts vary. + # Correct rows: repeat each row index (i*d + offset) + B_rows_data = [] + for i in range(M): + r = np.arange(i * d, (i + 1) * d) + B_rows_data.append(np.repeat(r, counts[i])) + B_rows_data = np.concatenate(B_rows_data) - D = block_diag(D, format="csr") - B = csr_matrix((B_vals, (B_row_inds, B_col_inds)), shape=(M * d, n + M)) + # B indices for cluster nodes + B_rows_cluster = np.arange(M * d) + B_cols_cluster = np.repeat(np.arange(n, n + M), d) + + # Combine everything + B_row_all = np.concatenate([B_rows_cluster, B_rows_data]) + B_col_all = np.concatenate([B_cols_cluster, B_cols_data]) + B_val_all = np.concatenate([B_cluster_vals, B_data_vals]) + + B = csr_matrix((B_val_all, (B_row_all, B_col_all)), shape=(M * d, n + M)) if verbose: - print("min and max weights:", np.array(W_vals).min(), np.array(W_vals).max()) - + print("min and max weights:", np.array(W_vals_all).min(), np.array(W_vals_all).max()) print( f"Computing Pseudoinverse of a matrix of L of size {n} + {M} multiplied with B", flush=True, ) - Lpinv_helpers = compute_Lpinv_helpers(W) + Lpinv_helpers = compute_Lpinv_helpers(W) Lpinv_BT = compute_Lpinv_MT(Lpinv_helpers, B) CC = compute_CC(D, B, Lpinv_BT) - CC_net = CC - - return CC_net, Lpinv_BT, D, B + return CC, Lpinv_BT, D, B # unscaled alignment error From 26528b3106b835e89f5b76950ccdc652a9fc769b Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 23:27:10 +0000 Subject: [PATCH 11/44] speedup: Speedup of zeta function by replacing pdist with measurement of row and col via nonzero and dist measure. Plus better csc usage --- src/pyRATS/_utils.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 772e066..1d0a0fd 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -470,13 +470,18 @@ def cost_of_moving_distortion( def compute_zeta(d_e_mask0, Psi_k_mask): - d_e_mask = d_e_mask0.toarray() - if d_e_mask.shape[0] == 1: + if d_e_mask0.shape[0] == 1: return 1 - d_e_mask_ = squareform(d_e_mask) - mask = d_e_mask_ != 0 - d_e_mask_ = d_e_mask_[mask] - disc_lip_const = pdist(Psi_k_mask)[mask] / d_e_mask_ + d_e_upper = triu(d_e_mask0, k=1) + row, col = d_e_upper.nonzero() + if len(row) == 0: + return 1 + + d_e_vals = d_e_upper.data + diff = Psi_k_mask[row] - Psi_k_mask[col] + dist_embedded = np.sqrt(np.sum(diff * diff, axis=1)) + + disc_lip_const = dist_embedded / d_e_vals return np.max(disc_lip_const) / (np.min(disc_lip_const) + 1e-12) @@ -647,6 +652,7 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): cost = np.zeros(n) + np.inf dest = np.zeros(n, dtype="int") - 1 + U_csc = U.tocsc() # Vary eta from 2 to eta_{min} if verbose: @@ -714,7 +720,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): S_ = ( (c == dest_k) | (dest == dest_k) - | np.array(U[:, list(Clstr[s])].sum(1), dtype=bool).flatten() + | np.array(U_csc[:, list(Clstr[s])].sum(1), dtype=bool).flatten() ) else: S_ = (c == dest_k) | (dest == dest_k) | (dest == s) From 5bdc3d701222a5f145f9d4b8fef88c068fdc9876 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 19 Mar 2026 00:05:12 +0000 Subject: [PATCH 12/44] Merging zeta change --- .github/workflows/benchmark.yml | 12 +++++++++--- tests/scripts/benchmark.py | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 30f15f4..656d991 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -12,8 +12,14 @@ permissions: jobs: benchmark: - name: RATS Performance Benchmark - runs-on: ubuntu-latest + name: RATS Performance Benchmark (${{ matrix.os }}) + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash steps: - uses: actions/checkout@v4 @@ -29,7 +35,7 @@ jobs: - name: Run Benchmark run: | - echo "## RATS Performance Benchmarks" >> $GITHUB_STEP_SUMMARY + echo "## RATS Performance Benchmarks (${{ matrix.os }})" >> $GITHUB_STEP_SUMMARY echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py index 335e8d3..6824787 100644 --- a/tests/scripts/benchmark.py +++ b/tests/scripts/benchmark.py @@ -48,7 +48,7 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"): if args.fast: sample_sizes = [400, 400, 500] else: - sample_sizes = [400, 400 , 500, 1000, 2000, 5000, 10000] + sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 25_000] results = [] From cb6fcd1f99a6dfbc324708ccd039e8a9ddf6c69f Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 19 Mar 2026 00:05:12 +0000 Subject: [PATCH 13/44] tests: Updating benchmarking tests to include windows, mac, and linux test From a1a2806d71df5221e7f9656c65e606c26f37fa68 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 19 Mar 2026 16:48:05 +0000 Subject: [PATCH 14/44] Test: Adding a test benchmark script to give timings across different commits --- .github/workflows/benchmark.yml | 22 +++++++++++++++++++--- tests/scripts/benchmark.py | 2 +- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 656d991..226ad86 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -12,16 +12,32 @@ permissions: jobs: benchmark: - name: RATS Performance Benchmark (${{ matrix.os }}) + name: RATS Benchmark (${{ matrix.os }}, ${{ matrix.ref_name }}) strategy: + fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] + include: + - ref: "${{ github.sha }}" + ref_name: "Current HEAD" + - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a" + ref_name: "f1a3f22 (Optimized MT)" + - ref: "e1d13b7b1d2446a2499add3ed78f3a38e41103bc" + ref_name: "e1d13b7 (Merge Unit Testing V1)" runs-on: ${{ matrix.os }} defaults: run: shell: bash steps: - - uses: actions/checkout@v4 + - name: Checkout Code + uses: actions/checkout@v4 + with: + ref: ${{ matrix.ref }} + fetch-depth: 0 + + - name: Overlay Latest Benchmark Script + run: | + git checkout ${{ github.sha }} -- tests/scripts/benchmark.py - name: Set up Python uses: actions/setup-python@v4 @@ -36,6 +52,6 @@ jobs: - name: Run Benchmark run: | echo "## RATS Performance Benchmarks (${{ matrix.os }})" >> $GITHUB_STEP_SUMMARY + echo "Commit: ${{ matrix.ref_name }} (${{ matrix.ref }})" >> $GITHUB_STEP_SUMMARY echo "Dataset: Klein Bottle (4D)" >> $GITHUB_STEP_SUMMARY - echo "Parameters: k=14, eta_min=5" >> $GITHUB_STEP_SUMMARY python tests/scripts/benchmark.py >> $GITHUB_STEP_SUMMARY diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py index 6824787..4297b05 100644 --- a/tests/scripts/benchmark.py +++ b/tests/scripts/benchmark.py @@ -48,7 +48,7 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"): if args.fast: sample_sizes = [400, 400, 500] else: - sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 25_000] + sample_sizes = [500 , 500, 1000, 2000, 5000, 10_000, 20_000] results = [] From ef234188572e251444008ad93e34338504748ab7 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 19 Mar 2026 16:51:39 +0000 Subject: [PATCH 15/44] Fix: Issue with sha code --- .github/workflows/benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 226ad86..0e04ebf 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -37,7 +37,8 @@ jobs: - name: Overlay Latest Benchmark Script run: | - git checkout ${{ github.sha }} -- tests/scripts/benchmark.py + git fetch origin ${{ github.sha }} --depth=1 + git checkout FETCH_HEAD -- tests/scripts/benchmark.py - name: Set up Python uses: actions/setup-python@v4 From bfca42936278dbffd85606ad339dc1e7913f3ae8 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 19 Mar 2026 17:10:48 +0000 Subject: [PATCH 16/44] Matrixifying refs for benchmarks and removing Dhruv's --- .github/workflows/benchmark.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0e04ebf..617c04b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,13 +17,12 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] + ref: ["${{ github.sha }}", "f1a3f2233e8248ca9610af9a8498122fc557657a"] include: - ref: "${{ github.sha }}" ref_name: "Current HEAD" - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a" - ref_name: "f1a3f22 (Optimized MT)" - - ref: "e1d13b7b1d2446a2499add3ed78f3a38e41103bc" - ref_name: "e1d13b7 (Merge Unit Testing V1)" + ref_name: "f1a3f22 (Dhruv Optimized MT)" runs-on: ${{ matrix.os }} defaults: run: From 274f8413e8e7a7d9c3bc3131833c1980f24a7b63 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Tue, 24 Mar 2026 19:41:36 +0000 Subject: [PATCH 17/44] Efficiency: Memefficiency via Joshua code and and updated sha for comparisons --- .github/workflows/benchmark.yml | 6 +- src/pyRATS/rats.py | 126 ++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 51 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 617c04b..093c24e 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -17,12 +17,12 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest, windows-latest] - ref: ["${{ github.sha }}", "f1a3f2233e8248ca9610af9a8498122fc557657a"] + ref: ["${{ github.sha }}", "5eaa1c6a82eb0a316ff2a6911e0ef7379add0bad"] include: - ref: "${{ github.sha }}" ref_name: "Current HEAD" - - ref: "f1a3f2233e8248ca9610af9a8498122fc557657a" - ref_name: "f1a3f22 (Dhruv Optimized MT)" + - ref: "5eaa1c6a82eb0a316ff2a6911e0ef7379add0bad" + ref_name: "5eaa1c6 (Joshua V1)" runs-on: ${{ matrix.os }} defaults: run: diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index aaa8dcc..e066696 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -505,16 +505,16 @@ def _postprocess(self): # neigh_inds[i] are the k neighbour indices of point i (same as # connectivity_matrix used below, extracted once here for clarity). neigh_inds = self.U.indices.reshape(n, self.k) # (n, k) - pair_i, pair_j = np.triu_indices(self.k, k=1) # all upper-triangle pairs - row_idx = neigh_inds[:, pair_i].ravel() # (n * num_pairs,) - col_idx = neigh_inds[:, pair_j].ravel() # (n * num_pairs,) + pair_i, pair_j = np.triu_indices(self.k, k=1) # all upper-triangle pairs + row_idx = neigh_inds[:, pair_i].ravel() # (n * num_pairs,) + col_idx = neigh_inds[:, pair_j].ravel() # (n * num_pairs,) num_pairs = len(pair_i) original_dists = np.asarray( self.neighborhood_graph_sym[row_idx, col_idx] ).reshape(n, num_pairs) embedded_datapoints = self.param.batched_eval_( - range(n), self.U.indices.reshape(n, self.k) + {"view_index": range(n), "data_mask": self.U.indices.reshape(n, self.k)} ) # tested and they are equivalent embedded_dists = batched_pdist(embedded_datapoints) disc_lip_const = np.divide( @@ -538,60 +538,44 @@ def _postprocess(self): connectivity_matrix = self.U.indices.reshape(n, -1) param_changed = np.arange(n) future_use_phi_of = np.arange(n, dtype=int) - future_use_phi_of_ = np.arange(n, dtype=int) while len(param_changed) > 0: reconsider_mask = np.isin(connectivity_matrix, param_changed) & ( connectivity_matrix != np.arange(n)[:, None] ) - param_changed = np.array([]) - - for i in range(connectivity_matrix.shape[1]): - - use_phi_of = connectivity_matrix[:, i][ - reconsider_mask[:, i] - ] # select neighborhood indices of the Phi's to use - if len(use_phi_of) == 0: - continue - neighbors_to_reconsider = connectivity_matrix[ - reconsider_mask[:, i] - ] # select datapoint and neighborhood indices who's neighborhood must be re-evaluated - points_to_reconsider = np.where(reconsider_mask[:, i])[ - 0 - ] # list of indices - - batch_embedded_datapoints = self.param.batched_eval_( - future_use_phi_of[use_phi_of], neighbors_to_reconsider - ) - batch_embedded_dists = batched_pdist(batch_embedded_datapoints) - batch_original_dists = original_dists[points_to_reconsider] - - disc_lip_const = np.divide( - batch_embedded_dists, - batch_original_dists, - out=np.full_like(batch_embedded_dists, np.nan), - where=batch_original_dists != 0, - ) - zeta_ = np.nanmax(disc_lip_const, axis=1) / ( - np.nanmin(disc_lip_const, axis=1) + 1e-12 - ) - swap_has_improved_point = points_to_reconsider[ - np.where(zeta_ < zeta[points_to_reconsider])[0] - ] # indices of points where swapping by the i'th neighbor improves the local distortion + def work_range( + start_end, + ): + start, end = start_end + return [ + self._process_column( + i, + connectivity_matrix, + reconsider_mask[:, i], + self.param, + future_use_phi_of, + original_dists, + ) + for i in range(start, end) + ] - param_changed = np.union1d(param_changed, swap_has_improved_point) - zeta[points_to_reconsider] = np.minimum( - zeta_, zeta[points_to_reconsider] - ) + chunk_sz = connectivity_matrix.shape[1] // self.n_jobs + ranges = [(i * chunk_sz, (i + 1) * chunk_sz) for i in range(self.n_jobs)] + ranges[-1] = (ranges[-1][0], connectivity_matrix.shape[1]) + zetas = Parallel(n_jobs=self.n_jobs)(delayed(work_range)(r) for r in ranges) - future_use_phi_of_[swap_has_improved_point] = future_use_phi_of[ - connectivity_matrix[:, i][swap_has_improved_point] + zetas = np.vstack(zetas).T + + zeta_ = np.min(zetas, axis=1) + param_changed = np.where(zeta > zeta_)[0] + future_use_phi_of[param_changed] = future_use_phi_of[ + connectivity_matrix[ + param_changed, np.argmin(zetas, axis=1)[param_changed] ] + ] + zeta = np.where(zeta < zeta_, zeta, zeta_) - future_use_phi_of = ( - future_use_phi_of_.copy() - ) # this is necessary for some reason if self.verbose: print( "#Param replaced: %d, max distortion: %f" @@ -606,3 +590,49 @@ def _postprocess(self): f"Maximum local distortion after postprocessing is: %0.4f" % (np.max(self.param.zeta)) ) + + def _process_column( + self, + i, + connectivity_matrix, + reconsider_mask, + param, + future_use_phi_of, + original_dists, + ): + zeta = np.zeros(len(connectivity_matrix)) + np.inf + + use_phi_of = connectivity_matrix[:, i][ + reconsider_mask + ] # select neighborhood indices of the Phi's to use + + if len(use_phi_of) == 0: + return zeta + + neighbors_to_reconsider = connectivity_matrix[ + reconsider_mask + ] # select datapoint and neighborhood indices who's neighborhood must be re-evaluated + points_to_reconsider = np.where(reconsider_mask)[0] # list of indices + + batch_embedded_datapoints = param.batched_eval_( + { + "view_index": future_use_phi_of[use_phi_of], + "data_mask": neighbors_to_reconsider, + } + ) + batch_embedded_dists = batched_pdist(batch_embedded_datapoints) + batch_original_dists = original_dists[points_to_reconsider] + + disc_lip_const = np.divide( + batch_embedded_dists, + batch_original_dists, + out=np.full_like(batch_embedded_dists, np.nan), + where=batch_original_dists != 0, + ) + + zeta_ = np.nanmax(disc_lip_const, axis=1) / ( + np.nanmin(disc_lip_const, axis=1) + 1e-12 + ) + zeta[points_to_reconsider] = zeta_ + return zeta + \ No newline at end of file From 12541995f3debed95161e93d8032a7eedfd334c0 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 18 Mar 2026 16:51:01 +0100 Subject: [PATCH 18/44] Fix: new mode of batch all and importing --- examples/__init__.py | 0 src/pyRATS/rats.py | 9 ++++----- 2 files changed, 4 insertions(+), 5 deletions(-) create mode 100644 examples/__init__.py diff --git a/examples/__init__.py b/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index e066696..a56d64d 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -3,6 +3,7 @@ from scipy.sparse import csr_matrix from multiprocessing import cpu_count +from joblib import Parallel, delayed from pyRATS._utils import ( nearest_neighbors, @@ -514,7 +515,7 @@ def _postprocess(self): ).reshape(n, num_pairs) embedded_datapoints = self.param.batched_eval_( - {"view_index": range(n), "data_mask": self.U.indices.reshape(n, self.k)} + range(n), self.U.indices.reshape(n, self.k) ) # tested and they are equivalent embedded_dists = batched_pdist(embedded_datapoints) disc_lip_const = np.divide( @@ -615,10 +616,8 @@ def _process_column( points_to_reconsider = np.where(reconsider_mask)[0] # list of indices batch_embedded_datapoints = param.batched_eval_( - { - "view_index": future_use_phi_of[use_phi_of], - "data_mask": neighbors_to_reconsider, - } + future_use_phi_of[use_phi_of], + neighbors_to_reconsider, ) batch_embedded_dists = batched_pdist(batch_embedded_datapoints) batch_original_dists = original_dists[points_to_reconsider] From a82f82793464170d53f1c93b45966979b0621800 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Tue, 24 Mar 2026 19:55:27 +0000 Subject: [PATCH 19/44] Fix: Past git repo commit needed init --- .github/workflows/benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 093c24e..bd18821 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -38,6 +38,7 @@ jobs: run: | git fetch origin ${{ github.sha }} --depth=1 git checkout FETCH_HEAD -- tests/scripts/benchmark.py + git checkout FETCH_HEAD -- examples/__init__.py - name: Set up Python uses: actions/setup-python@v4 From d1f3123ebdc6ab9eaa43a50aa4238dadd0691455 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Tue, 24 Mar 2026 19:58:10 +0000 Subject: [PATCH 20/44] Update: Cleaner method for src reconstruction for commit comparisons --- .github/workflows/benchmark.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index bd18821..4fe417d 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -31,14 +31,14 @@ jobs: - name: Checkout Code uses: actions/checkout@v4 with: - ref: ${{ matrix.ref }} + ref: ${{ github.sha }} fetch-depth: 0 - - name: Overlay Latest Benchmark Script + - name: Overlay src/ from Comparison Ref + if: matrix.ref != github.sha run: | - git fetch origin ${{ github.sha }} --depth=1 - git checkout FETCH_HEAD -- tests/scripts/benchmark.py - git checkout FETCH_HEAD -- examples/__init__.py + git fetch origin ${{ matrix.ref }} --depth=1 + git checkout FETCH_HEAD -- src/ - name: Set up Python uses: actions/setup-python@v4 From 81018d527cab1d13f37ce2ea7ed5e0158c323662 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 15:54:20 +0000 Subject: [PATCH 21/44] Test: Alternative memory efficient, with parallelization over num threads, for postprocessing --- src/pyRATS/rats.py | 160 ++++++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 73 deletions(-) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index a56d64d..9ca2c53 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -19,6 +19,57 @@ from pyRATS._tear_coloring import compute_color_of_pts_on_tear +def _postprocess_col_range( + start, + end, + connectivity_matrix, + reconsider_mask, + param, + future_use_phi_of, + original_dists, + n, +): + """Process a contiguous range of neighbor-columns [start, end) for _postprocess. + + Returns the element-wise minimum distortion and the corresponding column + index across the processed range, both as length-n arrays. + """ + local_zeta_min = np.full(n, np.inf) + local_best_col = np.zeros(n, dtype=int) + + for i in range(start, end): + col_mask = reconsider_mask[:, i] + use_phi_of = connectivity_matrix[:, i][col_mask] + if len(use_phi_of) == 0: + continue + + neighbors_to_reconsider = connectivity_matrix[col_mask] + points_to_reconsider = np.where(col_mask)[0] + + batch_embedded_datapoints = param.batched_eval_( + future_use_phi_of[use_phi_of], neighbors_to_reconsider + ) + batch_embedded_dists = batched_pdist(batch_embedded_datapoints) + batch_original_dists = original_dists[points_to_reconsider] + + disc_lip_const = np.divide( + batch_embedded_dists, + batch_original_dists, + out=np.full_like(batch_embedded_dists, np.nan), + where=batch_original_dists != 0, + ) + col_zeta = np.full(n, np.inf) + col_zeta[points_to_reconsider] = np.nanmax(disc_lip_const, axis=1) / ( + np.nanmin(disc_lip_const, axis=1) + 1e-12 + ) + + improved = col_zeta < local_zeta_min + local_best_col[improved] = i + local_zeta_min = np.minimum(local_zeta_min, col_zeta) + + return local_zeta_min, local_best_col + + class RATS: """Riemannian Alignment of Tangent Spaces @@ -545,37 +596,44 @@ def _postprocess(self): connectivity_matrix != np.arange(n)[:, None] ) - def work_range( - start_end, - ): - start, end = start_end - return [ - self._process_column( - i, - connectivity_matrix, - reconsider_mask[:, i], - self.param, - future_use_phi_of, - original_dists, - ) - for i in range(start, end) - ] - - chunk_sz = connectivity_matrix.shape[1] // self.n_jobs - ranges = [(i * chunk_sz, (i + 1) * chunk_sz) for i in range(self.n_jobs)] - ranges[-1] = (ranges[-1][0], connectivity_matrix.shape[1]) - zetas = Parallel(n_jobs=self.n_jobs)(delayed(work_range)(r) for r in ranges) - - zetas = np.vstack(zetas).T - - zeta_ = np.min(zetas, axis=1) - param_changed = np.where(zeta > zeta_)[0] + # Split the k columns into n_jobs chunks and process each chunk in + # a separate worker. Each worker returns its local (zeta_min, + # best_col) for the columns it owns; we then do a final + # min-reduction here. This mirrors the target_proc pattern used + # throughout _utils.py (lpca, best, …) while keeping memory + # bounded to one chunk of columns per worker at a time. + k_cols = connectivity_matrix.shape[1] + chunk_sz = max(1, k_cols // self.n_jobs) + ranges = [(p * chunk_sz, (p + 1) * chunk_sz) for p in range(self.n_jobs)] + ranges[-1] = (ranges[-1][0], k_cols) # extend last chunk to cover remainder + ranges = [(s, e) for s, e in ranges if s < k_cols] # drop empty ranges + + results = Parallel(n_jobs=self.n_jobs)( + delayed(_postprocess_col_range)( + s, e, + connectivity_matrix, + reconsider_mask, + self.param, + future_use_phi_of, + original_dists, + n, + ) + for s, e in ranges + ) + + # Reduce chunk results into a single (zeta_min, best_col). + zeta_min = np.full(n, np.inf) + best_col = np.zeros(n, dtype=int) + for chunk_zeta_min, chunk_best_col in results: + improved = chunk_zeta_min < zeta_min + best_col[improved] = chunk_best_col[improved] + zeta_min = np.minimum(zeta_min, chunk_zeta_min) + + param_changed = np.where(zeta > zeta_min)[0] future_use_phi_of[param_changed] = future_use_phi_of[ - connectivity_matrix[ - param_changed, np.argmin(zetas, axis=1)[param_changed] - ] + connectivity_matrix[param_changed, best_col[param_changed]] ] - zeta = np.where(zeta < zeta_, zeta, zeta_) + zeta = np.minimum(zeta, zeta_min) if self.verbose: print( @@ -591,47 +649,3 @@ def work_range( f"Maximum local distortion after postprocessing is: %0.4f" % (np.max(self.param.zeta)) ) - - def _process_column( - self, - i, - connectivity_matrix, - reconsider_mask, - param, - future_use_phi_of, - original_dists, - ): - zeta = np.zeros(len(connectivity_matrix)) + np.inf - - use_phi_of = connectivity_matrix[:, i][ - reconsider_mask - ] # select neighborhood indices of the Phi's to use - - if len(use_phi_of) == 0: - return zeta - - neighbors_to_reconsider = connectivity_matrix[ - reconsider_mask - ] # select datapoint and neighborhood indices who's neighborhood must be re-evaluated - points_to_reconsider = np.where(reconsider_mask)[0] # list of indices - - batch_embedded_datapoints = param.batched_eval_( - future_use_phi_of[use_phi_of], - neighbors_to_reconsider, - ) - batch_embedded_dists = batched_pdist(batch_embedded_datapoints) - batch_original_dists = original_dists[points_to_reconsider] - - disc_lip_const = np.divide( - batch_embedded_dists, - batch_original_dists, - out=np.full_like(batch_embedded_dists, np.nan), - where=batch_original_dists != 0, - ) - - zeta_ = np.nanmax(disc_lip_const, axis=1) / ( - np.nanmin(disc_lip_const, axis=1) + 1e-12 - ) - zeta[points_to_reconsider] = zeta_ - return zeta - \ No newline at end of file From 3325634537958454dab2f97ade0953f6bd8f7db7 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 16:43:21 +0000 Subject: [PATCH 22/44] Style: Updating parameter names to match sklearn --- README.md | 2 +- examples/barbell_example.ipynb | 10 +- examples/curvedtorus_example.ipynb | 14 +- examples/kleinbottle_example.ipynb | 10 +- examples/small_noisyswissroll3_example.ipynb | 12 +- examples/small_spherewithhole_example.ipynb | 10 +- examples/small_swissrollwithhole.ipynb | 10 +- examples/squarerwithtwoholes.ipynb | 10 +- src/pyRATS/rats.py | 208 +++++++++++++------ tests/scripts/benchmark.py | 14 +- tests/scripts/generate_snapshots.py | 28 +-- tests/test_e2e.py | 14 +- 12 files changed, 210 insertions(+), 132 deletions(-) diff --git a/README.md b/README.md index 064fe03..6d0391e 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ import datasets, vis X, labels, _ = datasets.Datasets().kleinbottle4d(n=5000) # create a RATS object projecting the data to 2d while tearing the manifold -model = rats.RATS(d=2, k=28, eta_min=5, to_tear=True) +model = rats.RATS(n_components=2, n_neighbors=28, min_cluster_size=5, tear=True) y = model.fit_transform(X) # compute the gluing instructions along the tear diff --git a/examples/barbell_example.ipynb b/examples/barbell_example.ipynb index 910cddb..3072280 100644 --- a/examples/barbell_example.ipynb +++ b/examples/barbell_example.ipynb @@ -59,10 +59,10 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -129,4 +129,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/curvedtorus_example.ipynb b/examples/curvedtorus_example.ipynb index ece11d1..94ef672 100644 --- a/examples/curvedtorus_example.ipynb +++ b/examples/curvedtorus_example.ipynb @@ -59,13 +59,13 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=21,\n", - " eta_min=5,\n", + " n_components=2,\n", + " n_neighbors=21,\n", + " min_cluster_size=5,\n", " nu=3,\n", - " max_iter=40,\n", - " to_tear=True,\n", - " cost_fn_name='distortion'\n", + " n_iter=40,\n", + " tear=True,\n", + " cost_function='distortion'\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -132,4 +132,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb index 8723479..1c23740 100644 --- a/examples/kleinbottle_example.ipynb +++ b/examples/kleinbottle_example.ipynb @@ -59,10 +59,10 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -121,4 +121,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/small_noisyswissroll3_example.ipynb b/examples/small_noisyswissroll3_example.ipynb index f25db13..db22477 100644 --- a/examples/small_noisyswissroll3_example.ipynb +++ b/examples/small_noisyswissroll3_example.ipynb @@ -59,11 +59,11 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", - " cost_fn_name='alignment'\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", + " cost_function='alignment'\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -122,4 +122,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/small_spherewithhole_example.ipynb b/examples/small_spherewithhole_example.ipynb index 2836f35..255464f 100644 --- a/examples/small_spherewithhole_example.ipynb +++ b/examples/small_spherewithhole_example.ipynb @@ -59,10 +59,10 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -131,4 +131,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/small_swissrollwithhole.ipynb b/examples/small_swissrollwithhole.ipynb index 1cb8856..f2ff33a 100644 --- a/examples/small_swissrollwithhole.ipynb +++ b/examples/small_swissrollwithhole.ipynb @@ -59,10 +59,10 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -129,4 +129,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/examples/squarerwithtwoholes.ipynb b/examples/squarerwithtwoholes.ipynb index dbd0cdb..1b90e81 100644 --- a/examples/squarerwithtwoholes.ipynb +++ b/examples/squarerwithtwoholes.ipynb @@ -59,10 +59,10 @@ "outputs": [], "source": [ "buml_obj = rats.RATS(\n", - " d=2,\n", - " k=28,\n", - " eta_min=5,\n", - " to_tear=True,\n", + " n_components=2,\n", + " n_neighbors=28,\n", + " min_cluster_size=5,\n", + " tear=True,\n", ")\n", "\n", "y = buml_obj.fit_transform(X=X)\n", @@ -130,4 +130,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 9ca2c53..1de93d6 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -75,69 +75,70 @@ class RATS: Parameters ---------- - d : int - Intrinsic dimension of the manifold. + n_components : int, default=2 + Intrinsic dimension of the manifold (target embedding dimension). - k : int + n_neighbors : int, default=28 Neighborhood size for local view fitting. - to_postprocess : bool, default=True + postprocess : bool, default=True If True, searches for local embeddings that minimize the local distortion. Useful for noisy manifolds. - kpca_kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None + kernel : {None, 'poly', 'rbf', 'sigmoid', 'cosine', 'precomputed'} or Callable, default=None Kernel used for PCA. None uses standard linear PCA (lpca). Any other value uses sklearn.decomposition.KernelPCA. See https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html for more information. - cost_fn_name : {'alignment', 'distortion'}, default='alignment' - 'alignment': Alignment error should be prefered when runtime matters or dealing with noisy manifolds. + cost_function : {'alignment', 'distortion'}, default='alignment' + 'alignment': Alignment error should be preferred when runtime matters or dealing with noisy manifolds. 'distortion': Distortion is slow, but works well on low noise manifolds. - eta_min : int, default=5 - Minimum number of points per cluster. A cluster is formed by merging local embeddings that minimize cost_fn_name. - Larger clusters improve the runtime, and regularise noisy manifolds. - The value must be >= 1. + min_cluster_size : int, default=5 + Minimum number of points per cluster. A cluster is formed by merging local + embeddings that minimise cost_function. Larger clusters improve runtime and + regularise noisy manifolds. The value must be >= 1. - eta_max : int, default=25 - Maximum allowed size of the clusters. - The value must be > eta_min. + max_cluster_size : int, default=25 + Maximum allowed size of the clusters. The value must be > min_cluster_size. - to_tear : bool, default=True + tear : bool, default=True Whether to tear the manifold. nu : int, default=3 The ratio of the size of local views in the embedding against those in the data. - max_iter : int - Number of iterations to refine the global embedding. - In total Riemannian gradient descent is run for max_iter * max_internal_iter iterations. - For every iteration in max_iter, the alignment of points in the embedding is recomputed and the tear is re-evaluated if to_tear=True. + n_iter : int, default=20 + Number of outer iterations to refine the global embedding. + Riemannian gradient descent runs for n_iter * n_iter_inner total steps. + For every iteration the alignment of points in the embedding is recomputed + and the tear is re-evaluated if tear=True. - max_internal_iter : int - The number of internal iterations used by Riemannian Gradient Descent. + n_iter_inner : int, default=100 + Number of internal iterations used by Riemannian Gradient Descent per outer step. - alpha : float - The step size used in the Riemannian gradient descent. + alpha : float, default=0.3 + Step size used in the Riemannian gradient descent. - patience : int - The number of iterations to wait for error below tolerance - to persist before stopping the refinement. + n_iter_without_progress : int, default=5 + The number of outer iterations to wait after the relative change in alignment + error and tear size both drop below *tol* before stopping early. - tol : float - The tolerance level for the relative change in the alignment error and the + tol : float, default=1e-2 + Tolerance level for the relative change in the alignment error and the relative change in the size of the tear. metric : str, default='euclidean' - To be added in future releases. Metric assumed on the embedding. Currently only 'euclidean' is supported. + To be added in future releases. Metric assumed on the embedding. + Currently only 'euclidean' is supported. - kpca_fit_inverse_transform : bool, default=False - To be added in future releases. If True, computes the inverse of the embedding transformation. - This flag must be only be enabled when running RATS with using the kpca_kernel. - If running pca with kpca_kernel=None, kpca_fit_inverse_transform is always True. + fit_inverse_transform : bool, default=False + To be added in future releases. If True, computes the inverse of the embedding + transformation. Must only be enabled together with a non-None kernel. + When kernel=None (linear PCA) the inverse is always available. - verbose : bool + verbose : bool, default=False If True, print logs to stdout. n_jobs : int, default=-1 @@ -145,37 +146,114 @@ class RATS: """ + # Mapping from deprecated parameter names to their new equivalents. + _DEPRECATED_PARAMS = { + "d": "n_components", + "k": "n_neighbors", + "max_iter": "n_iter", + "max_internal_iter": "n_iter_inner", + "patience": "n_iter_without_progress", + "cost_fn_name": "cost_function", + "eta_min": "min_cluster_size", + "eta_max": "max_cluster_size", + "kpca_kernel": "kernel", + "kpca_fit_inverse_transform": "fit_inverse_transform", + "to_postprocess": "postprocess", + "to_tear": "tear", + } + def __init__( self, - d=2, - kpca_kernel=None, - kpca_fit_inverse_transform=False, - k=28, - cost_fn_name="alignment", + n_components=2, + kernel=None, + fit_inverse_transform=False, + n_neighbors=28, + cost_function="alignment", metric="euclidean", - to_postprocess=True, - eta_min=5, - eta_max=25, - to_tear=True, + postprocess=True, + min_cluster_size=5, + max_cluster_size=25, + tear=True, nu=3, - max_iter=20, - max_internal_iter=100, + n_iter=20, + n_iter_inner=100, alpha=0.3, eps=1e-8, - patience=5, + n_iter_without_progress=5, tol=1e-2, n_forced_clusters=1, verbose=False, n_jobs=-1, + **kwargs, ): + # ------------------------------------------------------------------ + # Backward-compatibility: accept deprecated parameter names and emit + # a DeprecationWarning so old code keeps working for one release. + # ------------------------------------------------------------------ + for old, new in self._DEPRECATED_PARAMS.items(): + if old in kwargs: + warnings.warn( + f"The parameter '{old}' is deprecated and will be removed in a " + f"future release. Use '{new}' instead.", + DeprecationWarning, + stacklevel=2, + ) + # Only override the new-name value if the caller did not also + # supply the new name explicitly. + locals_snapshot = { + "n_components": n_components, + "kernel": kernel, + "fit_inverse_transform": fit_inverse_transform, + "n_neighbors": n_neighbors, + "cost_function": cost_function, + "postprocess": postprocess, + "min_cluster_size": min_cluster_size, + "max_cluster_size": max_cluster_size, + "tear": tear, + "n_iter": n_iter, + "n_iter_inner": n_iter_inner, + "n_iter_without_progress": n_iter_without_progress, + } + if old == "d": + n_components = kwargs.pop(old) + elif old == "k": + n_neighbors = kwargs.pop(old) + elif old == "max_iter": + n_iter = kwargs.pop(old) + elif old == "max_internal_iter": + n_iter_inner = kwargs.pop(old) + elif old == "patience": + n_iter_without_progress = kwargs.pop(old) + elif old == "cost_fn_name": + cost_function = kwargs.pop(old) + elif old == "eta_min": + min_cluster_size = kwargs.pop(old) + elif old == "eta_max": + max_cluster_size = kwargs.pop(old) + elif old == "kpca_kernel": + kernel = kwargs.pop(old) + elif old == "kpca_fit_inverse_transform": + fit_inverse_transform = kwargs.pop(old) + elif old == "to_postprocess": + postprocess = kwargs.pop(old) + elif old == "to_tear": + tear = kwargs.pop(old) + else: + kwargs.pop(old, None) + + if kwargs: + raise TypeError( + f"RATS.__init__() got unexpected keyword arguments: {list(kwargs.keys())}" + ) - if cost_fn_name not in ["alignment", "distortion"]: + if cost_function not in ["alignment", "distortion"]: raise ValueError( - f"cost_fn_name must be 'alignment' or 'distortion', got {cost_fn_name!r}." + f"cost_function must be 'alignment' or 'distortion', got {cost_function!r}." ) - if eta_max <= eta_min: + if max_cluster_size <= min_cluster_size: raise ValueError( - f"eta_max ({eta_max}) must be greater than eta_min ({eta_min})." + f"max_cluster_size ({max_cluster_size}) must be greater than " + f"min_cluster_size ({min_cluster_size})." ) if metric != "euclidean": warnings.warn( @@ -185,9 +263,9 @@ def __init__( FutureWarning, stacklevel=2, ) - if kpca_fit_inverse_transform: + if fit_inverse_transform: warnings.warn( - "kpca_fit_inverse_transform=True is not yet supported and will be " + "fit_inverse_transform=True is not yet supported and will be " "ignored. Inverse transform support will be added in a future release.", FutureWarning, stacklevel=2, @@ -196,22 +274,22 @@ def __init__( self.verbose = verbose self.n_jobs = n_jobs - self.d = d - self.kpca_kernel = kpca_kernel - self.kpca_fit_inverse_transform = kpca_fit_inverse_transform - self.k = k - if cost_fn_name == "distortion": - self.k_nn0 = max(k, eta_max * k) + self.d = n_components + self.kpca_kernel = kernel + self.kpca_fit_inverse_transform = fit_inverse_transform + self.k = n_neighbors + if cost_function == "distortion": + self.k_nn0 = max(n_neighbors, max_cluster_size * n_neighbors) else: - self.k_nn0 = k - self.cost_fn = cost_fn_name - self.to_postprocess = to_postprocess - self.eta_min, self.eta_max = eta_min, eta_max - self.to_tear = to_tear + self.k_nn0 = n_neighbors + self.cost_fn = cost_function + self.to_postprocess = postprocess + self.eta_min, self.eta_max = min_cluster_size, max_cluster_size + self.to_tear = tear self.nu = nu - self.max_iter, self.max_internal_iter = max_iter, max_internal_iter + self.max_iter, self.max_internal_iter = n_iter, n_iter_inner self.alpha, self.eps = alpha, eps - self.patience, self.tol = patience, tol + self.patience, self.tol = n_iter_without_progress, tol self.metric = metric self.n_forced_clusters = n_forced_clusters diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py index 4297b05..59a2adc 100644 --- a/tests/scripts/benchmark.py +++ b/tests/scripts/benchmark.py @@ -12,8 +12,8 @@ from pyRATS import rats from examples import datasets -def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"): - sys.stderr.write(f"Running benchmark for n={n_samples}, k={k}, eta_min={eta_min}...\n") +def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function="distortion"): + sys.stderr.write(f"Running benchmark for n={n_samples}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}...\n") sys.stderr.flush() # Load dataset @@ -21,11 +21,11 @@ def run_benchmark(n_samples, k=14, eta_min=5, cost_fn_name="distortion"): X, _, _ = ds.kleinbottle4d(n=n_samples) model = rats.RATS( - d=2, - k=k, - cost_fn_name=cost_fn_name, - eta_min=eta_min, - max_iter=3, + n_components=2, + n_neighbors=n_neighbors, + cost_function=cost_function, + min_cluster_size=min_cluster_size, + n_iter=3, nu=4, verbose=False, # Set to False for cleaner benchmark output ) diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py index 9588d12..4fdcb1b 100644 --- a/tests/scripts/generate_snapshots.py +++ b/tests/scripts/generate_snapshots.py @@ -46,8 +46,8 @@ def load_dataset(name): return X, labels -def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=False): - fname = f"dataset={dataset_name}_k={k}_eta_min={eta_min}_cost-fn={cost_fn_name}.res" +def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpath, force_compute=False): + fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res" path = os.path.join(dirpath, fname) if os.path.exists(path) and not force_compute: @@ -56,11 +56,11 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal X, _ = load_dataset(dataset_name) model = rats.RATS( - d=2, - k=k, - cost_fn_name=cost_fn_name, - eta_min=eta_min, - max_iter=3, + n_components=2, + n_neighbors=n_neighbors, + cost_function=cost_function, + min_cluster_size=min_cluster_size, + n_iter=3, nu=4, verbose=True, ) @@ -108,9 +108,9 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal if args.fast_mode: dataset_names = ["small_spherewithhole"] - ks = [14] - eta_mins = [5] - cost_fn_names = ["distortion"] + n_neighbors_list = [14] + min_cluster_sizes = [5] + cost_functions = ["distortion"] else: dataset_names = [ "curvedtorus3d", @@ -121,11 +121,11 @@ def run_model(k, eta_min, cost_fn_name, dataset_name, dirpath, force_compute=Fal "swissrollwithhole", "squarewithtwoholes", ] - ks = [14, 21] - eta_mins = [5, 10] - cost_fn_names = ["distortion", "alignment"] + n_neighbors_list = [14, 21] + min_cluster_sizes = [5, 10] + cost_functions = ["distortion", "alignment"] - argslist = list(itertools.product(ks, eta_mins, cost_fn_names, dataset_names)) + argslist = list(itertools.product(n_neighbors_list, min_cluster_sizes, cost_functions, dataset_names)) Parallel(n_jobs=1)( delayed(run_model)(*args, dirpath, force_compute) for args in argslist diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 616b947..480e3a3 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -22,10 +22,10 @@ def get_test_cases(): # Sample name: dataset=small_spherewithhole_k=14_eta_min=5_cost-fn=distortion.res try: dname = fname.split("dataset=")[1].split("_k=")[0] - k = int(fname.split("_k=")[1].split("_eta_min=")[0]) - eta = int(fname.split("_eta_min=")[1].split("_cost-fn=")[0]) - cfn = fname.split("_cost-fn=")[1].replace(".res", "") - cases.append((k, eta, cfn, dname)) + n_neighbors = int(fname.split("_k=")[1].split("_eta_min=")[0]) + min_cluster_size = int(fname.split("_eta_min=")[1].split("_cost-fn=")[0]) + cost_function = fname.split("_cost-fn=")[1].replace(".res", "") + cases.append((n_neighbors, min_cluster_size, cost_function, dname)) except Exception: continue return cases @@ -36,9 +36,9 @@ def get_test_cases(): if not TEST_CASES: TEST_CASES = [(14, 5, "distortion", "dummy_no_baseline")] -@pytest.mark.parametrize("k,eta_min,cost_fn_name,dataset_name", TEST_CASES) -def test_end_to_end(k, eta_min, cost_fn_name, dataset_name): - fname = f"dataset={dataset_name}_k={k}_eta_min={eta_min}_cost-fn={cost_fn_name}.res" +@pytest.mark.parametrize("n_neighbors,min_cluster_size,cost_function,dataset_name", TEST_CASES) +def test_end_to_end(n_neighbors, min_cluster_size, cost_function, dataset_name): + fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res" expected_path = os.path.join(EXPECTED_DIR, fname) actual_path = os.path.join(ACTUAL_DIR, fname) From d7e10d62e74fd2b80a315dfb070e08cccfbd52dc Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 16:54:22 +0000 Subject: [PATCH 23/44] Fix: Test var names --- tests/scripts/benchmark.py | 35 +++++++++++++++++++++++++++- tests/scripts/generate_snapshots.py | 36 ++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/tests/scripts/benchmark.py b/tests/scripts/benchmark.py index 59a2adc..25708c3 100644 --- a/tests/scripts/benchmark.py +++ b/tests/scripts/benchmark.py @@ -12,6 +12,39 @@ from pyRATS import rats from examples import datasets +def construct_rats( + n_components=2, + n_neighbors=14, + min_cluster_size=5, + cost_function="distortion", + n_iter=3, + nu=4, + verbose=False, +): + """Helper to instantiate RATS with fallback for older versions of the code.""" + try: + # Try new sklearn-style names + return rats.RATS( + n_components=n_components, + n_neighbors=n_neighbors, + min_cluster_size=min_cluster_size, + cost_function=cost_function, + n_iter=n_iter, + nu=nu, + verbose=verbose, + ) + except TypeError: + # Fallback to old names + return rats.RATS( + d=n_components, + k=n_neighbors, + eta_min=min_cluster_size, + cost_fn_name=cost_function, + max_iter=n_iter, + nu=nu, + verbose=verbose, + ) + def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function="distortion"): sys.stderr.write(f"Running benchmark for n={n_samples}, n_neighbors={n_neighbors}, min_cluster_size={min_cluster_size}...\n") sys.stderr.flush() @@ -20,7 +53,7 @@ def run_benchmark(n_samples, n_neighbors=14, min_cluster_size=5, cost_function=" ds = datasets.Datasets() X, _, _ = ds.kleinbottle4d(n=n_samples) - model = rats.RATS( + model = construct_rats( n_components=2, n_neighbors=n_neighbors, cost_function=cost_function, diff --git a/tests/scripts/generate_snapshots.py b/tests/scripts/generate_snapshots.py index 4fdcb1b..1c89122 100644 --- a/tests/scripts/generate_snapshots.py +++ b/tests/scripts/generate_snapshots.py @@ -46,6 +46,40 @@ def load_dataset(name): return X, labels +def construct_rats( + n_components=2, + n_neighbors=28, + min_cluster_size=5, + cost_function="alignment", + n_iter=20, + nu=3, + verbose=False, +): + """Helper to instantiate RATS with fallback for older versions of the code.""" + try: + # Try new sklearn-style names + return rats.RATS( + n_components=n_components, + n_neighbors=n_neighbors, + min_cluster_size=min_cluster_size, + cost_function=cost_function, + n_iter=n_iter, + nu=nu, + verbose=verbose, + ) + except TypeError: + # Fallback to old names + return rats.RATS( + d=n_components, + k=n_neighbors, + eta_min=min_cluster_size, + cost_fn_name=cost_function, + max_iter=n_iter, + nu=nu, + verbose=verbose, + ) + + def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpath, force_compute=False): fname = f"dataset={dataset_name}_k={n_neighbors}_eta_min={min_cluster_size}_cost-fn={cost_function}.res" path = os.path.join(dirpath, fname) @@ -55,7 +89,7 @@ def run_model(n_neighbors, min_cluster_size, cost_function, dataset_name, dirpat X, _ = load_dataset(dataset_name) - model = rats.RATS( + model = construct_rats( n_components=2, n_neighbors=n_neighbors, cost_function=cost_function, From 2d7695b04791ecc69d7c90ac886f439af2ec3963 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 16:57:31 +0000 Subject: [PATCH 24/44] style: Naming change of buml_obj to model --- examples/barbell_example.ipynb | 6 +++--- examples/curvedtorus_example.ipynb | 6 +++--- examples/kleinbottle_example.ipynb | 6 +++--- examples/small_noisyswissroll3_example.ipynb | 6 +++--- examples/small_spherewithhole_example.ipynb | 6 +++--- examples/small_swissrollwithhole.ipynb | 6 +++--- examples/squarerwithtwoholes.ipynb | 6 +++--- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/barbell_example.ipynb b/examples/barbell_example.ipynb index 3072280..7a967ce 100644 --- a/examples/barbell_example.ipynb +++ b/examples/barbell_example.ipynb @@ -58,14 +58,14 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", " tear=True,\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -85,7 +85,7 @@ "outputs": [], "source": [ "tear_color_eig_inds = [0,1,2] # interior has green color so assigned smallest value of i to g-channel\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds\n", ")\n", diff --git a/examples/curvedtorus_example.ipynb b/examples/curvedtorus_example.ipynb index 94ef672..eb2a0b3 100644 --- a/examples/curvedtorus_example.ipynb +++ b/examples/curvedtorus_example.ipynb @@ -58,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=21,\n", " min_cluster_size=5,\n", @@ -68,7 +68,7 @@ " cost_function='distortion'\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -88,7 +88,7 @@ "outputs": [], "source": [ "tear_color_eig_inds = [6, 1, 4] # interior has green color so assigned smallest value of i to g-channel\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds\n", ")\n", diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb index 1c23740..7684183 100644 --- a/examples/kleinbottle_example.ipynb +++ b/examples/kleinbottle_example.ipynb @@ -58,14 +58,14 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", " tear=True,\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -77,7 +77,7 @@ "outputs": [], "source": [ "tear_color_eig_inds = [7, 2, 4]\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds, \n", ")\n", diff --git a/examples/small_noisyswissroll3_example.ipynb b/examples/small_noisyswissroll3_example.ipynb index db22477..f21b196 100644 --- a/examples/small_noisyswissroll3_example.ipynb +++ b/examples/small_noisyswissroll3_example.ipynb @@ -58,7 +58,7 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", @@ -66,7 +66,7 @@ " cost_function='alignment'\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -78,7 +78,7 @@ "outputs": [], "source": [ "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds, \n", ")\n", diff --git a/examples/small_spherewithhole_example.ipynb b/examples/small_spherewithhole_example.ipynb index 255464f..2deb028 100644 --- a/examples/small_spherewithhole_example.ipynb +++ b/examples/small_spherewithhole_example.ipynb @@ -58,14 +58,14 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", " tear=True,\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -87,7 +87,7 @@ "figsize = (3,3)\n", "\n", "tear_color_eig_inds = [7, 1, 2]\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds, \n", ")\n", diff --git a/examples/small_swissrollwithhole.ipynb b/examples/small_swissrollwithhole.ipynb index f2ff33a..2b95103 100644 --- a/examples/small_swissrollwithhole.ipynb +++ b/examples/small_swissrollwithhole.ipynb @@ -58,14 +58,14 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", " tear=True,\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -85,7 +85,7 @@ "outputs": [], "source": [ "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds, \n", ")\n", diff --git a/examples/squarerwithtwoholes.ipynb b/examples/squarerwithtwoholes.ipynb index 1b90e81..f18f1f7 100644 --- a/examples/squarerwithtwoholes.ipynb +++ b/examples/squarerwithtwoholes.ipynb @@ -58,14 +58,14 @@ "metadata": {}, "outputs": [], "source": [ - "buml_obj = rats.RATS(\n", + "model = rats.RATS(\n", " n_components=2,\n", " n_neighbors=28,\n", " min_cluster_size=5,\n", " tear=True,\n", ")\n", "\n", - "y = buml_obj.fit_transform(X=X)\n", + "y = model.fit_transform(X=X)\n", "y" ] }, @@ -86,7 +86,7 @@ "source": [ "figsize = (3,3)\n", "tear_color_eig_inds = [0, 1, 2] # interior has green color so assigned smallest value of i to g-channel\n", - "color_of_pts_on_tear = buml_obj.compute_color_of_pts_on_tear(\n", + "color_of_pts_on_tear = model.compute_color_of_pts_on_tear(\n", " y,\n", " tear_color_eig_inds, \n", ")\n", From d7fcb1040a68a8aa92ba310dc1c877052805ccc5 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 17:46:55 +0000 Subject: [PATCH 25/44] feat: New method for dynamic allocation of memory to enable large datasets without crashes --- src/pyRATS/_utils.py | 57 +++++++++++++++++++++++++++++++++++++++++--- src/pyRATS/rats.py | 4 +++- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 1d0a0fd..ad0f3b1 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -16,6 +16,33 @@ breadth_first_order, ) +import os + +def _get_available_memory(): + """Returns the available memory in bytes. + + Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil, + and finally falls back to a 4GB default. + """ + # 1. Environment variable override + limit = os.environ.get("PYRATS_MEMORY_LIMIT") + if limit: + try: + return int(limit) + except ValueError: + pass + + # 2. Try psutil for dynamic discovery + try: + import psutil + # Use 75% of available RAM as a safe upper bound for our buffer + return int(psutil.virtual_memory().available * 0.75) + except (ImportError, AttributeError): + pass + + # 3. Fallback to 2GB + return 2 * 1024 * 1024 * 1024 + def lpca(X, d, U, n_jobs, verbose=False): """Fit PCA model on the data X. @@ -1570,9 +1597,33 @@ def batched_eval_(self, view_index, data_mask): masks = data_mask if self.algo == "lpca": - temp = np.matmul( - self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks] - ) + # Dynamic chunking to prevent memory spikes on high-dimensional data + n_eval = len(ks) + n_neighbors = masks.shape[1] + n_features = self.X.shape[1] + + # Calculate chunk size based on available memory + memory_limit = _get_available_memory() + itemsize = self.X.dtype.itemsize + # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize) + chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize)) + + if n_eval <= chunk_size: + # fits in memory, process all at once + temp = np.matmul( + self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks] + ) + else: + # split into chunks + temp = np.zeros((n_eval, n_neighbors, self.Psi.shape[2])) + for start in range(0, n_eval, chunk_size): + end = min(start + chunk_size, n_eval) + ks_chunk = ks[start:end] + masks_chunk = masks[start:end] + temp[start:end] = np.matmul( + self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :], + self.Psi[ks_chunk] + ) n = self.X.shape[0] else: temp = [] diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 1de93d6..f2649dc 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -670,7 +670,9 @@ def _postprocess(self): future_use_phi_of = np.arange(n, dtype=int) while len(param_changed) > 0: - reconsider_mask = np.isin(connectivity_matrix, param_changed) & ( + changed_mask = np.zeros(n, dtype=bool) + changed_mask[param_changed] = True + reconsider_mask = changed_mask[connectivity_matrix] & ( connectivity_matrix != np.arange(n)[:, None] ) From d8f85a12e94219f8e083bbd1f25679868dbc25ca Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 18:10:49 +0000 Subject: [PATCH 26/44] Fix: Dynamic memory allocation fix --- README.md | 9 +++++++++ src/pyRATS/_utils.py | 31 ++++++++++++++++++------------- src/pyRATS/rats.py | 4 +++- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 6d0391e..dbccff6 100644 --- a/README.md +++ b/README.md @@ -61,6 +61,15 @@ python -m http.server 8000 ``` and opening http://localhost:8000 in your browser. +# Memory Management +`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it uses up to 75% of available system RAM. If you need to explicitly limit memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes): + +```bash +# Limit pyRATS to 4GB +export PYRATS_MEMORY_LIMIT=4294967296 +python your_script.py +``` + Citation ---------- diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index ad0f3b1..0eb93e6 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -18,11 +18,11 @@ import os -def _get_available_memory(): - """Returns the available memory in bytes. +def _get_available_memory(n_jobs=1): + """Returns the available memory in bytes, scaled by n_jobs. Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil, - and finally falls back to a 4GB default. + and finally falls back to a 4GB default (divided by n_jobs). """ # 1. Environment variable override limit = os.environ.get("PYRATS_MEMORY_LIMIT") @@ -35,13 +35,14 @@ def _get_available_memory(): # 2. Try psutil for dynamic discovery try: import psutil - # Use 75% of available RAM as a safe upper bound for our buffer - return int(psutil.virtual_memory().available * 0.75) + # Use a fraction of available RAM as a safe upper bound for our buffer, + # divided by the number of parallel workers. + return int(psutil.virtual_memory().available * 0.75 / n_jobs) except (ImportError, AttributeError): pass - # 3. Fallback to 2GB - return 2 * 1024 * 1024 * 1024 + # 3. Fallback to 4GB total (divided by workers) + return (4 * 1024 * 1024 * 1024) // n_jobs def lpca(X, d, U, n_jobs, verbose=False): @@ -387,7 +388,7 @@ def nearest_neighbors(X, k, metric, sort_results=True, n_jobs=-1): def cost_of_moving_distortion( - k, d_e, neigh_ind_k, U_k, local_param, c, n_C, Utilde, eta_min, eta_max + k, d_e, neigh_ind_k, U_k, local_param, c, n_C, Utilde, eta_min, eta_max, n_jobs=1 ): """Computes the minimum cost and destination cluster possible when merging k with its neighboring clusters. @@ -513,7 +514,7 @@ def compute_zeta(d_e_mask0, Psi_k_mask): def cost_of_moving_alignment_error( - k, d_e, neigh_ind_k, U_k, param, c, n_C, Utilde, eta_min, eta_max + k, d_e, neigh_ind_k, U_k, param, c, n_C, Utilde, eta_min, eta_max, n_jobs=1 ): """Computes the minimum cost and destination cluster possible when merging k with its neighboring clusters. @@ -588,6 +589,7 @@ def cost_of_moving_alignment_error( evals = param.batched_eval_( c_U_k_uniq_k, np.broadcast_to(U_k_list, [len(c_U_k_uniq_k), len(U_k_list)]), + n_jobs=n_jobs ) m = c_U_k_uniq_k == k @@ -704,7 +706,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): dest_ = np.zeros(end_ind - start_ind, dtype="int") - 1 for i, k in enumerate(range(start_ind, end_ind)): cost_[i], dest_[i] = cost_of_moving( - k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max + k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=n_jobs ) return start_ind, end_ind, cost_, dest_ @@ -758,7 +760,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): for k in S: cost[k], dest[k] = cost_of_moving( - k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max + k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=1 ) k = np.argmin(cost) @@ -1581,7 +1583,7 @@ def __init__(self, algo="lpca", **kwargs): self.add_dim = False self.standardize = False - def batched_eval_(self, view_index, data_mask): + def batched_eval_(self, view_index, data_mask, n_jobs=1): """Maps multiple points to the new space through their local (K-)PCA prameterizations. Parameters @@ -1591,6 +1593,9 @@ def batched_eval_(self, view_index, data_mask): data_mask : array-like, shape (n_points, n_neighbors) List of points to map to the embedding dimension for each point in 'view_index' + + n_jobs : int, default=1 + Number of parallel jobs. Used to scale memory limits. """ ks = view_index @@ -1603,7 +1608,7 @@ def batched_eval_(self, view_index, data_mask): n_features = self.X.shape[1] # Calculate chunk size based on available memory - memory_limit = _get_available_memory() + memory_limit = _get_available_memory(n_jobs=n_jobs) itemsize = self.X.dtype.itemsize # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize) chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize)) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index f2649dc..c2370e9 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -28,6 +28,7 @@ def _postprocess_col_range( future_use_phi_of, original_dists, n, + n_jobs=1, ): """Process a contiguous range of neighbor-columns [start, end) for _postprocess. @@ -47,7 +48,7 @@ def _postprocess_col_range( points_to_reconsider = np.where(col_mask)[0] batch_embedded_datapoints = param.batched_eval_( - future_use_phi_of[use_phi_of], neighbors_to_reconsider + future_use_phi_of[use_phi_of], neighbors_to_reconsider, n_jobs=n_jobs ) batch_embedded_dists = batched_pdist(batch_embedded_datapoints) batch_original_dists = original_dists[points_to_reconsider] @@ -697,6 +698,7 @@ def _postprocess(self): future_use_phi_of, original_dists, n, + n_jobs=self.n_jobs, ) for s, e in ranges ) From 87b2e81d0610cfa3f41a6ea6f363eb5aacef75aa Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 18:25:12 +0000 Subject: [PATCH 27/44] style: progress bar update and verbose cleanup --- src/pyRATS/_tear_coloring.py | 27 +++++----- src/pyRATS/_utils.py | 100 +++++++++++++++++++---------------- src/pyRATS/rats.py | 44 +++++++++------ 3 files changed, 95 insertions(+), 76 deletions(-) diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py index 04a0131..bd43b33 100644 --- a/src/pyRATS/_tear_coloring.py +++ b/src/pyRATS/_tear_coloring.py @@ -4,6 +4,7 @@ import scipy from joblib import delayed, Parallel +from tqdm import tqdm from pyRATS import _utils @@ -67,14 +68,15 @@ def compute_spectral_color_of_pts_on_tear( i_mat_in_emb=i_mat_in_emb, ) if pts_across_tear is None: - print("No tear detected.") + if verbose: + tqdm.write("No tear detected.") return None n_pts_across_tear = len(pts_across_tear) n_comp, labels = connected_components( tear_graph, directed=False, return_labels=True ) if verbose: - print("Number of components in the tear graph:", n_comp, flush=True) + tqdm.write(f"Number of components in the tear graph: {n_comp}") n_points_in_comp = [] for i in range(n_comp): @@ -92,7 +94,7 @@ def compute_spectral_color_of_pts_on_tear( n_comp_i = np.sum(comp_i) scale = n_comp_i / n_pts_across_tear if verbose: - print("#points in the tear component no.", i, "are:", n_comp_i, flush=True) + tqdm.write(f"#points in the tear component no. {i} are: {n_comp_i}") # If the component is very small then assign the constant color if n_comp_i <= max(3, int(color_cutoff_frac * n)): @@ -155,7 +157,7 @@ def compute_tear_graph( # If i_mat_in_emb if not provided if i_mat_in_emb is None: if verbose: - print("Metric used for computing incidence matrix in embedding:", metric) + tqdm.write(f"Metric used for computing incidence matrix in embedding: {metric}") i_mat_in_emb = _utils.compute_incidence_matrix_in_embedding( y, partition, k, nu, metric ) @@ -171,14 +173,13 @@ def compute_tear_graph( views_across_tear_graph.eliminate_zeros() # If no partitions/views are across the tear then there is no tear if len(views_across_tear_graph.data) == 0: - print("No views across the tear detected.") + if verbose: + tqdm.write("No views across the tear detected.") return None, None if verbose: - print( - "total #pairs of overlapping partitions/views:", - overlap.count_nonzero(), - flush=True, + tqdm.write( + f"total #pairs of overlapping partitions/views: {overlap.count_nonzero()}" ) pts_across_tear, tear_graph = compute_points_across_tear_graph( views_across_tear_graph, @@ -188,8 +189,8 @@ def compute_tear_graph( n_jobs=n_jobs, ) if verbose: - print("#vertices in tear graph =", tear_graph.shape[0], flush=True) - print("#edges in tear graph =", len(tear_graph.data), flush=True) + tqdm.write(f"#vertices in tear graph = {tear_graph.shape[0]}") + tqdm.write(f"#edges in tear graph = {len(tear_graph.data)}") return pts_across_tear, tear_graph @@ -208,7 +209,7 @@ def compute_points_across_tear_graph( ) n_pairs = len(views_across_tear_graph_row) if verbose: - print("#pairs of partitions/views across tear:", n_pairs, flush=True) + tqdm.write(f"#pairs of partitions/views across tear: {n_pairs}") # Define per-pair processing function def process_pairs(ij_pairs): @@ -274,7 +275,7 @@ def process_pairs(ij_pairs): for _, _, pts in results: pts_across_tear_mask[pts] = True if verbose: - print("Computing tear graph.", flush=True) + tqdm.write("Computing tear graph.") # Build sparse tear graph tear_graph = csr_matrix( ( diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 0eb93e6..2f76341 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -15,6 +15,7 @@ connected_components, breadth_first_order, ) +from tqdm import tqdm import os @@ -125,7 +126,10 @@ def target_proc(p_num, chunk_sz): chunk_sz = int(n / n_jobs) results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(i, chunk_sz) for i in range(n_jobs) + delayed(target_proc)(i, chunk_sz) + for i in tqdm( + range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose + ) ) for i in range(len(results)): @@ -136,7 +140,7 @@ def target_proc(p_num, chunk_sz): param.n_pc_dir_chosen[start_ind:end_ind] = n_pc_dir_chosen_ if verbose: - print("local_param: all %d points processed..." % n) + tqdm.write("local_param: all %d points processed..." % n) return param @@ -209,14 +213,17 @@ def target_proc(p_num, chunk_sz): chunk_sz = int(n / n_jobs) results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(p_num, chunk_sz) for p_num in range(n_jobs) + delayed(target_proc)(p_num, chunk_sz) + for p_num in tqdm( + range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose + ) ) for start_ind, end_ind, model_ in results: local_param.model[start_ind:end_ind] = model_ if verbose: - print("local_param: all %d points processed..." % n) + tqdm.write("local_param: all %d points processed..." % n) return local_param @@ -684,16 +691,16 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): U_csc = U.tocsc() # Vary eta from 2 to eta_{min} - if verbose: - print("Constructing intermediate views.") - for eta in range(2, eta_min + 1): + for eta in tqdm( + range(2, eta_min + 1), desc="Intermediate views", disable=not verbose + ): if verbose: print("eta = %d." % eta) - print( - "#non-empty views with sz < %d = %d" - % (eta, np.sum((n_C > 0) * (n_C < eta))) - ) - print("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta))) + # tqdm.write( + # "#non-empty views with sz < %d = %d" + # % (eta, np.sum((n_C > 0) * (n_C < eta))) + # ) + # tqdm.write("#nodes in views with sz < %d = %d" % (eta, np.sum(n_C[c] < eta))) def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): start_ind = p_num * chunk_sz @@ -713,7 +720,13 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): chunk_sz = int(n / n_jobs) results = Parallel(n_jobs=n_jobs)( delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) - for p_num in range(n_jobs) + for p_num in tqdm( + range(n_jobs), + desc=f"eta={eta}", + unit="chunk", + leave=False, + disable=not verbose, + ) ) for start_ind, end_ind, cost_, dest_ in results: cost[start_ind:end_ind] = cost_ @@ -724,10 +737,19 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): k = np.argmin(cost) cost_star = cost[k] - if verbose: - print("Costs computed when eta = %d." % eta) + # if verbose: + # tqdm.write("Costs computed when eta = %d." % eta) # Loop until minimum cost is inf + pbar_merge = None + if verbose: + pbar_merge = tqdm( + total=np.sum(n_C[c] < eta), + desc="Merging", + unit="pts", + leave=False, + ) + total_len_S = 0 ctr = 0 while cost_star < np.inf: @@ -763,19 +785,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): k, d_e, neigh_ind[k], U_[k], param, c, n_C, Utilde, eta, eta_max, n_jobs=1 ) + if verbose: + pbar_merge.update(1) + k = np.argmin(cost) cost_star = cost[k] if verbose: - print( - "ctr=%d, total_len_S=%d, avg_len_S=%0.3f" - % (ctr, total_len_S, total_len_S / (ctr + 1e-12)) - ) - print( - "Remaining #nodes in views with sz < %d = %d" - % (eta, np.sum(n_C[c] < eta)) - ) - print("Done with eta = %d." % eta) + pbar_merge.close() + # tqdm.write( + # "ctr=%d, total_len_S=%d, avg_len_S=%0.3f" + # % (ctr, total_len_S, total_len_S / (ctr + 1e-12)) + # ) + # tqdm.write( + # "Remaining #nodes in views with sz < %d = %d" + # % (eta, np.sum(n_C[c] < eta)) + # ) + # tqdm.write("Done with eta = %d." % eta) return c, n_C @@ -866,16 +892,6 @@ def target_proc(p_num): W_data = overlap_svals[:, -1] for i in range(overlap_svals.shape[1] - 2, -1, -1): mask = W_data == 0 - if verbose: - print( - "Iter:", - i, - ":: Updating scores of", - np.sum(mask), - "edges out of", - W_data.shape[0], - "edges.", - ) n_zero_elem = np.sum(mask) if n_zero_elem == 0: break @@ -1136,7 +1152,9 @@ def compute_final_embedding( Utilde_t = Utilde.copy() # Refine global embedding y - for it0 in range(max_iter): + for it0 in tqdm( + range(max_iter), desc="RGD alignment", unit="iter", disable=not verbose + ): if to_tear: Utildeg = compute_incidence_matrix_in_embedding(y, C, k, nu, metric) @@ -1150,8 +1168,6 @@ def compute_final_embedding( err = compute_alignment_err(d, Utilde_t, param, verbose) E_Gamma_t = Utilde_t.nnz err = err / E_Gamma_t - if verbose: - print("Alignment error: %0.6f" % (err), flush=True) if prev_err is not None: if (np.abs(err - prev_err) / (prev_err + 1e-12) < tol) and ( np.abs(E_Gamma_t - prev_edges) / (prev_edges + 1e-12) < tol @@ -1222,9 +1238,6 @@ def update(alpha, max_iter, O, CC, M, d): CC, Lpinv_BT, _, _ = build_ortho_optim(d, Utilde, param, verbose) M, n = Utilde.shape - if verbose: - print("Descent starts", flush=True) - Tstar = update( alpha, max_internal_iter, np.tile(np.eye(d), (1, M)), CC, M, d ) # At each iteration compute a new S starting with S = I @@ -1434,13 +1447,6 @@ def build_ortho_optim(d, Utilde, param, verbose): B = csr_matrix((B_val_all, (B_row_all, B_col_all)), shape=(M * d, n + M)) - if verbose: - print("min and max weights:", np.array(W_vals_all).min(), np.array(W_vals_all).max()) - print( - f"Computing Pseudoinverse of a matrix of L of size {n} + {M} multiplied with B", - flush=True, - ) - Lpinv_helpers = compute_Lpinv_helpers(W) Lpinv_BT = compute_Lpinv_MT(Lpinv_helpers, B) CC = compute_CC(D, B, Lpinv_BT) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index c2370e9..b491ef4 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -4,6 +4,7 @@ from multiprocessing import cpu_count from joblib import Parallel, delayed +from tqdm import tqdm from pyRATS._utils import ( nearest_neighbors, @@ -313,18 +314,35 @@ def fit_transform(self, X): y : array-like, shape (n_samples, d) X transformed in the new space. """ + n_steps = 5 if self.to_postprocess else 4 + current_step = 1 + if self.verbose: + print(f"[{current_step}/{n_steps}] Fitting neighborhood graph...") self._fit_nbrhd_graph(X) + current_step += 1 # Construct low dimensional local views + if self.verbose: + print(f"[{current_step}/{n_steps}] Fitting local views...") self._fit_local_views(X) + current_step += 1 + if self.to_postprocess: + if self.verbose: + print(f"[{current_step}/{n_steps}] Postprocessing local parameters...") self._postprocess() + current_step += 1 # Construct intermediate views + if self.verbose: + print(f"[{current_step}/{n_steps}] Clustering intermediate views...") c, n_C = self._fit_intermediate_views() + current_step += 1 # Construct Global views + if self.verbose: + print(f"[{current_step}/{n_steps}] Aligning global views...") y = self._fit_global_views(c, n_C) return y @@ -660,21 +678,21 @@ def _postprocess(self): self.param.zeta = zeta - if self.verbose: - print( - r"Maximum local distortion before postprocessing is: %0.4f" - % np.max(zeta) - ) connectivity_matrix = self.U.indices.reshape(n, -1) - param_changed = np.arange(n) future_use_phi_of = np.arange(n, dtype=int) + pbar = None + if self.verbose: + pbar = tqdm(total=n, desc="Refining parameters", unit="pts", leave=False) + + param_changed = np.arange(n) while len(param_changed) > 0: changed_mask = np.zeros(n, dtype=bool) changed_mask[param_changed] = True - reconsider_mask = changed_mask[connectivity_matrix] & ( - connectivity_matrix != np.arange(n)[:, None] + reconsider_mask = ( + changed_mask[connectivity_matrix] + & (connectivity_matrix != np.arange(n)[:, None]) ) # Split the k columns into n_jobs chunks and process each chunk in @@ -718,16 +736,10 @@ def _postprocess(self): zeta = np.minimum(zeta, zeta_min) if self.verbose: - print( - "#Param replaced: %d, max distortion: %f" - % (len(param_changed), np.max(zeta)) - ) + pbar.update(n - len(param_changed) - pbar.n) self.param.zeta = zeta.copy() self.param.replace_(future_use_phi_of) if self.verbose: - print( - f"Maximum local distortion after postprocessing is: %0.4f" - % (np.max(self.param.zeta)) - ) + pbar.close() From 1648a790c65f014e67158de7eda9c6bb9beca3f3 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 18:27:48 +0000 Subject: [PATCH 28/44] Update to readme and pyproject (tqdm addition) --- README.md | 6 +++--- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dbccff6..18ff892 100644 --- a/README.md +++ b/README.md @@ -62,11 +62,11 @@ python -m http.server 8000 and opening http://localhost:8000 in your browser. # Memory Management -`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it uses up to 75% of available system RAM. If you need to explicitly limit memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes): +`pyRATS` implements dynamic chunking to prevent memory spikes on large or high-dimensional datasets. By default, it attempts to use up to 75% of available system RAM. This measurement can fail and defaults to 4GB otherwise. If you need to explicitly set memory usage, set the `PYRATS_MEMORY_LIMIT` environment variable (in bytes): ```bash -# Limit pyRATS to 4GB -export PYRATS_MEMORY_LIMIT=4294967296 +# Set pyRATS memory usage to 40GB +export PYRATS_MEMORY_LIMIT=42949672960 python your_script.py ``` diff --git a/pyproject.toml b/pyproject.toml index 4b63c93..1cc94a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "scipy", "joblib", "multiprocess", + "tqdm", ] [project.optional-dependencies] From a074cc9dd6be045f6a807c46cb5d6516eadf1616 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 18:38:23 +0000 Subject: [PATCH 29/44] style: Cleaner tqdms --- src/pyRATS/_utils.py | 2 -- src/pyRATS/rats.py | 4 ---- 2 files changed, 6 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 2f76341..a334552 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -694,8 +694,6 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): for eta in tqdm( range(2, eta_min + 1), desc="Intermediate views", disable=not verbose ): - if verbose: - print("eta = %d." % eta) # tqdm.write( # "#non-empty views with sz < %d = %d" # % (eta, np.sum((n_C > 0) * (n_C < eta))) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index b491ef4..aa24bae 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -534,8 +534,6 @@ def _fit_intermediate_views(self): self.verbose, self.n_jobs, ) - if self.verbose: - print("Pruning and cleaning up.") # Prune empty clusters non_empty_C = n_C > 0 M = np.sum(non_empty_C) @@ -561,8 +559,6 @@ def _fit_intermediate_views(self): np.random.seed(42) self.param.noise_seed = np.random.randint(M * M, size=M) - if self.verbose: - print("Done.") else: c = np.arange(n, dtype=int) C = csr_matrix((np.ones(n), (c, np.arange(n))), shape=(n, n), dtype=bool) From 9203f56045258644edbe3993c19ffe3d12929938 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Wed, 25 Mar 2026 18:45:19 +0000 Subject: [PATCH 30/44] speedup test: bottlnecks re-introduced. Testing shift back. --- src/pyRATS/_utils.py | 95 +++++++++++++++++++-------------- src/pyRATS/rats.py | 123 +++++++++++++++---------------------------- 2 files changed, 98 insertions(+), 120 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index a334552..8305818 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -34,16 +34,16 @@ def _get_available_memory(n_jobs=1): pass # 2. Try psutil for dynamic discovery - try: - import psutil - # Use a fraction of available RAM as a safe upper bound for our buffer, - # divided by the number of parallel workers. - return int(psutil.virtual_memory().available * 0.75 / n_jobs) - except (ImportError, AttributeError): - pass - - # 3. Fallback to 4GB total (divided by workers) - return (4 * 1024 * 1024 * 1024) // n_jobs + # Cache the result to avoid repeated system calls in tight loops + if not hasattr(_get_available_memory, "_cached_val"): + try: + import psutil + # Use a fraction of available RAM as a safe upper bound for our buffer. + _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75) + except (ImportError, AttributeError): + _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 # 4GB fallback + + return _get_available_memory._cached_val // n_jobs def lpca(X, d, U, n_jobs, verbose=False): @@ -125,12 +125,15 @@ def target_proc(p_num, chunk_sz): return start_ind, end_ind, Psi, mu, var_explained, n_pc_dir_chosen chunk_sz = int(n / n_jobs) - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(i, chunk_sz) - for i in tqdm( - range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose + if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead + results = [target_proc(0, n)] + else: + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(i, chunk_sz) + for i in tqdm( + range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose + ) ) - ) for i in range(len(results)): start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i] @@ -139,9 +142,6 @@ def target_proc(p_num, chunk_sz): param.var_explained[start_ind:end_ind, :] = var_explained_ param.n_pc_dir_chosen[start_ind:end_ind] = n_pc_dir_chosen_ - if verbose: - tqdm.write("local_param: all %d points processed..." % n) - return param @@ -222,8 +222,6 @@ def target_proc(p_num, chunk_sz): for start_ind, end_ind, model_ in results: local_param.model[start_ind:end_ind] = model_ - if verbose: - tqdm.write("local_param: all %d points processed..." % n) return local_param @@ -505,16 +503,29 @@ def cost_of_moving_distortion( def compute_zeta(d_e_mask0, Psi_k_mask): - if d_e_mask0.shape[0] == 1: + """Computes the local distortion zeta for a given parameterization and neighborhood distances. + + Parameters + ---------- + d_e_mask0 : sparse matrix or array-like + Pairwise distances between neighbors in the original space. + Psi_k_mask : array-like + Embedded coordinates of the neighbors. + """ + # For small k (typical in clustering), dense operations are MUCH faster than + # sparse triu/nonzero lookups. + d_e_mask = d_e_mask0.toarray() if hasattr(d_e_mask0, "toarray") else d_e_mask0 + if d_e_mask.shape[0] <= 1: return 1 - d_e_upper = triu(d_e_mask0, k=1) - row, col = d_e_upper.nonzero() - if len(row) == 0: + + # Use squareform/pdist for efficient pair extraction on small dense matrices + d_e_mask_ = squareform(d_e_mask) + mask = d_e_mask_ != 0 + if not np.any(mask): return 1 - d_e_vals = d_e_upper.data - diff = Psi_k_mask[row] - Psi_k_mask[col] - dist_embedded = np.sqrt(np.sum(diff * diff, axis=1)) + d_e_vals = d_e_mask_[mask] + dist_embedded = pdist(Psi_k_mask)[mask] disc_lip_const = dist_embedded / d_e_vals return np.max(disc_lip_const) / (np.min(disc_lip_const) + 1e-12) @@ -716,16 +727,19 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): return start_ind, end_ind, cost_, dest_ chunk_sz = int(n / n_jobs) - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) - for p_num in tqdm( - range(n_jobs), - desc=f"eta={eta}", - unit="chunk", - leave=False, - disable=not verbose, + if n_jobs == 1 or n < 1000: + results = [target_proc(0, n, n, Utilde, n_C, c)] + else: + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) + for p_num in tqdm( + range(n_jobs), + desc=f"eta={eta}", + unit="chunk", + leave=False, + disable=not verbose, + ) ) - ) for start_ind, end_ind, cost_, dest_ in results: cost[start_ind:end_ind] = cost_ dest[start_ind:end_ind] = dest_ @@ -879,9 +893,12 @@ def target_proc(p_num): overlap_svals_[i - start_ind, :] = svdvals_ return (start_ind, end_ind, overlap_svals_) - res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")( - delayed(target_proc)(p_num) for p_num in range(n_jobs) - ) + if n_jobs == 1 or n_elem < 1000: + res = [target_proc(p_num) for p_num in range(n_jobs)] + else: + res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")( + delayed(target_proc)(p_num) for p_num in range(n_jobs) + ) for value in res: start_ind, end_ind, overlap_svals_ = value overlap_svals[start_ind:end_ind, :] = overlap_svals_ diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index aa24bae..886bb9c 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -20,56 +20,7 @@ from pyRATS._tear_coloring import compute_color_of_pts_on_tear -def _postprocess_col_range( - start, - end, - connectivity_matrix, - reconsider_mask, - param, - future_use_phi_of, - original_dists, - n, - n_jobs=1, -): - """Process a contiguous range of neighbor-columns [start, end) for _postprocess. - - Returns the element-wise minimum distortion and the corresponding column - index across the processed range, both as length-n arrays. - """ - local_zeta_min = np.full(n, np.inf) - local_best_col = np.zeros(n, dtype=int) - - for i in range(start, end): - col_mask = reconsider_mask[:, i] - use_phi_of = connectivity_matrix[:, i][col_mask] - if len(use_phi_of) == 0: - continue - - neighbors_to_reconsider = connectivity_matrix[col_mask] - points_to_reconsider = np.where(col_mask)[0] - - batch_embedded_datapoints = param.batched_eval_( - future_use_phi_of[use_phi_of], neighbors_to_reconsider, n_jobs=n_jobs - ) - batch_embedded_dists = batched_pdist(batch_embedded_datapoints) - batch_original_dists = original_dists[points_to_reconsider] - - disc_lip_const = np.divide( - batch_embedded_dists, - batch_original_dists, - out=np.full_like(batch_embedded_dists, np.nan), - where=batch_original_dists != 0, - ) - col_zeta = np.full(n, np.inf) - col_zeta[points_to_reconsider] = np.nanmax(disc_lip_const, axis=1) / ( - np.nanmin(disc_lip_const, axis=1) + 1e-12 - ) - - improved = col_zeta < local_zeta_min - local_best_col[improved] = i - local_zeta_min = np.minimum(local_zeta_min, col_zeta) - - return local_zeta_min, local_best_col +# _postprocess_col_range removed to reduce Parallel overhead in tight loops. class RATS: @@ -691,39 +642,49 @@ def _postprocess(self): & (connectivity_matrix != np.arange(n)[:, None]) ) - # Split the k columns into n_jobs chunks and process each chunk in - # a separate worker. Each worker returns its local (zeta_min, - # best_col) for the columns it owns; we then do a final - # min-reduction here. This mirrors the target_proc pattern used - # throughout _utils.py (lpca, best, …) while keeping memory - # bounded to one chunk of columns per worker at a time. - k_cols = connectivity_matrix.shape[1] - chunk_sz = max(1, k_cols // self.n_jobs) - ranges = [(p * chunk_sz, (p + 1) * chunk_sz) for p in range(self.n_jobs)] - ranges[-1] = (ranges[-1][0], k_cols) # extend last chunk to cover remainder - ranges = [(s, e) for s, e in ranges if s < k_cols] # drop empty ranges - - results = Parallel(n_jobs=self.n_jobs)( - delayed(_postprocess_col_range)( - s, e, - connectivity_matrix, - reconsider_mask, - self.param, - future_use_phi_of, - original_dists, - n, - n_jobs=self.n_jobs, - ) - for s, e in ranges - ) - - # Reduce chunk results into a single (zeta_min, best_col). + # Revert to serial loop for column checks. + # Local empirical tests show that Parallel overhead here outweighs the + # benefit for small k (neighbors), especially since this is called + # inside a convergence loop. zeta_min = np.full(n, np.inf) best_col = np.zeros(n, dtype=int) - for chunk_zeta_min, chunk_best_col in results: - improved = chunk_zeta_min < zeta_min - best_col[improved] = chunk_best_col[improved] - zeta_min = np.minimum(zeta_min, chunk_zeta_min) + + # Cache the number of jobs for batched_eval_ memory scaling + n_jobs = self.n_jobs if self.n_jobs > 0 else 1 + + for i in range(connectivity_matrix.shape[1]): + col_mask = reconsider_mask[:, i] + points_to_reconsider = np.where(col_mask)[0] + if len(points_to_reconsider) == 0: + continue + + use_phi_of = connectivity_matrix[points_to_reconsider, i] + neighbors_of_points = connectivity_matrix[points_to_reconsider] + + # batched_eval_ now uses cached memory limits to reduce overhead + batch_embedded_datapoints = self.param.batched_eval_( + future_use_phi_of[use_phi_of], + neighbors_of_points, + n_jobs=n_jobs + ) + batch_embedded_dists = batched_pdist(batch_embedded_datapoints) + batch_original_dists = original_dists[points_to_reconsider] + + disc_lip_const = np.divide( + batch_embedded_dists, + batch_original_dists, + out=np.full_like(batch_embedded_dists, np.nan), + where=batch_original_dists != 0, + ) + + col_zeta = np.nanmax(disc_lip_const, axis=1) / ( + np.nanmin(disc_lip_const, axis=1) + 1e-12 + ) + + improved = col_zeta < zeta_min[points_to_reconsider] + improved_pts = points_to_reconsider[improved] + best_col[improved_pts] = i + zeta_min[improved_pts] = col_zeta[improved] param_changed = np.where(zeta > zeta_min)[0] future_use_phi_of[param_changed] = future_use_phi_of[ From a46dceb5e7ea8690ca28072434bfaab2be823d9e Mon Sep 17 00:00:00 2001 From: Joshua Offergeld Date: Sun, 12 Apr 2026 10:13:28 +0200 Subject: [PATCH 31/44] feat: add global distortion measurement --- examples/kleinbottle_example.ipynb | 34 +++- src/pyRATS/global_distortion.py | 283 +++++++++++++++++++++++++++++ 2 files changed, 307 insertions(+), 10 deletions(-) create mode 100644 src/pyRATS/global_distortion.py diff --git a/examples/kleinbottle_example.ipynb b/examples/kleinbottle_example.ipynb index 8723479..6b34a91 100644 --- a/examples/kleinbottle_example.ipynb +++ b/examples/kleinbottle_example.ipynb @@ -7,7 +7,7 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install git+https://github.com/Mishne-Lab/pyRATS\n", + "# %pip install git+https://github.com/Mishne-Lab/pyRATS\n", "# if not already installed on the system:\n", "# %pip install pandas imageio seaborn matplotlib" ] @@ -19,9 +19,12 @@ "metadata": {}, "outputs": [], "source": [ + "import sys\n", + "sys.path.insert(0, '../src/')\n", + "\n", "from matplotlib import pyplot as plt\n", "\n", - "from pyRATS import rats\n", + "from pyRATS import rats, global_distortion\n", "import datasets, vis" ] }, @@ -69,6 +72,25 @@ "y" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fdf51f9", + "metadata": {}, + "outputs": [], + "source": [ + "n_nbrs = global_distortion.get_n_nbrs_for_distortion_analysis(X)\n", + "\n", + "global_distortion.compute_global_distortion(\n", + " X,\n", + " y,\n", + " n_nbrs,\n", + " buml_obj,\n", + " metric=\"euclidean\",\n", + " max_crossings=20,\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -90,14 +112,6 @@ ")\n", "plt.show()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9b56299", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/src/pyRATS/global_distortion.py b/src/pyRATS/global_distortion.py new file mode 100644 index 0000000..8f715a4 --- /dev/null +++ b/src/pyRATS/global_distortion.py @@ -0,0 +1,283 @@ +from scipy.sparse import csr_matrix +from scipy.sparse.csgraph import shortest_path +from sklearn.neighbors import NearestNeighbors +from pyRATS._tear_coloring import compute_tear_graph +import numpy as np +from joblib import Parallel, delayed + + +def _shortest_paths( + X, + k, + metric="euclidean", +): + nbrs = NearestNeighbors(n_neighbors=k, metric=metric).fit(X) + knn_graph = nbrs.kneighbors_graph(mode="distance") + + return shortest_path( + knn_graph, + return_predecessors=False, + directed=False, + ) + + +def _compute_tear_aware_shortest_path_distances( + param, + y, + Utilde, + n_Utilde_Utilde, + Utildeg, + C, + c, + k, + nu, + metric="euclidean", + max_crossings=5, + tol=1e-6, + dist=None, + n_batches=None, + debug=False, + n_jobs=-1, +): + if n_batches is None: + n_batches = n_jobs + if dist is None: + dist = _shortest_paths(y, n_nbrs=k, metric=metric, return_predqecessors=False) + + output = compute_tear_graph( + y, + Utilde, + C, + n_Utilde_Utilde, + k, + nu, + metric, + False, + n_jobs, + i_mat_in_emb=Utildeg, + ) + if output is None: + return dist + + pts_across_tear, tear_graph = output + tear_graph = tear_graph.tocoo() + tear_graph_row_inds = tear_graph.row + tear_graph_col_inds = tear_graph.col + tear_graph_with_weights = _add_weights_to_tear_graph( + param, tear_graph, pts_across_tear, c + ) + dist_of_pts_across_tear = tear_graph_with_weights.data + tear_graph_row_inds = pts_across_tear[tear_graph_row_inds] + tear_graph_col_inds = pts_across_tear[tear_graph_col_inds] + + # make shortcuts in dist + for k in range(len(tear_graph_row_inds)): + dist[tear_graph_row_inds[k], tear_graph_col_inds[k]] = dist_of_pts_across_tear[ + k + ] + + final_dist = _recompute_dist_using_tear( + dist, + pts_across_tear, + max_crossings=max_crossings, + n_batches=n_batches, + tol=tol, + debug=debug, + ) + + return final_dist + + +def _add_weights_to_tear_graph( + param, tear_graph, pts_across_tear, cluster_label, n_batches=8 +): + tear_graph = tear_graph.copy().tocoo() + tear_graph_row_inds = tear_graph.row + tear_graph_col_inds = tear_graph.col + n_edges = len(tear_graph_row_inds) + dist_of_pts_across_tear = np.zeros(n_edges) + + def process_(start_ind, end_ind): + temp = np.zeros(end_ind - start_ind) + np.inf + for k in range(start_ind, end_ind): + r_i = tear_graph_row_inds[k] + c_i = tear_graph_col_inds[k] + edge_i = pts_across_tear[r_i] + edge_j = pts_across_tear[c_i] + # for view_ind in view_cont_pts_across_tear[(edge_i, edge_j)]: + for view_ind in [cluster_label[edge_i], cluster_label[edge_j]]: + local_coords = param.eval_( + {"data_mask": [edge_i, edge_j], "view_index": view_ind} + ) + temp[k - start_ind] = min( + temp[k - start_ind], + np.linalg.norm(local_coords[0, :] - local_coords[1, :]), + ) + return temp + + chunk_sz = n_edges // n_batches + start_end_inds = [] + for i in range(n_batches): + start_ind = i * chunk_sz + if i < n_batches - 1: + end_ind = start_ind + chunk_sz + else: + end_ind = n_edges + start_end_inds.append((start_ind, end_ind)) + + results = Parallel(n_jobs=n_batches)( + delayed(process_)(a, b) for a, b in start_end_inds + ) + dist_of_pts_across_tear = np.concatenate(results) + + return csr_matrix( + (dist_of_pts_across_tear, (tear_graph_row_inds, tear_graph_col_inds)), + shape=tear_graph.shape, + ) + + +def _recompute_dist_using_tear( + dist, pts_across_tear, max_crossings=20, n_batches=32, tol=1e-6, debug=False +): + if n_batches < 1: + n_batches = 5 + + n = dist.shape[0] + if debug: + dists = [dist.copy()] + for i_crossing in range(max_crossings): + old_dist = dist.copy() + dist_from_pts_across_tear = dist[pts_across_tear, :].T # n x n_pts_across_tear + + def process_(dist, start_ind, end_ind): + dist_ = np.zeros((end_ind - start_ind, n)) + for i_ in range(start_ind, end_ind): + i = i_ - start_ind + dist_[i, :] = np.minimum( + dist[i, :], + np.minimum( + dist[i, :], + np.min( + dist_from_pts_across_tear[i_, :][:, None] + + dist_from_pts_across_tear.T, + axis=0, + ), + ), + ) + return dist_ + + start_end_inds = [] + chunk_sz = n // n_batches + for j in range(n_batches): + start_ind = j * chunk_sz + if j < n_batches - 1: + end_ind = start_ind + chunk_sz + else: + end_ind = n + start_end_inds.append((start_ind, end_ind)) + results = Parallel(n_jobs=n_batches)( + delayed(process_)(dist.copy()[a:b, :], a, b) for a, b in start_end_inds + ) + + for i in range(len(results)): + start_ind, end_ind = start_end_inds[i] + dist[start_ind:end_ind, :] = results[i] + + if debug: + dists.append(dist.copy()) + + mean_abs_rel_diff = np.ma.masked_invalid( + np.abs(dist - old_dist) / (old_dist + 1e-12) + ).mean() + + if mean_abs_rel_diff < tol: + break + + return dist + + +def _compute_distortion_at(y_d_e, s_d_e): + scale_factors = (y_d_e + 1e-12) / (s_d_e + 1e-12) + mask = np.ones(scale_factors.shape, dtype=bool) + np.fill_diagonal(mask, 0) + max_distortion = np.max(scale_factors[mask]) / np.min(scale_factors[mask]) + n = y_d_e.shape[0] + distortion_at = np.zeros(n) + mask = np.ones(n, dtype=bool) + for i in range(n): + mask[i] = 0 + distortion_at[i] = np.max(scale_factors[i, mask]) / np.min( + scale_factors[i, mask] + ) + mask[i] = 1 + return distortion_at, max_distortion + + +def get_n_nbrs_for_distortion_analysis( + X, + n_nbrs_list=list(range(5, 16)), + tol=1e-3, + persistence=3, + metric="euclidean", +): + + gt_dist_list = [_shortest_paths(X, k, metric=metric) for k in n_nbrs_list] + + spd_rel_mean_abs_diff = [] + for i in range(1, len(gt_dist_list)): + temp = np.mean( + np.abs( + (gt_dist_list[i] - gt_dist_list[i - 1]) / (gt_dist_list[i - 1] + 1e-12) + ) + ) + spd_rel_mean_abs_diff.append(temp) + + while True: + temp = np.array(spd_rel_mean_abs_diff) < tol + if np.sum(temp) == 0: + tol = 2 * tol + else: + temp2 = np.zeros(len(temp) - persistence + 1, dtype=bool) + for i in range(len(temp) - persistence + 1): + if np.prod(temp[i : i + persistence]): + temp2[i] = 1 + if np.sum(temp2) == 0: + tol = 2 * tol + else: + n_nbrs_ind = np.argmax(temp2) + break + + return n_nbrs_list[n_nbrs_ind] + + +def compute_global_distortion( + X, + y, + n_nbrs, + model, + metric="euclidean", + max_crossings=20, +): + + gt_dist = _shortest_paths(X, n_nbrs, metric=metric) + + emb_dist = _shortest_paths(y, n_nbrs) + + if model.to_tear: + emb_dist = _compute_tear_aware_shortest_path_distances( + model.param, + y, + model.Utilde, + model.n_Utilde_Utilde, + model.Utildeg, + model.C, + model.c, + n_nbrs, + model.nu, + metric, + max_crossings=max_crossings, + dist=emb_dist, + n_jobs=model.n_jobs, + ) + + return _compute_distortion_at(emb_dist, gt_dist) From 293c317e7a8c224dbd6625b13a39061a20c84efd Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 14:28:39 +0200 Subject: [PATCH 32/44] Fix for the breakages of parallelization in batched calls --- src/pyRATS/_utils.py | 95 ++++++++++++++++++++++++++------------------ 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 8305818..19b0181 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -6,7 +6,7 @@ from scipy.sparse import csr_matrix, triu, block_diag, diags import itertools -from joblib import delayed, Parallel +from joblib import delayed, Parallel, parallel_backend from scipy.spatial.distance import pdist, squareform from sklearn.utils.extmath import svd_flip @@ -20,30 +20,33 @@ import os def _get_available_memory(n_jobs=1): - """Returns the available memory in bytes, scaled by n_jobs. - - Checks for PYRATS_MEMORY_LIMIT environment variable, then tries to use psutil, - and finally falls back to a 4GB default (divided by n_jobs). + """Returns the per-worker memory budget in bytes. + + Used by chunking code paths to size buffers safely. When the caller is + running inside ``n_jobs`` concurrent workers each holding their own + buffers, pass ``n_jobs`` so the budget is divided accordingly. When + called from a serial code path, leave ``n_jobs=1``. + + Sources, in priority order: + 1. PYRATS_MEMORY_LIMIT environment variable (total available bytes) + 2. psutil's available system RAM, scaled by 0.75 + 3. 4GB fallback """ - # 1. Environment variable override limit = os.environ.get("PYRATS_MEMORY_LIMIT") if limit: try: - return int(limit) + return int(limit) // max(1, n_jobs) except ValueError: pass - - # 2. Try psutil for dynamic discovery - # Cache the result to avoid repeated system calls in tight loops + if not hasattr(_get_available_memory, "_cached_val"): try: import psutil - # Use a fraction of available RAM as a safe upper bound for our buffer. _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75) except (ImportError, AttributeError): - _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 # 4GB fallback + _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 - return _get_available_memory._cached_val // n_jobs + return _get_available_memory._cached_val // max(1, n_jobs) def lpca(X, d, U, n_jobs, verbose=False): @@ -128,12 +131,17 @@ def target_proc(p_num, chunk_sz): if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead results = [target_proc(0, n)] else: - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(i, chunk_sz) - for i in tqdm( - range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose + # inner_max_num_threads=1: each loky worker would otherwise fork its + # own BLAS pool sized to all cores, causing n_jobs**2 thread contention + # which produces a slowdown past n_jobs ~ ncores/2. Capping the inner + # pool to 1 keeps scaling monotonic. + with parallel_backend("loky", inner_max_num_threads=1): + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(i, chunk_sz) + for i in tqdm( + range(n_jobs), desc="PCA", unit="chunk", leave=False, disable=not verbose + ) ) - ) for i in range(len(results)): start_ind, end_ind, Psi_, mu_, var_explained_, n_pc_dir_chosen_ = results[i] @@ -212,12 +220,13 @@ def target_proc(p_num, chunk_sz): return start_ind, end_ind, model_ chunk_sz = int(n / n_jobs) - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(p_num, chunk_sz) - for p_num in tqdm( - range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose + with parallel_backend("loky", inner_max_num_threads=1): + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(p_num, chunk_sz) + for p_num in tqdm( + range(n_jobs), desc="KPCA", unit="chunk", leave=False, disable=not verbose + ) ) - ) for start_ind, end_ind, model_ in results: local_param.model[start_ind:end_ind] = model_ @@ -730,16 +739,23 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): if n_jobs == 1 or n < 1000: results = [target_proc(0, n, n, Utilde, n_C, c)] else: - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) - for p_num in tqdm( - range(n_jobs), - desc=f"eta={eta}", - unit="chunk", - leave=False, - disable=not verbose, + # inner_max_num_threads=1: each loky worker would otherwise fork its + # own BLAS pool. With Apple Accelerate / OpenBLAS sized to all cores, + # n_jobs workers x n_cores BLAS threads = n_jobs**2 software threads + # contending for n_cores hardware threads, which produces the + # 4 -> 8 jobs slowdown. Capping the inner pool to 1 keeps scaling + # monotonic. + with parallel_backend("loky", inner_max_num_threads=1): + results = Parallel(n_jobs=n_jobs)( + delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) + for p_num in tqdm( + range(n_jobs), + desc=f"eta={eta}", + unit="chunk", + leave=False, + disable=not verbose, + ) ) - ) for start_ind, end_ind, cost_, dest_ in results: cost[start_ind:end_ind] = cost_ dest[start_ind:end_ind] = dest_ @@ -896,9 +912,11 @@ def target_proc(p_num): if n_jobs == 1 or n_elem < 1000: res = [target_proc(p_num) for p_num in range(n_jobs)] else: - res = Parallel(n_jobs=n_jobs, return_as="generator_unordered")( - delayed(target_proc)(p_num) for p_num in range(n_jobs) - ) + with parallel_backend("loky", inner_max_num_threads=1): + res = list(Parallel(n_jobs=n_jobs, + return_as="generator_unordered")( + delayed(target_proc)(p_num) for p_num in range(n_jobs) + )) for value in res: start_ind, end_ind, overlap_svals_ = value overlap_svals[start_ind:end_ind, :] = overlap_svals_ @@ -1616,7 +1634,9 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1): List of points to map to the embedding dimension for each point in 'view_index' n_jobs : int, default=1 - Number of parallel jobs. Used to scale memory limits. + Number of concurrent workers calling this method. Used only to + divide the chunking memory budget so each worker stays within + its safe share of available RAM. Pass 1 from serial code paths. """ ks = view_index @@ -1627,8 +1647,7 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1): n_eval = len(ks) n_neighbors = masks.shape[1] n_features = self.X.shape[1] - - # Calculate chunk size based on available memory + memory_limit = _get_available_memory(n_jobs=n_jobs) itemsize = self.X.dtype.itemsize # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize) From 5369ea34b665adcc7c530879358fbd586f02c74b Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 14:49:26 +0200 Subject: [PATCH 33/44] Memory Limiting Overhaul Check --- src/pyRATS/_tear_coloring.py | 13 +- src/pyRATS/_utils.py | 225 +++++++++++++++++++++-------- src/pyRATS/rats.py | 74 +++++----- tests/scripts/thread_scaling.py | 71 +++++++++ tests/test_memory.py | 248 ++++++++++++++++++++++++++++++++ 5 files changed, 526 insertions(+), 105 deletions(-) create mode 100644 tests/scripts/thread_scaling.py create mode 100644 tests/test_memory.py diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py index bd43b33..ddd3661 100644 --- a/src/pyRATS/_tear_coloring.py +++ b/src/pyRATS/_tear_coloring.py @@ -3,7 +3,7 @@ from scipy.sparse.csgraph import connected_components, laplacian import scipy -from joblib import delayed, Parallel +from joblib import delayed, Parallel, parallel_backend from tqdm import tqdm from pyRATS import _utils @@ -262,10 +262,13 @@ def process_pairs(ij_pairs): else: ij_pairs_batches = [(views_across_tear_graph_row, views_across_tear_graph_col)] - # Run in parallel - results = Parallel(n_jobs=n_jobs)( - delayed(process_pairs)(ij_pairs) for ij_pairs in ij_pairs_batches - ) + # Run in parallel; cap inner BLAS threads so each loky worker doesn't + # spawn its own multi-threaded BLAS pool, which would cause n_jobs**2 + # thread oversubscription against ncores hardware threads. + with parallel_backend("loky", inner_max_num_threads=1): + results = Parallel(n_jobs=n_jobs)( + delayed(process_pairs)(ij_pairs) for ij_pairs in ij_pairs_batches + ) # Aggregate results tear_graph_row_inds = np.concatenate([r for r, _, _ in results if len(r)]) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 19b0181..a103ce7 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -18,35 +18,80 @@ from tqdm import tqdm import os +import sys +import warnings + +_MIN_MEMORY_FLOOR = 64 * 1024 * 1024 # 64 MB — always make some progress +_FALLBACK_MEMORY = 4 * 1024 * 1024 * 1024 # 4 GB if psutil is missing + + +def _cgroup_available_bytes(): + """Linux-only: bytes still available inside the current cgroup, or None. + + Reads cgroup v2 first, then falls back to v1. Returns None on any other + OS, when no limit is set, or when the files can't be parsed. + """ + if not sys.platform.startswith("linux"): + return None + pairs = ( + ("/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory.current"), + ("/sys/fs/cgroup/memory/memory.limit_in_bytes", + "/sys/fs/cgroup/memory/memory.usage_in_bytes"), + ) + for max_path, cur_path in pairs: + try: + with open(max_path) as f: + raw = f.read().strip() + if not raw or raw == "max": + return None + limit = int(raw) + # cgroup v1 uses a huge sentinel when unconstrained. + if limit >= (1 << 62): + return None + with open(cur_path) as f: + used = int(f.read().strip()) + return max(0, limit - used) + except (OSError, ValueError): + continue + return None -def _get_available_memory(n_jobs=1): - """Returns the per-worker memory budget in bytes. - Used by chunking code paths to size buffers safely. When the caller is - running inside ``n_jobs`` concurrent workers each holding their own - buffers, pass ``n_jobs`` so the budget is divided accordingly. When - called from a serial code path, leave ``n_jobs=1``. +def _available_memory_bytes(): + """Live probe of usable memory in bytes. - Sources, in priority order: - 1. PYRATS_MEMORY_LIMIT environment variable (total available bytes) - 2. psutil's available system RAM, scaled by 0.75 - 3. 4GB fallback + Sources, all combined via min(): + * psutil.virtual_memory().available scaled by 0.75 (or 4GB fallback) + * cgroup v2/v1 remaining quota on Linux containers (avoids OOM-kill + before psutil sees pressure) + * PYRATS_MEMORY_LIMIT env var as a hard cap + + Floored at 64MB so chunking can always make at least one row of progress. + Called per request; do not cache — pyRATS allocates large internal state + during a fit, and a stale snapshot is the root cause of the prior OOMs. """ - limit = os.environ.get("PYRATS_MEMORY_LIMIT") - if limit: + try: + import psutil + avail = int(psutil.virtual_memory().available * 0.75) + except (ImportError, AttributeError): + avail = _FALLBACK_MEMORY + + cgroup = _cgroup_available_bytes() + if cgroup is not None: + avail = min(avail, int(cgroup * 0.75)) + + user_cap = os.environ.get("PYRATS_MEMORY_LIMIT") + if user_cap: try: - return int(limit) // max(1, n_jobs) + avail = min(avail, int(user_cap)) except ValueError: pass - if not hasattr(_get_available_memory, "_cached_val"): - try: - import psutil - _get_available_memory._cached_val = int(psutil.virtual_memory().available * 0.75) - except (ImportError, AttributeError): - _get_available_memory._cached_val = 4 * 1024 * 1024 * 1024 + return max(avail, _MIN_MEMORY_FLOOR) - return _get_available_memory._cached_val // max(1, n_jobs) + +# Backward-compatible alias for any external caller pinned to the old name. +def _get_available_memory(n_jobs=1): + return _available_memory_bytes() // max(1, n_jobs) def lpca(X, d, U, n_jobs, verbose=False): @@ -1622,74 +1667,104 @@ def __init__(self, algo="lpca", **kwargs): self.add_dim = False self.standardize = False - def batched_eval_(self, view_index, data_mask, n_jobs=1): - """Maps multiple points to the new space through their local (K-)PCA prameterizations. + def iter_eval_(self, view_index, data_mask, peak_bytes_per_row=0): + """Stream the local-(K-)PCA embedding in memory-bounded chunks. + + Yields ``(slice, chunk)`` pairs where ``chunk`` is the embedded + sub-array for ``view_index[slice]`` / ``data_mask[slice]``. Lets + callers reduce per chunk (e.g. compute pdists, take a max) without + ever materializing the full ``(n_eval, k, d)`` output. Parameters ------ - view_index : array-like, shape (n_points) - The indices of the parameterization to use. - + view_index : array-like, shape (n_points,) data_mask : array-like, shape (n_points, n_neighbors) - List of points to map to the embedding dimension for each point in 'view_index' - - n_jobs : int, default=1 - Number of concurrent workers calling this method. Used only to - divide the chunking memory budget so each worker stays within - its safe share of available RAM. Pass 1 from serial code paths. + peak_bytes_per_row : int, default=0 + Caller-provided estimate of additional bytes the downstream + pipeline will allocate per row (e.g. batched_pdist's diff + buffer). Folded into the chunk-size budget so the *whole* + pipeline stays within available memory, not just this method. + + On MemoryError the chunk size is halved and the slice is retried; + a one-time RuntimeWarning is emitted so the user knows they're at + the limit. """ - - ks = view_index + ks = np.asarray(view_index) if not isinstance(view_index, np.ndarray) else view_index masks = data_mask + n_eval = len(ks) + if n_eval == 0: + return if self.algo == "lpca": - # Dynamic chunking to prevent memory spikes on high-dimensional data - n_eval = len(ks) n_neighbors = masks.shape[1] n_features = self.X.shape[1] - - memory_limit = _get_available_memory(n_jobs=n_jobs) + d = self.Psi.shape[2] itemsize = self.X.dtype.itemsize - # Target buffer size per chunk: (chunk_size * n_neighbors * n_features * itemsize) - chunk_size = max(1, memory_limit // (n_neighbors * n_features * itemsize)) - - if n_eval <= chunk_size: - # fits in memory, process all at once - temp = np.matmul( - self.X[masks] - self.mu[ks][:, np.newaxis, :], self.Psi[ks] - ) - else: - # split into chunks - temp = np.zeros((n_eval, n_neighbors, self.Psi.shape[2])) - for start in range(0, n_eval, chunk_size): - end = min(start + chunk_size, n_eval) - ks_chunk = ks[start:end] - masks_chunk = masks[start:end] - temp[start:end] = np.matmul( - self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :], - self.Psi[ks_chunk] + # Per-row peak inside this method: input gather + output buffer. + own_per_row = (n_neighbors * n_features + n_neighbors * d) * itemsize + per_row = max(1, own_per_row + max(0, int(peak_bytes_per_row))) + + chunk_size = max(1, _available_memory_bytes() // per_row) + chunk_size = min(chunk_size, n_eval) + + warned = False + start = 0 + while start < n_eval: + end = min(start + chunk_size, n_eval) + sl = slice(start, end) + ks_chunk = ks[sl] + masks_chunk = masks[sl] + try: + chunk = np.matmul( + self.X[masks_chunk] - self.mu[ks_chunk][:, np.newaxis, :], + self.Psi[ks_chunk], ) - n = self.X.shape[0] + except MemoryError: + if chunk_size <= 1: + raise + chunk_size = max(1, chunk_size // 2) + if not warned: + warnings.warn( + "pyRATS: hit MemoryError during local embedding; " + f"halving chunk size to {chunk_size} and retrying. " + "Consider lowering n_neighbors or setting " + "PYRATS_MEMORY_LIMIT.", + RuntimeWarning, + stacklevel=2, + ) + warned = True + continue + chunk = self._apply_post_(ks_chunk, masks_chunk, chunk) + yield sl, chunk + start = end else: - temp = [] + # KPCA path is already row-by-row; yield singletons so callers + # have a uniform streaming interface. for i, k in enumerate(ks): X_ = self.X[masks[i], :] if self.standardize: X_ = X_ - np.mean(X_, axis=0)[None, :] X_ = X_ / (np.std(X_, axis=0, ddof=1)[None, :] + 1e-12) - temp.append(self.model[k].transform(X_)) - temp = np.array(temp) + chunk = self.model[k].transform(X_)[None, ...] + ks_chunk = ks[i:i + 1] + masks_chunk = masks[i:i + 1] + chunk = self._apply_post_(ks_chunk, masks_chunk, chunk) + yield slice(i, i + 1), chunk + + def _apply_post_(self, ks, masks, temp): + """Apply noise / add_dim / b / T / v to a chunk of embedded points. + Factored out of batched_eval_ so iter_eval_ can apply the same + post-processing per chunk. Semantics match the original inline code. + """ if self.noise_var: np.random.seed(self.noise_seed[0]) - temp2 = np.random.normal(0, self.noise_var, (n, temp.shape[2])) + temp2 = np.random.normal(0, self.noise_var, (self.X.shape[0], temp.shape[2])) temp = temp + temp2[masks, :] if self.noise is not None: temp = temp + self.noise[masks] - if self.add_dim: temp = np.concatenate([temp, np.zeros((*temp.shape[:2], 1))], axis=2) - if self.b is not None: temp = temp * self.b[ks][:, None, None] if self.T is not None: @@ -1698,6 +1773,32 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1): temp = temp + self.v[[ks], :] return temp + def batched_eval_(self, view_index, data_mask, n_jobs=1): + """Materialize the full local-(K-)PCA embedding. + + Thin wrapper around iter_eval_ for callers that need the whole + ``(n_eval, n_neighbors, d)`` array (e.g. clustering's procrustes + cost). Memory-bounded internally via iter_eval_'s chunking and + MemoryError backoff. + + ``n_jobs`` is accepted for backward compatibility and ignored; + the live memory probe in iter_eval_ already reflects whatever + sibling workers have allocated. + """ + ks = view_index + masks = data_mask + n_eval = len(ks) + if n_eval == 0: + d = self.Psi.shape[2] if self.algo == "lpca" else 0 + return np.zeros((0, masks.shape[1] if hasattr(masks, "shape") else 0, d)) + + out = None + for sl, chunk in self.iter_eval_(ks, masks): + if out is None: + out = np.empty((n_eval, chunk.shape[1], chunk.shape[2]), dtype=chunk.dtype) + out[sl] = chunk + return out + def eval_(self, view_index, data_mask, apply_b=True): """Maps points to the new space through their local (K-)PCA prameterizations. diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 886bb9c..bef7e1c 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -609,19 +609,26 @@ def _postprocess(self): self.neighborhood_graph_sym[row_idx, col_idx] ).reshape(n, num_pairs) - embedded_datapoints = self.param.batched_eval_( - range(n), self.U.indices.reshape(n, self.k) - ) # tested and they are equivalent - embedded_dists = batched_pdist(embedded_datapoints) - disc_lip_const = np.divide( - embedded_dists, - original_dists, - out=np.full_like(embedded_dists, np.nan), - where=original_dists != 0, - ) - zeta = np.nanmax(disc_lip_const, axis=1) / ( - np.nanmin(disc_lip_const, axis=1) + 1e-12 - ) + # Per-row peak for the downstream batched_pdist: a (chunk, num_pairs, d) + # diff buffer plus its (chunk, num_pairs) result. Folded into the + # iter_eval_ chunk-size budget so the whole pipeline stays bounded. + d_embed = self.param.Psi.shape[2] if self.param.algo == "lpca" else self.d + itemsize = self.param.X.dtype.itemsize + pdist_per_row = num_pairs * (d_embed + 1) * itemsize + + zeta = np.empty(n) + view_idx = np.arange(n, dtype=int) + masks_init = self.U.indices.reshape(n, self.k) + for sl, chunk in self.param.iter_eval_(view_idx, masks_init, + peak_bytes_per_row=pdist_per_row): + chunk_dists = batched_pdist(chunk) + chunk_orig = original_dists[sl] + dlc = np.divide( + chunk_dists, chunk_orig, + out=np.full_like(chunk_dists, np.nan), + where=chunk_orig != 0, + ) + zeta[sl] = np.nanmax(dlc, axis=1) / (np.nanmin(dlc, axis=1) + 1e-12) self.param.zeta = zeta @@ -642,15 +649,8 @@ def _postprocess(self): & (connectivity_matrix != np.arange(n)[:, None]) ) - # Revert to serial loop for column checks. - # Local empirical tests show that Parallel overhead here outweighs the - # benefit for small k (neighbors), especially since this is called - # inside a convergence loop. zeta_min = np.full(n, np.inf) best_col = np.zeros(n, dtype=int) - - # Cache the number of jobs for batched_eval_ memory scaling - n_jobs = self.n_jobs if self.n_jobs > 0 else 1 for i in range(connectivity_matrix.shape[1]): col_mask = reconsider_mask[:, i] @@ -660,26 +660,24 @@ def _postprocess(self): use_phi_of = connectivity_matrix[points_to_reconsider, i] neighbors_of_points = connectivity_matrix[points_to_reconsider] - - # batched_eval_ now uses cached memory limits to reduce overhead - batch_embedded_datapoints = self.param.batched_eval_( - future_use_phi_of[use_phi_of], - neighbors_of_points, - n_jobs=n_jobs - ) - batch_embedded_dists = batched_pdist(batch_embedded_datapoints) batch_original_dists = original_dists[points_to_reconsider] - disc_lip_const = np.divide( - batch_embedded_dists, - batch_original_dists, - out=np.full_like(batch_embedded_dists, np.nan), - where=batch_original_dists != 0, - ) - - col_zeta = np.nanmax(disc_lip_const, axis=1) / ( - np.nanmin(disc_lip_const, axis=1) + 1e-12 - ) + col_zeta = np.empty(len(points_to_reconsider)) + for sl, chunk in self.param.iter_eval_( + future_use_phi_of[use_phi_of], + neighbors_of_points, + peak_bytes_per_row=pdist_per_row, + ): + chunk_dists = batched_pdist(chunk) + chunk_orig = batch_original_dists[sl] + dlc = np.divide( + chunk_dists, chunk_orig, + out=np.full_like(chunk_dists, np.nan), + where=chunk_orig != 0, + ) + col_zeta[sl] = np.nanmax(dlc, axis=1) / ( + np.nanmin(dlc, axis=1) + 1e-12 + ) improved = col_zeta < zeta_min[points_to_reconsider] improved_pts = points_to_reconsider[improved] diff --git a/tests/scripts/thread_scaling.py b/tests/scripts/thread_scaling.py new file mode 100644 index 0000000..425e186 --- /dev/null +++ b/tests/scripts/thread_scaling.py @@ -0,0 +1,71 @@ +"""Measure how RATS.fit_transform scales with n_jobs on a kleinbottle. + +Usage: + python tests/scripts/thread_scaling.py [--n N] [--jobs 1 2 4 8] [--repeats R] + +Prints a small table of wall times and speedup vs. the n_jobs=1 baseline. +""" + +import argparse +import os +import sys +import time + +# Allow importing src and examples +repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")) +if repo_root not in sys.path: + sys.path.insert(0, repo_root) + +from pyRATS import rats +from examples import datasets + + +def run_once(X, n_jobs): + model = rats.RATS( + n_components=2, + n_neighbors=14, + cost_function="distortion", + min_cluster_size=5, + n_iter=3, + nu=4, + verbose=False, + n_jobs=n_jobs, + ) + t0 = time.perf_counter() + model.fit_transform(X=X) + return time.perf_counter() - t0 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--n", type=int, default=2000, + help="Klein bottle sample count (default 2000).") + parser.add_argument("--jobs", type=int, nargs="+", default=[1, 2, 4, 8], + help="List of n_jobs values to benchmark.") + parser.add_argument("--repeats", type=int, default=2, + help="Repeats per n_jobs (best time kept).") + args = parser.parse_args() + + print(f"Loading kleinbottle4d with n={args.n}...") + ds = datasets.Datasets() + X, _, _ = ds.kleinbottle4d(n=args.n) + + # Warmup so JIT / first-touch costs don't pollute the first measurement. + print("Warmup (n_jobs=1)...") + run_once(X, 1) + + print(f"\n| n_jobs | best (s) | speedup | all runs (s) |") + print(f"|--------|----------|---------|-------------------------|") + baseline = None + for nj in args.jobs: + runs = [run_once(X, nj) for _ in range(args.repeats)] + best = min(runs) + if baseline is None: + baseline = best + speedup = baseline / best + runs_str = ", ".join(f"{r:.2f}" for r in runs) + print(f"| {nj:6d} | {best:8.3f} | {speedup:7.2f} | {runs_str:23s} |") + + +if __name__ == "__main__": + main() diff --git a/tests/test_memory.py b/tests/test_memory.py new file mode 100644 index 0000000..99d3dd9 --- /dev/null +++ b/tests/test_memory.py @@ -0,0 +1,248 @@ +"""Memory-management tests for pyRATS. + +Covers: + * The live ``_available_memory_bytes`` probe and PYRATS_MEMORY_LIMIT cap. + * ``iter_eval_`` chunks the workload when the budget is tight, and the + streamed output matches the fully-materialized ``batched_eval_`` output. + * A simulated MemoryError causes chunk-size backoff (with a warning). + * Peak Python-tracked allocations during a streamed run stay within the + declared budget. +""" + +import os +import warnings +import tracemalloc + +import numpy as np +import pytest + +from pyRATS import _utils +from pyRATS._utils import Param, _available_memory_bytes, batched_pdist + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +def _make_lpca_param(n=400, n_features=32, k=12, d=2, seed=0): + """Build a self-contained lpca-style Param for testing iter_eval_. + + Skips the real lpca() fit (which we don't need) and just stuffs in + well-shaped Psi/mu/X arrays so iter_eval_ has something to chew on. + """ + rng = np.random.default_rng(seed) + p = Param(algo="lpca") + p.X = rng.standard_normal((n, n_features)).astype(np.float64) + p.mu = rng.standard_normal((n, n_features)) + p.Psi = rng.standard_normal((n, n_features, d)) + p.b = np.ones(n) + p.noise_var = 0 + p.noise = None + return p, k + + +@pytest.fixture +def clean_env(monkeypatch): + """Ensure PYRATS_MEMORY_LIMIT doesn't leak between tests.""" + monkeypatch.delenv("PYRATS_MEMORY_LIMIT", raising=False) + yield monkeypatch + + +# --------------------------------------------------------------------------- +# Probe +# --------------------------------------------------------------------------- + +def test_available_memory_honors_env_var(clean_env): + cap = 128 * 1024 * 1024 # 128 MB + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(cap)) + assert _available_memory_bytes() <= cap + + +def test_available_memory_floor(clean_env): + # A ridiculously small cap still gets clamped to the 64MB floor so + # chunking can always make progress. + clean_env.setenv("PYRATS_MEMORY_LIMIT", "1") + assert _available_memory_bytes() >= 64 * 1024 * 1024 + + +def test_available_memory_is_live(clean_env): + """Probe must not cache: changing the env var must change the result.""" + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(256 * 1024 * 1024)) + first = _available_memory_bytes() + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(128 * 1024 * 1024)) + second = _available_memory_bytes() + assert second < first + + +# --------------------------------------------------------------------------- +# iter_eval_ chunking + streaming correctness +# --------------------------------------------------------------------------- + +def test_iter_eval_yields_multiple_chunks_when_budget_tight(clean_env): + p, k = _make_lpca_param(n=300, k=12) + masks = np.tile(np.arange(k), (300, 1)) + view_idx = np.arange(300) + + # Force a tiny budget so we get many chunks. Floor is 64MB so we go + # under it via monkeypatch to actually exercise the path. + clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) + clean_env.setenv("PYRATS_MEMORY_LIMIT", "8192") # 8 KB + + chunks = list(p.iter_eval_(view_idx, masks)) + assert len(chunks) > 1, "expected chunking with a tight budget" + + # Sum of chunk lengths must cover the input exactly once. + covered = sum(sl.stop - sl.start for sl, _ in chunks) + assert covered == 300 + + +def test_iter_eval_output_matches_batched_eval(clean_env): + """Chunked stream must reconstruct the same array as the unchunked path.""" + p, k = _make_lpca_param(n=200, k=10) + masks = np.tile(np.arange(k), (200, 1)) + view_idx = np.arange(200) + + # Reference: large budget => one chunk. + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(1 << 30)) + full = p.batched_eval_(view_idx, masks) + + # Streamed: tight budget => many chunks. Sum back and compare. + clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) + clean_env.setenv("PYRATS_MEMORY_LIMIT", "16384") + streamed = np.empty_like(full) + nchunks = 0 + for sl, chunk in p.iter_eval_(view_idx, masks): + streamed[sl] = chunk + nchunks += 1 + assert nchunks > 1 + np.testing.assert_allclose(streamed, full, rtol=0, atol=0) + + +def test_peak_bytes_per_row_shrinks_chunks(clean_env): + """Caller-supplied downstream-cost estimate must shrink chunk size.""" + p, k = _make_lpca_param(n=500, k=10) + masks = np.tile(np.arange(k), (500, 1)) + view_idx = np.arange(500) + + clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) + clean_env.setenv("PYRATS_MEMORY_LIMIT", "65536") + + no_hint = list(p.iter_eval_(view_idx, masks, peak_bytes_per_row=0)) + with_hint = list( + p.iter_eval_(view_idx, masks, peak_bytes_per_row=10 * 1024) + ) + assert len(with_hint) >= len(no_hint), ( + "declaring downstream allocation should produce >= chunks" + ) + + +# --------------------------------------------------------------------------- +# MemoryError backoff +# --------------------------------------------------------------------------- + +def test_memory_error_triggers_backoff_and_warns(clean_env, monkeypatch): + p, k = _make_lpca_param(n=64, k=8) + masks = np.tile(np.arange(k), (64, 1)) + view_idx = np.arange(64) + + real_matmul = np.matmul + state = {"raised": False} + + def fake_matmul(a, b, *args, **kwargs): + # Raise MemoryError exactly once on the first call, then defer to + # real matmul. Mimics an OS-level transient pressure event. + if not state["raised"] and a.shape[0] > 1: + state["raised"] = True + raise MemoryError("simulated pressure") + return real_matmul(a, b, *args, **kwargs) + + monkeypatch.setattr(np, "matmul", fake_matmul) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + out = p.batched_eval_(view_idx, masks) + + assert state["raised"], "fake_matmul was never invoked" + assert out.shape == (64, k, p.Psi.shape[2]) + assert any( + issubclass(w.category, RuntimeWarning) and "MemoryError" in str(w.message) + for w in caught + ), "expected a RuntimeWarning about MemoryError backoff" + + +def test_memory_error_propagates_when_chunk_already_one(clean_env, monkeypatch): + """If chunk size is already 1, backoff has nowhere to go — must raise.""" + p, k = _make_lpca_param(n=4, k=4) + masks = np.tile(np.arange(k), (4, 1)) + view_idx = np.arange(4) + + clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 16) + clean_env.setenv("PYRATS_MEMORY_LIMIT", "16") # forces chunk_size = 1 + + def always_fail(a, b, *args, **kwargs): + raise MemoryError("cannot recover") + + monkeypatch.setattr(np, "matmul", always_fail) + + with pytest.raises(MemoryError): + list(p.iter_eval_(view_idx, masks)) + + +# --------------------------------------------------------------------------- +# Measured peak (Python-tracked allocations) +# --------------------------------------------------------------------------- + +def test_streamed_postprocess_peak_under_budget(clean_env): + """End-to-end peak check: a streamed _postprocess-style reduction must + keep peak Python-tracked allocations near the per-chunk working set, + not the full (n, num_pairs, d) materialized intermediate. + + We measure peak with tracemalloc, which on CPython tracks NumPy buffers + via the default PyMem allocator. This is portable across Linux/macOS/ + Windows and avoids relying on RSS (which depends on OS overcommit). + """ + n, k, d = 1500, 16, 3 + p, _ = _make_lpca_param(n=n, k=k, d=d, n_features=24) + masks = np.tile(np.arange(k), (n, 1)) + view_idx = np.arange(n) + + num_pairs = k * (k - 1) // 2 + itemsize = p.X.dtype.itemsize + + # --- Baseline: materialize the full pipeline (no streaming) ---------- + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(1 << 32)) # effectively unlimited + tracemalloc.start() + full_eval = p.batched_eval_(view_idx, masks) + full_dists = batched_pdist(full_eval) + _, materialized_peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + del full_eval, full_dists + + # --- Streamed: chunked _postprocess-style reduction ------------------ + # Cap at ~10x the per-row downstream peak so chunk_size lands at ~10 + # rows. The streamed peak should be a small fraction of the + # materialized peak. We also lower the floor so the cap actually bites. + pdist_per_row = num_pairs * (d + 1) * itemsize + budget = 10 * pdist_per_row * 8 + clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) + clean_env.setenv("PYRATS_MEMORY_LIMIT", str(budget)) + + tracemalloc.start() + zeta = np.empty(n) + nchunks = 0 + for sl, chunk in p.iter_eval_(view_idx, masks, peak_bytes_per_row=pdist_per_row): + chunk_dists = batched_pdist(chunk) + zeta[sl] = np.nanmax(chunk_dists, axis=1) + nchunks += 1 + _, streamed_peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + assert nchunks > 1, "tight budget should produce multiple chunks" + # Streamed peak should be meaningfully smaller. Allow generous slack + # (0.5x) since tracemalloc captures fixed Param state too; the goal is + # to catch regressions where streaming silently degrades to full + # materialization. + assert streamed_peak < 0.5 * materialized_peak, ( + f"streamed peak {streamed_peak} not meaningfully below " + f"materialized peak {materialized_peak}" + ) From 20697620d8fcd206ec03b301d5b536e43bafc74f Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 15:02:40 +0200 Subject: [PATCH 34/44] Allowing multiithreads within blas workers --- src/pyRATS/_utils.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index a103ce7..95b3817 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -94,6 +94,19 @@ def _get_available_memory(n_jobs=1): return _available_memory_bytes() // max(1, n_jobs) +def _inner_blas_threads(n_jobs): + """BLAS threads per outer Parallel worker, sized to avoid oversubscription. + + Apple Accelerate / OpenBLAS otherwise spawn a pool sized to all cores + inside each outer worker, giving n_jobs * cpu_count software threads on + cpu_count hardware threads — the n_jobs ~ cpu_count/2 cliff. Pinning to 1 + fixes that but leaves cores idle when n_jobs < cpu_count. cpu_count // + n_jobs strikes the balance: full utilization, no contention. Floored at 1. + """ + cores = os.cpu_count() or 1 + return max(1, cores // max(1, n_jobs)) + + def lpca(X, d, U, n_jobs, verbose=False): """Fit PCA model on the data X. @@ -176,11 +189,7 @@ def target_proc(p_num, chunk_sz): if n_jobs == 1 or n < 1000: # Threshold for Parallel overhead results = [target_proc(0, n)] else: - # inner_max_num_threads=1: each loky worker would otherwise fork its - # own BLAS pool sized to all cores, causing n_jobs**2 thread contention - # which produces a slowdown past n_jobs ~ ncores/2. Capping the inner - # pool to 1 keeps scaling monotonic. - with parallel_backend("loky", inner_max_num_threads=1): + with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)): results = Parallel(n_jobs=n_jobs)( delayed(target_proc)(i, chunk_sz) for i in tqdm( @@ -265,7 +274,7 @@ def target_proc(p_num, chunk_sz): return start_ind, end_ind, model_ chunk_sz = int(n / n_jobs) - with parallel_backend("loky", inner_max_num_threads=1): + with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)): results = Parallel(n_jobs=n_jobs)( delayed(target_proc)(p_num, chunk_sz) for p_num in tqdm( @@ -784,13 +793,7 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): if n_jobs == 1 or n < 1000: results = [target_proc(0, n, n, Utilde, n_C, c)] else: - # inner_max_num_threads=1: each loky worker would otherwise fork its - # own BLAS pool. With Apple Accelerate / OpenBLAS sized to all cores, - # n_jobs workers x n_cores BLAS threads = n_jobs**2 software threads - # contending for n_cores hardware threads, which produces the - # 4 -> 8 jobs slowdown. Capping the inner pool to 1 keeps scaling - # monotonic. - with parallel_backend("loky", inner_max_num_threads=1): + with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)): results = Parallel(n_jobs=n_jobs)( delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) for p_num in tqdm( @@ -957,7 +960,7 @@ def target_proc(p_num): if n_jobs == 1 or n_elem < 1000: res = [target_proc(p_num) for p_num in range(n_jobs)] else: - with parallel_backend("loky", inner_max_num_threads=1): + with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)): res = list(Parallel(n_jobs=n_jobs, return_as="generator_unordered")( delayed(target_proc)(p_num) for p_num in range(n_jobs) From f01c358a393f264938cbcc365213e774c59fedbc Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 15:05:08 +0200 Subject: [PATCH 35/44] ci fixes --- .github/workflows/benchmark.yml | 4 ++-- .github/workflows/ci.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 4fe417d..3364965 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -47,8 +47,8 @@ jobs: - name: Install Dependencies run: | - pip install --upgrade pip - pip install -e .[test] + python -m pip install --upgrade pip + python -m pip install -e .[test] - name: Run Benchmark run: | diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a0cc7e6..6dfe788 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,8 +29,8 @@ jobs: - name: Install Dependencies run: | - pip install --upgrade pip - pip install -e .[test] + python -m pip install --upgrade pip + python -m pip install -e .[test] - name: Set Fast Mode Flag id: set-flag From a2e7add0641bce8ab96edda7322e93c19e6fbe51 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 18:44:32 +0200 Subject: [PATCH 36/44] Testing more efficient allocation and memory checks --- src/pyRATS/_utils.py | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 95b3817..7f16c80 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -19,6 +19,7 @@ import os import sys +import time import warnings _MIN_MEMORY_FLOOR = 64 * 1024 * 1024 # 64 MB — always make some progress @@ -56,8 +57,12 @@ def _cgroup_available_bytes(): return None +_MEMORY_CACHE_TTL = 0.25 # seconds — fresh enough to track fit-phase allocations +_memory_cache: tuple[float, int] | None = None # (timestamp, bytes) + + def _available_memory_bytes(): - """Live probe of usable memory in bytes. + """Live probe of usable memory in bytes, cached with a short TTL. Sources, all combined via min(): * psutil.virtual_memory().available scaled by 0.75 (or 4GB fallback) @@ -65,10 +70,17 @@ def _available_memory_bytes(): before psutil sees pressure) * PYRATS_MEMORY_LIMIT env var as a hard cap + Result is cached for _MEMORY_CACHE_TTL seconds (default 250ms). This + eliminates psutil and cgroup file-read overhead on hot call sites (e.g. + iter_eval_ inside best()'s per-point loop) while still reacting to + large allocations between fit phases — the original "cache forever" bug. Floored at 64MB so chunking can always make at least one row of progress. - Called per request; do not cache — pyRATS allocates large internal state - during a fit, and a stale snapshot is the root cause of the prior OOMs. """ + global _memory_cache + now = time.monotonic() + if _memory_cache is not None and now - _memory_cache[0] < _MEMORY_CACHE_TTL: + return _memory_cache[1] + try: import psutil avail = int(psutil.virtual_memory().available * 0.75) @@ -86,7 +98,9 @@ def _available_memory_bytes(): except ValueError: pass - return max(avail, _MIN_MEMORY_FLOOR) + result = max(avail, _MIN_MEMORY_FLOOR) + _memory_cache = (now, result) + return result # Backward-compatible alias for any external caller pinned to the old name. @@ -1795,10 +1809,18 @@ def batched_eval_(self, view_index, data_mask, n_jobs=1): d = self.Psi.shape[2] if self.algo == "lpca" else 0 return np.zeros((0, masks.shape[1] if hasattr(masks, "shape") else 0, d)) - out = None - for sl, chunk in self.iter_eval_(ks, masks): - if out is None: - out = np.empty((n_eval, chunk.shape[1], chunk.shape[2]), dtype=chunk.dtype) + it = self.iter_eval_(ks, masks) + first_sl, first_chunk = next(it) + if first_sl.stop == n_eval: + # Common path: everything fit in one chunk — return directly, + # no extra allocation or copy. + return first_chunk + + # Multi-chunk path: accumulate into a pre-allocated output array. + out = np.empty((n_eval, first_chunk.shape[1], first_chunk.shape[2]), + dtype=first_chunk.dtype) + out[first_sl] = first_chunk + for sl, chunk in it: out[sl] = chunk return out From 84a46f46f7b8f8d81b9ed3c8fa6079bd52a4def0 Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Sun, 3 May 2026 18:53:07 +0200 Subject: [PATCH 37/44] Setting cpu core checker and warning --- src/pyRATS/rats.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index bef7e1c..5230592 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -252,6 +252,17 @@ def __init__( if n_jobs == -1: self.n_jobs = cpu_count() + cores = cpu_count() + if self.n_jobs > cores: + warnings.warn( + f"n_jobs={self.n_jobs} exceeds the number of available CPU cores " + f"({cores}). This causes oversubscription and will slow down the " + f"computation. n_jobs has been clamped to {cores}.", + UserWarning, + stacklevel=2, + ) + self.n_jobs = cores + def fit_transform(self, X): """Fit the model on the data in X, and transform X. From 7db1ddf1a1dd3eade845e43cda3d6445836f3975 Mon Sep 17 00:00:00 2001 From: Dhruv Kohli Date: Mon, 4 May 2026 05:33:04 -0700 Subject: [PATCH 38/44] Fix param.v update when the embedding dimension is 1, .squeeze will remove the axis corresponding to the embedding dimension. --- src/pyRATS/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index fd2dd9c..caa12df 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -1258,7 +1258,7 @@ def update(alpha, max_iter, O, CC, M, d): Tstar_ = Tstar.reshape((len(Tstar), d, -1), order="F").transpose(2, 1, 0) param.T = np.matmul(param.T, Tstar_) # update transformations of individual points - param.v = np.matmul(param.v[:, np.newaxis, :], Tstar_).squeeze() + Zstar[:, n:].T + param.v = np.matmul(param.v[:, np.newaxis, :], Tstar_)[:,0,:] + Zstar[:, n:].T return Zstar[:, :n].T From 39534f8fa0abb566028c956c65c4da28214bb23e Mon Sep 17 00:00:00 2001 From: Dhruv Kohli Date: Mon, 4 May 2026 05:37:57 -0700 Subject: [PATCH 39/44] Cast Utildeg to boolean after matrix multiplication C and Ug are boolean sparse matrices. On my end, scipy is resulting a matrix with int data type with summed values (treating True as ones) instead of OR. Casting into boolean dtype gives back OR-ed values. --- src/pyRATS/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index fd2dd9c..e9a71fc 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -1072,7 +1072,7 @@ def compute_incidence_matrix_in_embedding(y, C, k, nu, metric="euclidean"): k_ = min(int(k * nu), n - 1) _, neigh_indg = nearest_neighbors(y, k_, metric) Ug = sparse_matrix(neigh_indg, np.ones(neigh_indg.shape, dtype=bool)) - Utildeg = C.dot(Ug) + Utildeg = C.dot(Ug).astype(bool) return Utildeg From cf7eced5fde5043128e2546745afa562e414acc7 Mon Sep 17 00:00:00 2001 From: Dhruv Kohli Date: Mon, 4 May 2026 05:43:28 -0700 Subject: [PATCH 40/44] Fix output collection When there is no tear, compute_tear_graph returns a tuple of None. Comparing it with None results in False resulting in unnecessary subsequent processing. --- src/pyRATS/global_distortion.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/pyRATS/global_distortion.py b/src/pyRATS/global_distortion.py index 8f715a4..c96db6c 100644 --- a/src/pyRATS/global_distortion.py +++ b/src/pyRATS/global_distortion.py @@ -44,7 +44,7 @@ def _compute_tear_aware_shortest_path_distances( if dist is None: dist = _shortest_paths(y, n_nbrs=k, metric=metric, return_predqecessors=False) - output = compute_tear_graph( + pts_across_tear, tear_graph = compute_tear_graph( y, Utilde, C, @@ -56,10 +56,9 @@ def _compute_tear_aware_shortest_path_distances( n_jobs, i_mat_in_emb=Utildeg, ) - if output is None: + if tear_graph is None: return dist - - pts_across_tear, tear_graph = output + tear_graph = tear_graph.tocoo() tear_graph_row_inds = tear_graph.row tear_graph_col_inds = tear_graph.col From 07bb35e452b566aab628bd3c2513892e3f930216 Mon Sep 17 00:00:00 2001 From: Dhruv Kohli Date: Mon, 4 May 2026 05:46:29 -0700 Subject: [PATCH 41/44] Enhance comments on singular value handling in overlaps Added comments to clarify the significance of singular values in overlap calculations. --- src/pyRATS/_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index fd2dd9c..121213f 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -887,8 +887,15 @@ def target_proc(p_num): start_ind, end_ind, overlap_svals_ = value overlap_svals[start_ind:end_ind, :] = overlap_svals_ + # In most cases, when almost all overlaps have rank d, + # W_data = overlap_svals[:,-1], the d-th singular value of the overlap + # is sufficient to determine the priority of the overlaps. # overlap_svals[overlap_svals Date: Thu, 21 May 2026 14:19:10 +0200 Subject: [PATCH 42/44] Reverting to old parallel lib for exact matching to Dhruv's initial codebase --- .github/workflows/ci.yml | 2 +- src/pyRATS/_utils.py | 49 +++++++++++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6dfe788..fa5db79 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -50,7 +50,7 @@ jobs: # Step 2: Generate Expected Baselines (using baseline source code but current scripts) - name: Generate Expected Baselines run: | - git checkout dda054369e8a8ae9b8d166a7116ab983edfeaf8b -- src/ + git checkout 8e27b77b3247e1ac7447e7ea429e0f0c103a3fb0 -- src/ python tests/scripts/generate_snapshots.py --path-to-results-dir tests/data/expected ${{ steps.set-flag.outputs.fast_flag }} git checkout HEAD -- src/ diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 7f16c80..d687544 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -7,6 +7,7 @@ import itertools from joblib import delayed, Parallel, parallel_backend +import multiprocess as mp_lib from scipy.spatial.distance import pdist, squareform from sklearn.utils.extmath import svd_flip @@ -534,7 +535,12 @@ def cost_of_moving_distortion( c_U_k_uniq = np.unique(c_U_k).tolist() cost_x_k_to = np.zeros(len(c_U_k_uniq)) + np.inf - U_k_list = list(U_k) + # Reconstruct the set from neigh_ind_k inside this process: pickling and + # unpickling a set rebuilds the hash table with a different bucket layout, + # so list(U_k) iterates in a different order in the parent vs. a loky + # worker. Building the set from the same numpy array in each process gives + # the same iteration order everywhere. + U_k_list = list(set(neigh_ind_k)) # Iterate over all m in c_{U_k} i = 0 @@ -673,7 +679,12 @@ def cost_of_moving_alignment_error( c_U_k_uniq = np.unique(c_U_k) cost_x_k_to = np.zeros(len(c_U_k_uniq)) + np.inf - U_k_list = list(U_k) + # Reconstruct the set from neigh_ind_k inside this process: pickling and + # unpickling a set rebuilds the hash table with a different bucket layout, + # so list(U_k) iterates in a different order in the parent vs. a loky + # worker. Building the set from the same numpy array in each process gives + # the same iteration order everywhere. + U_k_list = list(set(neigh_ind_k)) neighbor_mask = ( (n_C[c_U_k_uniq] < eta_max) & (n_C[c_U_k_uniq] >= n_C_c_k) & (c_U_k_uniq != c_k) @@ -807,17 +818,29 @@ def target_proc(p_num, chunk_sz, n_, Utilde, n_C, c): if n_jobs == 1 or n < 1000: results = [target_proc(0, n, n, Utilde, n_C, c)] else: - with parallel_backend("loky", inner_max_num_threads=_inner_blas_threads(n_jobs)): - results = Parallel(n_jobs=n_jobs)( - delayed(target_proc)(p_num, chunk_sz, n, Utilde, n_C, c) - for p_num in tqdm( - range(n_jobs), - desc=f"eta={eta}", - unit="chunk", - leave=False, - disable=not verbose, - ) - ) + # Use multiprocess.Process (fork) rather than joblib/loky (spawn). + # cost_of_moving reads Utilde[m], a Python set whose iteration + # order depends on its hash-table bucket layout. Spawn re-pickles + # the set and rebuilds the table with a different layout in each + # worker, so list(Utilde[m]) iterates in a different order than + # in the parent — permuting the rows/cols passed to + # local_param.eval_ and d_e, producing float-LSB cost diffs that + # flip argmin tie-breaks and yield divergent cluster + # assignments. Fork shares the parent's address space via + # copy-on-write, so the children see the exact same set layout + # and produce bit-identical costs to a serial run. + q = mp_lib.Queue() + def _worker(p_num): + q.put(target_proc(p_num, chunk_sz, n, Utilde, n_C, c)) + procs = [ + mp_lib.Process(target=_worker, args=(p_num,), daemon=True) + for p_num in range(n_jobs) + ] + for p in procs: + p.start() + results = [q.get() for _ in range(n_jobs)] + for p in procs: + p.join() for start_ind, end_ind, cost_, dest_ in results: cost[start_ind:end_ind] = cost_ dest[start_ind:end_ind] = dest_ From 02ca601e893a71b4f3b19f204333a1e61170e60d Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 21 May 2026 14:20:11 +0200 Subject: [PATCH 43/44] Memory cache update for memory testing --- tests/test_memory.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/test_memory.py b/tests/test_memory.py index 99d3dd9..66f5893 100644 --- a/tests/test_memory.py +++ b/tests/test_memory.py @@ -43,8 +43,9 @@ def _make_lpca_param(n=400, n_features=32, k=12, d=2, seed=0): @pytest.fixture def clean_env(monkeypatch): - """Ensure PYRATS_MEMORY_LIMIT doesn't leak between tests.""" + """Ensure PYRATS_MEMORY_LIMIT and the TTL cache don't leak between tests.""" monkeypatch.delenv("PYRATS_MEMORY_LIMIT", raising=False) + monkeypatch.setattr(_utils, "_memory_cache", None) yield monkeypatch @@ -66,9 +67,11 @@ def test_available_memory_floor(clean_env): def test_available_memory_is_live(clean_env): - """Probe must not cache: changing the env var must change the result.""" + """Env-var changes are picked up after the TTL expires (not cached forever).""" clean_env.setenv("PYRATS_MEMORY_LIMIT", str(256 * 1024 * 1024)) first = _available_memory_bytes() + # Expire the cache so the next call re-reads the env var. + clean_env.setattr(_utils, "_memory_cache", None) clean_env.setenv("PYRATS_MEMORY_LIMIT", str(128 * 1024 * 1024)) second = _available_memory_bytes() assert second < first @@ -109,6 +112,7 @@ def test_iter_eval_output_matches_batched_eval(clean_env): # Streamed: tight budget => many chunks. Sum back and compare. clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) clean_env.setenv("PYRATS_MEMORY_LIMIT", "16384") + clean_env.setattr(_utils, "_memory_cache", None) # expire cache so tight cap takes effect streamed = np.empty_like(full) nchunks = 0 for sl, chunk in p.iter_eval_(view_idx, masks): @@ -226,6 +230,7 @@ def test_streamed_postprocess_peak_under_budget(clean_env): budget = 10 * pdist_per_row * 8 clean_env.setattr(_utils, "_MIN_MEMORY_FLOOR", 1024) clean_env.setenv("PYRATS_MEMORY_LIMIT", str(budget)) + clean_env.setattr(_utils, "_memory_cache", None) # expire cache so tight cap takes effect tracemalloc.start() zeta = np.empty(n) From 101ba26fab351ff856c054ef28cd1066cd34487c Mon Sep 17 00:00:00 2001 From: Nasir Ahmad Date: Thu, 21 May 2026 20:11:32 +0200 Subject: [PATCH 44/44] tqdm cleanup --- src/pyRATS/_tear_coloring.py | 4 ---- src/pyRATS/_utils.py | 5 +++-- src/pyRATS/rats.py | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/pyRATS/_tear_coloring.py b/src/pyRATS/_tear_coloring.py index ddd3661..3f0e8b7 100644 --- a/src/pyRATS/_tear_coloring.py +++ b/src/pyRATS/_tear_coloring.py @@ -75,8 +75,6 @@ def compute_spectral_color_of_pts_on_tear( n_comp, labels = connected_components( tear_graph, directed=False, return_labels=True ) - if verbose: - tqdm.write(f"Number of components in the tear graph: {n_comp}") n_points_in_comp = [] for i in range(n_comp): @@ -93,8 +91,6 @@ def compute_spectral_color_of_pts_on_tear( comp_i = labels == i n_comp_i = np.sum(comp_i) scale = n_comp_i / n_pts_across_tear - if verbose: - tqdm.write(f"#points in the tear component no. {i} are: {n_comp_i}") # If the component is very small then assign the constant color if n_comp_i <= max(3, int(color_cutoff_frac * n)): diff --git a/src/pyRATS/_utils.py b/src/pyRATS/_utils.py index 6dcd3cf..f4a8bb5 100644 --- a/src/pyRATS/_utils.py +++ b/src/pyRATS/_utils.py @@ -16,7 +16,7 @@ connected_components, breadth_first_order, ) -from tqdm import tqdm +from tqdm.auto import tqdm import os import sys @@ -791,7 +791,7 @@ def best(d_e, U, param, eta_min, eta_max, cost_fn, verbose, n_jobs): # Vary eta from 2 to eta_{min} for eta in tqdm( - range(2, eta_min + 1), desc="Intermediate views", disable=not verbose + range(2, eta_min + 1), desc="Intermediate views", disable=not verbose, position=0, leave=True ): # tqdm.write( # "#non-empty views with sz < %d = %d" @@ -861,6 +861,7 @@ def _worker(p_num): desc="Merging", unit="pts", leave=False, + position=1, ) total_len_S = 0 diff --git a/src/pyRATS/rats.py b/src/pyRATS/rats.py index 15f22c0..7895a3b 100644 --- a/src/pyRATS/rats.py +++ b/src/pyRATS/rats.py @@ -4,7 +4,7 @@ from multiprocessing import cpu_count from joblib import Parallel, delayed -from tqdm import tqdm +from tqdm.auto import tqdm from pyRATS._utils import ( nearest_neighbors,