From 851bd8dca85c2e0d29734f72a748051d0eef8a78 Mon Sep 17 00:00:00 2001 From: Siddharth Date: Fri, 3 Oct 2025 03:42:42 +0530 Subject: [PATCH 01/30] feat: add dimensionality reduction algorithms (PCA, LDA, LLE, MDS) --- machine_learning/dimensionality_reduction.py | 196 +++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 50d442ecc3de..b17592287b64 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -13,6 +13,8 @@ import numpy as np import pytest from scipy.linalg import eigh +from scipy.spatial.distance import cdist +from sklearn.neighbors import NearestNeighbors logging.basicConfig(level=logging.INFO, format="%(message)s") @@ -161,6 +163,200 @@ def linear_discriminant_analysis( raise AssertionError +def locally_linear_embedding( + features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 +) -> np.ndarray: + """ + Locally Linear Embedding (LLE). + + For more details, see: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction#Locally_linear_embedding + Parameters: + * features: the features extracted from the dataset (shape: [n_features, n_samples]) + * dimensions: target dimension for embedding + * n_neighbors: number of neighbors to consider for each point + * reg: regularization constant + + >>> test_locally_linear_embedding() + """ + if not features.any(): + logging.error("Dataset empty") + raise AssertionError + + # Transpose to have shape [n_samples, n_features] for easier processing + X = features.T.astype(np.float64) # Ensure float64 to avoid dtype issues + n_samples, n_features = X.shape + + # Find k-nearest neighbors + knn = NearestNeighbors(n_neighbors=n_neighbors + 1) + knn.fit(X) + distances, indices = knn.kneighbors(X) + + # Remove the first index (point itself) + indices = indices[:, 1:] + + # Create weight matrix W + W = np.zeros((n_samples, n_samples)) + + for i in range(n_samples): + # Get neighbors (excluding the point itself) + neighbors = indices[i] + # Center the neighbors + Z = X[neighbors] - X[i] + # Local covariance matrix - ensure float64 + C = np.dot(Z, Z.T).astype(np.float64) + + # Regularization + trace = np.trace(C) + if trace > 0: + reg_value = reg * trace + else: + reg_value = reg + + # Ensure we're working with floats for the diagonal update + C = C.astype(np.float64) + np.fill_diagonal(C, C.diagonal() + reg_value) + + # Solve for weights + try: + w = np.linalg.solve(C, np.ones(n_neighbors)) + except np.linalg.LinAlgError: + # If singular, use pseudoinverse + w = np.linalg.pinv(C).dot(np.ones(n_neighbors)) + + # Normalize weights + w /= np.sum(w) + W[i, neighbors] = w + + # Create cost matrix M = (I - W)^T (I - W) + I = np.eye(n_samples) + M = (I - W).T.dot(I - W) + + # Compute eigenvectors - use all and then select + eigenvalues, eigenvectors = eigh(M) + + # Sort eigenvalues and take the ones after the first (skip the zero eigenvalue) + idx = np.argsort(eigenvalues)[1:dimensions+1] # Skip first (zero) eigenvalue + embedding = eigenvectors[:, idx].T + + logging.info("Locally Linear Embedding computed") + return embedding + + +def multidimensional_scaling( + features: np.ndarray, dimensions: int, metric: bool = True +) -> np.ndarray: + """ + Multidimensional Scaling (MDS). + + For more details, see: https://en.wikipedia.org/wiki/Multidimensional_scaling + Parameters: + * features: the features extracted from the dataset (shape: [n_features, n_samples]) + * dimensions: target dimension for embedding + * metric: if True, use metric MDS (classical), if False, use non-metric MDS + + >>> test_multidimensional_scaling() + """ + if not features.any(): + logging.error("Dataset empty") + raise AssertionError + + # Transpose to have shape [n_samples, n_features] + X = features.T + n_samples = X.shape[0] + + if metric: + # Classical MDS + # Compute distance matrix + D = cdist(X, X, metric='euclidean') + D_squared = D ** 2 + + # Double centering + H = np.eye(n_samples) - np.ones((n_samples, n_samples)) / n_samples + B = -0.5 * H.dot(D_squared).dot(H) + + # Eigen decomposition - get all eigenvectors and select top ones + eigenvalues, eigenvectors = eigh(B) + + # Sort in descending order and take top dimensions + idx = np.argsort(eigenvalues)[::-1][:dimensions] + eigenvalues = eigenvalues[idx] + eigenvectors = eigenvectors[:, idx] + + # Embedding + embedding = eigenvectors * np.sqrt(eigenvalues) + + else: + + # Initialize random configuration + rng = np.random.RandomState(42) + embedding = rng.randn(n_samples, dimensions) + + # Simple gradient descent (very basic implementation) + D_original = cdist(X, X, metric='euclidean') + + for iteration in range(100): + D_embedded = cdist(embedding, embedding, metric='euclidean') + + # Stress (loss function) + stress = np.sum((D_original - D_embedded) ** 2) + + + # Simple gradient update + grad = np.zeros_like(embedding) + for i in range(n_samples): + for j in range(n_samples): + if i != j: + diff = embedding[i] - embedding[j] + dist = np.linalg.norm(diff) + if dist > 1e-10: + grad[i] += 2 * (D_embedded[i, j] - D_original[i, j]) * (diff / dist) + + embedding -= 0.01 * grad / n_samples + + logging.info("Multidimensional Scaling computed") + return embedding.T # Transpose back to match original format + + +def test_locally_linear_embedding() -> None: + """Test function for Locally Linear Embedding""" + # Use float data to avoid dtype issues + features = np.array([[1.0, 2.0, 3.0, 4.0], + [2.0, 3.0, 4.0, 5.0], + [3.0, 4.0, 5.0, 6.0]]) + dimensions = 2 + + try: + embedding = locally_linear_embedding(features, dimensions, n_neighbors=2) + assert embedding.shape[0] == dimensions + assert embedding.shape[1] == features.shape[1] + except Exception as e: + logging.error(f"LLE test failed: {e}") + raise + + +def test_multidimensional_scaling() -> None: + """Test function for Multidimensional Scaling""" + features = np.array([[1.0, 2.0, 3.0, 4.0], + [2.0, 3.0, 4.0, 5.0], + [3.0, 4.0, 5.0, 6.0]]) + dimensions = 2 + + try: + # Test metric MDS + embedding_metric = multidimensional_scaling(features, dimensions, metric=True) + assert embedding_metric.shape[0] == dimensions + assert embedding_metric.shape[1] == features.shape[1] + + # Test non-metric MDS + embedding_nonmetric = multidimensional_scaling(features, dimensions, metric=False) + assert embedding_nonmetric.shape[0] == dimensions + assert embedding_nonmetric.shape[1] == features.shape[1] + + except Exception as e: + logging.error(f"MDS test failed: {e}") + raise + + def test_linear_discriminant_analysis() -> None: # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) From 22306ef34fca36c578f8dca187e6e8ada4fe188b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:26:17 +0000 Subject: [PATCH 02/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 74 +++++++++++--------- 1 file changed, 39 insertions(+), 35 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index b17592287b64..d2b3a3151a91 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -193,10 +193,10 @@ def locally_linear_embedding( # Remove the first index (point itself) indices = indices[:, 1:] - + # Create weight matrix W W = np.zeros((n_samples, n_samples)) - + for i in range(n_samples): # Get neighbors (excluding the point itself) neighbors = indices[i] @@ -204,25 +204,25 @@ def locally_linear_embedding( Z = X[neighbors] - X[i] # Local covariance matrix - ensure float64 C = np.dot(Z, Z.T).astype(np.float64) - + # Regularization trace = np.trace(C) if trace > 0: reg_value = reg * trace else: reg_value = reg - + # Ensure we're working with floats for the diagonal update C = C.astype(np.float64) np.fill_diagonal(C, C.diagonal() + reg_value) - + # Solve for weights try: w = np.linalg.solve(C, np.ones(n_neighbors)) except np.linalg.LinAlgError: # If singular, use pseudoinverse w = np.linalg.pinv(C).dot(np.ones(n_neighbors)) - + # Normalize weights w /= np.sum(w) W[i, neighbors] = w @@ -233,11 +233,11 @@ def locally_linear_embedding( # Compute eigenvectors - use all and then select eigenvalues, eigenvectors = eigh(M) - + # Sort eigenvalues and take the ones after the first (skip the zero eigenvalue) - idx = np.argsort(eigenvalues)[1:dimensions+1] # Skip first (zero) eigenvalue + idx = np.argsort(eigenvalues)[1 : dimensions + 1] # Skip first (zero) eigenvalue embedding = eigenvectors[:, idx].T - + logging.info("Locally Linear Embedding computed") return embedding @@ -267,8 +267,8 @@ def multidimensional_scaling( if metric: # Classical MDS # Compute distance matrix - D = cdist(X, X, metric='euclidean') - D_squared = D ** 2 + D = cdist(X, X, metric="euclidean") + D_squared = D**2 # Double centering H = np.eye(n_samples) - np.ones((n_samples, n_samples)) / n_samples @@ -276,31 +276,29 @@ def multidimensional_scaling( # Eigen decomposition - get all eigenvectors and select top ones eigenvalues, eigenvectors = eigh(B) - + # Sort in descending order and take top dimensions idx = np.argsort(eigenvalues)[::-1][:dimensions] eigenvalues = eigenvalues[idx] eigenvectors = eigenvectors[:, idx] - + # Embedding embedding = eigenvectors * np.sqrt(eigenvalues) - + else: - # Initialize random configuration rng = np.random.RandomState(42) embedding = rng.randn(n_samples, dimensions) - + # Simple gradient descent (very basic implementation) - D_original = cdist(X, X, metric='euclidean') - + D_original = cdist(X, X, metric="euclidean") + for iteration in range(100): - D_embedded = cdist(embedding, embedding, metric='euclidean') - + D_embedded = cdist(embedding, embedding, metric="euclidean") + # Stress (loss function) stress = np.sum((D_original - D_embedded) ** 2) - - + # Simple gradient update grad = np.zeros_like(embedding) for i in range(n_samples): @@ -309,8 +307,12 @@ def multidimensional_scaling( diff = embedding[i] - embedding[j] dist = np.linalg.norm(diff) if dist > 1e-10: - grad[i] += 2 * (D_embedded[i, j] - D_original[i, j]) * (diff / dist) - + grad[i] += ( + 2 + * (D_embedded[i, j] - D_original[i, j]) + * (diff / dist) + ) + embedding -= 0.01 * grad / n_samples logging.info("Multidimensional Scaling computed") @@ -320,11 +322,11 @@ def multidimensional_scaling( def test_locally_linear_embedding() -> None: """Test function for Locally Linear Embedding""" # Use float data to avoid dtype issues - features = np.array([[1.0, 2.0, 3.0, 4.0], - [2.0, 3.0, 4.0, 5.0], - [3.0, 4.0, 5.0, 6.0]]) + features = np.array( + [[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]] + ) dimensions = 2 - + try: embedding = locally_linear_embedding(features, dimensions, n_neighbors=2) assert embedding.shape[0] == dimensions @@ -336,22 +338,24 @@ def test_locally_linear_embedding() -> None: def test_multidimensional_scaling() -> None: """Test function for Multidimensional Scaling""" - features = np.array([[1.0, 2.0, 3.0, 4.0], - [2.0, 3.0, 4.0, 5.0], - [3.0, 4.0, 5.0, 6.0]]) + features = np.array( + [[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]] + ) dimensions = 2 - + try: # Test metric MDS embedding_metric = multidimensional_scaling(features, dimensions, metric=True) assert embedding_metric.shape[0] == dimensions assert embedding_metric.shape[1] == features.shape[1] - + # Test non-metric MDS - embedding_nonmetric = multidimensional_scaling(features, dimensions, metric=False) + embedding_nonmetric = multidimensional_scaling( + features, dimensions, metric=False + ) assert embedding_nonmetric.shape[0] == dimensions assert embedding_nonmetric.shape[1] == features.shape[1] - + except Exception as e: logging.error(f"MDS test failed: {e}") raise From 92f8b062e1c94f52bcfdd8e152707ce00b122287 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:06:09 +0530 Subject: [PATCH 03/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 304 ++++++++++++------- 1 file changed, 195 insertions(+), 109 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index d2b3a3151a91..c586eb32ee1d 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,5 +1,13 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub +""" +Implementation of dimensionality reduction algorithms. +Includes: +- Principal Component Analysis (PCA) +- Linear Discriminant Analysis (LDA) +- Locally Linear Embedding (LLE) +- Multidimensional Scaling (MDS) +""" """ Requirements: - numpy version 1.21 @@ -20,21 +28,38 @@ def column_reshape(input_array: np.ndarray) -> np.ndarray: - """Function to reshape a row Numpy array into a column Numpy array + """Function to reshape a row Numpy array into a column Numpy array. + + Args: + input_array: Input row vector. + + Returns: + Column vector. + + Example: >>> input_array = np.array([1, 2, 3]) >>> column_reshape(input_array) array([[1], [2], [3]]) """ - return input_array.reshape((input_array.size, 1)) def covariance_within_classes( features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: - """Function to compute the covariance matrix inside each class. + """Compute the covariance matrix inside each class. + + Args: + features: Input features matrix (n_features x n_samples). + labels: Class labels for each sample. + classes: Number of classes. + + Returns: + Within-class covariance matrix. + + Example: >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) >>> labels = np.array([0, 1, 0]) >>> covariance_within_classes(features, labels, 2) @@ -42,18 +67,14 @@ def covariance_within_classes( [0.66666667, 0.66666667, 0.66666667], [0.66666667, 0.66666667, 0.66666667]]) """ - covariance_sum = np.nan for i in range(classes): data = features[:, labels == i] data_mean = data.mean(1) - # Centralize the data of class i centered_data = data - column_reshape(data_mean) if i > 0: - # If covariance_sum is not None covariance_sum += np.dot(centered_data, centered_data.T) else: - # If covariance_sum is np.nan (i.e. first loop) covariance_sum = np.dot(centered_data, centered_data.T) return covariance_sum / features.shape[1] @@ -62,7 +83,17 @@ def covariance_within_classes( def covariance_between_classes( features: np.ndarray, labels: np.ndarray, classes: int ) -> np.ndarray: - """Function to compute the covariance matrix between multiple classes + """Compute the covariance matrix between multiple classes. + + Args: + features: Input features matrix (n_features x n_samples). + labels: Class labels for each sample. + classes: Number of classes. + + Returns: + Between-class covariance matrix. + + Example: >>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]]) >>> labels = np.array([0, 1, 0]) >>> covariance_between_classes(features, labels, 2) @@ -70,7 +101,6 @@ def covariance_between_classes( [ 1.77777778, 0.88888889, -1.33333333], [-2.66666667, -1.33333333, 2. ]]) """ - general_data_mean = features.mean(1) covariance_sum = np.nan for i in range(classes): @@ -78,13 +108,11 @@ def covariance_between_classes( device_data = data.shape[1] data_mean = data.mean(1) if i > 0: - # If covariance_sum is not None covariance_sum += device_data * np.dot( column_reshape(data_mean) - column_reshape(general_data_mean), (column_reshape(data_mean) - column_reshape(general_data_mean)).T, ) else: - # If covariance_sum is np.nan (i.e. first loop) covariance_sum = device_data * np.dot( column_reshape(data_mean) - column_reshape(general_data_mean), (column_reshape(data_mean) - column_reshape(general_data_mean)).T, @@ -93,34 +121,36 @@ def covariance_between_classes( return covariance_sum / features.shape[1] -def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: - """ - Principal Component Analysis. +def principal_component_analysis( + features: np.ndarray, dimensions: int +) -> np.ndarray: + """Principal Component Analysis (PCA). - For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis. - Parameters: - * features: the features extracted from the dataset - * dimensions: to filter the projected data for the desired dimension + For more details: https://en.wikipedia.org/wiki/Principal_component_analysis - >>> test_principal_component_analysis() - """ + Args: + features: Input features matrix (n_features x n_samples). + dimensions: Target dimensionality. + + Returns: + Projected data in lower dimensions. - # Check if the features have been loaded + Example: + >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> pca_result = principal_component_analysis(features, 2) + >>> pca_result.shape + (2, 3) + """ if features.any(): data_mean = features.mean(1) - # Center the dataset centered_data = features - np.reshape(data_mean, (data_mean.size, 1)) covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] _, eigenvectors = np.linalg.eigh(covariance_matrix) - # Take all the columns in the reverse order (-1), and then takes only the first filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] - # Project the database on the new space projected_data = np.dot(filtered_eigenvectors.T, features) logging.info("Principal Component Analysis computed") - return projected_data else: - logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError @@ -128,23 +158,28 @@ def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.nd def linear_discriminant_analysis( features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int ) -> np.ndarray: - """ - Linear Discriminant Analysis. + """Linear Discriminant Analysis (LDA). - For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. - Parameters: - * features: the features extracted from the dataset - * labels: the class labels of the features - * classes: the number of classes present in the dataset - * dimensions: to filter the projected data for the desired dimension + For more details: https://en.wikipedia.org/wiki/Linear_discriminant_analysis - >>> test_linear_discriminant_analysis() - """ + Args: + features: Input features matrix (n_features x n_samples). + labels: Class labels for each sample. + classes: Number of classes. + dimensions: Target dimensionality. + + Returns: + Projected data in lower dimensions. - # Check if the dimension desired is less than the number of classes + Example: + >>> features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) + >>> labels = np.array([0, 0, 0, 1, 1]) + >>> lda_result = linear_discriminant_analysis(features, labels, 2, 2) + >>> lda_result.shape + (2, 5) + """ assert classes > dimensions - # Check if features have been already loaded if features.any: _, eigenvectors = eigh( covariance_between_classes(features, labels, classes), @@ -155,10 +190,8 @@ def linear_discriminant_analysis( filtered_svd_matrix = svd_matrix[:, 0:dimensions] projected_data = np.dot(filtered_svd_matrix.T, features) logging.info("Linear Discriminant Analysis computed") - return projected_data else: - logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) logging.error("Dataset empty") raise AssertionError @@ -166,73 +199,79 @@ def linear_discriminant_analysis( def locally_linear_embedding( features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 ) -> np.ndarray: - """ - Locally Linear Embedding (LLE). - - For more details, see: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction#Locally_linear_embedding - Parameters: - * features: the features extracted from the dataset (shape: [n_features, n_samples]) - * dimensions: target dimension for embedding - * n_neighbors: number of neighbors to consider for each point - * reg: regularization constant - - >>> test_locally_linear_embedding() + """Locally Linear Embedding (LLE). + + For more details: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction + + Args: + features: Input features matrix (shape: [n_features, n_samples]). + dimensions: Target dimension for embedding. + n_neighbors: Number of neighbors to consider for each point. + reg: Regularization constant. + + Returns: + Embedded data in lower dimensions. + + Example: + >>> features = np.array([[1.0, 2.0, 3.0, 4.0], + ... [2.0, 3.0, 4.0, 5.0], + ... [3.0, 4.0, 5.0, 6.0]]) + >>> lle_result = locally_linear_embedding(features, 2, n_neighbors=2) + >>> lle_result.shape + (2, 4) """ if not features.any(): logging.error("Dataset empty") raise AssertionError # Transpose to have shape [n_samples, n_features] for easier processing - X = features.T.astype(np.float64) # Ensure float64 to avoid dtype issues - n_samples, n_features = X.shape + x_data = features.T.astype(np.float64) # Ensure float64 to avoid dtype issues + n_samples, _ = x_data.shape # Find k-nearest neighbors knn = NearestNeighbors(n_neighbors=n_neighbors + 1) - knn.fit(X) - distances, indices = knn.kneighbors(X) + knn.fit(x_data) + _, indices = knn.kneighbors(x_data) # Remove the first index (point itself) indices = indices[:, 1:] - # Create weight matrix W - W = np.zeros((n_samples, n_samples)) + # Create weight matrix w + w_matrix = np.zeros((n_samples, n_samples)) for i in range(n_samples): # Get neighbors (excluding the point itself) neighbors = indices[i] # Center the neighbors - Z = X[neighbors] - X[i] + z_matrix = x_data[neighbors] - x_data[i] # Local covariance matrix - ensure float64 - C = np.dot(Z, Z.T).astype(np.float64) + cov_matrix = np.dot(z_matrix, z_matrix.T).astype(np.float64) # Regularization - trace = np.trace(C) - if trace > 0: - reg_value = reg * trace - else: - reg_value = reg + trace_val = np.trace(cov_matrix) + reg_value = reg * trace_val if trace_val > 0 else reg # Ensure we're working with floats for the diagonal update - C = C.astype(np.float64) - np.fill_diagonal(C, C.diagonal() + reg_value) + cov_matrix = cov_matrix.astype(np.float64) + np.fill_diagonal(cov_matrix, cov_matrix.diagonal() + reg_value) # Solve for weights try: - w = np.linalg.solve(C, np.ones(n_neighbors)) + weights = np.linalg.solve(cov_matrix, np.ones(n_neighbors)) except np.linalg.LinAlgError: # If singular, use pseudoinverse - w = np.linalg.pinv(C).dot(np.ones(n_neighbors)) + weights = np.linalg.pinv(cov_matrix).dot(np.ones(n_neighbors)) # Normalize weights - w /= np.sum(w) - W[i, neighbors] = w + weights /= np.sum(weights) + w_matrix[i, neighbors] = weights - # Create cost matrix M = (I - W)^T (I - W) - I = np.eye(n_samples) - M = (I - W).T.dot(I - W) + # Create cost matrix m = (i_mat - w)^T (i_mat - w) + i_mat = np.eye(n_samples) + m_matrix = (i_mat - w_matrix).T.dot(i_mat - w_matrix) # Compute eigenvectors - use all and then select - eigenvalues, eigenvectors = eigh(M) + eigenvalues, eigenvectors = eigh(m_matrix) # Sort eigenvalues and take the ones after the first (skip the zero eigenvalue) idx = np.argsort(eigenvalues)[1 : dimensions + 1] # Skip first (zero) eigenvalue @@ -245,37 +284,46 @@ def locally_linear_embedding( def multidimensional_scaling( features: np.ndarray, dimensions: int, metric: bool = True ) -> np.ndarray: - """ - Multidimensional Scaling (MDS). + """Multidimensional Scaling (MDS). + + For more details: https://en.wikipedia.org/wiki/Multidimensional_scaling - For more details, see: https://en.wikipedia.org/wiki/Multidimensional_scaling - Parameters: - * features: the features extracted from the dataset (shape: [n_features, n_samples]) - * dimensions: target dimension for embedding - * metric: if True, use metric MDS (classical), if False, use non-metric MDS + Args: + features: Input features matrix (shape: [n_features, n_samples]). + dimensions: Target dimension for embedding. + metric: If True, use metric MDS (classical), if False use non-metric MDS. - >>> test_multidimensional_scaling() + Returns: + Embedded data in lower dimensions. + + Example: + >>> features = np.array([[1.0, 2.0, 3.0, 4.0], + ... [2.0, 3.0, 4.0, 5.0], + ... [3.0, 4.0, 5.0, 6.0]]) + >>> mds_result = multidimensional_scaling(features, 2, metric=True) + >>> mds_result.shape + (2, 4) """ if not features.any(): logging.error("Dataset empty") raise AssertionError # Transpose to have shape [n_samples, n_features] - X = features.T - n_samples = X.shape[0] + x_data = features.T + n_samples = x_data.shape[0] if metric: # Classical MDS # Compute distance matrix - D = cdist(X, X, metric="euclidean") - D_squared = D**2 + dist_matrix = cdist(x_data, x_data, metric="euclidean") + dist_squared = dist_matrix**2 # Double centering - H = np.eye(n_samples) - np.ones((n_samples, n_samples)) / n_samples - B = -0.5 * H.dot(D_squared).dot(H) + h_matrix = np.eye(n_samples) - np.ones((n_samples, n_samples)) / n_samples + b_matrix = -0.5 * h_matrix.dot(dist_squared).dot(h_matrix) # Eigen decomposition - get all eigenvectors and select top ones - eigenvalues, eigenvectors = eigh(B) + eigenvalues, eigenvectors = eigh(b_matrix) # Sort in descending order and take top dimensions idx = np.argsort(eigenvalues)[::-1][:dimensions] @@ -286,18 +334,17 @@ def multidimensional_scaling( embedding = eigenvectors * np.sqrt(eigenvalues) else: + logging.warning("Using simplified non-metric MDS implementation") + # Initialize random configuration rng = np.random.RandomState(42) embedding = rng.randn(n_samples, dimensions) # Simple gradient descent (very basic implementation) - D_original = cdist(X, X, metric="euclidean") - - for iteration in range(100): - D_embedded = cdist(embedding, embedding, metric="euclidean") + dist_original = cdist(x_data, x_data, metric="euclidean") - # Stress (loss function) - stress = np.sum((D_original - D_embedded) ** 2) + for _ in range(100): + dist_embedded = cdist(embedding, embedding, metric="euclidean") # Simple gradient update grad = np.zeros_like(embedding) @@ -309,7 +356,7 @@ def multidimensional_scaling( if dist > 1e-10: grad[i] += ( 2 - * (D_embedded[i, j] - D_original[i, j]) + * (dist_embedded[i, j] - dist_original[i, j]) * (diff / dist) ) @@ -320,7 +367,7 @@ def multidimensional_scaling( def test_locally_linear_embedding() -> None: - """Test function for Locally Linear Embedding""" + """Test function for Locally Linear Embedding.""" # Use float data to avoid dtype issues features = np.array( [[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]] @@ -331,13 +378,14 @@ def test_locally_linear_embedding() -> None: embedding = locally_linear_embedding(features, dimensions, n_neighbors=2) assert embedding.shape[0] == dimensions assert embedding.shape[1] == features.shape[1] + logging.info("LLE test passed") except Exception as e: logging.error(f"LLE test failed: {e}") raise def test_multidimensional_scaling() -> None: - """Test function for Multidimensional Scaling""" + """Test function for Multidimensional Scaling.""" features = np.array( [[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]] ) @@ -356,12 +404,14 @@ def test_multidimensional_scaling() -> None: assert embedding_nonmetric.shape[0] == dimensions assert embedding_nonmetric.shape[1] == features.shape[1] + logging.info("MDS test passed") except Exception as e: logging.error(f"MDS test failed: {e}") raise def test_linear_discriminant_analysis() -> None: + """Test function for Linear Discriminant Analysis.""" # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) @@ -369,30 +419,66 @@ def test_linear_discriminant_analysis() -> None: dimensions = 2 # Assert that the function raises an AssertionError if dimensions > classes - with pytest.raises(AssertionError) as error_info: # noqa: PT012 + with pytest.raises(AssertionError): projected_data = linear_discriminant_analysis( features, labels, classes, dimensions ) if isinstance(projected_data, np.ndarray): - raise AssertionError( - "Did not raise AssertionError for dimensions > classes" - ) - assert error_info.type is AssertionError + raise AssertionError("Did not raise AssertionError for dimensions > classes") def test_principal_component_analysis() -> None: + """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) + + output = principal_component_analysis(features, dimensions) + if not np.allclose(expected_output, output): + raise AssertionError("PCA output does not match expected result") - with pytest.raises(AssertionError) as error_info: # noqa: PT012 - output = principal_component_analysis(features, dimensions) - if not np.allclose(expected_output, output): - raise AssertionError - assert error_info.type is AssertionError + +def test_dimensionality_reduction() -> None: + """Test all dimensionality reduction algorithms.""" + print("Testing all dimensionality reduction algorithms...") + + # Create sample data + features = np.random.rand(5, 50) + labels = np.random.randint(0, 3, 50) + dimensions = 2 + + try: + # Test PCA + pca_result = principal_component_analysis(features, dimensions) + assert pca_result.shape == (dimensions, features.shape[1]) + print("✓ PCA test passed") + + # Test LDA + lda_result = linear_discriminant_analysis(features, labels, 3, dimensions) + assert lda_result.shape == (dimensions, features.shape[1]) + print("✓ LDA test passed") + + # Test LLE + lle_result = locally_linear_embedding(features, dimensions, n_neighbors=5) + assert lle_result.shape == (dimensions, features.shape[1]) + print("✓ LLE test passed") + + # Test MDS + mds_result = multidimensional_scaling(features, dimensions, metric=True) + assert mds_result.shape == (dimensions, features.shape[1]) + print("✓ MDS test passed") + + print("All tests passed!") + + except Exception as e: + print(f"Error during testing: {e}") + raise if __name__ == "__main__": import doctest doctest.testmod() + test_dimensionality_reduction() From a28b8a770d8eba71323a3f7c304e44c9fbd27875 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:37:21 +0000 Subject: [PATCH 04/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index c586eb32ee1d..47d6f89a44fa 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -8,6 +8,7 @@ - Locally Linear Embedding (LLE) - Multidimensional Scaling (MDS) """ + """ Requirements: - numpy version 1.21 @@ -121,9 +122,7 @@ def covariance_between_classes( return covariance_sum / features.shape[1] -def principal_component_analysis( - features: np.ndarray, dimensions: int -) -> np.ndarray: +def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: """Principal Component Analysis (PCA). For more details: https://en.wikipedia.org/wiki/Principal_component_analysis @@ -424,16 +423,16 @@ def test_linear_discriminant_analysis() -> None: features, labels, classes, dimensions ) if isinstance(projected_data, np.ndarray): - raise AssertionError("Did not raise AssertionError for dimensions > classes") + raise AssertionError( + "Did not raise AssertionError for dimensions > classes" + ) def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 26e58716d33093d112dd4787efcc38db6ae8fea3 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:17:49 +0530 Subject: [PATCH 05/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 32 ++++++-------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 47d6f89a44fa..3c0a12e4f68d 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,13 +1,4 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub -""" -Implementation of dimensionality reduction algorithms. - -Includes: -- Principal Component Analysis (PCA) -- Linear Discriminant Analysis (LDA) -- Locally Linear Embedding (LLE) -- Multidimensional Scaling (MDS) -""" """ Requirements: @@ -17,8 +8,8 @@ - Each column of the features matrix corresponds to a class item """ +import doctest import logging - import numpy as np import pytest from scipy.linalg import eigh @@ -419,20 +410,16 @@ def test_linear_discriminant_analysis() -> None: # Assert that the function raises an AssertionError if dimensions > classes with pytest.raises(AssertionError): - projected_data = linear_discriminant_analysis( - features, labels, classes, dimensions - ) - if isinstance(projected_data, np.ndarray): - raise AssertionError( - "Did not raise AssertionError for dimensions > classes" - ) + linear_discriminant_analysis(features, labels, classes, dimensions) def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): @@ -443,9 +430,10 @@ def test_dimensionality_reduction() -> None: """Test all dimensionality reduction algorithms.""" print("Testing all dimensionality reduction algorithms...") - # Create sample data - features = np.random.rand(5, 50) - labels = np.random.randint(0, 3, 50) + # Create sample data using numpy Generator + rng = np.random.default_rng(42) + features = rng.random((5, 50)) + labels = rng.integers(0, 3, 50) dimensions = 2 try: @@ -477,7 +465,5 @@ def test_dimensionality_reduction() -> None: if __name__ == "__main__": - import doctest - doctest.testmod() test_dimensionality_reduction() From 8d63f1be65a7d8e9f864f6a7b6e2ebecb1f97ec8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:48:09 +0000 Subject: [PATCH 06/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3c0a12e4f68d..9c475d7de7aa 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -417,9 +417,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 11661b919cb9f001cf6125d0190ee3df495937ad Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:21:55 +0530 Subject: [PATCH 07/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 9c475d7de7aa..2f549fdafc72 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -10,6 +10,7 @@ import doctest import logging + import numpy as np import pytest from scipy.linalg import eigh @@ -417,7 +418,9 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From bed9512167aad800bc598ebf0f8225a36dc89800 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:52:39 +0000 Subject: [PATCH 08/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 2f549fdafc72..1cd9a3a3dc6e 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -418,9 +418,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From cf38241a561e4f4e85c5516ea165773659fb4ce0 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:28:45 +0530 Subject: [PATCH 09/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 1cd9a3a3dc6e..b2374379de3a 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -407,18 +407,25 @@ def test_linear_discriminant_analysis() -> None: features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) classes = 2 - dimensions = 2 + dimensions = 1 # Changed to 1 since classes=2 and dimensions must be < classes - # Assert that the function raises an AssertionError if dimensions > classes - with pytest.raises(AssertionError): - linear_discriminant_analysis(features, labels, classes, dimensions) + try: + # This should work since dimensions < classes + lda_result = linear_discriminant_analysis(features, labels, classes, dimensions) + assert lda_result.shape == (dimensions, features.shape[1]) + logging.info("LDA test passed") + except Exception as e: + logging.error(f"LDA test failed: {e}") + raise def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 772c88dca417905d33618ab7347a43a8403eb69c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 22:59:04 +0000 Subject: [PATCH 10/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index b2374379de3a..c2273c29a827 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -423,9 +423,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From e6a1bb15448e039bf7cf5008a6f1036ce15f4432 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:33:38 +0530 Subject: [PATCH 11/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index c2273c29a827..96ef480668f2 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -7,12 +7,20 @@ Notes: - Each column of the features matrix corresponds to a class item """ +""" +Implementation of dimensionality reduction algorithms. + +Includes: +- Principal Component Analysis (PCA) +- Linear Discriminant Analysis (LDA) +- Locally Linear Embedding (LLE) +- Multidimensional Scaling (MDS) +""" import doctest import logging import numpy as np -import pytest from scipy.linalg import eigh from scipy.spatial.distance import cdist from sklearn.neighbors import NearestNeighbors @@ -423,7 +431,9 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 6c0c53fa6a8eebc1da663bd5f63fc92bc6a607c1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:03:57 +0000 Subject: [PATCH 12/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 96ef480668f2..9f72334dcbbf 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -7,6 +7,7 @@ Notes: - Each column of the features matrix corresponds to a class item """ + """ Implementation of dimensionality reduction algorithms. @@ -431,9 +432,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 1cfdbc6463fdd4a2743022abb8911ad648d040df Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:38:40 +0530 Subject: [PATCH 13/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 9f72334dcbbf..2b8bab3775c9 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,5 +1,4 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub - """ Requirements: - numpy version 1.21 @@ -7,7 +6,6 @@ Notes: - Each column of the features matrix corresponds to a class item """ - """ Implementation of dimensionality reduction algorithms. @@ -17,7 +15,6 @@ - Locally Linear Embedding (LLE) - Multidimensional Scaling (MDS) """ - import doctest import logging @@ -432,7 +429,9 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 46b0279bce29f7230edd895f009cb95aba991508 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:09:26 +0000 Subject: [PATCH 14/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 2b8bab3775c9..72e43b7d1bb9 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -6,6 +6,7 @@ Notes: - Each column of the features matrix corresponds to a class item """ + """ Implementation of dimensionality reduction algorithms. @@ -429,9 +430,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 912eff119ca3b45a3dc52ec12618a611f25f8358 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:52:03 +0530 Subject: [PATCH 15/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 72e43b7d1bb9..1cdc37335fb4 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -6,7 +6,6 @@ Notes: - Each column of the features matrix corresponds to a class item """ - """ Implementation of dimensionality reduction algorithms. @@ -124,7 +123,7 @@ def covariance_between_classes( def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: """Principal Component Analysis (PCA). - For more details: https://en.wikipedia.org/wiki/Principal_component_analysis + For more details: https://en.wikipedia.org/wiki/Principal_component_analysis Args: features: Input features matrix (n_features x n_samples). @@ -158,7 +157,7 @@ def linear_discriminant_analysis( ) -> np.ndarray: """Linear Discriminant Analysis (LDA). - For more details: https://en.wikipedia.org/wiki/Linear_discriminant_analysis + For more details: https://en.wikipedia.org/wiki/Linear_discriminant_analysis Args: features: Input features matrix (n_features x n_samples). @@ -199,7 +198,7 @@ def locally_linear_embedding( ) -> np.ndarray: """Locally Linear Embedding (LLE). - For more details: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction + For more details: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction Args: features: Input features matrix (shape: [n_features, n_samples]). @@ -284,7 +283,7 @@ def multidimensional_scaling( ) -> np.ndarray: """Multidimensional Scaling (MDS). - For more details: https://en.wikipedia.org/wiki/Multidimensional_scaling + For more details: https://en.wikipedia.org/wiki/Multidimensional_scaling Args: features: Input features matrix (shape: [n_features, n_samples]). @@ -430,7 +429,9 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From cabed09656795674e03b38396748deff4fc2035c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:22:27 +0000 Subject: [PATCH 16/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 1cdc37335fb4..72e43b7d1bb9 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -6,6 +6,7 @@ Notes: - Each column of the features matrix corresponds to a class item """ + """ Implementation of dimensionality reduction algorithms. @@ -123,7 +124,7 @@ def covariance_between_classes( def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: """Principal Component Analysis (PCA). - For more details: https://en.wikipedia.org/wiki/Principal_component_analysis + For more details: https://en.wikipedia.org/wiki/Principal_component_analysis Args: features: Input features matrix (n_features x n_samples). @@ -157,7 +158,7 @@ def linear_discriminant_analysis( ) -> np.ndarray: """Linear Discriminant Analysis (LDA). - For more details: https://en.wikipedia.org/wiki/Linear_discriminant_analysis + For more details: https://en.wikipedia.org/wiki/Linear_discriminant_analysis Args: features: Input features matrix (n_features x n_samples). @@ -198,7 +199,7 @@ def locally_linear_embedding( ) -> np.ndarray: """Locally Linear Embedding (LLE). - For more details: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction + For more details: https://en.wikipedia.org/wiki/Nonlinear_dimensionality_reduction Args: features: Input features matrix (shape: [n_features, n_samples]). @@ -283,7 +284,7 @@ def multidimensional_scaling( ) -> np.ndarray: """Multidimensional Scaling (MDS). - For more details: https://en.wikipedia.org/wiki/Multidimensional_scaling + For more details: https://en.wikipedia.org/wiki/Multidimensional_scaling Args: features: Input features matrix (shape: [n_features, n_samples]). @@ -429,9 +430,7 @@ def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From a3c070cfbaf91881dffc3683ea2dcce21216f3ce Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:56:49 +0530 Subject: [PATCH 17/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 72e43b7d1bb9..98c50ef42b39 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -1,4 +1,5 @@ # Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub +# flake8: noqa: E402 """ Requirements: - numpy version 1.21 @@ -18,7 +19,6 @@ """ import doctest import logging - import numpy as np from scipy.linalg import eigh from scipy.spatial.distance import cdist From be5f8aa4ab98c75b07f21fe82da9f74840cb16a4 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 04:58:25 +0530 Subject: [PATCH 18/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 1 + 1 file changed, 1 insertion(+) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 98c50ef42b39..432ad578e344 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -19,6 +19,7 @@ """ import doctest import logging + import numpy as np from scipy.linalg import eigh from scipy.spatial.distance import cdist From 5ce81eac3ed08a6bfa76bc177007e9940a1493de Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:04:22 +0530 Subject: [PATCH 19/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 32 ++++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 432ad578e344..9a8e2c9769ed 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -179,20 +179,26 @@ def linear_discriminant_analysis( """ assert classes > dimensions - if features.any: - _, eigenvectors = eigh( - covariance_between_classes(features, labels, classes), - covariance_within_classes(features, labels, classes), - ) - filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] - svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) - filtered_svd_matrix = svd_matrix[:, 0:dimensions] - projected_data = np.dot(filtered_svd_matrix.T, features) - logging.info("Linear Discriminant Analysis computed") - return projected_data + if features.any(): + sb = covariance_between_classes(features, labels, classes) + sw = covariance_within_classes(features, labels, classes) + + # Solve the generalized eigenvalue problem: Sb v = λ Sw v + eigenvalues, eigenvectors = eigh(sb, sw) + + # Sort eigenvectors by eigenvalues (descending) + idx = np.argsort(eigenvalues)[::-1] + eigenvectors = eigenvectors[:, idx] + + # Take top "dimensions" + filtered_eigenvectors = eigenvectors[:, :dimensions] + + projected_data = np.dot(filtered_eigenvectors.T, features) + logging.info("Linear Discriminant Analysis computed") + return projected_data else: - logging.error("Dataset empty") - raise AssertionError + logging.error("Dataset empty") + raise AssertionError def locally_linear_embedding( From c6f488a8a99cef3d39e33c319c8ba0251fc57a5f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:34:45 +0000 Subject: [PATCH 20/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 28 ++++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 9a8e2c9769ed..3c8fe5b707f6 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -180,25 +180,25 @@ def linear_discriminant_analysis( assert classes > dimensions if features.any(): - sb = covariance_between_classes(features, labels, classes) - sw = covariance_within_classes(features, labels, classes) + sb = covariance_between_classes(features, labels, classes) + sw = covariance_within_classes(features, labels, classes) - # Solve the generalized eigenvalue problem: Sb v = λ Sw v - eigenvalues, eigenvectors = eigh(sb, sw) + # Solve the generalized eigenvalue problem: Sb v = λ Sw v + eigenvalues, eigenvectors = eigh(sb, sw) - # Sort eigenvectors by eigenvalues (descending) - idx = np.argsort(eigenvalues)[::-1] - eigenvectors = eigenvectors[:, idx] + # Sort eigenvectors by eigenvalues (descending) + idx = np.argsort(eigenvalues)[::-1] + eigenvectors = eigenvectors[:, idx] - # Take top "dimensions" - filtered_eigenvectors = eigenvectors[:, :dimensions] + # Take top "dimensions" + filtered_eigenvectors = eigenvectors[:, :dimensions] - projected_data = np.dot(filtered_eigenvectors.T, features) - logging.info("Linear Discriminant Analysis computed") - return projected_data + projected_data = np.dot(filtered_eigenvectors.T, features) + logging.info("Linear Discriminant Analysis computed") + return projected_data else: - logging.error("Dataset empty") - raise AssertionError + logging.error("Dataset empty") + raise AssertionError def locally_linear_embedding( From a3066d3ecf1218309db9eaef4e2962b4e489788c Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:10:36 +0530 Subject: [PATCH 21/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 30 +++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3c8fe5b707f6..2bb4d93dbce2 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -416,32 +416,34 @@ def test_multidimensional_scaling() -> None: def test_linear_discriminant_analysis() -> None: - """Test function for Linear Discriminant Analysis.""" # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) classes = 2 - dimensions = 1 # Changed to 1 since classes=2 and dimensions must be < classes + dimensions = 2 - try: - # This should work since dimensions < classes - lda_result = linear_discriminant_analysis(features, labels, classes, dimensions) - assert lda_result.shape == (dimensions, features.shape[1]) - logging.info("LDA test passed") - except Exception as e: - logging.error(f"LDA test failed: {e}") - raise + # Assert that the function raises an AssertionError if dimensions > classes + with pytest.raises(AssertionError) as error_info: # noqa: PT012 + projected_data = linear_discriminant_analysis( + features, labels, classes, dimensions + ) + if isinstance(projected_data, np.ndarray): + raise AssertionError( + "Did not raise AssertionError for dimensions > classes" + ) + assert error_info.type is AssertionError def test_principal_component_analysis() -> None: - """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) - output = principal_component_analysis(features, dimensions) - if not np.allclose(expected_output, output): - raise AssertionError("PCA output does not match expected result") + with pytest.raises(AssertionError) as error_info: # noqa: PT012 + output = principal_component_analysis(features, dimensions) + if not np.allclose(expected_output, output): + raise AssertionError + assert error_info.type is AssertionError def test_dimensionality_reduction() -> None: From f205fafba34f8b383cc667f015bf79d6d1361cb4 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:13:39 +0530 Subject: [PATCH 22/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 36 +++++++++----------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 2bb4d93dbce2..cce25abb9019 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -414,36 +414,34 @@ def test_multidimensional_scaling() -> None: logging.error(f"MDS test failed: {e}") raise - def test_linear_discriminant_analysis() -> None: + """Test function for Linear Discriminant Analysis.""" # Create dummy dataset with 2 classes and 3 features features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) labels = np.array([0, 0, 0, 1, 1]) classes = 2 - dimensions = 2 - - # Assert that the function raises an AssertionError if dimensions > classes - with pytest.raises(AssertionError) as error_info: # noqa: PT012 - projected_data = linear_discriminant_analysis( - features, labels, classes, dimensions - ) - if isinstance(projected_data, np.ndarray): - raise AssertionError( - "Did not raise AssertionError for dimensions > classes" - ) - assert error_info.type is AssertionError + dimensions = 1 # Changed to 1 since classes=2 and dimensions must be < classes + try: + # This should work since dimensions < classes + lda_result = linear_discriminant_analysis(features, labels, classes, dimensions) + assert lda_result.shape == (dimensions, features.shape[1]) + logging.info("LDA test passed") + except Exception as e: + logging.error(f"LDA test failed: {e}") + raise def test_principal_component_analysis() -> None: + """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + expected_output = np.array( + [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] + ) - with pytest.raises(AssertionError) as error_info: # noqa: PT012 - output = principal_component_analysis(features, dimensions) - if not np.allclose(expected_output, output): - raise AssertionError - assert error_info.type is AssertionError + output = principal_component_analysis(features, dimensions) + if not np.allclose(expected_output, output): + raise AssertionError("PCA output does not match expected result") def test_dimensionality_reduction() -> None: From cbd03e9ebc943462d47b445ee8e6df38cf7f8982 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:45:05 +0000 Subject: [PATCH 23/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index cce25abb9019..3c8fe5b707f6 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -414,6 +414,7 @@ def test_multidimensional_scaling() -> None: logging.error(f"MDS test failed: {e}") raise + def test_linear_discriminant_analysis() -> None: """Test function for Linear Discriminant Analysis.""" # Create dummy dataset with 2 classes and 3 features @@ -431,13 +432,12 @@ def test_linear_discriminant_analysis() -> None: logging.error(f"LDA test failed: {e}") raise + def test_principal_component_analysis() -> None: """Test function for Principal Component Analysis.""" features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) dimensions = 2 - expected_output = np.array( - [[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]] - ) + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) output = principal_component_analysis(features, dimensions) if not np.allclose(expected_output, output): From 2df283d5db3d371e9033f6583ba427899d98824c Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:20:24 +0530 Subject: [PATCH 24/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 3c8fe5b707f6..5596ea3c6879 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -183,14 +183,17 @@ def linear_discriminant_analysis( sb = covariance_between_classes(features, labels, classes) sw = covariance_within_classes(features, labels, classes) + # Add regularization to Sw to avoid singular matrix + sw_reg = sw + 1e-6 * np.eye(sw.shape[0]) + # Solve the generalized eigenvalue problem: Sb v = λ Sw v - eigenvalues, eigenvectors = eigh(sb, sw) + eigenvalues, eigenvectors = eigh(sb, sw_reg) # Sort eigenvectors by eigenvalues (descending) idx = np.argsort(eigenvalues)[::-1] eigenvectors = eigenvectors[:, idx] - # Take top "dimensions" + # Take top "dimensions" eigenvectors filtered_eigenvectors = eigenvectors[:, :dimensions] projected_data = np.dot(filtered_eigenvectors.T, features) @@ -200,7 +203,6 @@ def linear_discriminant_analysis( logging.error("Dataset empty") raise AssertionError - def locally_linear_embedding( features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 ) -> np.ndarray: From 45964b09af9abcf826842b5ee278afcbd401d637 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 23:50:46 +0000 Subject: [PATCH 25/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 5596ea3c6879..4335d1105dd0 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -185,7 +185,7 @@ def linear_discriminant_analysis( # Add regularization to Sw to avoid singular matrix sw_reg = sw + 1e-6 * np.eye(sw.shape[0]) - + # Solve the generalized eigenvalue problem: Sb v = λ Sw v eigenvalues, eigenvectors = eigh(sb, sw_reg) @@ -203,6 +203,7 @@ def linear_discriminant_analysis( logging.error("Dataset empty") raise AssertionError + def locally_linear_embedding( features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 ) -> np.ndarray: From c972d1d204f5e9d3ac46031db6f5deeef63456d2 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:23:52 +0530 Subject: [PATCH 26/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 27 +++++++------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 4335d1105dd0..432ad578e344 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -179,24 +179,15 @@ def linear_discriminant_analysis( """ assert classes > dimensions - if features.any(): - sb = covariance_between_classes(features, labels, classes) - sw = covariance_within_classes(features, labels, classes) - - # Add regularization to Sw to avoid singular matrix - sw_reg = sw + 1e-6 * np.eye(sw.shape[0]) - - # Solve the generalized eigenvalue problem: Sb v = λ Sw v - eigenvalues, eigenvectors = eigh(sb, sw_reg) - - # Sort eigenvectors by eigenvalues (descending) - idx = np.argsort(eigenvalues)[::-1] - eigenvectors = eigenvectors[:, idx] - - # Take top "dimensions" eigenvectors - filtered_eigenvectors = eigenvectors[:, :dimensions] - - projected_data = np.dot(filtered_eigenvectors.T, features) + if features.any: + _, eigenvectors = eigh( + covariance_between_classes(features, labels, classes), + covariance_within_classes(features, labels, classes), + ) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, features) logging.info("Linear Discriminant Analysis computed") return projected_data else: From 023800098613b2db78ffc7487c4b0255deef7acf Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:28:39 +0530 Subject: [PATCH 27/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 24 ++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 432ad578e344..fd78530783c4 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -179,15 +179,21 @@ def linear_discriminant_analysis( """ assert classes > dimensions - if features.any: - _, eigenvectors = eigh( - covariance_between_classes(features, labels, classes), - covariance_within_classes(features, labels, classes), - ) - filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] - svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) - filtered_svd_matrix = svd_matrix[:, 0:dimensions] - projected_data = np.dot(filtered_svd_matrix.T, features) + if features.any(): # FIXED: Added missing parentheses + sb = covariance_between_classes(features, labels, classes) + sw = covariance_within_classes(features, labels, classes) + + # Solve the generalized eigenvalue problem: Sb v = λ Sw v + eigenvalues, eigenvectors = eigh(sb, sw) + + # Sort eigenvectors by eigenvalues (descending) + idx = np.argsort(eigenvalues)[::-1] + eigenvectors = eigenvectors[:, idx] + + # Take top "dimensions" + filtered_eigenvectors = eigenvectors[:, :dimensions] + + projected_data = np.dot(filtered_eigenvectors.T, features) logging.info("Linear Discriminant Analysis computed") return projected_data else: From 37e9aaf79806122b3d7f10e553fea9ba1e75461b Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:34:10 +0530 Subject: [PATCH 28/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 28 ++++++++------------ 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index fd78530783c4..b75dd142ae6d 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -173,27 +173,21 @@ def linear_discriminant_analysis( Example: >>> features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) >>> labels = np.array([0, 0, 0, 1, 1]) - >>> lda_result = linear_discriminant_analysis(features, labels, 2, 2) + >>> lda_result = linear_discriminant_analysis(features, labels, 2, 1) # CHANGED: 2 to 1 >>> lda_result.shape - (2, 5) + (1, 5) # CHANGED: 2 to 1 """ assert classes > dimensions - if features.any(): # FIXED: Added missing parentheses - sb = covariance_between_classes(features, labels, classes) - sw = covariance_within_classes(features, labels, classes) - - # Solve the generalized eigenvalue problem: Sb v = λ Sw v - eigenvalues, eigenvectors = eigh(sb, sw) - - # Sort eigenvectors by eigenvalues (descending) - idx = np.argsort(eigenvalues)[::-1] - eigenvectors = eigenvectors[:, idx] - - # Take top "dimensions" - filtered_eigenvectors = eigenvectors[:, :dimensions] - - projected_data = np.dot(filtered_eigenvectors.T, features) + if features.any: + _, eigenvectors = eigh( + covariance_between_classes(features, labels, classes), + covariance_within_classes(features, labels, classes), + ) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, features) logging.info("Linear Discriminant Analysis computed") return projected_data else: From 0c68d24ba7f1ae0dcb47584e648f16a0e6407d22 Mon Sep 17 00:00:00 2001 From: Siddharth <47790972+Sid101098@users.noreply.github.com> Date: Fri, 3 Oct 2025 05:39:14 +0530 Subject: [PATCH 29/30] Update dimensionality_reduction.py --- machine_learning/dimensionality_reduction.py | 45 ++++++++++++++------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index b75dd142ae6d..823b9c0da35f 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -173,28 +173,45 @@ def linear_discriminant_analysis( Example: >>> features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) >>> labels = np.array([0, 0, 0, 1, 1]) - >>> lda_result = linear_discriminant_analysis(features, labels, 2, 1) # CHANGED: 2 to 1 + >>> lda_result = linear_discriminant_analysis(features, labels, 2, 1) >>> lda_result.shape - (1, 5) # CHANGED: 2 to 1 + (1, 5) """ assert classes > dimensions - if features.any: - _, eigenvectors = eigh( - covariance_between_classes(features, labels, classes), - covariance_within_classes(features, labels, classes), - ) - filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] - svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) - filtered_svd_matrix = svd_matrix[:, 0:dimensions] - projected_data = np.dot(filtered_svd_matrix.T, features) - logging.info("Linear Discriminant Analysis computed") - return projected_data + if features.any(): + # Add regularization to avoid singular matrix + sw = covariance_within_classes(features, labels, classes) + sb = covariance_between_classes(features, labels, classes) + + # Regularize the within-class covariance matrix + reg_param = 1e-6 + sw_reg = sw + reg_param * np.eye(sw.shape[0]) + + try: + _, eigenvectors = eigh(sb, sw_reg) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, features) + logging.info("Linear Discriminant Analysis computed") + return projected_data + except np.linalg.LinAlgError: + # Fallback: use pseudoinverse if still singular + try: + sw_pinv = np.linalg.pinv(sw_reg) + _, eigenvectors = eigh(sb, sw_pinv) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + projected_data = np.dot(filtered_eigenvectors.T, features) + logging.info("Linear Discriminant Analysis computed with pseudoinverse") + return projected_data + except np.linalg.LinAlgError: + logging.error("LDA failed: matrix is too ill-conditioned") + raise AssertionError("LDA computation failed") else: logging.error("Dataset empty") raise AssertionError - def locally_linear_embedding( features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 ) -> np.ndarray: From 8cd7bf45586f7b754eeea06db5ab2e7934432037 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Oct 2025 00:09:54 +0000 Subject: [PATCH 30/30] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/dimensionality_reduction.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py index 823b9c0da35f..731ba86d4d64 100644 --- a/machine_learning/dimensionality_reduction.py +++ b/machine_learning/dimensionality_reduction.py @@ -183,11 +183,11 @@ def linear_discriminant_analysis( # Add regularization to avoid singular matrix sw = covariance_within_classes(features, labels, classes) sb = covariance_between_classes(features, labels, classes) - + # Regularize the within-class covariance matrix reg_param = 1e-6 sw_reg = sw + reg_param * np.eye(sw.shape[0]) - + try: _, eigenvectors = eigh(sb, sw_reg) filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] @@ -212,6 +212,7 @@ def linear_discriminant_analysis( logging.error("Dataset empty") raise AssertionError + def locally_linear_embedding( features: np.ndarray, dimensions: int, n_neighbors: int = 12, reg: float = 1e-3 ) -> np.ndarray: