From 5dfd2d5a4e7429b3ea13e4be062ca78f43894a6d Mon Sep 17 00:00:00 2001 From: LuisMelendez Date: Wed, 1 Oct 2025 22:50:58 -0600 Subject: [PATCH 1/5] Create pearson_correlation.py --- machine_learning/pearson_correlation.py | 53 +++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 machine_learning/pearson_correlation.py diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py new file mode 100644 index 000000000000..3f71c8398efb --- /dev/null +++ b/machine_learning/pearson_correlation.py @@ -0,0 +1,53 @@ +import numpy as np + + +def pearson_correlation(x: np.ndarray, y: np.ndarray) -> float: + """ + Calculate the Pearson correlation coefficient (PCC) between two arrays. + + Pearson correlation measures the linear relationship between two datasets, + returning a value between -1 and 1: + - 1 indicates a perfect positive linear correlation + - 0 indicates no linear correlation + - -1 indicates a perfect negative linear correlation + + Formula: + r = Σ((x - mean(x)) * (y - mean(y))) / sqrt(Σ(x - mean(x))^2 * Σ(y - mean(y))^2) + + Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + + Parameters: + - x: 1D numpy array of values + - y: 1D numpy array of values + + Returns: + - The Pearson correlation coefficient (float) + + a = np.array([1, 2, 3, 4, 5]) + b = np.array([2, 4, 6, 8, 10]) + float(np.round(pearson_correlation(a, b), 5)) + 1.0 + a = np.array([1, 2, 3, 4, 5]) + b = np.array([10, 9, 2, 6, 4]) + float(np.round(pearson_correlation(a, b), 5)) + -0.18845 + a = np.array([1, 2, 3]) + b = np.array([1, 2]) + pearson_correlation(a, b) + Traceback (most recent call last): + ... + ValueError: Input arrays must have the same length. + """ + if len(x) != len(y): + raise ValueError("Input arrays must have the same length.") + + x_mean = np.mean(x) + y_mean = np.mean(y) + + numerator = np.sum((x - x_mean) * (y - y_mean)) + denominator = np.sqrt(np.sum((x - x_mean) ** 2) * np.sum((y - y_mean) ** 2)) + + if denominator == 0: + raise ValueError("Standard deviation of input arrays must not be zero.") + + return numerator / denominator From a5fa33e47e8ff9a679c4a4d2d6d670843661ad81 Mon Sep 17 00:00:00 2001 From: LuisMelendez Date: Wed, 1 Oct 2025 23:24:25 -0600 Subject: [PATCH 2/5] Update pearson_correlation.py --- machine_learning/pearson_correlation.py | 68 +++++++++++-------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py index 3f71c8398efb..ebf1483e164f 100644 --- a/machine_learning/pearson_correlation.py +++ b/machine_learning/pearson_correlation.py @@ -1,53 +1,47 @@ import numpy as np - -def pearson_correlation(x: np.ndarray, y: np.ndarray) -> float: +def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: """ - Calculate the Pearson correlation coefficient (PCC) between two arrays. - - Pearson correlation measures the linear relationship between two datasets, - returning a value between -1 and 1: - - 1 indicates a perfect positive linear correlation - - 0 indicates no linear correlation - - -1 indicates a perfect negative linear correlation - - Formula: - r = Σ((x - mean(x)) * (y - mean(y))) / sqrt(Σ(x - mean(x))^2 * Σ(y - mean(y))^2) - - Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + Calculate the Pearson correlation coefficient between two sets of data. Parameters: - - x: 1D numpy array of values - - y: 1D numpy array of values + data_x (np.ndarray): Array of numeric values representing a column of data + that will be compared with another column to determine + how strongly the two vectors are related. + data_y (np.ndarray): Array of numeric values representing the second column + of data to compare with data_x. Returns: - - The Pearson correlation coefficient (float) + float: Pearson correlation coefficient between data_x and data_y. - a = np.array([1, 2, 3, 4, 5]) - b = np.array([2, 4, 6, 8, 10]) - float(np.round(pearson_correlation(a, b), 5)) + Reference: + https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + + Example: + >>> data_x = np.array([1, 2, 3, 4, 5]) + >>> data_y = np.array([2, 4, 6, 8, 10]) + >>> round(pearson_correlation(data_x, data_y), 2) 1.0 - a = np.array([1, 2, 3, 4, 5]) - b = np.array([10, 9, 2, 6, 4]) - float(np.round(pearson_correlation(a, b), 5)) - -0.18845 - a = np.array([1, 2, 3]) - b = np.array([1, 2]) - pearson_correlation(a, b) - Traceback (most recent call last): - ... - ValueError: Input arrays must have the same length. """ - if len(x) != len(y): - raise ValueError("Input arrays must have the same length.") + if len(data_x) != len(data_y): + raise ValueError("data_x and data_y must have the same length") + + n = len(data_x) + if n == 0: + return 0.0 - x_mean = np.mean(x) - y_mean = np.mean(y) + mean_x = np.mean(data_x) + mean_y = np.mean(data_y) - numerator = np.sum((x - x_mean) * (y - y_mean)) - denominator = np.sqrt(np.sum((x - x_mean) ** 2) * np.sum((y - y_mean) ** 2)) + numerator = np.sum((data_x - mean_x) * (data_y - mean_y)) + denominator = np.sqrt(np.sum((data_x - mean_x)**2) * np.sum((data_y - mean_y)**2)) if denominator == 0: - raise ValueError("Standard deviation of input arrays must not be zero.") + return 0.0 return numerator / denominator + + +if __name__ == "__main__": + import doctest + doctest.testmod() From 86d7b1eb2b65503a621aaf840f82d3c99a5be699 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 05:24:45 +0000 Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/pearson_correlation.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py index ebf1483e164f..75a464046be0 100644 --- a/machine_learning/pearson_correlation.py +++ b/machine_learning/pearson_correlation.py @@ -1,14 +1,15 @@ import numpy as np + def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: """ Calculate the Pearson correlation coefficient between two sets of data. Parameters: - data_x (np.ndarray): Array of numeric values representing a column of data + data_x (np.ndarray): Array of numeric values representing a column of data that will be compared with another column to determine how strongly the two vectors are related. - data_y (np.ndarray): Array of numeric values representing the second column + data_y (np.ndarray): Array of numeric values representing the second column of data to compare with data_x. Returns: @@ -25,7 +26,7 @@ def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: """ if len(data_x) != len(data_y): raise ValueError("data_x and data_y must have the same length") - + n = len(data_x) if n == 0: return 0.0 @@ -34,7 +35,9 @@ def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: mean_y = np.mean(data_y) numerator = np.sum((data_x - mean_x) * (data_y - mean_y)) - denominator = np.sqrt(np.sum((data_x - mean_x)**2) * np.sum((data_y - mean_y)**2)) + denominator = np.sqrt( + np.sum((data_x - mean_x) ** 2) * np.sum((data_y - mean_y) ** 2) + ) if denominator == 0: return 0.0 @@ -44,4 +47,5 @@ def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: if __name__ == "__main__": import doctest + doctest.testmod() From 5cb554de4f37170dfdd90e9d32e22196ff332724 Mon Sep 17 00:00:00 2001 From: LuisMelendez Date: Wed, 1 Oct 2025 23:32:45 -0600 Subject: [PATCH 4/5] Update pearson_correlation.py --- machine_learning/pearson_correlation.py | 66 +++++++++++++------------ 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py index 75a464046be0..c7b00119507d 100644 --- a/machine_learning/pearson_correlation.py +++ b/machine_learning/pearson_correlation.py @@ -3,49 +3,51 @@ def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: """ - Calculate the Pearson correlation coefficient between two sets of data. + Calculate the Pearson correlation coefficient (PCC) between two arrays. + + Pearson correlation measures the linear relationship between two datasets, + returning a value between -1 and 1: + - 1 indicates a perfect positive linear correlation + - 0 indicates no linear correlation + - -1 indicates a perfect negative linear correlation + + Formula: + r = Σ((x - mean(x)) * (y - mean(y))) / sqrt(Σ(x - mean(x))^2 * Σ(y - mean(y))^2) + + Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient Parameters: - data_x (np.ndarray): Array of numeric values representing a column of data - that will be compared with another column to determine - how strongly the two vectors are related. - data_y (np.ndarray): Array of numeric values representing the second column - of data to compare with data_x. + - x: 1D numpy array of values + - y: 1D numpy array of values Returns: - float: Pearson correlation coefficient between data_x and data_y. - - Reference: - https://en.wikipedia.org/wiki/Pearson_correlation_coefficient + - The Pearson correlation coefficient (float) - Example: - >>> data_x = np.array([1, 2, 3, 4, 5]) - >>> data_y = np.array([2, 4, 6, 8, 10]) - >>> round(pearson_correlation(data_x, data_y), 2) + a = np.array([1, 2, 3, 4, 5]) + b = np.array([2, 4, 6, 8, 10]) + float(np.round(pearson_correlation(a, b), 5)) 1.0 + a = np.array([1, 2, 3, 4, 5]) + b = np.array([10, 9, 2, 6, 4]) + float(np.round(pearson_correlation(a, b), 5)) + -0.18845 + a = np.array([1, 2, 3]) + b = np.array([1, 2]) + pearson_correlation(a, b) + Traceback (most recent call last): + ... + ValueError: Input arrays must have the same length. """ if len(data_x) != len(data_y): - raise ValueError("data_x and data_y must have the same length") + raise ValueError("Input arrays must have the same length.") - n = len(data_x) - if n == 0: - return 0.0 + x_mean = np.mean(data_x) + y_mean = np.mean(data_y) - mean_x = np.mean(data_x) - mean_y = np.mean(data_y) - - numerator = np.sum((data_x - mean_x) * (data_y - mean_y)) - denominator = np.sqrt( - np.sum((data_x - mean_x) ** 2) * np.sum((data_y - mean_y) ** 2) - ) + numerator = np.sum((data_x - x_mean) * (data_y - y_mean)) + denominator = np.sqrt(np.sum((data_x - x_mean) ** 2) * np.sum((data_y - y_mean) ** 2)) if denominator == 0: - return 0.0 + raise ValueError("Standard deviation of input arrays must not be zero.") return numerator / denominator - - -if __name__ == "__main__": - import doctest - - doctest.testmod() From 9a25b8312d24d0af5065bb8561604c20818ef625 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 2 Oct 2025 05:33:04 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/pearson_correlation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py index c7b00119507d..91a16285a88e 100644 --- a/machine_learning/pearson_correlation.py +++ b/machine_learning/pearson_correlation.py @@ -45,7 +45,9 @@ def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float: y_mean = np.mean(data_y) numerator = np.sum((data_x - x_mean) * (data_y - y_mean)) - denominator = np.sqrt(np.sum((data_x - x_mean) ** 2) * np.sum((data_y - y_mean) ** 2)) + denominator = np.sqrt( + np.sum((data_x - x_mean) ** 2) * np.sum((data_y - y_mean) ** 2) + ) if denominator == 0: raise ValueError("Standard deviation of input arrays must not be zero.")