Skip to content

Commit 5cb554d

Browse files
authored
Update pearson_correlation.py
1 parent 86d7b1e commit 5cb554d

1 file changed

Lines changed: 34 additions & 32 deletions

File tree

machine_learning/pearson_correlation.py

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,49 +3,51 @@
33

44
def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float:
55
"""
6-
Calculate the Pearson correlation coefficient between two sets of data.
6+
Calculate the Pearson correlation coefficient (PCC) between two arrays.
7+
8+
Pearson correlation measures the linear relationship between two datasets,
9+
returning a value between -1 and 1:
10+
- 1 indicates a perfect positive linear correlation
11+
- 0 indicates no linear correlation
12+
- -1 indicates a perfect negative linear correlation
13+
14+
Formula:
15+
r = Σ((x - mean(x)) * (y - mean(y))) / sqrt(Σ(x - mean(x))^2 * Σ(y - mean(y))^2)
16+
17+
Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
718
819
Parameters:
9-
data_x (np.ndarray): Array of numeric values representing a column of data
10-
that will be compared with another column to determine
11-
how strongly the two vectors are related.
12-
data_y (np.ndarray): Array of numeric values representing the second column
13-
of data to compare with data_x.
20+
- x: 1D numpy array of values
21+
- y: 1D numpy array of values
1422
1523
Returns:
16-
float: Pearson correlation coefficient between data_x and data_y.
17-
18-
Reference:
19-
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
24+
- The Pearson correlation coefficient (float)
2025
21-
Example:
22-
>>> data_x = np.array([1, 2, 3, 4, 5])
23-
>>> data_y = np.array([2, 4, 6, 8, 10])
24-
>>> round(pearson_correlation(data_x, data_y), 2)
26+
a = np.array([1, 2, 3, 4, 5])
27+
b = np.array([2, 4, 6, 8, 10])
28+
float(np.round(pearson_correlation(a, b), 5))
2529
1.0
30+
a = np.array([1, 2, 3, 4, 5])
31+
b = np.array([10, 9, 2, 6, 4])
32+
float(np.round(pearson_correlation(a, b), 5))
33+
-0.18845
34+
a = np.array([1, 2, 3])
35+
b = np.array([1, 2])
36+
pearson_correlation(a, b)
37+
Traceback (most recent call last):
38+
...
39+
ValueError: Input arrays must have the same length.
2640
"""
2741
if len(data_x) != len(data_y):
28-
raise ValueError("data_x and data_y must have the same length")
42+
raise ValueError("Input arrays must have the same length.")
2943

30-
n = len(data_x)
31-
if n == 0:
32-
return 0.0
44+
x_mean = np.mean(data_x)
45+
y_mean = np.mean(data_y)
3346

34-
mean_x = np.mean(data_x)
35-
mean_y = np.mean(data_y)
36-
37-
numerator = np.sum((data_x - mean_x) * (data_y - mean_y))
38-
denominator = np.sqrt(
39-
np.sum((data_x - mean_x) ** 2) * np.sum((data_y - mean_y) ** 2)
40-
)
47+
numerator = np.sum((data_x - x_mean) * (data_y - y_mean))
48+
denominator = np.sqrt(np.sum((data_x - x_mean) ** 2) * np.sum((data_y - y_mean) ** 2))
4149

4250
if denominator == 0:
43-
return 0.0
51+
raise ValueError("Standard deviation of input arrays must not be zero.")
4452

4553
return numerator / denominator
46-
47-
48-
if __name__ == "__main__":
49-
import doctest
50-
51-
doctest.testmod()

0 commit comments

Comments
 (0)