Update pearson_correlation.py

LuisOfL · web-flow · commit a5fa33e47e8f · 2025-10-01T23:24:25.000-06:00
diff --git a/machine_learning/pearson_correlation.py b/machine_learning/pearson_correlation.py
@@ -1,53 +1,47 @@
 import numpy as np
 
-
-def pearson_correlation(x: np.ndarray, y: np.ndarray) -> float:
+def pearson_correlation(data_x: np.ndarray, data_y: np.ndarray) -> float:
     """
-    Calculate the Pearson correlation coefficient (PCC) between two arrays.
-
-    Pearson correlation measures the linear relationship between two datasets,
-    returning a value between -1 and 1:
-      - 1   indicates a perfect positive linear correlation
-      - 0   indicates no linear correlation
-      - -1  indicates a perfect negative linear correlation
-
-    Formula:
-    r = Σ((x - mean(x)) * (y - mean(y))) / sqrt(Σ(x - mean(x))^2 * Σ(y - mean(y))^2)
-
-    Reference: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+    Calculate the Pearson correlation coefficient between two sets of data.
 
     Parameters:
-    - x: 1D numpy array of values
-    - y: 1D numpy array of values
+    data_x (np.ndarray): Array of numeric values representing a column of data 
+                         that will be compared with another column to determine
+                         how strongly the two vectors are related.
+    data_y (np.ndarray): Array of numeric values representing the second column 
+                         of data to compare with data_x.
 
     Returns:
-    - The Pearson correlation coefficient (float)
+    float: Pearson correlation coefficient between data_x and data_y.
 
-      a = np.array([1, 2, 3, 4, 5])
-      b = np.array([2, 4, 6, 8, 10])
-      float(np.round(pearson_correlation(a, b), 5))
+    Reference:
+    https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
+
+    Example:
+    >>> data_x = np.array([1, 2, 3, 4, 5])
+    >>> data_y = np.array([2, 4, 6, 8, 10])
+    >>> round(pearson_correlation(data_x, data_y), 2)
     1.0
-      a = np.array([1, 2, 3, 4, 5])
-      b = np.array([10, 9, 2, 6, 4])
-      float(np.round(pearson_correlation(a, b), 5))
-    -0.18845
-      a = np.array([1, 2, 3])
-      b = np.array([1, 2])
-      pearson_correlation(a, b)
-    Traceback (most recent call last):
-        ...
-    ValueError: Input arrays must have the same length.
     """
-    if len(x) != len(y):
-        raise ValueError("Input arrays must have the same length.")
+    if len(data_x) != len(data_y):
+        raise ValueError("data_x and data_y must have the same length")
+    
+    n = len(data_x)
+    if n == 0:
+        return 0.0
 
-    x_mean = np.mean(x)
-    y_mean = np.mean(y)
+    mean_x = np.mean(data_x)
+    mean_y = np.mean(data_y)
 
-    numerator = np.sum((x - x_mean) * (y - y_mean))
-    denominator = np.sqrt(np.sum((x - x_mean) ** 2) * np.sum((y - y_mean) ** 2))
+    numerator = np.sum((data_x - mean_x) * (data_y - mean_y))
+    denominator = np.sqrt(np.sum((data_x - mean_x)**2) * np.sum((data_y - mean_y)**2))
 
     if denominator == 0:
-        raise ValueError("Standard deviation of input arrays must not be zero.")
+        return 0.0
 
     return numerator / denominator
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()