Update linear_regression.py

MohakAgrawal25 · web-flow · commit 3d13432bcc40 · 2025-10-05T12:33:17.000+05:30
diff --git a/machine_learning/linear_regression.py b/machine_learning/linear_regression.py
@@ -21,127 +21,66 @@
 
 
 def collect_dataset():
-    """Collect dataset of CSGO
-    The dataset contains ADR vs Rating of a Player
-    :return : dataset obtained from the link, as matrix
-    """
+    """Collect dataset of CSGO (ADR vs Rating)."""
     response = httpx.get(
-        "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/"
-        "master/Week1/ADRvsRating.csv",
+        "https://raw.githubusercontent.com/yashLadha/The_Math_of_Intelligence/master/Week1/ADRvsRating.csv",
         timeout=10,
     )
-    lines = response.text.splitlines()
-    data = []
-    for item in lines:
-        item = item.split(",")
-        data.append(item)
-    data.pop(0)  # This is for removing the labels from the list
-    dataset = np.matrix(data)
-    return dataset
-
-
-def run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta):
-    """Run steep gradient descent and updates the Feature vector accordingly_
-    :param data_x   : contains the dataset
-    :param data_y   : contains the output associated with each data-entry
-    :param len_data : length of the data_
-    :param alpha    : Learning rate of the model
-    :param theta    : Feature vector (weight's for our model)
-    ;param return    : Updated Feature's, using
-                       curr_features - alpha_ * gradient(w.r.t. feature)
-    >>> import numpy as np
-    >>> data_x = np.array([[1, 2], [3, 4]])
-    >>> data_y = np.array([5, 6])
-    >>> len_data = len(data_x)
-    >>> alpha = 0.01
-    >>> theta = np.array([0.1, 0.2])
-    >>> run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
-    array([0.196, 0.343])
-    """
-    n = len_data
-
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_grad = np.dot(prod, data_x)
-    theta = theta - (alpha / n) * sum_grad
+    lines = response.text.strip().splitlines()
+    data = [line.split(",") for line in lines[1:]]  # skip header
+    return np.array(data, dtype=float)
+
+
+def run_steep_gradient_descent(data_x, data_y, alpha, theta):
+    """Perform one step of gradient descent."""
+    n = data_x.shape[0]
+    predictions = data_x @ theta.T
+    errors = predictions.flatten() - data_y
+    gradient = (1 / n) * (errors @ data_x)
+    theta = theta - alpha * gradient
     return theta
 
 
-def sum_of_square_error(data_x, data_y, len_data, theta):
-    """Return sum of square error for error calculation
-    :param data_x    : contains our dataset
-    :param data_y    : contains the output (result vector)
-    :param len_data  : len of the dataset
-    :param theta     : contains the feature vector
-    :return          : sum of square error computed from given feature's
-
-    Example:
-    >>> vc_x = np.array([[1.1], [2.1], [3.1]])
-    >>> vc_y = np.array([1.2, 2.2, 3.2])
-    >>> round(sum_of_square_error(vc_x, vc_y, 3, np.array([1])),3)
-    np.float64(0.005)
-    """
-    prod = np.dot(theta, data_x.transpose())
-    prod -= data_y.transpose()
-    sum_elem = np.sum(np.square(prod))
-    error = sum_elem / (2 * len_data)
-    return error
-
-
-def run_linear_regression(data_x, data_y):
-    """Implement Linear regression over the dataset
-    :param data_x  : contains our dataset
-    :param data_y  : contains the output (result vector)
-    :return        : feature for line of best fit (Feature vector)
-    """
-    iterations = 100000
-    alpha = 0.0001550
-
-    no_features = data_x.shape[1]
-    len_data = data_x.shape[0] - 1
-
-    theta = np.zeros((1, no_features))
+def sum_of_square_error(data_x, data_y, theta):
+    """Compute mean squared error."""
+    n = data_x.shape[0]
+    predictions = data_x @ theta.T
+    errors = predictions.flatten() - data_y
+    return np.sum(errors ** 2) / (2 * n)
 
-    for i in range(iterations):
-        theta = run_steep_gradient_descent(data_x, data_y, len_data, alpha, theta)
-        error = sum_of_square_error(data_x, data_y, len_data, theta)
-        print(f"At Iteration {i + 1} - Error is {error:.5f}")
 
+def run_linear_regression(data_x, data_y, iterations=100000, alpha=0.000155):
+    """Run gradient descent to learn parameters."""
+    theta = np.zeros((1, data_x.shape[1]))
+    for i in range(iterations):
+        theta = run_steep_gradient_descent(data_x, data_y, alpha, theta)
+        error = sum_of_square_error(data_x, data_y, theta)
+        print(f"Iteration {i + 1}: Error = {error:.5f}")
     return theta
 
 
 def mean_absolute_error(predicted_y, original_y):
-    """Return sum of square error for error calculation
-    :param predicted_y   : contains the output of prediction (result vector)
-    :param original_y    : contains values of expected outcome
-    :return          : mean absolute error computed from given feature's
-
-    >>> predicted_y = [3, -0.5, 2, 7]
-    >>> original_y = [2.5, 0.0, 2, 8]
-    >>> mean_absolute_error(predicted_y, original_y)
-    0.5
-    """
-    total = sum(abs(y - predicted_y[i]) for i, y in enumerate(original_y))
-    return total / len(original_y)
+    """Compute MAE (fully vectorized)."""
+    predicted_y = np.array(predicted_y)
+    original_y = np.array(original_y)
+    return np.mean(np.abs(predicted_y - original_y))
 
 
 def main():
-    """Driver function"""
     data = collect_dataset()
-
-    len_data = data.shape[0]
-    data_x = np.c_[np.ones(len_data), data[:, :-1]].astype(float)
-    data_y = data[:, -1].astype(float)
+    data_x = np.c_[np.ones(data.shape[0]), data[:, 0]]  # Add bias term
+    data_y = data[:, 1]  # Rating
 
     theta = run_linear_regression(data_x, data_y)
-    len_result = theta.shape[1]
-    print("Resultant Feature vector : ")
-    for i in range(len_result):
-        print(f"{theta[0, i]:.5f}")
 
+    print("Learned Parameters (theta):")
+    for val in theta[0]:
+        print(f"{val:.5f}")
+
+    predictions = data_x @ theta.T
+    mae = mean_absolute_error(predictions.flatten(), data_y)
+    print(f"Mean Absolute Error: {mae:.5f}")
 
-if __name__ == "__main__":
-    import doctest
 
-    doctest.testmod()
+if __name__ == "__main__":
     main()