diff --git a/machine_learning/mini_batch_gradient_descent.py b/machine_learning/mini_batch_gradient_descent.py
new file mode 100644
index 000000000000..0c93e0a0b63a
--- /dev/null
+++ b/machine_learning/mini_batch_gradient_descent.py
@@ -0,0 +1,84 @@
+"""
+Mini-Batch Gradient Descent : https://en.wikipedia.org/wiki/Stochastic_gradient_descent
+Mini-batch gradient descent is an optimization method for training models
+by splitting the data into small batches.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def mini_batch_gradient_descent(
+    feature_matrix: np.ndarray,
+    target_values: np.ndarray,
+    learning_rate: float = 0.01,
+    batch_size: int = 16,
+    n_epochs: int = 50,
+    random_seed: int | None = None,
+) -> tuple[np.ndarray, float]:
+    """
+    Mini-Batch Gradient Descent for linear regression.
+
+    Parameters
+    ----------
+    feature_matrix : np.ndarray
+        Feature matrix.
+    target_values : np.ndarray
+        Target values.
+    learning_rate : float
+        Learning rate.
+    batch_size : int
+        Size of mini-batches.
+    n_epochs : int
+        Number of training epochs.
+    random_seed : int | None
+        Random seed for reproducibility.
+
+    Returns
+    -------
+    weights : np.ndarray
+        Learned weights.
+    bias : float
+        Learned bias.
+
+    Example
+    -------
+    >>> import numpy as np
+    >>> X = np.array([[1], [2], [3], [4]])
+    >>> y = np.array([2, 4, 6, 8])
+    >>> w, b = mini_batch_gradient_descent(
+    ...     X, y, learning_rate=0.1, batch_size=2, n_epochs=100, random_seed=42
+    ... )
+    >>> round(float(w[0]), 1)  # slope close to 2
+    2.0
+    """
+    n_samples, n_features = feature_matrix.shape
+    weights = np.zeros(n_features)
+    bias = 0.0
+
+    rng = np.random.default_rng(random_seed)
+
+    for _ in range(n_epochs):
+        indices = rng.permutation(n_samples)
+        feature_matrix_shuffled = feature_matrix[indices]
+        target_values_shuffled = target_values[indices]
+
+        for start_idx in range(0, n_samples, batch_size):
+            end_idx = start_idx + batch_size
+            feature_batch = feature_matrix_shuffled[start_idx:end_idx]
+            target_batch = target_values_shuffled[start_idx:end_idx]
+
+            predictions = np.dot(feature_batch, weights) + bias
+            errors = predictions - target_batch
+
+            weights -= learning_rate * (feature_batch.T @ errors) / len(target_batch)
+            bias -= learning_rate * np.mean(errors)
+
+    return weights, bias
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()