labmlai · Creeken-Harrans · May 1, 2026
diff --git a/labml_nn/activations/fta/__init__.py b/labml_nn/activations/fta/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Fuzzy Tiling Activations
 summary: >
@@ -68,7 +68,7 @@ class FTA(nn.Module):
     """
 
     def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: float):
-        """
+        r"""
         :param lower_limit: is the lower limit $l$
         :param upper_limit: is the upper limit $u$
         :param delta: is the bin size $\delta$
@@ -86,7 +86,7 @@ def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: fl
         self.eta = eta
 
     def fuzzy_i_plus(self, x: torch.Tensor):
-        """
+        r"""
         #### Fuzzy indicator function
 
         $$I_{\eta,+}(x) = I_+(\eta - x) x + I_+ (x - \eta)$$

diff --git a/labml_nn/adaptive_computation/ponder_net/__init__.py b/labml_nn/adaptive_computation/ponder_net/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: "PonderNet: Learning to Ponder"
 summary: >
@@ -106,7 +106,7 @@ def __init__(self, n_elems: int, n_hidden: int, max_steps: int):
         self.is_halt = False
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
+        r"""
         * `x` is the input of shape `[batch_size, n_elems]`
 
         This outputs a tuple of four tensors:
@@ -177,7 +177,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Te
 
 
 class ReconstructionLoss(nn.Module):
-    """
+    r"""
     ## Reconstruction loss
 
     $$L_{Rec} = \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n)$$
@@ -186,14 +186,14 @@ class ReconstructionLoss(nn.Module):
     """
 
     def __init__(self, loss_func: nn.Module):
-        """
+        r"""
         * `loss_func` is the loss function $\mathcal{L}$
         """
         super().__init__()
         self.loss_func = loss_func
 
     def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
-        """
+        r"""
         * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
         * `y_hat` is $\hat{y}_1 \dots \hat{y}_N$ in a tensor of shape `[N, batch_size, ...]`
         * `y` is the target of shape `[batch_size, ...]`
@@ -213,7 +213,7 @@ def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
 
 
 class RegularizationLoss(nn.Module):
-    """
+    r"""
     ## Regularization loss
 
     $$L_{Reg} = \mathop{KL} \Big(p_n \Vert p_G(\lambda_p) \Big)$$
@@ -229,7 +229,7 @@ class RegularizationLoss(nn.Module):
     """
 
     def __init__(self, lambda_p: float, max_steps: int = 1_000):
-        """
+        r"""
         * `lambda_p` is $\lambda_p$ - the success probability of geometric distribution
         * `max_steps` is the highest $N$; we use this to pre-compute $p_G(\lambda_p)$
         """
@@ -253,7 +253,7 @@ def __init__(self, lambda_p: float, max_steps: int = 1_000):
         self.kl_div = nn.KLDivLoss(reduction='batchmean')
 
     def forward(self, p: torch.Tensor):
-        """
+        r"""
         * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
         """
         # Transpose `p` to `[batch_size, N]`

diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py
@@ -35,7 +35,7 @@
 
 
 class Squash(nn.Module):
-    """
+    r"""
     ## Squash
 
     This is **squashing** function from paper, given by equation $(1)$.
@@ -69,7 +69,7 @@ def forward(self, s: torch.Tensor):
 
 
 class Router(nn.Module):
-    """
+    r"""
     ## Routing Algorithm
 
     This is the routing mechanism described in the paper.
@@ -132,7 +132,7 @@ def forward(self, u: torch.Tensor):
 
 
 class MarginLoss(nn.Module):
-    """
+    r"""
     ## Margin loss for class existence
 
     A separate margin loss is used for each output capsule and the total loss is the sum of them.
@@ -161,7 +161,7 @@ def __init__(self, *, n_labels: int, lambda_: float = 0.5, m_positive: float = 0
         self.n_labels = n_labels
 
     def forward(self, v: torch.Tensor, labels: torch.Tensor):
-        """
+        r"""
         `v`, $\mathbf{v}_j$ are the squashed output capsules.
         This has shape `[batch_size, n_labels, n_features]`; that is, there is a capsule for each label.
 

diff --git a/labml_nn/cfr/__init__.py b/labml_nn/cfr/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Regret Minimization in Games with Incomplete Information (CFR)
 summary: >
@@ -337,7 +337,7 @@
 
 
 class History:
-    """
+    r"""
     <a id="History"></a>
 
     ## History
@@ -349,14 +349,14 @@ class History:
     """
 
     def is_terminal(self):
-        """
+        r"""
         Whether it's a terminal history; i.e. game over.
         $h \in Z$
         """
         raise NotImplementedError()
 
     def terminal_utility(self, i: Player) -> float:
-        """
+        r"""
         <a id="terminal_utility"></a>
         Utility of player $i$ for a terminal history.
         $u_i(h)$ where $h \in Z$
@@ -485,7 +485,7 @@ def load_dict(self, data: Dict[str, any]):
         self.calculate_strategy()
 
     def calculate_strategy(self):
-        """
+        r"""
         ## Calculate strategy
 
         Calculate current strategy using [regret matching](#RegretMatching).
@@ -520,7 +520,7 @@ def calculate_strategy(self):
             self.strategy = {a: 1 / count for a, r in regret.items()}
 
     def get_average_strategy(self):
-        """
+        r"""
         ## Get average strategy
 
         $$\textcolor{cyan}{\bar{\sigma}^T_i(I)(a)} =
@@ -596,7 +596,7 @@ def _get_info_set(self, h: History):
         return self.info_sets[info_set_key]
 
     def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> float:
-        """
+        r"""
         ### Walk Tree
 
         This function walks the game tree.
@@ -686,7 +686,7 @@ def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> floa
         return v
 
     def iterate(self):
-        """
+        r"""
         ### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$
 
         This updates the strategies for $T$ iterations.

diff --git a/labml_nn/conv_mixer/__init__.py b/labml_nn/conv_mixer/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Patches Are All You Need? (ConvMixer)
 summary: >
@@ -96,7 +96,7 @@ def forward(self, x: torch.Tensor):
 
 
 class PatchEmbeddings(nn.Module):
-    """
+    r"""
     <a id="PatchEmbeddings"></a>
 
     ## Get patch embeddings

diff --git a/labml_nn/diffusion/ddpm/__init__.py b/labml_nn/diffusion/ddpm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Denoising Diffusion Probabilistic Models (DDPM)
 summary: >
@@ -175,7 +175,7 @@ class DenoiseDiffusion:
     """
 
     def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
-        """
+        r"""
         * `eps_model` is $\textcolor{lightgreen}{\epsilon_\theta}(x_t, t)$ model
         * `n_steps` is $t$
         * `device` is the device to place constants on
@@ -196,7 +196,7 @@ def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
         self.sigma2 = self.beta
 
     def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
+        r"""
         #### Get $q(x_t|x_0)$ distribution
 
         \begin{align}
@@ -212,7 +212,7 @@ def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torc
         return mean, var
 
     def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor] = None):
-        """
+        r"""
         #### Sample from $q(x_t|x_0)$
 
         \begin{align}
@@ -230,7 +230,7 @@ def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor
         return mean + (var ** 0.5) * eps
 
     def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
-        """
+        r"""
         #### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
 
         \begin{align}
@@ -262,7 +262,7 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
         return mean + (var ** .5) * eps
 
     def loss(self, x0: torch.Tensor, noise: Optional[torch.Tensor] = None):
-        """
+        r"""
         #### Simplified Loss
 
         $$L_{\text{simple}}(\theta) = \mathbb{E}_{t,x_0, \epsilon} \Bigg[ \bigg\Vert