diff --git a/labml_nn/activations/fta/__init__.py b/labml_nn/activations/fta/__init__.py
index ba682a0a0..6ae143d3c 100644
--- a/labml_nn/activations/fta/__init__.py
+++ b/labml_nn/activations/fta/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Fuzzy Tiling Activations
 summary: >
@@ -68,7 +68,7 @@ class FTA(nn.Module):
     """
 
     def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: float):
-        """
+        r"""
         :param lower_limit: is the lower limit $l$
         :param upper_limit: is the upper limit $u$
         :param delta: is the bin size $\delta$
@@ -86,7 +86,7 @@ def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: fl
         self.eta = eta
 
     def fuzzy_i_plus(self, x: torch.Tensor):
-        """
+        r"""
         #### Fuzzy indicator function
 
         $$I_{\eta,+}(x) = I_+(\eta - x) x + I_+ (x - \eta)$$
diff --git a/labml_nn/adaptive_computation/ponder_net/__init__.py b/labml_nn/adaptive_computation/ponder_net/__init__.py
index 7dfcd2d3c..b9a56eeef 100644
--- a/labml_nn/adaptive_computation/ponder_net/__init__.py
+++ b/labml_nn/adaptive_computation/ponder_net/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: "PonderNet: Learning to Ponder"
 summary: >
@@ -106,7 +106,7 @@ def __init__(self, n_elems: int, n_hidden: int, max_steps: int):
         self.is_halt = False
 
     def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
+        r"""
         * `x` is the input of shape `[batch_size, n_elems]`
 
         This outputs a tuple of four tensors:
@@ -177,7 +177,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Te
 
 
 class ReconstructionLoss(nn.Module):
-    """
+    r"""
     ## Reconstruction loss
 
     $$L_{Rec} = \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n)$$
@@ -186,14 +186,14 @@ class ReconstructionLoss(nn.Module):
     """
 
     def __init__(self, loss_func: nn.Module):
-        """
+        r"""
         * `loss_func` is the loss function $\mathcal{L}$
         """
         super().__init__()
         self.loss_func = loss_func
 
     def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
-        """
+        r"""
         * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
         * `y_hat` is $\hat{y}_1 \dots \hat{y}_N$ in a tensor of shape `[N, batch_size, ...]`
         * `y` is the target of shape `[batch_size, ...]`
@@ -213,7 +213,7 @@ def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
 
 
 class RegularizationLoss(nn.Module):
-    """
+    r"""
     ## Regularization loss
 
     $$L_{Reg} = \mathop{KL} \Big(p_n \Vert p_G(\lambda_p) \Big)$$
@@ -229,7 +229,7 @@ class RegularizationLoss(nn.Module):
     """
 
     def __init__(self, lambda_p: float, max_steps: int = 1_000):
-        """
+        r"""
         * `lambda_p` is $\lambda_p$ - the success probability of geometric distribution
         * `max_steps` is the highest $N$; we use this to pre-compute $p_G(\lambda_p)$
         """
@@ -253,7 +253,7 @@ def __init__(self, lambda_p: float, max_steps: int = 1_000):
         self.kl_div = nn.KLDivLoss(reduction='batchmean')
 
     def forward(self, p: torch.Tensor):
-        """
+        r"""
         * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
         """
         # Transpose `p` to `[batch_size, N]`
diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py
index 9a9dfbeae..a2dfaf6cc 100644
--- a/labml_nn/capsule_networks/__init__.py
+++ b/labml_nn/capsule_networks/__init__.py
@@ -35,7 +35,7 @@
 
 
 class Squash(nn.Module):
-    """
+    r"""
     ## Squash
 
     This is **squashing** function from paper, given by equation $(1)$.
@@ -69,7 +69,7 @@ def forward(self, s: torch.Tensor):
 
 
 class Router(nn.Module):
-    """
+    r"""
     ## Routing Algorithm
 
     This is the routing mechanism described in the paper.
@@ -132,7 +132,7 @@ def forward(self, u: torch.Tensor):
 
 
 class MarginLoss(nn.Module):
-    """
+    r"""
     ## Margin loss for class existence
 
     A separate margin loss is used for each output capsule and the total loss is the sum of them.
@@ -161,7 +161,7 @@ def __init__(self, *, n_labels: int, lambda_: float = 0.5, m_positive: float = 0
         self.n_labels = n_labels
 
     def forward(self, v: torch.Tensor, labels: torch.Tensor):
-        """
+        r"""
         `v`, $\mathbf{v}_j$ are the squashed output capsules.
         This has shape `[batch_size, n_labels, n_features]`; that is, there is a capsule for each label.
 
diff --git a/labml_nn/cfr/__init__.py b/labml_nn/cfr/__init__.py
index 48e496058..1e69674ea 100644
--- a/labml_nn/cfr/__init__.py
+++ b/labml_nn/cfr/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Regret Minimization in Games with Incomplete Information (CFR)
 summary: >
@@ -337,7 +337,7 @@
 
 
 class History:
-    """
+    r"""
     <a id="History"></a>
 
     ## History
@@ -349,14 +349,14 @@ class History:
     """
 
     def is_terminal(self):
-        """
+        r"""
         Whether it's a terminal history; i.e. game over.
         $h \in Z$
         """
         raise NotImplementedError()
 
     def terminal_utility(self, i: Player) -> float:
-        """
+        r"""
         <a id="terminal_utility"></a>
         Utility of player $i$ for a terminal history.
         $u_i(h)$ where $h \in Z$
@@ -485,7 +485,7 @@ def load_dict(self, data: Dict[str, any]):
         self.calculate_strategy()
 
     def calculate_strategy(self):
-        """
+        r"""
         ## Calculate strategy
 
         Calculate current strategy using [regret matching](#RegretMatching).
@@ -520,7 +520,7 @@ def calculate_strategy(self):
             self.strategy = {a: 1 / count for a, r in regret.items()}
 
     def get_average_strategy(self):
-        """
+        r"""
         ## Get average strategy
 
         $$\textcolor{cyan}{\bar{\sigma}^T_i(I)(a)} =
@@ -596,7 +596,7 @@ def _get_info_set(self, h: History):
         return self.info_sets[info_set_key]
 
     def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> float:
-        """
+        r"""
         ### Walk Tree
 
         This function walks the game tree.
@@ -686,7 +686,7 @@ def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> floa
         return v
 
     def iterate(self):
-        """
+        r"""
         ### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$
 
         This updates the strategies for $T$ iterations.
diff --git a/labml_nn/conv_mixer/__init__.py b/labml_nn/conv_mixer/__init__.py
index 42d1804ae..5b8a91f4d 100644
--- a/labml_nn/conv_mixer/__init__.py
+++ b/labml_nn/conv_mixer/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Patches Are All You Need? (ConvMixer)
 summary: >
@@ -96,7 +96,7 @@ def forward(self, x: torch.Tensor):
 
 
 class PatchEmbeddings(nn.Module):
-    """
+    r"""
     <a id="PatchEmbeddings"></a>
 
     ## Get patch embeddings
diff --git a/labml_nn/diffusion/ddpm/__init__.py b/labml_nn/diffusion/ddpm/__init__.py
index c89c93eeb..013b6ee63 100644
--- a/labml_nn/diffusion/ddpm/__init__.py
+++ b/labml_nn/diffusion/ddpm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Denoising Diffusion Probabilistic Models (DDPM)
 summary: >
@@ -175,7 +175,7 @@ class DenoiseDiffusion:
     """
 
     def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
-        """
+        r"""
         * `eps_model` is $\textcolor{lightgreen}{\epsilon_\theta}(x_t, t)$ model
         * `n_steps` is $t$
         * `device` is the device to place constants on
@@ -196,7 +196,7 @@ def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
         self.sigma2 = self.beta
 
     def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
+        r"""
         #### Get $q(x_t|x_0)$ distribution
 
         \begin{align}
@@ -212,7 +212,7 @@ def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torc
         return mean, var
 
     def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor] = None):
-        """
+        r"""
         #### Sample from $q(x_t|x_0)$
 
         \begin{align}
@@ -230,7 +230,7 @@ def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor
         return mean + (var ** 0.5) * eps
 
     def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
-        """
+        r"""
         #### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
 
         \begin{align}
@@ -262,7 +262,7 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
         return mean + (var ** .5) * eps
 
     def loss(self, x0: torch.Tensor, noise: Optional[torch.Tensor] = None):
-        """
+        r"""
         #### Simplified Loss
 
         $$L_{\text{simple}}(\theta) = \mathbb{E}_{t,x_0, \epsilon} \Bigg[ \bigg\Vert
diff --git a/labml_nn/diffusion/ddpm/evaluate.py b/labml_nn/diffusion/ddpm/evaluate.py
index 52251b61e..06456925f 100644
--- a/labml_nn/diffusion/ddpm/evaluate.py
+++ b/labml_nn/diffusion/ddpm/evaluate.py
@@ -26,7 +26,13 @@ class Sampler:
     ## Sampler class
     """
 
-    def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size: int, device: torch.device):
+    def __init__(
+        self,
+        diffusion: DenoiseDiffusion,
+        image_channels: int,
+        image_size: int,
+        device: torch.device,
+    ):
         """
         * `diffusion` is the `DenoiseDiffusion` instance
         * `image_channels` is the number of channels in the image
@@ -63,9 +69,11 @@ def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size:
         # $$\tilde\beta_t = \frac{1 - \bar\alpha_{t-1}}{1 - \bar\alpha_t} \beta_t$$
         self.beta_tilde = self.beta * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
         # $$\frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}$$
-        self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1 ** 0.5) / (1 - self.alpha_bar)
+        self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1**0.5) / (1 - self.alpha_bar)
         # $$\frac{\sqrt{\alpha_t}(1 - \bar\alpha_{t-1}}{1-\bar\alpha_t}$$
-        self.mu_tilde_coef2 = (self.alpha ** 0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
+        self.mu_tilde_coef2 = (
+            (self.alpha**0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
+        )
         # $\sigma^2 = \beta$
         self.sigma2 = self.beta
 
@@ -80,6 +88,7 @@ def show_image(self, img, title=""):
     def make_video(self, frames, path="video.mp4"):
         """Helper function to create a video"""
         import imageio
+
         # 20 second video
         writer = imageio.get_writer(path, fps=len(frames) // 20)
         # Add each image
@@ -91,7 +100,7 @@ def make_video(self, frames, path="video.mp4"):
         writer.close()
 
     def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
-        """
+        r"""
         #### Sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
 
         We sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ and at each step
@@ -101,14 +110,17 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
         """
 
         # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
-        xt = torch.randn([1, self.image_channels, self.image_size, self.image_size], device=self.device)
+        xt = torch.randn(
+            [1, self.image_channels, self.image_size, self.image_size],
+            device=self.device,
+        )
 
         # Interval to log $\hat{x}_0$
         interval = self.n_steps // n_frames
         # Frames for video
         frames = []
         # Sample $T$ steps
-        for t_inv in monit.iterate('Denoise', self.n_steps):
+        for t_inv in monit.iterate("Denoise", self.n_steps):
             # $t$
             t_ = self.n_steps - t_inv - 1
             # $t$ in a tensor
@@ -128,8 +140,10 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
         if create_video:
             self.make_video(frames)
 
-    def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100):
-        """
+    def interpolate(
+        self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100
+    ):
+        r"""
         #### Interpolate two images $x_0$ and $x'_0$
 
         We get $x_t \sim q(x_t|x_0)$ and $x'_t \sim q(x'_t|x_0)$.
@@ -144,20 +158,28 @@ def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: in
         * `x2` is $x'_0$
         * `lambda_` is $\lambda$
         * `t_` is $t$
-        """
+        r"""
 
         # Number of samples
         n_samples = x1.shape[0]
         # $t$ tensor
         t = torch.full((n_samples,), t_, device=self.device)
         # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$
-        xt = (1 - lambda_) * self.diffusion.q_sample(x1, t) + lambda_ * self.diffusion.q_sample(x2, t)
+        xt = (1 - lambda_) * self.diffusion.q_sample(
+            x1, t
+        ) + lambda_ * self.diffusion.q_sample(x2, t)
 
         # $$\bar{x}_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|\bar{x}_t)$$
         return self._sample_x0(xt, t_)
 
-    def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int = 100, t_: int = 100,
-                            create_video=True):
+    def interpolate_animate(
+        self,
+        x1: torch.Tensor,
+        x2: torch.Tensor,
+        n_frames: int = 100,
+        t_: int = 100,
+        create_video=True,
+    ):
         """
         #### Interpolate two images $x_0$ and $x'_0$ and make a video
 
@@ -166,7 +188,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int
         * `n_frames` is the number of frames for the image
         * `t_` is $t$
         * `create_video` specifies whether to make a video or to show each frame
-        """
+        r"""
 
         # Show original images
         self.show_image(x1, "x1")
@@ -183,7 +205,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int
 
         frames = []
         # Get frames with different $\lambda$
-        for i in monit.iterate('Interpolate', n_frames + 1, is_children_silent=True):
+        for i in monit.iterate("Interpolate", n_frames + 1, is_children_silent=True):
             # $\lambda$
             lambda_ = i / n_frames
             # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$
@@ -206,15 +228,17 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int):
 
         * `xt` is $x_t$
         * `n_steps` is $t$
-        """
+        r"""
 
         # Number of sampels
         n_samples = xt.shape[0]
         # Iterate until $t$ steps
-        for t_ in monit.iterate('Denoise', n_steps):
+        for t_ in monit.iterate("Denoise", n_steps):
             t = n_steps - t_ - 1
             # Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
-            xt = self.diffusion.p_sample(xt, xt.new_full((n_samples,), t, dtype=torch.long))
+            xt = self.diffusion.p_sample(
+                xt, xt.new_full((n_samples,), t, dtype=torch.long)
+            )
 
         # Return $x_0$
         return xt
@@ -222,9 +246,12 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int):
     def sample(self, n_samples: int = 16):
         """
         #### Generate images
-        """
+        r"""
         # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
-        xt = torch.randn([n_samples, self.image_channels, self.image_size, self.image_size], device=self.device)
+        xt = torch.randn(
+            [n_samples, self.image_channels, self.image_size, self.image_size],
+            device=self.device,
+        )
 
         # $$x_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|x_t)$$
         x0 = self._sample_x0(xt, self.n_steps)
@@ -234,7 +261,7 @@ def sample(self, n_samples: int = 16):
             self.show_image(x0[i])
 
     def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor):
-        """
+        r"""
         #### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
 
         \begin{align}
@@ -244,23 +271,23 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor):
           &= \frac{1}{\sqrt{\alpha_t}} \Big(x_t -
             \frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)
         \end{align}
-        """
+        r"""
         # [gather](utils.html) $\bar\alpha_t$
         alpha_bar = gather(self.alpha_bar, t)
         # $\alpha_t$
         alpha = gather(self.alpha, t)
         # $\frac{\beta}{\sqrt{1-\bar\alpha_t}}$
-        eps_coef = (1 - alpha) / (1 - alpha_bar) ** .5
+        eps_coef = (1 - alpha) / (1 - alpha_bar) ** 0.5
         # $$\frac{1}{\sqrt{\alpha_t}} \Big(x_t -
         #      \frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
-        mean = 1 / (alpha ** 0.5) * (xt - eps_coef * eps_theta)
+        mean = 1 / (alpha**0.5) * (xt - eps_coef * eps_theta)
         # $\sigma^2$
         var = gather(self.sigma2, t)
 
         # $\epsilon \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$
         eps = torch.randn(xt.shape, device=xt.device)
         # Sample
-        return mean + (var ** .5) * eps
+        return mean + (var**0.5) * eps
 
     def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor):
         """
@@ -268,13 +295,13 @@ def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor):
 
         $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}}
          \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
-        """
+        r"""
         # [gather](utils.html) $\bar\alpha_t$
         alpha_bar = gather(self.alpha_bar, t)
 
         # $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}}
         #  \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
-        return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar ** 0.5)
+        return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar**0.5)
 
 
 def main():
@@ -297,16 +324,18 @@ def main():
     configs.init()
 
     # Set PyTorch modules for saving and loading
-    experiment.add_pytorch_models({'eps_model': configs.eps_model})
+    experiment.add_pytorch_models({"eps_model": configs.eps_model})
 
     # Load training experiment
     experiment.load(run_uuid)
 
     # Create sampler
-    sampler = Sampler(diffusion=configs.diffusion,
-                      image_channels=configs.image_channels,
-                      image_size=configs.image_size,
-                      device=configs.device)
+    sampler = Sampler(
+        diffusion=configs.diffusion,
+        image_channels=configs.image_channels,
+        image_size=configs.image_size,
+        device=configs.device,
+    )
 
     # Start evaluation
     with experiment.start():
@@ -324,5 +353,5 @@ def main():
 
 
 #
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/labml_nn/diffusion/ddpm/unet.py b/labml_nn/diffusion/ddpm/unet.py
index f5da80901..587c619d0 100644
--- a/labml_nn/diffusion/ddpm/unet.py
+++ b/labml_nn/diffusion/ddpm/unet.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: U-Net model for Denoising Diffusion Probabilistic Models (DDPM)
 summary: >
@@ -29,7 +29,7 @@
 
 
 class Swish(nn.Module):
-    """
+    r"""
     ### Swish activation function
 
     $$x \cdot \sigma(x)$$
@@ -272,7 +272,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor):
 
 
 class Upsample(nn.Module):
-    """
+    r"""
     ### Scale up the feature map by $2 \times$
     """
 
@@ -288,7 +288,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor):
 
 
 class Downsample(nn.Module):
-    """
+    r"""
     ### Scale down the feature map by $\frac{1}{2} \times$
     """
 
diff --git a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
index d7f9ecd1b..330ad2e04 100644
--- a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
+++ b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Latent Diffusion Models
 summary: >
@@ -70,7 +70,7 @@ def __init__(self,
                  linear_start: float,
                  linear_end: float,
                  ):
-        """
+        r"""
         :param unet_model: is the [U-Net](model/unet.html) that predicts noise
          $\epsilon_\text{cond}(x_t, c)$, in latent space
         :param autoencoder: is the [AutoEncoder](model/autoencoder.html)
@@ -134,7 +134,7 @@ def autoencoder_decode(self, z: torch.Tensor):
         return self.first_stage_model.decode(z / self.latent_scaling_factor)
 
     def forward(self, x: torch.Tensor, t: torch.Tensor, context: torch.Tensor):
-        """
+        r"""
         ### Predict noise
 
         Predict noise given the latent representation $x_t$, time step $t$, and the
diff --git a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
index badc6cf85..ec3784060 100644
--- a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
+++ b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
@@ -416,7 +416,7 @@ def forward(self, x: torch.Tensor):
 
 
 def swish(x: torch.Tensor):
-    """
+    r"""
     ### Swish activation
 
     $$x \cdot \sigma(x)$$
diff --git a/labml_nn/diffusion/stable_diffusion/model/unet.py b/labml_nn/diffusion/stable_diffusion/model/unet.py
index 261a4bced..4eb10afeb 100644
--- a/labml_nn/diffusion/stable_diffusion/model/unet.py
+++ b/labml_nn/diffusion/stable_diffusion/model/unet.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: U-Net for Stable Diffusion
 summary: >
diff --git a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
index cf42efa4f..ef2044057 100644
--- a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
+++ b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Transformer for Stable Diffusion U-Net
 summary: >
@@ -291,7 +291,7 @@ def forward(self, x: torch.Tensor):
 
 
 class GeGLU(nn.Module):
-    """
+    r"""
     ### GeGLU Activation
 
     $$\text{GeGLU}(x) = (xW + b) * \text{GELU}(xV + c)$$
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
index 38c063e06..75e71bbf6 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
@@ -29,7 +29,7 @@ class DiffusionSampler:
     model: LatentDiffusion
 
     def __init__(self, model: LatentDiffusion):
-        """
+        r"""
         :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
         """
         super().__init__()
@@ -40,7 +40,7 @@ def __init__(self, model: LatentDiffusion):
 
     def get_eps(self, x: torch.Tensor, t: torch.Tensor, c: torch.Tensor, *,
                 uncond_scale: float, uncond_cond: Optional[torch.Tensor]):
-        """
+        r"""
         ## Get $\epsilon(x_t, c)$
 
         :param x: is $x_t$ of shape `[batch_size, channels, height, width]`
@@ -79,7 +79,7 @@ def sample(self,
                uncond_cond: Optional[torch.Tensor] = None,
                skip_steps: int = 0,
                ):
-        """
+        r"""
         ### Sampling Loop
 
         :param shape: is the shape of the generated images in the
@@ -100,7 +100,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
               uncond_scale: float = 1.,
               uncond_cond: Optional[torch.Tensor] = None,
               ):
-        """
+        r"""
         ### Painting Loop
 
         :param x: is $x_{T'}$ of shape `[batch_size, channels, height, width]`
@@ -116,7 +116,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
         raise NotImplementedError()
 
     def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
-        """
+        r"""
         ### Sample from $q(x_t|x_0)$
 
         :param x0: is $x_0$ of shape `[batch_size, channels, height, width]`
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
index 04a8837f3..fb36ab521 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
@@ -24,7 +24,7 @@
 
 
 class DDIMSampler(DiffusionSampler):
-    """
+    r"""
     ## DDIM Sampler
 
     This extends the [`DiffusionSampler` base class](index.html).
@@ -52,7 +52,7 @@ class DDIMSampler(DiffusionSampler):
     model: LatentDiffusion
 
     def __init__(self, model: LatentDiffusion, n_steps: int, ddim_discretize: str = "uniform", ddim_eta: float = 0.):
-        """
+        r"""
         :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
         :param n_steps: is the number of DDIM sampling steps, $S$
         :param ddim_discretize: specifies how to extract $\tau$ from $[1,2,\dots,T]$.
@@ -106,7 +106,7 @@ def sample(self,
                uncond_cond: Optional[torch.Tensor] = None,
                skip_steps: int = 0,
                ):
-        """
+        r"""
         ### Sampling Loop
 
         :param shape: is the shape of the generated images in the
@@ -153,7 +153,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
                  temperature: float = 1.,
                  uncond_scale: float = 1.,
                  uncond_cond: Optional[torch.Tensor] = None):
-        """
+        r"""
         ### Sample $x_{\tau_{i-1}}$
 
         :param x: is $x_{\tau_i}$ of shape `[batch_size, channels, height, width]`
@@ -184,7 +184,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
     def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, *,
                                temperature: float,
                                repeat_noise: bool):
-        """
+        r"""
         ### Sample $x_{\tau_{i-1}}$ given $\epsilon_\theta(x_{\tau_i})$
         """
 
@@ -231,7 +231,7 @@ def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor,
 
     @torch.no_grad()
     def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
-        """
+        r"""
         ### Sample from $q_{\sigma,\tau}(x_{\tau_i}|x_0)$
 
         $$q_{\sigma,\tau}(x_t|x_0) =
@@ -258,7 +258,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
               uncond_scale: float = 1.,
               uncond_cond: Optional[torch.Tensor] = None,
               ):
-        """
+        r"""
         ### Painting Loop
 
         :param x: is $x_{S'}$ of shape `[batch_size, channels, height, width]`
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
index f591e2b65..ffa545abf 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Denoising Diffusion Probabilistic Models (DDPM) Sampling
 summary: >
@@ -24,7 +24,7 @@
 
 
 class DDPMSampler(DiffusionSampler):
-    """
+    r"""
     ## DDPM Sampler
 
     This extends the [`DiffusionSampler` base class](index.html).
@@ -49,7 +49,7 @@ class DDPMSampler(DiffusionSampler):
     model: LatentDiffusion
 
     def __init__(self, model: LatentDiffusion):
-        """
+        r"""
         :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
         """
         super().__init__(model)
@@ -94,7 +94,7 @@ def sample(self,
                uncond_cond: Optional[torch.Tensor] = None,
                skip_steps: int = 0,
                ):
-        """
+        r"""
         ### Sampling Loop
 
         :param shape: is the shape of the generated images in the
@@ -139,7 +139,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
                  repeat_noise: bool = False,
                  temperature: float = 1.,
                  uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None):
-        """
+        r"""
         ### Sample $x_{t-1}$ from $p_\theta(x_{t-1} | x_t)$
 
         :param x: is $x_t$ of shape `[batch_size, channels, height, width]`
@@ -208,7 +208,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
 
     @torch.no_grad()
     def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
-        """
+        r"""
         ### Sample from $q(x_t|x_0)$
 
         $$q(x_t|x_0) = \mathcal{N} \Big(x_t; \sqrt{\bar\alpha_t} x_0, (1-\bar\alpha_t) \mathbf{I} \Big)$$
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
index ef3aab4d2..8e4fec81c 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
@@ -26,7 +26,7 @@ class Img2Img:
     def __init__(self, *, checkpoint_path: Path,
                  ddim_steps: int = 50,
                  ddim_eta: float = 0.0):
-        """
+        r"""
         :param checkpoint_path: is the path of the checkpoint
         :param ddim_steps: is the number of sampling steps
         :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant
@@ -54,7 +54,7 @@ def __call__(self, *,
                  prompt: str,
                  uncond_scale: float = 5.0,
                  ):
-        """
+        r"""
         :param dest_path: is the path to store the generated images
         :param orig_img: is the image to transform
         :param strength: specifies how much of the original image should not be preserved
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
index a3504ed80..cdf731dc7 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
@@ -31,7 +31,7 @@ class InPaint:
     def __init__(self, *, checkpoint_path: Path,
                  ddim_steps: int = 50,
                  ddim_eta: float = 0.0):
-        """
+        r"""
         :param checkpoint_path: is the path of the checkpoint
         :param ddim_steps: is the number of sampling steps
         :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant
@@ -60,7 +60,7 @@ def __call__(self, *,
                  uncond_scale: float = 5.0,
                  mask: Optional[torch.Tensor] = None,
                  ):
-        """
+        r"""
         :param dest_path: is the path to store the generated images
         :param orig_img: is the image to transform
         :param strength: specifies how much of the original image should not be preserved
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
index aee342bbb..30ab64ffe 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
@@ -33,7 +33,7 @@ def __init__(self, *,
                  n_steps: int = 50,
                  ddim_eta: float = 0.0,
                  ):
-        """
+        r"""
         :param checkpoint_path: is the path of the checkpoint
         :param sampler_name: is the name of the [sampler](../sampler/index.html)
         :param n_steps: is the number of sampling steps
@@ -62,7 +62,7 @@ def __call__(self, *,
                  h: int = 512, w: int = 512,
                  uncond_scale: float = 7.5,
                  ):
-        """
+        r"""
         :param dest_path: is the path to store the generated images
         :param batch_size: is the number of images to generate in a batch
         :param prompt: is the prompt to generate images with
diff --git a/labml_nn/distillation/__init__.py b/labml_nn/distillation/__init__.py
index a8d0d11b5..72708117a 100644
--- a/labml_nn/distillation/__init__.py
+++ b/labml_nn/distillation/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Distilling the Knowledge in a Neural Network
 summary: >
diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py
index 0a78e2613..3bd1c6178 100644
--- a/labml_nn/gan/cycle_gan/__init__.py
+++ b/labml_nn/gan/cycle_gan/__init__.py
@@ -188,7 +188,7 @@ def forward(self, x: torch.Tensor):
 
 
 def weights_init_normal(m):
-    """
+    r"""
     Initialize convolution layer weights to $\mathcal{N}(0, 0.2)$
     """
     classname = m.__class__.__name__
@@ -436,7 +436,7 @@ def initialize(self):
         )
 
     def run(self):
-        """
+        r"""
         ## Training
 
         We aim to solve:
diff --git a/labml_nn/gan/original/__init__.py b/labml_nn/gan/original/__init__.py
index 27eb1c650..dff35f8de 100644
--- a/labml_nn/gan/original/__init__.py
+++ b/labml_nn/gan/original/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Generative Adversarial Networks (GAN)
 summary: A simple PyTorch implementation/tutorial of Generative Adversarial Networks (GAN) loss functions.
@@ -38,7 +38,7 @@
 
 
 class DiscriminatorLogitsLoss(nn.Module):
-    """
+    r"""
     ## Discriminator Loss
 
     Discriminator should **ascend** on the gradient,
@@ -75,7 +75,7 @@ def __init__(self, smoothing: float = 0.2):
         self.register_buffer('labels_false', _create_labels(256, 0.0, smoothing), False)
 
     def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor):
-        """
+        r"""
         `logits_true` are logits from $D(\pmb{x}^{(i)})$ and
         `logits_false` are logits from $D(G(\pmb{z}^{(i)}))$
         """
@@ -91,7 +91,7 @@ def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor):
 
 
 class GeneratorLogitsLoss(nn.Module):
-    """
+    r"""
     ## Generator Loss
 
     Generator should **descend** on the gradient,
diff --git a/labml_nn/gan/original/experiment.py b/labml_nn/gan/original/experiment.py
index 71789df71..dbb621795 100644
--- a/labml_nn/gan/original/experiment.py
+++ b/labml_nn/gan/original/experiment.py
@@ -115,7 +115,7 @@ def init(self):
         tracker.set_image("generated", True, 1 / 100)
 
     def sample_z(self, batch_size: int):
-        """
+        r"""
         $$z \sim p(z)$$
         """
         return torch.randn(batch_size, 100, device=self.device)
diff --git a/labml_nn/gan/stylegan/__init__.py b/labml_nn/gan/stylegan/__init__.py
index c1c36bbe0..528cee11b 100644
--- a/labml_nn/gan/stylegan/__init__.py
+++ b/labml_nn/gan/stylegan/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: StyleGAN 2
 summary: >
@@ -156,7 +156,7 @@
 
 
 class MappingNetwork(nn.Module):
-    """
+    r"""
     <a id="mapping_network"></a>
 
     ## Mapping Network
@@ -212,7 +212,7 @@ class Generator(nn.Module):
     """
 
     def __init__(self, log_resolution: int, d_latent: int, n_features: int = 32, max_features: int = 512):
-        """
+        r"""
         * `log_resolution` is the $\log_2$ of image resolution
         * `d_latent` is the dimensionality of $w$
         * `n_features` number of features in the convolution layer at the highest resolution (final block)
@@ -276,7 +276,7 @@ def forward(self, w: torch.Tensor, input_noise: List[Tuple[Optional[torch.Tensor
 
 
 class GeneratorBlock(nn.Module):
-    """
+    r"""
     <a id="generator_block"></a>
 
     ### Generator Block
@@ -379,7 +379,7 @@ def forward(self, x: torch.Tensor, w: torch.Tensor, noise: Optional[torch.Tensor
 
 
 class ToRGB(nn.Module):
-    """
+    r"""
     <a id="to_rgb"></a>
 
     ### To RGB
@@ -430,7 +430,7 @@ class Conv2dWeightModulate(nn.Module):
 
     def __init__(self, in_features: int, out_features: int, kernel_size: int,
                  demodulate: float = True, eps: float = 1e-8):
-        """
+        r"""
         * `in_features` is the number of features in the input feature map
         * `out_features` is the number of features in the output feature map
         * `kernel_size` is the size of the convolution kernel
@@ -492,7 +492,7 @@ def forward(self, x: torch.Tensor, s: torch.Tensor):
 
 
 class Discriminator(nn.Module):
-    """
+    r"""
     <a id="discriminator"></a>
 
     ## StyleGAN 2 Discriminator
@@ -506,7 +506,7 @@ class Discriminator(nn.Module):
     """
 
     def __init__(self, log_resolution: int, n_features: int = 64, max_features: int = 512):
-        """
+        r"""
         * `log_resolution` is the $\log_2$ of image resolution
         * `n_features` number of features in the convolution layer at the highest resolution (first block)
         * `max_features` maximum number of features in any generator block
@@ -561,7 +561,7 @@ def forward(self, x: torch.Tensor):
 
 
 class DiscriminatorBlock(nn.Module):
-    """
+    r"""
     <a id="discriminator_black"></a>
 
     ### Discriminator Block
@@ -653,7 +653,7 @@ def forward(self, x: torch.Tensor):
 
 
 class DownSample(nn.Module):
-    """
+    r"""
     <a id="down_sample"></a>
 
     ### Down-sample
@@ -677,7 +677,7 @@ def forward(self, x: torch.Tensor):
 
 
 class UpSample(nn.Module):
-    """
+    r"""
     <a id="up_sample"></a>
 
     ### Up-sample
@@ -797,7 +797,7 @@ def forward(self, x: torch.Tensor):
 
 
 class EqualizedWeight(nn.Module):
-    """
+    r"""
     <a id="equalized_weight"></a>
 
     ## Learning-rate Equalized Weights Parameter
@@ -835,7 +835,7 @@ def forward(self):
 
 
 class GradientPenalty(nn.Module):
-    """
+    r"""
     <a id="gradient_penalty"></a>
 
     ## Gradient Penalty
@@ -851,7 +851,7 @@ class GradientPenalty(nn.Module):
     """
 
     def forward(self, x: torch.Tensor, d: torch.Tensor):
-        """
+        r"""
         * `x` is $x \sim \mathcal{D}$
         * `d` is $D(x)$
         """
@@ -877,7 +877,7 @@ def forward(self, x: torch.Tensor, d: torch.Tensor):
 
 
 class PathLengthPenalty(nn.Module):
-    """
+    r"""
     <a id="path_length_penalty"></a>
 
     ## Path Length Penalty
@@ -901,7 +901,7 @@ class PathLengthPenalty(nn.Module):
     """
 
     def __init__(self, beta: float):
-        """
+        r"""
         * `beta` is the constant $\beta$ used to calculate the exponential moving average $a$
         """
         super().__init__()
diff --git a/labml_nn/gan/stylegan/experiment.py b/labml_nn/gan/stylegan/experiment.py
index 7a33aba9b..621e8f859 100644
--- a/labml_nn/gan/stylegan/experiment.py
+++ b/labml_nn/gan/stylegan/experiment.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: StyleGAN 2 Model Training
 summary: >
diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py
index d28c6283f..8f115b439 100644
--- a/labml_nn/gan/wasserstein/__init__.py
+++ b/labml_nn/gan/wasserstein/__init__.py
@@ -108,7 +108,7 @@ def forward(self, f_real: torch.Tensor, f_fake: torch.Tensor):
         This returns the a tuple with losses for $f_w(x)$ and $f_w(g_\theta(z))$,
         which are later added.
         They are kept separate for logging.
-        """
+        r"""
 
         # We use ReLUs to clip the loss to keep $f \in [-1, +1]$ range.
         return F.relu(1 - f_real).mean(), F.relu(1 + f_fake).mean()
diff --git a/labml_nn/graphs/gat/__init__.py b/labml_nn/graphs/gat/__init__.py
index 81ae9eaca..b8eb71ce2 100644
--- a/labml_nn/graphs/gat/__init__.py
+++ b/labml_nn/graphs/gat/__init__.py
@@ -30,7 +30,7 @@
 
 
 class GraphAttentionLayer(nn.Module):
-    """
+    r"""
     ## Graph attention layer
 
     This is a single graph attention layer.
@@ -82,7 +82,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int,
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
-        """
+        r"""
         * `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`.
         * `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`.
         We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head.
diff --git a/labml_nn/graphs/gatv2/__init__.py b/labml_nn/graphs/gatv2/__init__.py
index f306bb0be..60efa9656 100644
--- a/labml_nn/graphs/gatv2/__init__.py
+++ b/labml_nn/graphs/gatv2/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Graph Attention Networks v2 (GATv2)
 summary: >
@@ -60,7 +60,7 @@
 
 
 class GraphAttentionV2Layer(nn.Module):
-    """
+    r"""
     ## Graph attention v2 layer
     This is a single graph attention v2 layer.
     A GATv2 is made up of multiple such layers.
@@ -119,7 +119,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int,
         self.dropout = nn.Dropout(dropout)
 
     def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
-        """
+        r"""
         * `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`.
         * `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`.
         We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head.
diff --git a/labml_nn/hypernetworks/hyper_lstm.py b/labml_nn/hypernetworks/hyper_lstm.py
index 917baf4d1..ccd84ed42 100644
--- a/labml_nn/hypernetworks/hyper_lstm.py
+++ b/labml_nn/hypernetworks/hyper_lstm.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: HyperNetworks - HyperLSTM
 summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks.
@@ -223,7 +223,7 @@ def __init__(self, input_size: int, hidden_size: int, hyper_size: int, n_z: int,
 
     def forward(self, x: torch.Tensor,
                 state: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = None):
-        """
+        r"""
         * `x` has shape `[n_steps, batch_size, input_size]` and
         * `state` is a tuple of $h, c, \hat{h}, \hat{c}$.
          $h, c$ have shape `[batch_size, hidden_size]` and
diff --git a/labml_nn/lora/__init__.py b/labml_nn/lora/__init__.py
index bd3c42c7b..355707337 100644
--- a/labml_nn/lora/__init__.py
+++ b/labml_nn/lora/__init__.py
@@ -26,7 +26,7 @@
 
 
 class Linear(nn.Module):
-    """
+    r"""
     ## LoRA Linear Layer
 
     LoRA linear layer adds a low-rank decomposition to the pre-trained
@@ -48,7 +48,7 @@ class Linear(nn.Module):
 
     def __init__(self, in_features: int, out_features: int, bias: bool,
                  r: int, alpha: int = None):
-        """
+        r"""
         :param in_features: is the number of input features of the linear layer
         :param out_features: is the number of output features of the linear layer
         :param bias: is a flag indicating if there is a bias parameter
@@ -99,7 +99,7 @@ def forward(self, x: torch.Tensor):
 
 
 class Embedding(nn.Module):
-    """
+    r"""
     ## LoRA Embedding Layer
 
     Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained
@@ -110,7 +110,7 @@ class Embedding(nn.Module):
 
     def __init__(self, num_embeddings: int, embedding_dim: int,
                  r: int, alpha: int = None):
-        """
+        r"""
 
         :param num_embeddings: is the number of embeddings
         :param embedding_dim: is the number embedding dimensions
diff --git a/labml_nn/lstm/__init__.py b/labml_nn/lstm/__init__.py
index 29edba75c..6d76b6378 100644
--- a/labml_nn/lstm/__init__.py
+++ b/labml_nn/lstm/__init__.py
@@ -17,7 +17,7 @@
 
 
 class LSTMCell(nn.Module):
-    """
+    r"""
     ## Long Short-Term Memory Cell
 
     LSTM Cell computes $c$, and $h$. $c$ is like the long-term memory,
diff --git a/labml_nn/neox/model.py b/labml_nn/neox/model.py
index 295afd05d..1e813a89c 100644
--- a/labml_nn/neox/model.py
+++ b/labml_nn/neox/model.py
@@ -73,7 +73,7 @@ class RoPE(nn.Module):
     """
 
     def __init__(self, d_rope: int, base: float = 10_000.):
-        """
+        r"""
         :param d_rope: is the number of features for RoPE embeddings
         :param base: is the base for $\theta_i = 10000^{\frac{2(i-1)}{d}}$, which defaults to $10000$
         """
@@ -92,7 +92,7 @@ def __init__(self, d_rope: int, base: float = 10_000.):
 
     @staticmethod
     def rotate_half(x: torch.Tensor):
-        """
+        r"""
         ### Rotate the features
 
         $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., -x^{(\frac{d}{2})}]$
@@ -101,7 +101,7 @@ def rotate_half(x: torch.Tensor):
         return torch.cat((-x2, x1), dim=-1)
 
     def forward(self, x: torch.Tensor, offset: int = 0):
-        """
+        r"""
         :param x: has shape `[..., seq, n_heads, d_k]`
         :param offset: is the starting position of `x`. This is $\gt 0$ when we have
         cached the keys and queries of previous positions
@@ -513,7 +513,7 @@ def __init__(self, *, n_vocab: int = 50_432, n_hidden: int = 6_144,
                  llm_int8_threshold: float = 6.0,
                  is_flash_attention: bool = False
                  ):
-        """
+        r"""
         ### Generator to create layers
 
         The layers are generated in the same order as checkpoints.
@@ -571,7 +571,7 @@ def post_load_prepare(self, layer: NeoXModule, *,
                           device: torch.device = None,
                           llm_int8_threshold: float = None,
                           ):
-        """
+        r"""
         <a id="post_load_prepare"></a>
 
         ### Layer transformations after loading the checkpoint
diff --git a/labml_nn/neox/utils/llm_int8.py b/labml_nn/neox/utils/llm_int8.py
index cd8420855..47c349233 100644
--- a/labml_nn/neox/utils/llm_int8.py
+++ b/labml_nn/neox/utils/llm_int8.py
@@ -41,7 +41,7 @@
 
 
 def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, threshold: float = 6.0):
-    """
+    r"""
     ## Transform a `nn.Linear` layer to LLM.int8() linear layer
 
     :param linear_module: is the `nn.Linear` layer to transform
diff --git a/labml_nn/normalization/batch_channel_norm/__init__.py b/labml_nn/normalization/batch_channel_norm/__init__.py
index 846361087..4a16dfe2f 100644
--- a/labml_nn/normalization/batch_channel_norm/__init__.py
+++ b/labml_nn/normalization/batch_channel_norm/__init__.py
@@ -40,7 +40,7 @@ class BatchChannelNorm(nn.Module):
 
     def __init__(self, channels: int, groups: int,
                  eps: float = 1e-5, momentum: float = 0.1, estimate: bool = True):
-        """
+        r"""
         * `channels` is the number of features in the input
         * `groups` is the number of groups the features are divided into
         * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
@@ -66,7 +66,7 @@ def forward(self, x):
 
 
 class EstimatedBatchNorm(nn.Module):
-    """
+    r"""
     ## Estimated Batch Normalization
 
     When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
@@ -88,7 +88,7 @@ class EstimatedBatchNorm(nn.Module):
     """
     def __init__(self, channels: int,
                  eps: float = 1e-5, momentum: float = 0.1, affine: bool = True):
-        """
+        r"""
         * `channels` is the number of features in the input
         * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
         * `momentum` is the momentum in taking the exponential moving average
@@ -174,7 +174,7 @@ class ChannelNorm(nn.Module):
 
     def __init__(self, channels, groups,
                  eps: float = 1e-5, affine: bool = True):
-        """
+        r"""
         * `groups` is the number of groups the features are divided into
         * `channels` is the number of features in the input
         * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py
index 1471b807f..8001f1bb7 100644
--- a/labml_nn/normalization/batch_norm/__init__.py
+++ b/labml_nn/normalization/batch_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Batch Normalization
 summary: >
@@ -138,7 +138,7 @@ def __init__(self, channels: int, *,
         * `track_running_stats` is whether to calculate the moving averages or mean and variance
 
         We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation.
-        """
+        r"""
         super().__init__()
 
         self.channels = channels
@@ -163,7 +163,7 @@ def forward(self, x: torch.Tensor):
         `*` denotes any number of (possibly 0) dimensions.
          For example, in an image (2D) convolution this will be
         `[batch_size, channels, height, width]`
-        """
+        r"""
         # Keep the original shape
         x_shape = x.shape
         # Get the batch size
diff --git a/labml_nn/normalization/deep_norm/__init__.py b/labml_nn/normalization/deep_norm/__init__.py
index fcec5bd82..414a84398 100644
--- a/labml_nn/normalization/deep_norm/__init__.py
+++ b/labml_nn/normalization/deep_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: DeepNorm
 summary: >
@@ -82,7 +82,7 @@
 
 
 class DeepNorm(nn.Module):
-    """
+    r"""
     ## DeepNorm Normalization
 
     $$x_{l + 1} = \mathop{LN}\Big( \alpha x_l + \mathop{G}_l \big(x_l, \theta_l \big)\Big)$$
@@ -91,7 +91,7 @@ class DeepNorm(nn.Module):
     def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], *,
                  eps: float = 1e-5,
                  elementwise_affine: bool = True):
-        """
+        r"""
         :param alpha: is $\alpha$
         :param normalized_shape: is the shape for LayerNorm $\mathop{LN}$
         :param eps: is $\epsilon$ for LayerNorm
@@ -104,7 +104,7 @@ def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size],
         self.layer_norm = LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
 
     def forward(self, x: torch.Tensor, gx: torch.Tensor):
-        """
+        r"""
         :param x: is the output from the previous layer $x_l$
         :param gx: is the output of the current sub-layer $\mathop{G}_l (x_l, \theta_l)$
         """
@@ -126,7 +126,7 @@ def __init__(self, *,
                  deep_norm_alpha: float,
                  deep_norm_beta: float,
                  ):
-        """
+        r"""
         :param d_model: is the token embedding size
         :param self_attn: is the self attention module
         :param feed_forward: is the feed forward module
diff --git a/labml_nn/normalization/deep_norm/experiment.py b/labml_nn/normalization/deep_norm/experiment.py
index 9fcdadc64..1819453e1 100644
--- a/labml_nn/normalization/deep_norm/experiment.py
+++ b/labml_nn/normalization/deep_norm/experiment.py
@@ -89,7 +89,7 @@ class Configs(NLPAutoRegressionConfigs):
 
 @option(Configs.deep_norm_alpha)
 def _deep_norm_alpha(c: Configs):
-    """
+    r"""
     #### Calculate $\alpha$
 
     $\alpha = (2M)^{\frac{1}{4}}$
@@ -99,7 +99,7 @@ def _deep_norm_alpha(c: Configs):
 
 @option(Configs.deep_norm_beta)
 def _deep_norm_beta(c: Configs):
-    """
+    r"""
     #### Calculate $\beta$
 
     $\beta = (8M)^{-\frac{1}{4}}$
diff --git a/labml_nn/normalization/group_norm/__init__.py b/labml_nn/normalization/group_norm/__init__.py
index e9a87ac92..3c20840ff 100644
--- a/labml_nn/normalization/group_norm/__init__.py
+++ b/labml_nn/normalization/group_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Group Normalization
 summary: >
@@ -98,7 +98,7 @@ def __init__(self, groups: int, channels: int, *,
         * `channels` is the number of features in the input
         * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
         * `affine` is whether to scale and shift the normalized value
-        """
+        r"""
         super().__init__()
 
         assert channels % groups == 0, "Number of channels should be evenly divisible by the number of groups"
@@ -118,7 +118,7 @@ def forward(self, x: torch.Tensor):
         `*` denotes any number of (possibly 0) dimensions.
          For example, in an image (2D) convolution this will be
         `[batch_size, channels, height, width]`
-        """
+        r"""
         # Keep the original shape
         x_shape = x.shape
         # Get the batch size
diff --git a/labml_nn/normalization/instance_norm/__init__.py b/labml_nn/normalization/instance_norm/__init__.py
index c7db3adc9..937d69c80 100644
--- a/labml_nn/normalization/instance_norm/__init__.py
+++ b/labml_nn/normalization/instance_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Instance Normalization
 summary: >
@@ -53,7 +53,7 @@ def __init__(self, channels: int, *,
         * `channels` is the number of features in the input
         * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
         * `affine` is whether to scale and shift the normalized value
-        """
+        r"""
         super().__init__()
 
         self.channels = channels
@@ -71,7 +71,7 @@ def forward(self, x: torch.Tensor):
         `*` denotes any number of (possibly 0) dimensions.
          For example, in an image (2D) convolution this will be
         `[batch_size, channels, height, width]`
-        """
+        r"""
         # Keep the original shape
         x_shape = x.shape
         # Get the batch size
diff --git a/labml_nn/normalization/layer_norm/__init__.py b/labml_nn/normalization/layer_norm/__init__.py
index 0d5ca8116..a0a004f57 100644
--- a/labml_nn/normalization/layer_norm/__init__.py
+++ b/labml_nn/normalization/layer_norm/__init__.py
@@ -79,7 +79,7 @@ def __init__(self, normalized_shape: Union[int, List[int], Size], *,
         * `elementwise_affine` is whether to scale and shift the normalized value
 
         We've tried to use the same names for arguments as PyTorch `LayerNorm` implementation.
-        """
+        r"""
         super().__init__()
 
         # Convert `normalized_shape` to `torch.Size`
@@ -104,7 +104,7 @@ def forward(self, x: torch.Tensor):
         `*` could be any number of dimensions.
          For example, in an NLP task this will be
         `[seq_len, batch_size, features]`
-        """
+        r"""
         # Sanity check to make sure the shapes match
         assert self.normalized_shape == x.shape[-len(self.normalized_shape):]
 
diff --git a/labml_nn/normalization/weight_standardization/__init__.py b/labml_nn/normalization/weight_standardization/__init__.py
index 2fb3009b5..8165f9a9a 100644
--- a/labml_nn/normalization/weight_standardization/__init__.py
+++ b/labml_nn/normalization/weight_standardization/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Weight Standardization
 summary: >
diff --git a/labml_nn/optimizers/__init__.py b/labml_nn/optimizers/__init__.py
index 172854a83..6cd5361f4 100644
--- a/labml_nn/optimizers/__init__.py
+++ b/labml_nn/optimizers/__init__.py
@@ -73,7 +73,7 @@ class GenericAdaptiveOptimizer(Optimizer):
     """
 
     def __init__(self, params, defaults: Dict[str, Any], lr: float, betas: Tuple[float, float], eps: float):
-        """
+        r"""
         ### Initialize
 
         * `params` is the collection of parameters or set of parameter groups.
@@ -109,7 +109,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
         pass
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.Tensor):
-        """
+        r"""
         ### Take optimizer step on a parameter tensor
 
         This should be overridden and take the optimization step on `param` tensor $\theta$,
diff --git a/labml_nn/optimizers/ada_belief.py b/labml_nn/optimizers/ada_belief.py
index d33b1b4a8..b3d33fab7 100644
--- a/labml_nn/optimizers/ada_belief.py
+++ b/labml_nn/optimizers/ada_belief.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: AdaBelief optimizer
 summary: A simple PyTorch implementation/tutorial of AdaBelief optimizer.
@@ -53,7 +53,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
                  weight_decay: WeightDecay = WeightDecay(), amsgrad=False,
                  degenerate_to_sgd=True,
                  rectify=True, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -75,7 +75,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         self.rectify = rectify
 
     def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
-        """
+        r"""
         ### Initialize a parameter state
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -95,7 +95,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
             state['max_exp_avg_var'] = torch.zeros_like(param, memory_format=torch.preserve_format)
 
     def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor):
-        """
+        r"""
         ### Calculate $m_t$ and $s_t$ or $\max(s_1, s_2, ..., s_{t-1}, s_t)$
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -131,7 +131,7 @@ def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso
             return m, s
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
-        """
+        r"""
         ### Take an update step for a given parameter tensor
 
         * `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam.py b/labml_nn/optimizers/adam.py
index 568be4d55..24b115dc8 100644
--- a/labml_nn/optimizers/adam.py
+++ b/labml_nn/optimizers/adam.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Adam Optimizer
 summary: A simple PyTorch implementation/tutorial of Adam optimizer
@@ -60,7 +60,7 @@ def __init__(self, params,
                  weight_decay: WeightDecay = WeightDecay(),
                  optimized_update: bool = True,
                  defaults: Optional[Dict[str, Any]] = None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -81,7 +81,7 @@ def __init__(self, params,
         self.optimized_update = optimized_update
 
     def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
-        """
+        r"""
         ### Initialize a parameter state
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -97,7 +97,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
         state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
 
     def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor):
-        """
+        r"""
         ### Calculate $m_t$ and and $v_t$
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -121,7 +121,7 @@ def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso
         return m, v
 
     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
-        """
+        r"""
         ### Get learning-rate
 
         This returns the modified learning rate based on the state.
@@ -132,7 +132,7 @@ def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
 
     def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter,
                     m: torch.Tensor, v: torch.Tensor):
-        """
+        r"""
         ### Do the *Adam* parameter update
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -192,7 +192,7 @@ def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch
             param.data.addcdiv_(m, denominator, value=-step_size)
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
-        """
+        r"""
         ### Take an update step for a given parameter tensor
 
         * `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam_fp16.py b/labml_nn/optimizers/adam_fp16.py
index 1b36135f8..36e55435c 100644
--- a/labml_nn/optimizers/adam_fp16.py
+++ b/labml_nn/optimizers/adam_fp16.py
@@ -35,7 +35,7 @@ def __init__(self, params, lr: float = 1e-3, betas: Tuple[float, float] = (0.9,
         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults)
 
     def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
-        """
+        r"""
         ### Initialize a parameter state
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -55,7 +55,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
         state['fp32_copy'] = param.to(torch.float)
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
-        """
+        r"""
         ### Take an update step for a given parameter tensor
 
         * `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam_warmup.py b/labml_nn/optimizers/adam_warmup.py
index fb73d1529..92cd9ff9a 100644
--- a/labml_nn/optimizers/adam_warmup.py
+++ b/labml_nn/optimizers/adam_warmup.py
@@ -25,7 +25,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
                  weight_decay: WeightDecay = WeightDecay(),
                  optimized_update: bool = True,
                  amsgrad=False, warmup=0, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -46,7 +46,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
 
     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
-        """
+        r"""
         ### Get learning-rate
 
         $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
diff --git a/labml_nn/optimizers/adam_warmup_cosine_decay.py b/labml_nn/optimizers/adam_warmup_cosine_decay.py
index 037f1b4cf..6358f4bcc 100644
--- a/labml_nn/optimizers/adam_warmup_cosine_decay.py
+++ b/labml_nn/optimizers/adam_warmup_cosine_decay.py
@@ -28,7 +28,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
                  weight_decay: WeightDecay = WeightDecay(),
                  optimized_update: bool = True,
                  amsgrad=False, warmup=0, total_steps=1e10, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -51,7 +51,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
 
     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
-        """
+        r"""
         ### Get learning-rate
 
         $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
diff --git a/labml_nn/optimizers/amsgrad.py b/labml_nn/optimizers/amsgrad.py
index 07658e09b..5d9971f6b 100644
--- a/labml_nn/optimizers/amsgrad.py
+++ b/labml_nn/optimizers/amsgrad.py
@@ -36,7 +36,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
                  weight_decay: WeightDecay = WeightDecay(),
                  optimized_update: bool = True,
                  amsgrad=True, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -56,7 +56,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults)
 
     def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
-        """
+        r"""
         ### Initialize a parameter state
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -73,7 +73,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
             state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
 
     def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor):
-        """
+        r"""
         ### Calculate $m_t$ and and $v_t$ or $\max(v_1, v_2, ..., v_{t-1}, v_t)$
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -109,7 +109,7 @@ def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tenso
 
 
 def _synthetic_experiment(is_adam: bool):
-    """
+    r"""
     ## Synthetic Experiment
 
     This is the synthetic experiment described in the paper,
diff --git a/labml_nn/optimizers/noam.py b/labml_nn/optimizers/noam.py
index 8443f881c..26450311f 100644
--- a/labml_nn/optimizers/noam.py
+++ b/labml_nn/optimizers/noam.py
@@ -29,7 +29,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
                  optimized_update: bool = True,
                  amsgrad=False,
                  warmup=0, d_model=512, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -52,7 +52,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
         self.d_model = d_model
 
     def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
-        """
+        r"""
         ### Get learning-rate
 
         $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
diff --git a/labml_nn/optimizers/radam.py b/labml_nn/optimizers/radam.py
index 3e384c4d4..bd718e370 100644
--- a/labml_nn/optimizers/radam.py
+++ b/labml_nn/optimizers/radam.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Rectified Adam (RAdam) optimizer
 summary: A simple PyTorch implementation/tutorial of RAdam optimizer.
@@ -157,7 +157,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
                  optimized_update: bool = True,
                  amsgrad=False,
                  degenerated_to_sgd=True, defaults=None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -176,7 +176,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
         super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
-        """
+        r"""
         ### Take an update step for a given parameter tensor
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -221,7 +221,7 @@ def calc_rectification_term(beta2: float, step: int) -> Optional[float]:
 
     def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter,
                       m: torch.Tensor, v: torch.Tensor):
-        """
+        r"""
         ### Do the *RAdam* parameter update
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -274,7 +274,7 @@ def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: tor
 
 
 def _test_rectification_term():
-    """
+    r"""
     ### Plot $r_t$ against $t$ for various $\beta_2$
 
     ![Plot of r_t](radam_r_t.png)
diff --git a/labml_nn/optimizers/sophia.py b/labml_nn/optimizers/sophia.py
index 2aa58f426..12d43c2a1 100644
--- a/labml_nn/optimizers/sophia.py
+++ b/labml_nn/optimizers/sophia.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Sophia Optimizer
 summary: A simple PyTorch implementation/tutorial of Sophia optimizer
@@ -72,7 +72,7 @@ def __init__(self, params,
                  rho: float = 0.03,
                  weight_decay: WeightDecay = WeightDecay(),
                  defaults: Optional[Dict[str, Any]] = None):
-        """
+        r"""
         ### Initialize the optimizer
 
         * `params` is the list of parameters
@@ -92,7 +92,7 @@ def __init__(self, params,
         self.weight_decay = weight_decay
 
     def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
-        """
+        r"""
         ### Initialize a parameter state
 
         * `state` is the optimizer state of the parameter (tensor)
@@ -108,7 +108,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
         state['hessian'] = torch.zeros_like(param, memory_format=torch.preserve_format)
 
     def update_hessian(self, n_tokens_training_batch):
-        """
+        r"""
         ### Update the EMA of Hessian diagonal $h_t$
 
         * `n_tokens_training_batch` is the number of tokens/inputs in the batch $B$
@@ -145,7 +145,7 @@ def update_hessian(self, n_tokens_training_batch):
                 state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=(1 - beta2) * n_tokens_training_batch)
 
     def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
-        """
+        r"""
         ### Take an update step for a given parameter tensor
 
         * `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/recurrent_highway_networks/__init__.py b/labml_nn/recurrent_highway_networks/__init__.py
index f1e8b3b76..0f9590c02 100644
--- a/labml_nn/recurrent_highway_networks/__init__.py
+++ b/labml_nn/recurrent_highway_networks/__init__.py
@@ -16,7 +16,7 @@
 
 
 class RHNCell(nn.Module):
-    """
+    r"""
     ## Recurrent Highway Network Cell
 
     This implements equations $(6) - (9)$.
diff --git a/labml_nn/resnet/__init__.py b/labml_nn/resnet/__init__.py
index bd085c470..960ecf2a3 100644
--- a/labml_nn/resnet/__init__.py
+++ b/labml_nn/resnet/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Deep Residual Learning for Image Recognition (ResNet)
 summary: >
@@ -67,7 +67,7 @@ class ShortcutProjection(nn.Module):
     """
 
     def __init__(self, in_channels: int, out_channels: int, stride: int):
-        """
+        r"""
         * `in_channels` is the number of channels in $x$
         * `out_channels` is the number of channels in $\mathcal{F}(x, \{W_i\})$
         * `stride` is the stride length in the convolution operation for $F$.
@@ -86,7 +86,7 @@ def forward(self, x: torch.Tensor):
 
 
 class ResidualBlock(nn.Module):
-    """
+    r"""
     <a id="residual_block"></a>
 
     ## Residual Block
@@ -153,7 +153,7 @@ def forward(self, x: torch.Tensor):
 
 
 class BottleneckResidualBlock(nn.Module):
-    """
+    r"""
     <a id="bottleneck_residual_block"></a>
 
     ## Bottleneck Residual Block
@@ -181,7 +181,7 @@ class BottleneckResidualBlock(nn.Module):
     """
 
     def __init__(self, in_channels: int, bottleneck_channels: int, out_channels: int, stride: int):
-        """
+        r"""
         * `in_channels` is the number of channels in $x$
         * `bottleneck_channels` is the number of channels for the $3 \times 3$ convlution
         * `out_channels` is the number of output channels
diff --git a/labml_nn/rl/dqn/__init__.py b/labml_nn/rl/dqn/__init__.py
index 048bda113..8320dc214 100644
--- a/labml_nn/rl/dqn/__init__.py
+++ b/labml_nn/rl/dqn/__init__.py
@@ -31,7 +31,7 @@
 
 
 class QFuncLoss(nn.Module):
-    """
+    r"""
     ## Train the model
 
     We want to find optimal action-value function.
@@ -106,7 +106,7 @@ def __init__(self, gamma: float):
     def forward(self, q: torch.Tensor, action: torch.Tensor, double_q: torch.Tensor,
                 target_q: torch.Tensor, done: torch.Tensor, reward: torch.Tensor,
                 weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
+        r"""
         * `q` - $Q(s;\theta_i)$
         * `action` - $a$
         * `double_q` - $\textcolor{cyan}Q(s';\textcolor{cyan}{\theta_i})$
diff --git a/labml_nn/rl/dqn/experiment.py b/labml_nn/rl/dqn/experiment.py
index 2a3af4381..19be0f414 100644
--- a/labml_nn/rl/dqn/experiment.py
+++ b/labml_nn/rl/dqn/experiment.py
@@ -107,7 +107,7 @@ def __init__(self, *,
         self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4)
 
     def _sample_action(self, q_value: torch.Tensor, exploration_coefficient: float):
-        """
+        r"""
         #### $\epsilon$-greedy Sampling
         When sampling actions we use a $\epsilon$-greedy strategy, where we
         take a greedy action with probabiliy $1 - \epsilon$ and
diff --git a/labml_nn/rl/dqn/model.py b/labml_nn/rl/dqn/model.py
index 6dbe2e081..3b15f03c0 100644
--- a/labml_nn/rl/dqn/model.py
+++ b/labml_nn/rl/dqn/model.py
@@ -15,7 +15,7 @@
 
 
 class Model(nn.Module):
-    """
+    r"""
     ## Dueling Network ⚔️ Model for $Q$ Values
 
     We are using a [dueling network](https://arxiv.org/abs/1511.06581)
diff --git a/labml_nn/rl/dqn/replay_buffer.py b/labml_nn/rl/dqn/replay_buffer.py
index 966bfcbb6..ad3882e7b 100644
--- a/labml_nn/rl/dqn/replay_buffer.py
+++ b/labml_nn/rl/dqn/replay_buffer.py
@@ -18,7 +18,7 @@
 
 
 class ReplayBuffer:
-    """
+    r"""
     ## Buffer for Prioritized Experience Replay
 
     [Prioritized experience replay](https://arxiv.org/abs/1511.05952)
@@ -180,7 +180,7 @@ def _set_priority_sum(self, idx, priority):
             self.priority_sum[idx] = self.priority_sum[2 * idx] + self.priority_sum[2 * idx + 1]
 
     def _sum(self):
-        """
+        r"""
         #### $\sum_k p_k^\alpha$
         """
 
@@ -188,7 +188,7 @@ def _sum(self):
         return self.priority_sum[1]
 
     def _min(self):
-        """
+        r"""
         #### $\min_k p_k^\alpha$
         """
 
@@ -196,7 +196,7 @@ def _min(self):
         return self.priority_min[1]
 
     def find_prefix_sum_idx(self, prefix_sum):
-        """
+        r"""
         #### Find largest $i$ such that $\sum_{k=1}^{i} p_k^\alpha  \le P$
         """
 
diff --git a/labml_nn/rl/ppo/__init__.py b/labml_nn/rl/ppo/__init__.py
index 2b878d1d4..ca5b31edb 100644
--- a/labml_nn/rl/ppo/__init__.py
+++ b/labml_nn/rl/ppo/__init__.py
@@ -31,7 +31,7 @@
 
 
 class ClippedPPOLoss(nn.Module):
-    """
+    r"""
     ## PPO Loss
 
     Here's how the PPO update rule is derived.
@@ -179,7 +179,7 @@ def forward(self, log_pi: torch.Tensor, sampled_log_pi: torch.Tensor,
 
 
 class ClippedValueFunctionLoss(nn.Module):
-    """
+    r"""
     ## Clipped Value Function Loss
 
     Similarly we clip the value function update also.
diff --git a/labml_nn/rl/ppo/gae.py b/labml_nn/rl/ppo/gae.py
index 981b609ef..74864b611 100644
--- a/labml_nn/rl/ppo/gae.py
+++ b/labml_nn/rl/ppo/gae.py
@@ -23,7 +23,7 @@ def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: flo
         self.n_workers = n_workers
 
     def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
-        """
+        r"""
         ### Calculate advantages
 
         \begin{align}
diff --git a/labml_nn/sampling/nucleus.py b/labml_nn/sampling/nucleus.py
index 6de9c719e..60daa2b9c 100644
--- a/labml_nn/sampling/nucleus.py
+++ b/labml_nn/sampling/nucleus.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Nucleus Sampling
 summary: A PyTorch implementation of nucleus sampling from language models.
diff --git a/labml_nn/sampling/temperature.py b/labml_nn/sampling/temperature.py
index 4c924ee61..a8f60a5d3 100644
--- a/labml_nn/sampling/temperature.py
+++ b/labml_nn/sampling/temperature.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Sampling from Language Models with Temperature
 summary: A PyTorch implementation of sampling from language models with temperature.
diff --git a/labml_nn/scaling/zero3/__init__.py b/labml_nn/scaling/zero3/__init__.py
index 9f5955350..1f3609d6f 100644
--- a/labml_nn/scaling/zero3/__init__.py
+++ b/labml_nn/scaling/zero3/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Zero-DP Memory Optimization
 summary: >
diff --git a/labml_nn/sketch_rnn/__init__.py b/labml_nn/sketch_rnn/__init__.py
index fe250008d..9632dbd78 100644
--- a/labml_nn/sketch_rnn/__init__.py
+++ b/labml_nn/sketch_rnn/__init__.py
@@ -54,7 +54,7 @@ class StrokesDataset(Dataset):
     """
 
     def __init__(self, dataset: np.array, max_seq_length: int, scale: Optional[float] = None):
-        """
+        r"""
         `dataset` is a list of numpy arrays of shape [seq_len, 3].
         It is a sequence of strokes, and each stroke is represented by
         3 integers.
@@ -126,7 +126,7 @@ def __getitem__(self, idx: int):
 
 
 class BivariateGaussianMixture:
-    """
+    r"""
     ## Bi-variate Gaussian mixture
 
     The mixture is represented by $\Pi$ and
@@ -150,7 +150,7 @@ def n_distributions(self):
         return self.pi_logits.shape[-1]
 
     def set_temperature(self, temperature: float):
-        """
+        r"""
         Adjust by temperature $\tau$
         """
         # $$\hat{\Pi_k} \leftarrow \frac{\hat{\Pi_k}}{\tau}$$
@@ -348,7 +348,7 @@ def forward(self, mask: torch.Tensor, target: torch.Tensor,
 
 
 class KLDivLoss(nn.Module):
-    """
+    r"""
     ## KL-Divergence loss
 
     This calculates the KL divergence between a given normal distribution and $\mathcal{N}(0, 1)$
diff --git a/labml_nn/transformers/aft/__init__.py b/labml_nn/transformers/aft/__init__.py
index b3f526cbb..5aab884d6 100644
--- a/labml_nn/transformers/aft/__init__.py
+++ b/labml_nn/transformers/aft/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: An Attention Free Transformer
 summary: >
@@ -64,7 +64,7 @@
 
 
 class AFTLocal(nn.Module):
-    """
+    r"""
     ### AFT Local Operation
 
     $$Y_t = \sigma(Q_t) \odot
@@ -109,7 +109,7 @@ def __init__(self, d_model: int, seq_len: int, local_window_size: int, bias: boo
 
     @staticmethod
     def create_local_mask(seq_len, local_window_size):
-        """
+        r"""
         #### Create local mask
 
         This creates a mask for
diff --git a/labml_nn/transformers/alibi/__init__.py b/labml_nn/transformers/alibi/__init__.py
index 8c1bdad27..154f93a52 100644
--- a/labml_nn/transformers/alibi/__init__.py
+++ b/labml_nn/transformers/alibi/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Attention with Linear Biases (ALiBi)
 summary: >
@@ -41,7 +41,7 @@
 
 
 def get_slopes(n_heads: int):
-    """
+    r"""
     ## Get head-specific slope $m$ for each head
 
     * `n_heads` is the number of heads in the attention layer $n$
diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py
index 96339e0cc..e3e4212c3 100644
--- a/labml_nn/transformers/compressive/__init__.py
+++ b/labml_nn/transformers/compressive/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Compressive Transformer
 summary: >
diff --git a/labml_nn/transformers/configs.py b/labml_nn/transformers/configs.py
index e80f3f097..aab32b427 100644
--- a/labml_nn/transformers/configs.py
+++ b/labml_nn/transformers/configs.py
@@ -50,7 +50,7 @@ class FeedForwardConfigs(BaseConfigs):
 
 @option(FeedForwardConfigs.activation, 'ReLU')
 def _ffn_activation_relu():
-    """
+    r"""
     ### ReLU activation
 
     $$\max(0, x)$$
@@ -60,7 +60,7 @@ def _ffn_activation_relu():
 
 @option(FeedForwardConfigs.activation, 'GELU')
 def _ffn_activation_gelu():
-    """
+    r"""
     ### GELU activation
 
     $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py
index d4fbac2ea..2b0962fb8 100644
--- a/labml_nn/transformers/fast_weights/__init__.py
+++ b/labml_nn/transformers/fast_weights/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Linear Transformers Are Secretly Fast Weight Memory Systems
 summary: >
@@ -101,7 +101,7 @@
 
 
 class DPFP(nn.Module):
-    """
+    r"""
     ## Deterministic Parameter Free Project (DPFP)
 
     This is the new projection function $\textcolor{lightgreen}{\phi}$ introduced in the paper.
@@ -135,7 +135,7 @@ class DPFP(nn.Module):
     """
 
     def __init__(self, nu: int = 1, eps: float = 1e-6):
-        """
+        r"""
         * `nu` is the hyper-parameter $\nu$.
         * `eps` is the small value used to make sure there is no division-by-zero when normalizing.
         """
@@ -151,7 +151,7 @@ def forward(self, k: torch.Tensor):
         return k / (torch.sum(k, dim=-1, keepdim=True) + self.eps)
 
     def dpfp(self, k: torch.Tensor):
-        """
+        r"""
         $$\textcolor{lightgreen}{\phi(k)}$$
         """
         # $x = \text{ReLU}\Big(\big[k, -k\big]\Big)$
@@ -173,7 +173,7 @@ def dpfp(self, k: torch.Tensor):
 
 
 class FastWeightsAttention(nn.Module):
-    """
+    r"""
     ## Fast Weights Attention
 
     The paper introduces a new update rule for calculating $\textcolor{cyan}{W^{(i)}}$.
diff --git a/labml_nn/transformers/feed_forward.py b/labml_nn/transformers/feed_forward.py
index f9c8d768e..eb1ab07f7 100644
--- a/labml_nn/transformers/feed_forward.py
+++ b/labml_nn/transformers/feed_forward.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Position-wise Feed-Forward Network (FFN)
 summary: Documented reusable implementation of the position wise feedforward network.
diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py
index ee9e9b8bd..6661d12c0 100644
--- a/labml_nn/transformers/feedback/__init__.py
+++ b/labml_nn/transformers/feedback/__init__.py
@@ -136,7 +136,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor):
          positional encodings $P_q, P_j$.
 
         We replace term $\textcolor{lightgreen}{D}$ with $S_j$.
-        """
+        r"""
 
         # $U^K_j$
         key_pos_emb = self.key_pos_embeddings[-key.shape[0]:]
@@ -160,7 +160,7 @@ def forward(self, *,
         """
         * `query` has shape `[batch_size, d_model]`
         * `key` and `value` has shape `[seq_len, batch_size, d_model]`
-        """
+        r"""
 
         # Prepare `query`, `key` and `value` for attention computation
         # `key` and `value`  will then have shape `[seq_len, batch_size, heads, d_k]`
diff --git a/labml_nn/transformers/flash/__init__.py b/labml_nn/transformers/flash/__init__.py
index fe4b2990d..84d14a8d1 100644
--- a/labml_nn/transformers/flash/__init__.py
+++ b/labml_nn/transformers/flash/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Flash Attention
 summary: >
@@ -160,7 +160,7 @@ class AttentionFunc(torch.autograd.Function):
     def forward(ctx: Any,
                 q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
                 causal: bool, sm_scale: float) -> torch.Tensor:
-        """
+        r"""
         ### Forward pass
 
         Group query attention forward pass. Returns the output in shape `[batch_size, n_heads, q_seq_len, d_head]`.
@@ -352,7 +352,7 @@ def _attn_fwd(t_q, t_k, t_v, sm_scale_log2e, t_lse, t_o,
               BLOCK_Q: tl.constexpr,
               BLOCK_K: tl.constexpr,
               ):
-    """
+    r"""
     ### Triton kernel for Flash attention forward pass
 
     :param t_q: queries $Q_i$
diff --git a/labml_nn/transformers/fnet/__init__.py b/labml_nn/transformers/fnet/__init__.py
index 4b123f376..61f45d3ca 100644
--- a/labml_nn/transformers/fnet/__init__.py
+++ b/labml_nn/transformers/fnet/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: "FNet: Mixing Tokens with Fourier Transforms"
 summary: >
@@ -45,7 +45,7 @@
 
 
 class FNetMix(nn.Module):
-    """
+    r"""
     ## FNet - Mix tokens
 
     This module simply implements
@@ -58,7 +58,7 @@ class FNetMix(nn.Module):
     """
 
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
-        """
+        r"""
         The [normal attention module](../mha.html) can be fed with different token embeddings for
         $\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
 
diff --git a/labml_nn/transformers/gmlp/__init__.py b/labml_nn/transformers/gmlp/__init__.py
index 74b55e9f4..f2afd7d05 100644
--- a/labml_nn/transformers/gmlp/__init__.py
+++ b/labml_nn/transformers/gmlp/__init__.py
@@ -23,7 +23,7 @@
 
 
 class GMLPBlock(nn.Module):
-    """
+    r"""
     ## gMLP Block
 
     Each block does the following transformations to input embeddings
@@ -87,7 +87,7 @@ def forward(self, *, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
 
 
 class SpacialGatingUnit(nn.Module):
-    """
+    r"""
     ## Spatial Gating Unit
 
     $$s(Z) = Z_1 \odot f_{W,b}(Z_2)$$
diff --git a/labml_nn/transformers/gpt/__init__.py b/labml_nn/transformers/gpt/__init__.py
index 7c8beda60..6d32eef10 100644
--- a/labml_nn/transformers/gpt/__init__.py
+++ b/labml_nn/transformers/gpt/__init__.py
@@ -124,7 +124,7 @@ def _transformer_configs(c: Configs):
 
 
 def _init_weights(module):
-    """
+    r"""
     ### Initialize weights
 
     Weights of linear layers and embedding layers are initialized
diff --git a/labml_nn/transformers/hour_glass/__init__.py b/labml_nn/transformers/hour_glass/__init__.py
index f94bd7bda..0031c7ce5 100644
--- a/labml_nn/transformers/hour_glass/__init__.py
+++ b/labml_nn/transformers/hour_glass/__init__.py
@@ -246,7 +246,7 @@ def __init__(self):
 
 
 class AttentionBasedShortening(nn.Module):
-    """
+    r"""
     ### 🚧 Down-sampling with attention
 
     \begin{align}
@@ -263,7 +263,7 @@ def __init__(self):
 
 
 class LinearUpSampling(nn.Module):
-    """
+    r"""
     ### 🚧 Linear projection for up-sampling
 
     Make a linear projection of dense token embeddings to a size of $d_{\text{model}} k$.
@@ -275,7 +275,7 @@ def __init__(self):
 
 
 class AttentionBasedUpSampling(nn.Module):
-    """
+    r"""
     ### 🚧 Attention based up-sampling
 
     \begin{align}
diff --git a/labml_nn/transformers/jax_transformer/__init__.py b/labml_nn/transformers/jax_transformer/__init__.py
index fb9b1bf5e..672b49160 100644
--- a/labml_nn/transformers/jax_transformer/__init__.py
+++ b/labml_nn/transformers/jax_transformer/__init__.py
@@ -385,7 +385,7 @@ def __init__(self, normalized_shape: Union[Tuple[int], List[int]], *,
          $X \in \mathbb{R}^{* \times S[0] \times S[1] \times ... \times S[n]}$
         * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
         * `elementwise_affine` is whether to scale and shift the normalized value
-        """
+        r"""
         super().__init__()
 
         self.eps = eps
@@ -487,7 +487,7 @@ def __call__(self, *,
 
         `mask` has shape `[seq_len, seq_len]` and
         `mask[i, j]` indicates whether query at position `i` can see key-value at position `j`.
-        """
+        r"""
 
         # Get sequence length
         seq_len = len(query)
@@ -623,7 +623,7 @@ class CrossEntropyLoss(Module):
     <a id="CrossEntropyLoss"></a>
 
     ## Cross Entropy Loss
-    """
+    r"""
 
     def __init__(self):
         super().__init__()
@@ -800,7 +800,7 @@ def step(self, params: Dict, grads: Dict):
 
         * `params` is a tree-map of parameters
         * `grads` is a tree-map of gradients
-        """
+        r"""
         # Increment step $t$
         self._n_steps += 1
         # Update states for each parameter
@@ -813,7 +813,7 @@ def _step(self, n_steps: int, param: jnp.ndarray, state: AdamState):
         ### Update parameters
 
         This performs a Adam update on the given parameter
-        """
+        r"""
 
         # Bias corrections for $\hat{m}_t$: $1 - \beta_1^t$ and for $\hat{v}_t$: $1 - \beta_2^t$
         bias_correction = [1 - beta ** n_steps for beta in self.betas]
@@ -834,7 +834,7 @@ def _update_state(self, grad, state: AdamState):
         ### Update state
 
         This updates uncorrected first and second moments $m_t$ and $v_t$
-        """
+        r"""
         # Uncorrected first and second moments $m_{t-1}$ and $v_{t-1}$
         m, v = state
         # Clip gradients
diff --git a/labml_nn/transformers/knn/__init__.py b/labml_nn/transformers/knn/__init__.py
index 72d8037f0..902f5ffcc 100644
--- a/labml_nn/transformers/knn/__init__.py
+++ b/labml_nn/transformers/knn/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: k-Nearest Neighbor Language Models
 summary: >
diff --git a/labml_nn/transformers/knn/build_index.py b/labml_nn/transformers/knn/build_index.py
index f6deafaa9..f8c6a9942 100644
--- a/labml_nn/transformers/knn/build_index.py
+++ b/labml_nn/transformers/knn/build_index.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Build FAISS index for k-NN search
 summary: This builds the FAISS index with the transformer embeddings.
@@ -51,7 +51,7 @@ def load_experiment(run_uuid: str, checkpoint: Optional[int] = None):
 
 
 def gather_keys(conf: Configs):
-    """
+    r"""
     ## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays
 
     *Note that these numpy arrays will take up a lot of space (even few hundred gigabytes)
diff --git a/labml_nn/transformers/knn/eval_knn.py b/labml_nn/transformers/knn/eval_knn.py
index 703a41ca7..17c6f49a9 100644
--- a/labml_nn/transformers/knn/eval_knn.py
+++ b/labml_nn/transformers/knn/eval_knn.py
@@ -20,7 +20,7 @@
 
 
 def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray, vals_store: np.ndarray, n_tokens: int):
-    """
+    r"""
     ## $k$-NN to get $p(w_t, c_t)$
 
     Here we refer to $f(\textcolor{yellowgreen}{c_t})$ as queries,
diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index ff93530e0..e09516e84 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -123,7 +123,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor):
         ### Calculate scores between queries and keys
 
         This method can be overridden for other variations like relative attention.
-        """
+        r"""
 
         # Calculate $Q K^\top$ or $S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd}$
         return torch.einsum('ibhd,jbhd->ijbh', query, key)
diff --git a/labml_nn/transformers/mlp_mixer/__init__.py b/labml_nn/transformers/mlp_mixer/__init__.py
index 06b650583..4281efc37 100644
--- a/labml_nn/transformers/mlp_mixer/__init__.py
+++ b/labml_nn/transformers/mlp_mixer/__init__.py
@@ -48,7 +48,7 @@ def __init__(self, mlp: nn.Module):
         self.mlp = mlp
 
     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
-        """
+        r"""
         The [normal attention module](../mha.html) can be fed with different token embeddings for
         $\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
 
diff --git a/labml_nn/transformers/positional_encoding.py b/labml_nn/transformers/positional_encoding.py
index 615ee913c..650877f3f 100644
--- a/labml_nn/transformers/positional_encoding.py
+++ b/labml_nn/transformers/positional_encoding.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Fixed Positional Encodings
 summary: >
diff --git a/labml_nn/transformers/primer_ez/__init__.py b/labml_nn/transformers/primer_ez/__init__.py
index 6357f6d1c..276f26715 100644
--- a/labml_nn/transformers/primer_ez/__init__.py
+++ b/labml_nn/transformers/primer_ez/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: "Primer: Searching for Efficient Transformers for Language Modeling"
 summary: >
@@ -42,7 +42,7 @@
 
 
 class SquaredReLU(nn.Module):
-    """
+    r"""
     ## Squared ReLU activation
 
     $$y = {\max(x, 0)}^2$$
diff --git a/labml_nn/transformers/retro/bert_embeddings.py b/labml_nn/transformers/retro/bert_embeddings.py
index 9ddef8f09..a6c53558e 100644
--- a/labml_nn/transformers/retro/bert_embeddings.py
+++ b/labml_nn/transformers/retro/bert_embeddings.py
@@ -19,7 +19,7 @@
 
 
 class BERTChunkEmbeddings:
-    """
+    r"""
     ## BERT Embeddings
 
     For a given chunk of text $N$ this class generates BERT embeddings $\text{B\small{ERT}}(N)$.
@@ -75,7 +75,7 @@ def _trim_chunk(chunk: str):
             return stripped
 
     def __call__(self, chunks: List[str]):
-        """
+        r"""
         ### Get $\text{B\small{ERT}}(N)$ for a list of chunks.
         """
 
diff --git a/labml_nn/transformers/retro/database.py b/labml_nn/transformers/retro/database.py
index 90d0a2003..8a3e94e79 100644
--- a/labml_nn/transformers/retro/database.py
+++ b/labml_nn/transformers/retro/database.py
@@ -26,7 +26,7 @@
 
 def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256,
                    code_size: int = 64, n_probe: int = 8, n_train: int = 50_000):
-    """
+    r"""
     ## Build Database
 
     * `chunk_len` is the length of a chunk (number of characters)
diff --git a/labml_nn/transformers/retro/model.py b/labml_nn/transformers/retro/model.py
index fe72048a5..06516de3b 100644
--- a/labml_nn/transformers/retro/model.py
+++ b/labml_nn/transformers/retro/model.py
@@ -32,7 +32,7 @@ class RotaryPositionalEmbeddings(nn.Module):
     """
 
     def __init__(self, d: int, base: int = 10_000):
-        """
+        r"""
         * `d` is the number of features $d$
         * `base` is the constant used for calculating $\Theta$
         """
@@ -81,7 +81,7 @@ def forward(self, x: torch.Tensor):
 
 
 class SelfAttention(nn.Module):
-    """
+    r"""
     ## Self-Attention Layer $\text{A\small{TTN}}$
 
     This applies causal and non-causal [multi-headed self-attention](../mha.html).
@@ -185,7 +185,7 @@ def forward(self, h: torch.Tensor):
 
 
 class CrossAttention(nn.Module):
-    """
+    r"""
     ## Cross-Attention Layer $\text{C\small{A}}$
 
     This is similar to the self-attention layer defined above, except that
@@ -272,7 +272,7 @@ def forward(self, e: torch.Tensor, h: torch.Tensor):
 
 
 class ChunkedCrossAttention(nn.Module):
-    """
+    r"""
     ## Chunked Cross-Attention Layer $\text{C\small{CA}}$
 
     This is similar to the cross-attention layer defined above.
@@ -380,7 +380,7 @@ def forward(self, h: torch.Tensor, e: torch.Tensor):
 
 
 class FeedForward(nn.Module):
-    """
+    r"""
     ### Position-wise Feed Forward Layer $\text{F\small{FW}}$
 
     This consists of two linear layers and an activation in the middle.
@@ -425,7 +425,7 @@ def forward(self, h: torch.Tensor):
 
 
 class NearestNeighborEncoder(nn.Module):
-    """
+    r"""
     ## Nearest Neighbor Encoder $\text{E\small{NCODER}}(\text{R\small{ET}}(C_u)_{1 \le u \le l}, H)$
 
     This module encodes the retrieved nearest neighbors
@@ -433,7 +433,7 @@ class NearestNeighborEncoder(nn.Module):
 
     def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int],
                  d_model: int, n_heads: int, d_k: int, d_ff: int):
-        """
+        r"""
         * `chunk_len` is the length of a chunk
         * `n_layer` is the number of layers in the encoder $L_{\text{enc}}$
         * `ca_layers` are the layers with cross attention $P_{\text{enc}}$
@@ -457,7 +457,7 @@ def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int],
         self.norm_h = nn.LayerNorm(d_model)
 
     def forward(self, e: torch.Tensor, h: torch.Tensor):
-        """
+        r"""
         * `e` are token embeddings of the retrieved nearest neighbors,
          $\text{E\small{MB}}\big(\text{R\small{ET}}(C_u)_{1 \le u \le l}\big)$
          of shape `[batch_size, chunks, neighbors, neighbor_len, d_model]`
@@ -541,7 +541,7 @@ def __init__(self, n_vocab: int, d_model: int, n_layers: int, ca_layers: Set[int
         self.norm_e = nn.LayerNorm(d_model)
 
     def forward(self, x: torch.Tensor, ret: torch.Tensor):
-        """
+        r"""
         * `x` is the input sequence, $X$ of shape `[batch_size, seq_len]`
         * `ret` are the retrieved neighbors
          $\text{R\small{ET}}(C_u)_{1 \le u \le l}$
diff --git a/labml_nn/transformers/rope/__init__.py b/labml_nn/transformers/rope/__init__.py
index a200785b0..f4800220a 100644
--- a/labml_nn/transformers/rope/__init__.py
+++ b/labml_nn/transformers/rope/__init__.py
@@ -28,7 +28,7 @@
 
 
 class RotaryPositionalEmbeddings(nn.Module):
-    """
+    r"""
     ## RoPE module
 
     Rotary encoding transforms pairs of features by rotating in the 2D plane.
@@ -116,7 +116,7 @@ class RotaryPositionalEmbeddings(nn.Module):
     """
 
     def __init__(self, d: int, base: int = 10_000):
-        """
+        r"""
         * `d` is the number of features $d$
         * `base` is the constant used for calculating $\Theta$
         """
@@ -128,7 +128,7 @@ def __init__(self, d: int, base: int = 10_000):
         self.sin_cached = None
 
     def _build_cache(self, x: torch.Tensor):
-        """
+        r"""
         Cache $\cos$ and $\sin$ values
         """
         # Return if cache is already built
diff --git a/labml_nn/transformers/rope/value_pe/__init__.py b/labml_nn/transformers/rope/value_pe/__init__.py
index 8aadeab8f..7855c0efe 100644
--- a/labml_nn/transformers/rope/value_pe/__init__.py
+++ b/labml_nn/transformers/rope/value_pe/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: Rotary Positional Embeddings with Relative distance (RoPER)
 summary: >
diff --git a/labml_nn/uncertainty/evidence/__init__.py b/labml_nn/uncertainty/evidence/__init__.py
index 8062050a1..27f04873c 100644
--- a/labml_nn/uncertainty/evidence/__init__.py
+++ b/labml_nn/uncertainty/evidence/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
 ---
 title: "Evidential Deep Learning to Quantify Classification Uncertainty"
 summary: >
@@ -55,7 +55,7 @@
 
 
 class MaximumLikelihoodLoss(nn.Module):
-    """
+    r"""
     <a id="MaximumLikelihoodLoss"></a>
 
     ## Type II Maximum Likelihood Loss
@@ -81,7 +81,7 @@ class MaximumLikelihoodLoss(nn.Module):
     """
 
     def forward(self, evidence: torch.Tensor, target: torch.Tensor):
-        """
+        r"""
         * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
         * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
         """
@@ -98,7 +98,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
 
 
 class CrossEntropyBayesRisk(nn.Module):
-    """
+    r"""
     <a id="CrossEntropyBayesRisk"></a>
 
     ## Bayes Risk with Cross Entropy Loss
@@ -128,7 +128,7 @@ class CrossEntropyBayesRisk(nn.Module):
     """
 
     def forward(self, evidence: torch.Tensor, target: torch.Tensor):
-        """
+        r"""
         * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
         * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
         """
@@ -145,7 +145,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
 
 
 class SquaredErrorBayesRisk(nn.Module):
-    """
+    r"""
     <a id="SquaredErrorBayesRisk"></a>
 
     ## Bayes Risk with Squared Error Loss
@@ -191,7 +191,7 @@ class SquaredErrorBayesRisk(nn.Module):
     """
 
     def forward(self, evidence: torch.Tensor, target: torch.Tensor):
-        """
+        r"""
         * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
         * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
         """
@@ -215,7 +215,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
 
 
 class KLDivergenceLoss(nn.Module):
-    """
+    r"""
     <a id="KLDivergenceLoss"></a>
 
     ## KL Divergence Regularization Loss
@@ -240,7 +240,7 @@ class KLDivergenceLoss(nn.Module):
     """
 
     def forward(self, evidence: torch.Tensor, target: torch.Tensor):
-        """
+        r"""
         * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
         * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
         """
diff --git a/labml_nn/unet/__init__.py b/labml_nn/unet/__init__.py
index cdfde7393..8e66cb843 100644
--- a/labml_nn/unet/__init__.py
+++ b/labml_nn/unet/__init__.py
@@ -30,7 +30,7 @@
 
 
 class DoubleConvolution(nn.Module):
-    """
+    r"""
     ### Two $3 \times 3$ Convolution Layers
 
     Each step in the contraction path and expansive path have two $3 \times 3$
@@ -63,7 +63,7 @@ def forward(self, x: torch.Tensor):
 
 
 class DownSample(nn.Module):
-    """
+    r"""
     ### Down-sample
 
     Each step in the contracting path down-samples the feature map with
@@ -80,7 +80,7 @@ def forward(self, x: torch.Tensor):
 
 
 class UpSample(nn.Module):
-    """
+    r"""
     ### Up-sample
 
     Each step in the expansive path up-samples the feature map with