diff --git a/labml_nn/activations/fta/__init__.py b/labml_nn/activations/fta/__init__.py
index ba682a0a0..6ae143d3c 100644
--- a/labml_nn/activations/fta/__init__.py
+++ b/labml_nn/activations/fta/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Fuzzy Tiling Activations
summary: >
@@ -68,7 +68,7 @@ class FTA(nn.Module):
"""
def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: float):
- """
+ r"""
:param lower_limit: is the lower limit $l$
:param upper_limit: is the upper limit $u$
:param delta: is the bin size $\delta$
@@ -86,7 +86,7 @@ def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: fl
self.eta = eta
def fuzzy_i_plus(self, x: torch.Tensor):
- """
+ r"""
#### Fuzzy indicator function
$$I_{\eta,+}(x) = I_+(\eta - x) x + I_+ (x - \eta)$$
diff --git a/labml_nn/adaptive_computation/ponder_net/__init__.py b/labml_nn/adaptive_computation/ponder_net/__init__.py
index 7dfcd2d3c..b9a56eeef 100644
--- a/labml_nn/adaptive_computation/ponder_net/__init__.py
+++ b/labml_nn/adaptive_computation/ponder_net/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: "PonderNet: Learning to Ponder"
summary: >
@@ -106,7 +106,7 @@ def __init__(self, n_elems: int, n_hidden: int, max_steps: int):
self.is_halt = False
def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
- """
+ r"""
* `x` is the input of shape `[batch_size, n_elems]`
This outputs a tuple of four tensors:
@@ -177,7 +177,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Te
class ReconstructionLoss(nn.Module):
- """
+ r"""
## Reconstruction loss
$$L_{Rec} = \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n)$$
@@ -186,14 +186,14 @@ class ReconstructionLoss(nn.Module):
"""
def __init__(self, loss_func: nn.Module):
- """
+ r"""
* `loss_func` is the loss function $\mathcal{L}$
"""
super().__init__()
self.loss_func = loss_func
def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
- """
+ r"""
* `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
* `y_hat` is $\hat{y}_1 \dots \hat{y}_N$ in a tensor of shape `[N, batch_size, ...]`
* `y` is the target of shape `[batch_size, ...]`
@@ -213,7 +213,7 @@ def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor):
class RegularizationLoss(nn.Module):
- """
+ r"""
## Regularization loss
$$L_{Reg} = \mathop{KL} \Big(p_n \Vert p_G(\lambda_p) \Big)$$
@@ -229,7 +229,7 @@ class RegularizationLoss(nn.Module):
"""
def __init__(self, lambda_p: float, max_steps: int = 1_000):
- """
+ r"""
* `lambda_p` is $\lambda_p$ - the success probability of geometric distribution
* `max_steps` is the highest $N$; we use this to pre-compute $p_G(\lambda_p)$
"""
@@ -253,7 +253,7 @@ def __init__(self, lambda_p: float, max_steps: int = 1_000):
self.kl_div = nn.KLDivLoss(reduction='batchmean')
def forward(self, p: torch.Tensor):
- """
+ r"""
* `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]`
"""
# Transpose `p` to `[batch_size, N]`
diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py
index 9a9dfbeae..a2dfaf6cc 100644
--- a/labml_nn/capsule_networks/__init__.py
+++ b/labml_nn/capsule_networks/__init__.py
@@ -35,7 +35,7 @@
class Squash(nn.Module):
- """
+ r"""
## Squash
This is **squashing** function from paper, given by equation $(1)$.
@@ -69,7 +69,7 @@ def forward(self, s: torch.Tensor):
class Router(nn.Module):
- """
+ r"""
## Routing Algorithm
This is the routing mechanism described in the paper.
@@ -132,7 +132,7 @@ def forward(self, u: torch.Tensor):
class MarginLoss(nn.Module):
- """
+ r"""
## Margin loss for class existence
A separate margin loss is used for each output capsule and the total loss is the sum of them.
@@ -161,7 +161,7 @@ def __init__(self, *, n_labels: int, lambda_: float = 0.5, m_positive: float = 0
self.n_labels = n_labels
def forward(self, v: torch.Tensor, labels: torch.Tensor):
- """
+ r"""
`v`, $\mathbf{v}_j$ are the squashed output capsules.
This has shape `[batch_size, n_labels, n_features]`; that is, there is a capsule for each label.
diff --git a/labml_nn/cfr/__init__.py b/labml_nn/cfr/__init__.py
index 48e496058..1e69674ea 100644
--- a/labml_nn/cfr/__init__.py
+++ b/labml_nn/cfr/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Regret Minimization in Games with Incomplete Information (CFR)
summary: >
@@ -337,7 +337,7 @@
class History:
- """
+ r"""
## History
@@ -349,14 +349,14 @@ class History:
"""
def is_terminal(self):
- """
+ r"""
Whether it's a terminal history; i.e. game over.
$h \in Z$
"""
raise NotImplementedError()
def terminal_utility(self, i: Player) -> float:
- """
+ r"""
Utility of player $i$ for a terminal history.
$u_i(h)$ where $h \in Z$
@@ -485,7 +485,7 @@ def load_dict(self, data: Dict[str, any]):
self.calculate_strategy()
def calculate_strategy(self):
- """
+ r"""
## Calculate strategy
Calculate current strategy using [regret matching](#RegretMatching).
@@ -520,7 +520,7 @@ def calculate_strategy(self):
self.strategy = {a: 1 / count for a, r in regret.items()}
def get_average_strategy(self):
- """
+ r"""
## Get average strategy
$$\textcolor{cyan}{\bar{\sigma}^T_i(I)(a)} =
@@ -596,7 +596,7 @@ def _get_info_set(self, h: History):
return self.info_sets[info_set_key]
def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> float:
- """
+ r"""
### Walk Tree
This function walks the game tree.
@@ -686,7 +686,7 @@ def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> floa
return v
def iterate(self):
- """
+ r"""
### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$
This updates the strategies for $T$ iterations.
diff --git a/labml_nn/conv_mixer/__init__.py b/labml_nn/conv_mixer/__init__.py
index 42d1804ae..5b8a91f4d 100644
--- a/labml_nn/conv_mixer/__init__.py
+++ b/labml_nn/conv_mixer/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Patches Are All You Need? (ConvMixer)
summary: >
@@ -96,7 +96,7 @@ def forward(self, x: torch.Tensor):
class PatchEmbeddings(nn.Module):
- """
+ r"""
## Get patch embeddings
diff --git a/labml_nn/diffusion/ddpm/__init__.py b/labml_nn/diffusion/ddpm/__init__.py
index c89c93eeb..013b6ee63 100644
--- a/labml_nn/diffusion/ddpm/__init__.py
+++ b/labml_nn/diffusion/ddpm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Denoising Diffusion Probabilistic Models (DDPM)
summary: >
@@ -175,7 +175,7 @@ class DenoiseDiffusion:
"""
def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
- """
+ r"""
* `eps_model` is $\textcolor{lightgreen}{\epsilon_\theta}(x_t, t)$ model
* `n_steps` is $t$
* `device` is the device to place constants on
@@ -196,7 +196,7 @@ def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device):
self.sigma2 = self.beta
def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
- """
+ r"""
#### Get $q(x_t|x_0)$ distribution
\begin{align}
@@ -212,7 +212,7 @@ def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torc
return mean, var
def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor] = None):
- """
+ r"""
#### Sample from $q(x_t|x_0)$
\begin{align}
@@ -230,7 +230,7 @@ def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor
return mean + (var ** 0.5) * eps
def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
- """
+ r"""
#### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
\begin{align}
@@ -262,7 +262,7 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor):
return mean + (var ** .5) * eps
def loss(self, x0: torch.Tensor, noise: Optional[torch.Tensor] = None):
- """
+ r"""
#### Simplified Loss
$$L_{\text{simple}}(\theta) = \mathbb{E}_{t,x_0, \epsilon} \Bigg[ \bigg\Vert
diff --git a/labml_nn/diffusion/ddpm/evaluate.py b/labml_nn/diffusion/ddpm/evaluate.py
index 52251b61e..06456925f 100644
--- a/labml_nn/diffusion/ddpm/evaluate.py
+++ b/labml_nn/diffusion/ddpm/evaluate.py
@@ -26,7 +26,13 @@ class Sampler:
## Sampler class
"""
- def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size: int, device: torch.device):
+ def __init__(
+ self,
+ diffusion: DenoiseDiffusion,
+ image_channels: int,
+ image_size: int,
+ device: torch.device,
+ ):
"""
* `diffusion` is the `DenoiseDiffusion` instance
* `image_channels` is the number of channels in the image
@@ -63,9 +69,11 @@ def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size:
# $$\tilde\beta_t = \frac{1 - \bar\alpha_{t-1}}{1 - \bar\alpha_t} \beta_t$$
self.beta_tilde = self.beta * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
# $$\frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}$$
- self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1 ** 0.5) / (1 - self.alpha_bar)
+ self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1**0.5) / (1 - self.alpha_bar)
# $$\frac{\sqrt{\alpha_t}(1 - \bar\alpha_{t-1}}{1-\bar\alpha_t}$$
- self.mu_tilde_coef2 = (self.alpha ** 0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
+ self.mu_tilde_coef2 = (
+ (self.alpha**0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar)
+ )
# $\sigma^2 = \beta$
self.sigma2 = self.beta
@@ -80,6 +88,7 @@ def show_image(self, img, title=""):
def make_video(self, frames, path="video.mp4"):
"""Helper function to create a video"""
import imageio
+
# 20 second video
writer = imageio.get_writer(path, fps=len(frames) // 20)
# Add each image
@@ -91,7 +100,7 @@ def make_video(self, frames, path="video.mp4"):
writer.close()
def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
- """
+ r"""
#### Sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
We sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ and at each step
@@ -101,14 +110,17 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
"""
# $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
- xt = torch.randn([1, self.image_channels, self.image_size, self.image_size], device=self.device)
+ xt = torch.randn(
+ [1, self.image_channels, self.image_size, self.image_size],
+ device=self.device,
+ )
# Interval to log $\hat{x}_0$
interval = self.n_steps // n_frames
# Frames for video
frames = []
# Sample $T$ steps
- for t_inv in monit.iterate('Denoise', self.n_steps):
+ for t_inv in monit.iterate("Denoise", self.n_steps):
# $t$
t_ = self.n_steps - t_inv - 1
# $t$ in a tensor
@@ -128,8 +140,10 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True):
if create_video:
self.make_video(frames)
- def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100):
- """
+ def interpolate(
+ self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100
+ ):
+ r"""
#### Interpolate two images $x_0$ and $x'_0$
We get $x_t \sim q(x_t|x_0)$ and $x'_t \sim q(x'_t|x_0)$.
@@ -144,20 +158,28 @@ def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: in
* `x2` is $x'_0$
* `lambda_` is $\lambda$
* `t_` is $t$
- """
+ r"""
# Number of samples
n_samples = x1.shape[0]
# $t$ tensor
t = torch.full((n_samples,), t_, device=self.device)
# $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$
- xt = (1 - lambda_) * self.diffusion.q_sample(x1, t) + lambda_ * self.diffusion.q_sample(x2, t)
+ xt = (1 - lambda_) * self.diffusion.q_sample(
+ x1, t
+ ) + lambda_ * self.diffusion.q_sample(x2, t)
# $$\bar{x}_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|\bar{x}_t)$$
return self._sample_x0(xt, t_)
- def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int = 100, t_: int = 100,
- create_video=True):
+ def interpolate_animate(
+ self,
+ x1: torch.Tensor,
+ x2: torch.Tensor,
+ n_frames: int = 100,
+ t_: int = 100,
+ create_video=True,
+ ):
"""
#### Interpolate two images $x_0$ and $x'_0$ and make a video
@@ -166,7 +188,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int
* `n_frames` is the number of frames for the image
* `t_` is $t$
* `create_video` specifies whether to make a video or to show each frame
- """
+ r"""
# Show original images
self.show_image(x1, "x1")
@@ -183,7 +205,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int
frames = []
# Get frames with different $\lambda$
- for i in monit.iterate('Interpolate', n_frames + 1, is_children_silent=True):
+ for i in monit.iterate("Interpolate", n_frames + 1, is_children_silent=True):
# $\lambda$
lambda_ = i / n_frames
# $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$
@@ -206,15 +228,17 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int):
* `xt` is $x_t$
* `n_steps` is $t$
- """
+ r"""
# Number of sampels
n_samples = xt.shape[0]
# Iterate until $t$ steps
- for t_ in monit.iterate('Denoise', n_steps):
+ for t_ in monit.iterate("Denoise", n_steps):
t = n_steps - t_ - 1
# Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
- xt = self.diffusion.p_sample(xt, xt.new_full((n_samples,), t, dtype=torch.long))
+ xt = self.diffusion.p_sample(
+ xt, xt.new_full((n_samples,), t, dtype=torch.long)
+ )
# Return $x_0$
return xt
@@ -222,9 +246,12 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int):
def sample(self, n_samples: int = 16):
"""
#### Generate images
- """
+ r"""
# $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$
- xt = torch.randn([n_samples, self.image_channels, self.image_size, self.image_size], device=self.device)
+ xt = torch.randn(
+ [n_samples, self.image_channels, self.image_size, self.image_size],
+ device=self.device,
+ )
# $$x_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|x_t)$$
x0 = self._sample_x0(xt, self.n_steps)
@@ -234,7 +261,7 @@ def sample(self, n_samples: int = 16):
self.show_image(x0[i])
def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor):
- """
+ r"""
#### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$
\begin{align}
@@ -244,23 +271,23 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor):
&= \frac{1}{\sqrt{\alpha_t}} \Big(x_t -
\frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)
\end{align}
- """
+ r"""
# [gather](utils.html) $\bar\alpha_t$
alpha_bar = gather(self.alpha_bar, t)
# $\alpha_t$
alpha = gather(self.alpha, t)
# $\frac{\beta}{\sqrt{1-\bar\alpha_t}}$
- eps_coef = (1 - alpha) / (1 - alpha_bar) ** .5
+ eps_coef = (1 - alpha) / (1 - alpha_bar) ** 0.5
# $$\frac{1}{\sqrt{\alpha_t}} \Big(x_t -
# \frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
- mean = 1 / (alpha ** 0.5) * (xt - eps_coef * eps_theta)
+ mean = 1 / (alpha**0.5) * (xt - eps_coef * eps_theta)
# $\sigma^2$
var = gather(self.sigma2, t)
# $\epsilon \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$
eps = torch.randn(xt.shape, device=xt.device)
# Sample
- return mean + (var ** .5) * eps
+ return mean + (var**0.5) * eps
def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor):
"""
@@ -268,13 +295,13 @@ def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor):
$$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}}
\Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
- """
+ r"""
# [gather](utils.html) $\bar\alpha_t$
alpha_bar = gather(self.alpha_bar, t)
# $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}}
# \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$
- return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar ** 0.5)
+ return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar**0.5)
def main():
@@ -297,16 +324,18 @@ def main():
configs.init()
# Set PyTorch modules for saving and loading
- experiment.add_pytorch_models({'eps_model': configs.eps_model})
+ experiment.add_pytorch_models({"eps_model": configs.eps_model})
# Load training experiment
experiment.load(run_uuid)
# Create sampler
- sampler = Sampler(diffusion=configs.diffusion,
- image_channels=configs.image_channels,
- image_size=configs.image_size,
- device=configs.device)
+ sampler = Sampler(
+ diffusion=configs.diffusion,
+ image_channels=configs.image_channels,
+ image_size=configs.image_size,
+ device=configs.device,
+ )
# Start evaluation
with experiment.start():
@@ -324,5 +353,5 @@ def main():
#
-if __name__ == '__main__':
+if __name__ == "__main__":
main()
diff --git a/labml_nn/diffusion/ddpm/unet.py b/labml_nn/diffusion/ddpm/unet.py
index f5da80901..587c619d0 100644
--- a/labml_nn/diffusion/ddpm/unet.py
+++ b/labml_nn/diffusion/ddpm/unet.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: U-Net model for Denoising Diffusion Probabilistic Models (DDPM)
summary: >
@@ -29,7 +29,7 @@
class Swish(nn.Module):
- """
+ r"""
### Swish activation function
$$x \cdot \sigma(x)$$
@@ -272,7 +272,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor):
class Upsample(nn.Module):
- """
+ r"""
### Scale up the feature map by $2 \times$
"""
@@ -288,7 +288,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor):
class Downsample(nn.Module):
- """
+ r"""
### Scale down the feature map by $\frac{1}{2} \times$
"""
diff --git a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
index d7f9ecd1b..330ad2e04 100644
--- a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
+++ b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Latent Diffusion Models
summary: >
@@ -70,7 +70,7 @@ def __init__(self,
linear_start: float,
linear_end: float,
):
- """
+ r"""
:param unet_model: is the [U-Net](model/unet.html) that predicts noise
$\epsilon_\text{cond}(x_t, c)$, in latent space
:param autoencoder: is the [AutoEncoder](model/autoencoder.html)
@@ -134,7 +134,7 @@ def autoencoder_decode(self, z: torch.Tensor):
return self.first_stage_model.decode(z / self.latent_scaling_factor)
def forward(self, x: torch.Tensor, t: torch.Tensor, context: torch.Tensor):
- """
+ r"""
### Predict noise
Predict noise given the latent representation $x_t$, time step $t$, and the
diff --git a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
index badc6cf85..ec3784060 100644
--- a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
+++ b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py
@@ -416,7 +416,7 @@ def forward(self, x: torch.Tensor):
def swish(x: torch.Tensor):
- """
+ r"""
### Swish activation
$$x \cdot \sigma(x)$$
diff --git a/labml_nn/diffusion/stable_diffusion/model/unet.py b/labml_nn/diffusion/stable_diffusion/model/unet.py
index 261a4bced..4eb10afeb 100644
--- a/labml_nn/diffusion/stable_diffusion/model/unet.py
+++ b/labml_nn/diffusion/stable_diffusion/model/unet.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: U-Net for Stable Diffusion
summary: >
diff --git a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
index cf42efa4f..ef2044057 100644
--- a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
+++ b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Transformer for Stable Diffusion U-Net
summary: >
@@ -291,7 +291,7 @@ def forward(self, x: torch.Tensor):
class GeGLU(nn.Module):
- """
+ r"""
### GeGLU Activation
$$\text{GeGLU}(x) = (xW + b) * \text{GELU}(xV + c)$$
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
index 38c063e06..75e71bbf6 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py
@@ -29,7 +29,7 @@ class DiffusionSampler:
model: LatentDiffusion
def __init__(self, model: LatentDiffusion):
- """
+ r"""
:param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
"""
super().__init__()
@@ -40,7 +40,7 @@ def __init__(self, model: LatentDiffusion):
def get_eps(self, x: torch.Tensor, t: torch.Tensor, c: torch.Tensor, *,
uncond_scale: float, uncond_cond: Optional[torch.Tensor]):
- """
+ r"""
## Get $\epsilon(x_t, c)$
:param x: is $x_t$ of shape `[batch_size, channels, height, width]`
@@ -79,7 +79,7 @@ def sample(self,
uncond_cond: Optional[torch.Tensor] = None,
skip_steps: int = 0,
):
- """
+ r"""
### Sampling Loop
:param shape: is the shape of the generated images in the
@@ -100,7 +100,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
uncond_scale: float = 1.,
uncond_cond: Optional[torch.Tensor] = None,
):
- """
+ r"""
### Painting Loop
:param x: is $x_{T'}$ of shape `[batch_size, channels, height, width]`
@@ -116,7 +116,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
raise NotImplementedError()
def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
- """
+ r"""
### Sample from $q(x_t|x_0)$
:param x0: is $x_0$ of shape `[batch_size, channels, height, width]`
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
index 04a8837f3..fb36ab521 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py
@@ -24,7 +24,7 @@
class DDIMSampler(DiffusionSampler):
- """
+ r"""
## DDIM Sampler
This extends the [`DiffusionSampler` base class](index.html).
@@ -52,7 +52,7 @@ class DDIMSampler(DiffusionSampler):
model: LatentDiffusion
def __init__(self, model: LatentDiffusion, n_steps: int, ddim_discretize: str = "uniform", ddim_eta: float = 0.):
- """
+ r"""
:param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
:param n_steps: is the number of DDIM sampling steps, $S$
:param ddim_discretize: specifies how to extract $\tau$ from $[1,2,\dots,T]$.
@@ -106,7 +106,7 @@ def sample(self,
uncond_cond: Optional[torch.Tensor] = None,
skip_steps: int = 0,
):
- """
+ r"""
### Sampling Loop
:param shape: is the shape of the generated images in the
@@ -153,7 +153,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
temperature: float = 1.,
uncond_scale: float = 1.,
uncond_cond: Optional[torch.Tensor] = None):
- """
+ r"""
### Sample $x_{\tau_{i-1}}$
:param x: is $x_{\tau_i}$ of shape `[batch_size, channels, height, width]`
@@ -184,7 +184,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, *,
temperature: float,
repeat_noise: bool):
- """
+ r"""
### Sample $x_{\tau_{i-1}}$ given $\epsilon_\theta(x_{\tau_i})$
"""
@@ -231,7 +231,7 @@ def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor,
@torch.no_grad()
def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
- """
+ r"""
### Sample from $q_{\sigma,\tau}(x_{\tau_i}|x_0)$
$$q_{\sigma,\tau}(x_t|x_0) =
@@ -258,7 +258,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *,
uncond_scale: float = 1.,
uncond_cond: Optional[torch.Tensor] = None,
):
- """
+ r"""
### Painting Loop
:param x: is $x_{S'}$ of shape `[batch_size, channels, height, width]`
diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
index f591e2b65..ffa545abf 100644
--- a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
+++ b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Denoising Diffusion Probabilistic Models (DDPM) Sampling
summary: >
@@ -24,7 +24,7 @@
class DDPMSampler(DiffusionSampler):
- """
+ r"""
## DDPM Sampler
This extends the [`DiffusionSampler` base class](index.html).
@@ -49,7 +49,7 @@ class DDPMSampler(DiffusionSampler):
model: LatentDiffusion
def __init__(self, model: LatentDiffusion):
- """
+ r"""
:param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$
"""
super().__init__(model)
@@ -94,7 +94,7 @@ def sample(self,
uncond_cond: Optional[torch.Tensor] = None,
skip_steps: int = 0,
):
- """
+ r"""
### Sampling Loop
:param shape: is the shape of the generated images in the
@@ -139,7 +139,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
repeat_noise: bool = False,
temperature: float = 1.,
uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None):
- """
+ r"""
### Sample $x_{t-1}$ from $p_\theta(x_{t-1} | x_t)$
:param x: is $x_t$ of shape `[batch_size, channels, height, width]`
@@ -208,7 +208,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int,
@torch.no_grad()
def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):
- """
+ r"""
### Sample from $q(x_t|x_0)$
$$q(x_t|x_0) = \mathcal{N} \Big(x_t; \sqrt{\bar\alpha_t} x_0, (1-\bar\alpha_t) \mathbf{I} \Big)$$
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
index ef3aab4d2..8e4fec81c 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py
@@ -26,7 +26,7 @@ class Img2Img:
def __init__(self, *, checkpoint_path: Path,
ddim_steps: int = 50,
ddim_eta: float = 0.0):
- """
+ r"""
:param checkpoint_path: is the path of the checkpoint
:param ddim_steps: is the number of sampling steps
:param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant
@@ -54,7 +54,7 @@ def __call__(self, *,
prompt: str,
uncond_scale: float = 5.0,
):
- """
+ r"""
:param dest_path: is the path to store the generated images
:param orig_img: is the image to transform
:param strength: specifies how much of the original image should not be preserved
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
index a3504ed80..cdf731dc7 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py
@@ -31,7 +31,7 @@ class InPaint:
def __init__(self, *, checkpoint_path: Path,
ddim_steps: int = 50,
ddim_eta: float = 0.0):
- """
+ r"""
:param checkpoint_path: is the path of the checkpoint
:param ddim_steps: is the number of sampling steps
:param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant
@@ -60,7 +60,7 @@ def __call__(self, *,
uncond_scale: float = 5.0,
mask: Optional[torch.Tensor] = None,
):
- """
+ r"""
:param dest_path: is the path to store the generated images
:param orig_img: is the image to transform
:param strength: specifies how much of the original image should not be preserved
diff --git a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
index aee342bbb..30ab64ffe 100644
--- a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
+++ b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py
@@ -33,7 +33,7 @@ def __init__(self, *,
n_steps: int = 50,
ddim_eta: float = 0.0,
):
- """
+ r"""
:param checkpoint_path: is the path of the checkpoint
:param sampler_name: is the name of the [sampler](../sampler/index.html)
:param n_steps: is the number of sampling steps
@@ -62,7 +62,7 @@ def __call__(self, *,
h: int = 512, w: int = 512,
uncond_scale: float = 7.5,
):
- """
+ r"""
:param dest_path: is the path to store the generated images
:param batch_size: is the number of images to generate in a batch
:param prompt: is the prompt to generate images with
diff --git a/labml_nn/distillation/__init__.py b/labml_nn/distillation/__init__.py
index a8d0d11b5..72708117a 100644
--- a/labml_nn/distillation/__init__.py
+++ b/labml_nn/distillation/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Distilling the Knowledge in a Neural Network
summary: >
diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py
index 0a78e2613..3bd1c6178 100644
--- a/labml_nn/gan/cycle_gan/__init__.py
+++ b/labml_nn/gan/cycle_gan/__init__.py
@@ -188,7 +188,7 @@ def forward(self, x: torch.Tensor):
def weights_init_normal(m):
- """
+ r"""
Initialize convolution layer weights to $\mathcal{N}(0, 0.2)$
"""
classname = m.__class__.__name__
@@ -436,7 +436,7 @@ def initialize(self):
)
def run(self):
- """
+ r"""
## Training
We aim to solve:
diff --git a/labml_nn/gan/original/__init__.py b/labml_nn/gan/original/__init__.py
index 27eb1c650..dff35f8de 100644
--- a/labml_nn/gan/original/__init__.py
+++ b/labml_nn/gan/original/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Generative Adversarial Networks (GAN)
summary: A simple PyTorch implementation/tutorial of Generative Adversarial Networks (GAN) loss functions.
@@ -38,7 +38,7 @@
class DiscriminatorLogitsLoss(nn.Module):
- """
+ r"""
## Discriminator Loss
Discriminator should **ascend** on the gradient,
@@ -75,7 +75,7 @@ def __init__(self, smoothing: float = 0.2):
self.register_buffer('labels_false', _create_labels(256, 0.0, smoothing), False)
def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor):
- """
+ r"""
`logits_true` are logits from $D(\pmb{x}^{(i)})$ and
`logits_false` are logits from $D(G(\pmb{z}^{(i)}))$
"""
@@ -91,7 +91,7 @@ def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor):
class GeneratorLogitsLoss(nn.Module):
- """
+ r"""
## Generator Loss
Generator should **descend** on the gradient,
diff --git a/labml_nn/gan/original/experiment.py b/labml_nn/gan/original/experiment.py
index 71789df71..dbb621795 100644
--- a/labml_nn/gan/original/experiment.py
+++ b/labml_nn/gan/original/experiment.py
@@ -115,7 +115,7 @@ def init(self):
tracker.set_image("generated", True, 1 / 100)
def sample_z(self, batch_size: int):
- """
+ r"""
$$z \sim p(z)$$
"""
return torch.randn(batch_size, 100, device=self.device)
diff --git a/labml_nn/gan/stylegan/__init__.py b/labml_nn/gan/stylegan/__init__.py
index c1c36bbe0..528cee11b 100644
--- a/labml_nn/gan/stylegan/__init__.py
+++ b/labml_nn/gan/stylegan/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: StyleGAN 2
summary: >
@@ -156,7 +156,7 @@
class MappingNetwork(nn.Module):
- """
+ r"""
## Mapping Network
@@ -212,7 +212,7 @@ class Generator(nn.Module):
"""
def __init__(self, log_resolution: int, d_latent: int, n_features: int = 32, max_features: int = 512):
- """
+ r"""
* `log_resolution` is the $\log_2$ of image resolution
* `d_latent` is the dimensionality of $w$
* `n_features` number of features in the convolution layer at the highest resolution (final block)
@@ -276,7 +276,7 @@ def forward(self, w: torch.Tensor, input_noise: List[Tuple[Optional[torch.Tensor
class GeneratorBlock(nn.Module):
- """
+ r"""
### Generator Block
@@ -379,7 +379,7 @@ def forward(self, x: torch.Tensor, w: torch.Tensor, noise: Optional[torch.Tensor
class ToRGB(nn.Module):
- """
+ r"""
### To RGB
@@ -430,7 +430,7 @@ class Conv2dWeightModulate(nn.Module):
def __init__(self, in_features: int, out_features: int, kernel_size: int,
demodulate: float = True, eps: float = 1e-8):
- """
+ r"""
* `in_features` is the number of features in the input feature map
* `out_features` is the number of features in the output feature map
* `kernel_size` is the size of the convolution kernel
@@ -492,7 +492,7 @@ def forward(self, x: torch.Tensor, s: torch.Tensor):
class Discriminator(nn.Module):
- """
+ r"""
## StyleGAN 2 Discriminator
@@ -506,7 +506,7 @@ class Discriminator(nn.Module):
"""
def __init__(self, log_resolution: int, n_features: int = 64, max_features: int = 512):
- """
+ r"""
* `log_resolution` is the $\log_2$ of image resolution
* `n_features` number of features in the convolution layer at the highest resolution (first block)
* `max_features` maximum number of features in any generator block
@@ -561,7 +561,7 @@ def forward(self, x: torch.Tensor):
class DiscriminatorBlock(nn.Module):
- """
+ r"""
### Discriminator Block
@@ -653,7 +653,7 @@ def forward(self, x: torch.Tensor):
class DownSample(nn.Module):
- """
+ r"""
### Down-sample
@@ -677,7 +677,7 @@ def forward(self, x: torch.Tensor):
class UpSample(nn.Module):
- """
+ r"""
### Up-sample
@@ -797,7 +797,7 @@ def forward(self, x: torch.Tensor):
class EqualizedWeight(nn.Module):
- """
+ r"""
## Learning-rate Equalized Weights Parameter
@@ -835,7 +835,7 @@ def forward(self):
class GradientPenalty(nn.Module):
- """
+ r"""
## Gradient Penalty
@@ -851,7 +851,7 @@ class GradientPenalty(nn.Module):
"""
def forward(self, x: torch.Tensor, d: torch.Tensor):
- """
+ r"""
* `x` is $x \sim \mathcal{D}$
* `d` is $D(x)$
"""
@@ -877,7 +877,7 @@ def forward(self, x: torch.Tensor, d: torch.Tensor):
class PathLengthPenalty(nn.Module):
- """
+ r"""
## Path Length Penalty
@@ -901,7 +901,7 @@ class PathLengthPenalty(nn.Module):
"""
def __init__(self, beta: float):
- """
+ r"""
* `beta` is the constant $\beta$ used to calculate the exponential moving average $a$
"""
super().__init__()
diff --git a/labml_nn/gan/stylegan/experiment.py b/labml_nn/gan/stylegan/experiment.py
index 7a33aba9b..621e8f859 100644
--- a/labml_nn/gan/stylegan/experiment.py
+++ b/labml_nn/gan/stylegan/experiment.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: StyleGAN 2 Model Training
summary: >
diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py
index d28c6283f..8f115b439 100644
--- a/labml_nn/gan/wasserstein/__init__.py
+++ b/labml_nn/gan/wasserstein/__init__.py
@@ -108,7 +108,7 @@ def forward(self, f_real: torch.Tensor, f_fake: torch.Tensor):
This returns the a tuple with losses for $f_w(x)$ and $f_w(g_\theta(z))$,
which are later added.
They are kept separate for logging.
- """
+ r"""
# We use ReLUs to clip the loss to keep $f \in [-1, +1]$ range.
return F.relu(1 - f_real).mean(), F.relu(1 + f_fake).mean()
diff --git a/labml_nn/graphs/gat/__init__.py b/labml_nn/graphs/gat/__init__.py
index 81ae9eaca..b8eb71ce2 100644
--- a/labml_nn/graphs/gat/__init__.py
+++ b/labml_nn/graphs/gat/__init__.py
@@ -30,7 +30,7 @@
class GraphAttentionLayer(nn.Module):
- """
+ r"""
## Graph attention layer
This is a single graph attention layer.
@@ -82,7 +82,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int,
self.dropout = nn.Dropout(dropout)
def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
- """
+ r"""
* `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`.
* `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`.
We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head.
diff --git a/labml_nn/graphs/gatv2/__init__.py b/labml_nn/graphs/gatv2/__init__.py
index f306bb0be..60efa9656 100644
--- a/labml_nn/graphs/gatv2/__init__.py
+++ b/labml_nn/graphs/gatv2/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Graph Attention Networks v2 (GATv2)
summary: >
@@ -60,7 +60,7 @@
class GraphAttentionV2Layer(nn.Module):
- """
+ r"""
## Graph attention v2 layer
This is a single graph attention v2 layer.
A GATv2 is made up of multiple such layers.
@@ -119,7 +119,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int,
self.dropout = nn.Dropout(dropout)
def forward(self, h: torch.Tensor, adj_mat: torch.Tensor):
- """
+ r"""
* `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`.
* `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`.
We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head.
diff --git a/labml_nn/hypernetworks/hyper_lstm.py b/labml_nn/hypernetworks/hyper_lstm.py
index 917baf4d1..ccd84ed42 100644
--- a/labml_nn/hypernetworks/hyper_lstm.py
+++ b/labml_nn/hypernetworks/hyper_lstm.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: HyperNetworks - HyperLSTM
summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks.
@@ -223,7 +223,7 @@ def __init__(self, input_size: int, hidden_size: int, hyper_size: int, n_z: int,
def forward(self, x: torch.Tensor,
state: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = None):
- """
+ r"""
* `x` has shape `[n_steps, batch_size, input_size]` and
* `state` is a tuple of $h, c, \hat{h}, \hat{c}$.
$h, c$ have shape `[batch_size, hidden_size]` and
diff --git a/labml_nn/lora/__init__.py b/labml_nn/lora/__init__.py
index bd3c42c7b..355707337 100644
--- a/labml_nn/lora/__init__.py
+++ b/labml_nn/lora/__init__.py
@@ -26,7 +26,7 @@
class Linear(nn.Module):
- """
+ r"""
## LoRA Linear Layer
LoRA linear layer adds a low-rank decomposition to the pre-trained
@@ -48,7 +48,7 @@ class Linear(nn.Module):
def __init__(self, in_features: int, out_features: int, bias: bool,
r: int, alpha: int = None):
- """
+ r"""
:param in_features: is the number of input features of the linear layer
:param out_features: is the number of output features of the linear layer
:param bias: is a flag indicating if there is a bias parameter
@@ -99,7 +99,7 @@ def forward(self, x: torch.Tensor):
class Embedding(nn.Module):
- """
+ r"""
## LoRA Embedding Layer
Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained
@@ -110,7 +110,7 @@ class Embedding(nn.Module):
def __init__(self, num_embeddings: int, embedding_dim: int,
r: int, alpha: int = None):
- """
+ r"""
:param num_embeddings: is the number of embeddings
:param embedding_dim: is the number embedding dimensions
diff --git a/labml_nn/lstm/__init__.py b/labml_nn/lstm/__init__.py
index 29edba75c..6d76b6378 100644
--- a/labml_nn/lstm/__init__.py
+++ b/labml_nn/lstm/__init__.py
@@ -17,7 +17,7 @@
class LSTMCell(nn.Module):
- """
+ r"""
## Long Short-Term Memory Cell
LSTM Cell computes $c$, and $h$. $c$ is like the long-term memory,
diff --git a/labml_nn/neox/model.py b/labml_nn/neox/model.py
index 295afd05d..1e813a89c 100644
--- a/labml_nn/neox/model.py
+++ b/labml_nn/neox/model.py
@@ -73,7 +73,7 @@ class RoPE(nn.Module):
"""
def __init__(self, d_rope: int, base: float = 10_000.):
- """
+ r"""
:param d_rope: is the number of features for RoPE embeddings
:param base: is the base for $\theta_i = 10000^{\frac{2(i-1)}{d}}$, which defaults to $10000$
"""
@@ -92,7 +92,7 @@ def __init__(self, d_rope: int, base: float = 10_000.):
@staticmethod
def rotate_half(x: torch.Tensor):
- """
+ r"""
### Rotate the features
$[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., -x^{(\frac{d}{2})}]$
@@ -101,7 +101,7 @@ def rotate_half(x: torch.Tensor):
return torch.cat((-x2, x1), dim=-1)
def forward(self, x: torch.Tensor, offset: int = 0):
- """
+ r"""
:param x: has shape `[..., seq, n_heads, d_k]`
:param offset: is the starting position of `x`. This is $\gt 0$ when we have
cached the keys and queries of previous positions
@@ -513,7 +513,7 @@ def __init__(self, *, n_vocab: int = 50_432, n_hidden: int = 6_144,
llm_int8_threshold: float = 6.0,
is_flash_attention: bool = False
):
- """
+ r"""
### Generator to create layers
The layers are generated in the same order as checkpoints.
@@ -571,7 +571,7 @@ def post_load_prepare(self, layer: NeoXModule, *,
device: torch.device = None,
llm_int8_threshold: float = None,
):
- """
+ r"""
### Layer transformations after loading the checkpoint
diff --git a/labml_nn/neox/utils/llm_int8.py b/labml_nn/neox/utils/llm_int8.py
index cd8420855..47c349233 100644
--- a/labml_nn/neox/utils/llm_int8.py
+++ b/labml_nn/neox/utils/llm_int8.py
@@ -41,7 +41,7 @@
def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, threshold: float = 6.0):
- """
+ r"""
## Transform a `nn.Linear` layer to LLM.int8() linear layer
:param linear_module: is the `nn.Linear` layer to transform
diff --git a/labml_nn/normalization/batch_channel_norm/__init__.py b/labml_nn/normalization/batch_channel_norm/__init__.py
index 846361087..4a16dfe2f 100644
--- a/labml_nn/normalization/batch_channel_norm/__init__.py
+++ b/labml_nn/normalization/batch_channel_norm/__init__.py
@@ -40,7 +40,7 @@ class BatchChannelNorm(nn.Module):
def __init__(self, channels: int, groups: int,
eps: float = 1e-5, momentum: float = 0.1, estimate: bool = True):
- """
+ r"""
* `channels` is the number of features in the input
* `groups` is the number of groups the features are divided into
* `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
@@ -66,7 +66,7 @@ def forward(self, x):
class EstimatedBatchNorm(nn.Module):
- """
+ r"""
## Estimated Batch Normalization
When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations,
@@ -88,7 +88,7 @@ class EstimatedBatchNorm(nn.Module):
"""
def __init__(self, channels: int,
eps: float = 1e-5, momentum: float = 0.1, affine: bool = True):
- """
+ r"""
* `channels` is the number of features in the input
* `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
* `momentum` is the momentum in taking the exponential moving average
@@ -174,7 +174,7 @@ class ChannelNorm(nn.Module):
def __init__(self, channels, groups,
eps: float = 1e-5, affine: bool = True):
- """
+ r"""
* `groups` is the number of groups the features are divided into
* `channels` is the number of features in the input
* `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py
index 1471b807f..8001f1bb7 100644
--- a/labml_nn/normalization/batch_norm/__init__.py
+++ b/labml_nn/normalization/batch_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Batch Normalization
summary: >
@@ -138,7 +138,7 @@ def __init__(self, channels: int, *,
* `track_running_stats` is whether to calculate the moving averages or mean and variance
We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation.
- """
+ r"""
super().__init__()
self.channels = channels
@@ -163,7 +163,7 @@ def forward(self, x: torch.Tensor):
`*` denotes any number of (possibly 0) dimensions.
For example, in an image (2D) convolution this will be
`[batch_size, channels, height, width]`
- """
+ r"""
# Keep the original shape
x_shape = x.shape
# Get the batch size
diff --git a/labml_nn/normalization/deep_norm/__init__.py b/labml_nn/normalization/deep_norm/__init__.py
index fcec5bd82..414a84398 100644
--- a/labml_nn/normalization/deep_norm/__init__.py
+++ b/labml_nn/normalization/deep_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: DeepNorm
summary: >
@@ -82,7 +82,7 @@
class DeepNorm(nn.Module):
- """
+ r"""
## DeepNorm Normalization
$$x_{l + 1} = \mathop{LN}\Big( \alpha x_l + \mathop{G}_l \big(x_l, \theta_l \big)\Big)$$
@@ -91,7 +91,7 @@ class DeepNorm(nn.Module):
def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], *,
eps: float = 1e-5,
elementwise_affine: bool = True):
- """
+ r"""
:param alpha: is $\alpha$
:param normalized_shape: is the shape for LayerNorm $\mathop{LN}$
:param eps: is $\epsilon$ for LayerNorm
@@ -104,7 +104,7 @@ def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size],
self.layer_norm = LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
def forward(self, x: torch.Tensor, gx: torch.Tensor):
- """
+ r"""
:param x: is the output from the previous layer $x_l$
:param gx: is the output of the current sub-layer $\mathop{G}_l (x_l, \theta_l)$
"""
@@ -126,7 +126,7 @@ def __init__(self, *,
deep_norm_alpha: float,
deep_norm_beta: float,
):
- """
+ r"""
:param d_model: is the token embedding size
:param self_attn: is the self attention module
:param feed_forward: is the feed forward module
diff --git a/labml_nn/normalization/deep_norm/experiment.py b/labml_nn/normalization/deep_norm/experiment.py
index 9fcdadc64..1819453e1 100644
--- a/labml_nn/normalization/deep_norm/experiment.py
+++ b/labml_nn/normalization/deep_norm/experiment.py
@@ -89,7 +89,7 @@ class Configs(NLPAutoRegressionConfigs):
@option(Configs.deep_norm_alpha)
def _deep_norm_alpha(c: Configs):
- """
+ r"""
#### Calculate $\alpha$
$\alpha = (2M)^{\frac{1}{4}}$
@@ -99,7 +99,7 @@ def _deep_norm_alpha(c: Configs):
@option(Configs.deep_norm_beta)
def _deep_norm_beta(c: Configs):
- """
+ r"""
#### Calculate $\beta$
$\beta = (8M)^{-\frac{1}{4}}$
diff --git a/labml_nn/normalization/group_norm/__init__.py b/labml_nn/normalization/group_norm/__init__.py
index e9a87ac92..3c20840ff 100644
--- a/labml_nn/normalization/group_norm/__init__.py
+++ b/labml_nn/normalization/group_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Group Normalization
summary: >
@@ -98,7 +98,7 @@ def __init__(self, groups: int, channels: int, *,
* `channels` is the number of features in the input
* `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability
* `affine` is whether to scale and shift the normalized value
- """
+ r"""
super().__init__()
assert channels % groups == 0, "Number of channels should be evenly divisible by the number of groups"
@@ -118,7 +118,7 @@ def forward(self, x: torch.Tensor):
`*` denotes any number of (possibly 0) dimensions.
For example, in an image (2D) convolution this will be
`[batch_size, channels, height, width]`
- """
+ r"""
# Keep the original shape
x_shape = x.shape
# Get the batch size
diff --git a/labml_nn/normalization/instance_norm/__init__.py b/labml_nn/normalization/instance_norm/__init__.py
index c7db3adc9..937d69c80 100644
--- a/labml_nn/normalization/instance_norm/__init__.py
+++ b/labml_nn/normalization/instance_norm/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Instance Normalization
summary: >
@@ -53,7 +53,7 @@ def __init__(self, channels: int, *,
* `channels` is the number of features in the input
* `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
* `affine` is whether to scale and shift the normalized value
- """
+ r"""
super().__init__()
self.channels = channels
@@ -71,7 +71,7 @@ def forward(self, x: torch.Tensor):
`*` denotes any number of (possibly 0) dimensions.
For example, in an image (2D) convolution this will be
`[batch_size, channels, height, width]`
- """
+ r"""
# Keep the original shape
x_shape = x.shape
# Get the batch size
diff --git a/labml_nn/normalization/layer_norm/__init__.py b/labml_nn/normalization/layer_norm/__init__.py
index 0d5ca8116..a0a004f57 100644
--- a/labml_nn/normalization/layer_norm/__init__.py
+++ b/labml_nn/normalization/layer_norm/__init__.py
@@ -79,7 +79,7 @@ def __init__(self, normalized_shape: Union[int, List[int], Size], *,
* `elementwise_affine` is whether to scale and shift the normalized value
We've tried to use the same names for arguments as PyTorch `LayerNorm` implementation.
- """
+ r"""
super().__init__()
# Convert `normalized_shape` to `torch.Size`
@@ -104,7 +104,7 @@ def forward(self, x: torch.Tensor):
`*` could be any number of dimensions.
For example, in an NLP task this will be
`[seq_len, batch_size, features]`
- """
+ r"""
# Sanity check to make sure the shapes match
assert self.normalized_shape == x.shape[-len(self.normalized_shape):]
diff --git a/labml_nn/normalization/weight_standardization/__init__.py b/labml_nn/normalization/weight_standardization/__init__.py
index 2fb3009b5..8165f9a9a 100644
--- a/labml_nn/normalization/weight_standardization/__init__.py
+++ b/labml_nn/normalization/weight_standardization/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Weight Standardization
summary: >
diff --git a/labml_nn/optimizers/__init__.py b/labml_nn/optimizers/__init__.py
index 172854a83..6cd5361f4 100644
--- a/labml_nn/optimizers/__init__.py
+++ b/labml_nn/optimizers/__init__.py
@@ -73,7 +73,7 @@ class GenericAdaptiveOptimizer(Optimizer):
"""
def __init__(self, params, defaults: Dict[str, Any], lr: float, betas: Tuple[float, float], eps: float):
- """
+ r"""
### Initialize
* `params` is the collection of parameters or set of parameter groups.
@@ -109,7 +109,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
pass
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.Tensor):
- """
+ r"""
### Take optimizer step on a parameter tensor
This should be overridden and take the optimization step on `param` tensor $\theta$,
diff --git a/labml_nn/optimizers/ada_belief.py b/labml_nn/optimizers/ada_belief.py
index d33b1b4a8..b3d33fab7 100644
--- a/labml_nn/optimizers/ada_belief.py
+++ b/labml_nn/optimizers/ada_belief.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: AdaBelief optimizer
summary: A simple PyTorch implementation/tutorial of AdaBelief optimizer.
@@ -53,7 +53,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
weight_decay: WeightDecay = WeightDecay(), amsgrad=False,
degenerate_to_sgd=True,
rectify=True, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -75,7 +75,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
self.rectify = rectify
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
- """
+ r"""
### Initialize a parameter state
* `state` is the optimizer state of the parameter (tensor)
@@ -95,7 +95,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
state['max_exp_avg_var'] = torch.zeros_like(param, memory_format=torch.preserve_format)
def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor):
- """
+ r"""
### Calculate $m_t$ and $s_t$ or $\max(s_1, s_2, ..., s_{t-1}, s_t)$
* `state` is the optimizer state of the parameter (tensor)
@@ -131,7 +131,7 @@ def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso
return m, s
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
- """
+ r"""
### Take an update step for a given parameter tensor
* `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam.py b/labml_nn/optimizers/adam.py
index 568be4d55..24b115dc8 100644
--- a/labml_nn/optimizers/adam.py
+++ b/labml_nn/optimizers/adam.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Adam Optimizer
summary: A simple PyTorch implementation/tutorial of Adam optimizer
@@ -60,7 +60,7 @@ def __init__(self, params,
weight_decay: WeightDecay = WeightDecay(),
optimized_update: bool = True,
defaults: Optional[Dict[str, Any]] = None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -81,7 +81,7 @@ def __init__(self, params,
self.optimized_update = optimized_update
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
- """
+ r"""
### Initialize a parameter state
* `state` is the optimizer state of the parameter (tensor)
@@ -97,7 +97,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor):
- """
+ r"""
### Calculate $m_t$ and and $v_t$
* `state` is the optimizer state of the parameter (tensor)
@@ -121,7 +121,7 @@ def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso
return m, v
def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
- """
+ r"""
### Get learning-rate
This returns the modified learning rate based on the state.
@@ -132,7 +132,7 @@ def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter,
m: torch.Tensor, v: torch.Tensor):
- """
+ r"""
### Do the *Adam* parameter update
* `state` is the optimizer state of the parameter (tensor)
@@ -192,7 +192,7 @@ def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch
param.data.addcdiv_(m, denominator, value=-step_size)
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
- """
+ r"""
### Take an update step for a given parameter tensor
* `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam_fp16.py b/labml_nn/optimizers/adam_fp16.py
index 1b36135f8..36e55435c 100644
--- a/labml_nn/optimizers/adam_fp16.py
+++ b/labml_nn/optimizers/adam_fp16.py
@@ -35,7 +35,7 @@ def __init__(self, params, lr: float = 1e-3, betas: Tuple[float, float] = (0.9,
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults)
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
- """
+ r"""
### Initialize a parameter state
* `state` is the optimizer state of the parameter (tensor)
@@ -55,7 +55,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
state['fp32_copy'] = param.to(torch.float)
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
- """
+ r"""
### Take an update step for a given parameter tensor
* `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/optimizers/adam_warmup.py b/labml_nn/optimizers/adam_warmup.py
index fb73d1529..92cd9ff9a 100644
--- a/labml_nn/optimizers/adam_warmup.py
+++ b/labml_nn/optimizers/adam_warmup.py
@@ -25,7 +25,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
weight_decay: WeightDecay = WeightDecay(),
optimized_update: bool = True,
amsgrad=False, warmup=0, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -46,7 +46,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
- """
+ r"""
### Get learning-rate
$$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
diff --git a/labml_nn/optimizers/adam_warmup_cosine_decay.py b/labml_nn/optimizers/adam_warmup_cosine_decay.py
index 037f1b4cf..6358f4bcc 100644
--- a/labml_nn/optimizers/adam_warmup_cosine_decay.py
+++ b/labml_nn/optimizers/adam_warmup_cosine_decay.py
@@ -28,7 +28,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
weight_decay: WeightDecay = WeightDecay(),
optimized_update: bool = True,
amsgrad=False, warmup=0, total_steps=1e10, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -51,7 +51,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
- """
+ r"""
### Get learning-rate
$$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$
diff --git a/labml_nn/optimizers/amsgrad.py b/labml_nn/optimizers/amsgrad.py
index 07658e09b..5d9971f6b 100644
--- a/labml_nn/optimizers/amsgrad.py
+++ b/labml_nn/optimizers/amsgrad.py
@@ -36,7 +36,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
weight_decay: WeightDecay = WeightDecay(),
optimized_update: bool = True,
amsgrad=True, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -56,7 +56,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults)
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
- """
+ r"""
### Initialize a parameter state
* `state` is the optimizer state of the parameter (tensor)
@@ -73,7 +73,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format)
def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor):
- """
+ r"""
### Calculate $m_t$ and and $v_t$ or $\max(v_1, v_2, ..., v_{t-1}, v_t)$
* `state` is the optimizer state of the parameter (tensor)
@@ -109,7 +109,7 @@ def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tenso
def _synthetic_experiment(is_adam: bool):
- """
+ r"""
## Synthetic Experiment
This is the synthetic experiment described in the paper,
diff --git a/labml_nn/optimizers/noam.py b/labml_nn/optimizers/noam.py
index 8443f881c..26450311f 100644
--- a/labml_nn/optimizers/noam.py
+++ b/labml_nn/optimizers/noam.py
@@ -29,7 +29,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
optimized_update: bool = True,
amsgrad=False,
warmup=0, d_model=512, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -52,7 +52,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16,
self.d_model = d_model
def get_lr(self, state: Dict[str, any], group: Dict[str, any]):
- """
+ r"""
### Get learning-rate
$$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$
diff --git a/labml_nn/optimizers/radam.py b/labml_nn/optimizers/radam.py
index 3e384c4d4..bd718e370 100644
--- a/labml_nn/optimizers/radam.py
+++ b/labml_nn/optimizers/radam.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Rectified Adam (RAdam) optimizer
summary: A simple PyTorch implementation/tutorial of RAdam optimizer.
@@ -157,7 +157,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
optimized_update: bool = True,
amsgrad=False,
degenerated_to_sgd=True, defaults=None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -176,7 +176,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8,
super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults)
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
- """
+ r"""
### Take an update step for a given parameter tensor
* `state` is the optimizer state of the parameter (tensor)
@@ -221,7 +221,7 @@ def calc_rectification_term(beta2: float, step: int) -> Optional[float]:
def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter,
m: torch.Tensor, v: torch.Tensor):
- """
+ r"""
### Do the *RAdam* parameter update
* `state` is the optimizer state of the parameter (tensor)
@@ -274,7 +274,7 @@ def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: tor
def _test_rectification_term():
- """
+ r"""
### Plot $r_t$ against $t$ for various $\beta_2$

diff --git a/labml_nn/optimizers/sophia.py b/labml_nn/optimizers/sophia.py
index 2aa58f426..12d43c2a1 100644
--- a/labml_nn/optimizers/sophia.py
+++ b/labml_nn/optimizers/sophia.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Sophia Optimizer
summary: A simple PyTorch implementation/tutorial of Sophia optimizer
@@ -72,7 +72,7 @@ def __init__(self, params,
rho: float = 0.03,
weight_decay: WeightDecay = WeightDecay(),
defaults: Optional[Dict[str, Any]] = None):
- """
+ r"""
### Initialize the optimizer
* `params` is the list of parameters
@@ -92,7 +92,7 @@ def __init__(self, params,
self.weight_decay = weight_decay
def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter):
- """
+ r"""
### Initialize a parameter state
* `state` is the optimizer state of the parameter (tensor)
@@ -108,7 +108,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par
state['hessian'] = torch.zeros_like(param, memory_format=torch.preserve_format)
def update_hessian(self, n_tokens_training_batch):
- """
+ r"""
### Update the EMA of Hessian diagonal $h_t$
* `n_tokens_training_batch` is the number of tokens/inputs in the batch $B$
@@ -145,7 +145,7 @@ def update_hessian(self, n_tokens_training_batch):
state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=(1 - beta2) * n_tokens_training_batch)
def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter):
- """
+ r"""
### Take an update step for a given parameter tensor
* `state` is the optimizer state of the parameter (tensor)
diff --git a/labml_nn/recurrent_highway_networks/__init__.py b/labml_nn/recurrent_highway_networks/__init__.py
index f1e8b3b76..0f9590c02 100644
--- a/labml_nn/recurrent_highway_networks/__init__.py
+++ b/labml_nn/recurrent_highway_networks/__init__.py
@@ -16,7 +16,7 @@
class RHNCell(nn.Module):
- """
+ r"""
## Recurrent Highway Network Cell
This implements equations $(6) - (9)$.
diff --git a/labml_nn/resnet/__init__.py b/labml_nn/resnet/__init__.py
index bd085c470..960ecf2a3 100644
--- a/labml_nn/resnet/__init__.py
+++ b/labml_nn/resnet/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Deep Residual Learning for Image Recognition (ResNet)
summary: >
@@ -67,7 +67,7 @@ class ShortcutProjection(nn.Module):
"""
def __init__(self, in_channels: int, out_channels: int, stride: int):
- """
+ r"""
* `in_channels` is the number of channels in $x$
* `out_channels` is the number of channels in $\mathcal{F}(x, \{W_i\})$
* `stride` is the stride length in the convolution operation for $F$.
@@ -86,7 +86,7 @@ def forward(self, x: torch.Tensor):
class ResidualBlock(nn.Module):
- """
+ r"""
## Residual Block
@@ -153,7 +153,7 @@ def forward(self, x: torch.Tensor):
class BottleneckResidualBlock(nn.Module):
- """
+ r"""
## Bottleneck Residual Block
@@ -181,7 +181,7 @@ class BottleneckResidualBlock(nn.Module):
"""
def __init__(self, in_channels: int, bottleneck_channels: int, out_channels: int, stride: int):
- """
+ r"""
* `in_channels` is the number of channels in $x$
* `bottleneck_channels` is the number of channels for the $3 \times 3$ convlution
* `out_channels` is the number of output channels
diff --git a/labml_nn/rl/dqn/__init__.py b/labml_nn/rl/dqn/__init__.py
index 048bda113..8320dc214 100644
--- a/labml_nn/rl/dqn/__init__.py
+++ b/labml_nn/rl/dqn/__init__.py
@@ -31,7 +31,7 @@
class QFuncLoss(nn.Module):
- """
+ r"""
## Train the model
We want to find optimal action-value function.
@@ -106,7 +106,7 @@ def __init__(self, gamma: float):
def forward(self, q: torch.Tensor, action: torch.Tensor, double_q: torch.Tensor,
target_q: torch.Tensor, done: torch.Tensor, reward: torch.Tensor,
weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
- """
+ r"""
* `q` - $Q(s;\theta_i)$
* `action` - $a$
* `double_q` - $\textcolor{cyan}Q(s';\textcolor{cyan}{\theta_i})$
diff --git a/labml_nn/rl/dqn/experiment.py b/labml_nn/rl/dqn/experiment.py
index 2a3af4381..19be0f414 100644
--- a/labml_nn/rl/dqn/experiment.py
+++ b/labml_nn/rl/dqn/experiment.py
@@ -107,7 +107,7 @@ def __init__(self, *,
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4)
def _sample_action(self, q_value: torch.Tensor, exploration_coefficient: float):
- """
+ r"""
#### $\epsilon$-greedy Sampling
When sampling actions we use a $\epsilon$-greedy strategy, where we
take a greedy action with probabiliy $1 - \epsilon$ and
diff --git a/labml_nn/rl/dqn/model.py b/labml_nn/rl/dqn/model.py
index 6dbe2e081..3b15f03c0 100644
--- a/labml_nn/rl/dqn/model.py
+++ b/labml_nn/rl/dqn/model.py
@@ -15,7 +15,7 @@
class Model(nn.Module):
- """
+ r"""
## Dueling Network ⚔️ Model for $Q$ Values
We are using a [dueling network](https://arxiv.org/abs/1511.06581)
diff --git a/labml_nn/rl/dqn/replay_buffer.py b/labml_nn/rl/dqn/replay_buffer.py
index 966bfcbb6..ad3882e7b 100644
--- a/labml_nn/rl/dqn/replay_buffer.py
+++ b/labml_nn/rl/dqn/replay_buffer.py
@@ -18,7 +18,7 @@
class ReplayBuffer:
- """
+ r"""
## Buffer for Prioritized Experience Replay
[Prioritized experience replay](https://arxiv.org/abs/1511.05952)
@@ -180,7 +180,7 @@ def _set_priority_sum(self, idx, priority):
self.priority_sum[idx] = self.priority_sum[2 * idx] + self.priority_sum[2 * idx + 1]
def _sum(self):
- """
+ r"""
#### $\sum_k p_k^\alpha$
"""
@@ -188,7 +188,7 @@ def _sum(self):
return self.priority_sum[1]
def _min(self):
- """
+ r"""
#### $\min_k p_k^\alpha$
"""
@@ -196,7 +196,7 @@ def _min(self):
return self.priority_min[1]
def find_prefix_sum_idx(self, prefix_sum):
- """
+ r"""
#### Find largest $i$ such that $\sum_{k=1}^{i} p_k^\alpha \le P$
"""
diff --git a/labml_nn/rl/ppo/__init__.py b/labml_nn/rl/ppo/__init__.py
index 2b878d1d4..ca5b31edb 100644
--- a/labml_nn/rl/ppo/__init__.py
+++ b/labml_nn/rl/ppo/__init__.py
@@ -31,7 +31,7 @@
class ClippedPPOLoss(nn.Module):
- """
+ r"""
## PPO Loss
Here's how the PPO update rule is derived.
@@ -179,7 +179,7 @@ def forward(self, log_pi: torch.Tensor, sampled_log_pi: torch.Tensor,
class ClippedValueFunctionLoss(nn.Module):
- """
+ r"""
## Clipped Value Function Loss
Similarly we clip the value function update also.
diff --git a/labml_nn/rl/ppo/gae.py b/labml_nn/rl/ppo/gae.py
index 981b609ef..74864b611 100644
--- a/labml_nn/rl/ppo/gae.py
+++ b/labml_nn/rl/ppo/gae.py
@@ -23,7 +23,7 @@ def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: flo
self.n_workers = n_workers
def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray:
- """
+ r"""
### Calculate advantages
\begin{align}
diff --git a/labml_nn/sampling/nucleus.py b/labml_nn/sampling/nucleus.py
index 6de9c719e..60daa2b9c 100644
--- a/labml_nn/sampling/nucleus.py
+++ b/labml_nn/sampling/nucleus.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Nucleus Sampling
summary: A PyTorch implementation of nucleus sampling from language models.
diff --git a/labml_nn/sampling/temperature.py b/labml_nn/sampling/temperature.py
index 4c924ee61..a8f60a5d3 100644
--- a/labml_nn/sampling/temperature.py
+++ b/labml_nn/sampling/temperature.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Sampling from Language Models with Temperature
summary: A PyTorch implementation of sampling from language models with temperature.
diff --git a/labml_nn/scaling/zero3/__init__.py b/labml_nn/scaling/zero3/__init__.py
index 9f5955350..1f3609d6f 100644
--- a/labml_nn/scaling/zero3/__init__.py
+++ b/labml_nn/scaling/zero3/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Zero-DP Memory Optimization
summary: >
diff --git a/labml_nn/sketch_rnn/__init__.py b/labml_nn/sketch_rnn/__init__.py
index fe250008d..9632dbd78 100644
--- a/labml_nn/sketch_rnn/__init__.py
+++ b/labml_nn/sketch_rnn/__init__.py
@@ -54,7 +54,7 @@ class StrokesDataset(Dataset):
"""
def __init__(self, dataset: np.array, max_seq_length: int, scale: Optional[float] = None):
- """
+ r"""
`dataset` is a list of numpy arrays of shape [seq_len, 3].
It is a sequence of strokes, and each stroke is represented by
3 integers.
@@ -126,7 +126,7 @@ def __getitem__(self, idx: int):
class BivariateGaussianMixture:
- """
+ r"""
## Bi-variate Gaussian mixture
The mixture is represented by $\Pi$ and
@@ -150,7 +150,7 @@ def n_distributions(self):
return self.pi_logits.shape[-1]
def set_temperature(self, temperature: float):
- """
+ r"""
Adjust by temperature $\tau$
"""
# $$\hat{\Pi_k} \leftarrow \frac{\hat{\Pi_k}}{\tau}$$
@@ -348,7 +348,7 @@ def forward(self, mask: torch.Tensor, target: torch.Tensor,
class KLDivLoss(nn.Module):
- """
+ r"""
## KL-Divergence loss
This calculates the KL divergence between a given normal distribution and $\mathcal{N}(0, 1)$
diff --git a/labml_nn/transformers/aft/__init__.py b/labml_nn/transformers/aft/__init__.py
index b3f526cbb..5aab884d6 100644
--- a/labml_nn/transformers/aft/__init__.py
+++ b/labml_nn/transformers/aft/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: An Attention Free Transformer
summary: >
@@ -64,7 +64,7 @@
class AFTLocal(nn.Module):
- """
+ r"""
### AFT Local Operation
$$Y_t = \sigma(Q_t) \odot
@@ -109,7 +109,7 @@ def __init__(self, d_model: int, seq_len: int, local_window_size: int, bias: boo
@staticmethod
def create_local_mask(seq_len, local_window_size):
- """
+ r"""
#### Create local mask
This creates a mask for
diff --git a/labml_nn/transformers/alibi/__init__.py b/labml_nn/transformers/alibi/__init__.py
index 8c1bdad27..154f93a52 100644
--- a/labml_nn/transformers/alibi/__init__.py
+++ b/labml_nn/transformers/alibi/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Attention with Linear Biases (ALiBi)
summary: >
@@ -41,7 +41,7 @@
def get_slopes(n_heads: int):
- """
+ r"""
## Get head-specific slope $m$ for each head
* `n_heads` is the number of heads in the attention layer $n$
diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py
index 96339e0cc..e3e4212c3 100644
--- a/labml_nn/transformers/compressive/__init__.py
+++ b/labml_nn/transformers/compressive/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Compressive Transformer
summary: >
diff --git a/labml_nn/transformers/configs.py b/labml_nn/transformers/configs.py
index e80f3f097..aab32b427 100644
--- a/labml_nn/transformers/configs.py
+++ b/labml_nn/transformers/configs.py
@@ -50,7 +50,7 @@ class FeedForwardConfigs(BaseConfigs):
@option(FeedForwardConfigs.activation, 'ReLU')
def _ffn_activation_relu():
- """
+ r"""
### ReLU activation
$$\max(0, x)$$
@@ -60,7 +60,7 @@ def _ffn_activation_relu():
@option(FeedForwardConfigs.activation, 'GELU')
def _ffn_activation_gelu():
- """
+ r"""
### GELU activation
$$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$
diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py
index d4fbac2ea..2b0962fb8 100644
--- a/labml_nn/transformers/fast_weights/__init__.py
+++ b/labml_nn/transformers/fast_weights/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Linear Transformers Are Secretly Fast Weight Memory Systems
summary: >
@@ -101,7 +101,7 @@
class DPFP(nn.Module):
- """
+ r"""
## Deterministic Parameter Free Project (DPFP)
This is the new projection function $\textcolor{lightgreen}{\phi}$ introduced in the paper.
@@ -135,7 +135,7 @@ class DPFP(nn.Module):
"""
def __init__(self, nu: int = 1, eps: float = 1e-6):
- """
+ r"""
* `nu` is the hyper-parameter $\nu$.
* `eps` is the small value used to make sure there is no division-by-zero when normalizing.
"""
@@ -151,7 +151,7 @@ def forward(self, k: torch.Tensor):
return k / (torch.sum(k, dim=-1, keepdim=True) + self.eps)
def dpfp(self, k: torch.Tensor):
- """
+ r"""
$$\textcolor{lightgreen}{\phi(k)}$$
"""
# $x = \text{ReLU}\Big(\big[k, -k\big]\Big)$
@@ -173,7 +173,7 @@ def dpfp(self, k: torch.Tensor):
class FastWeightsAttention(nn.Module):
- """
+ r"""
## Fast Weights Attention
The paper introduces a new update rule for calculating $\textcolor{cyan}{W^{(i)}}$.
diff --git a/labml_nn/transformers/feed_forward.py b/labml_nn/transformers/feed_forward.py
index f9c8d768e..eb1ab07f7 100644
--- a/labml_nn/transformers/feed_forward.py
+++ b/labml_nn/transformers/feed_forward.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Position-wise Feed-Forward Network (FFN)
summary: Documented reusable implementation of the position wise feedforward network.
diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py
index ee9e9b8bd..6661d12c0 100644
--- a/labml_nn/transformers/feedback/__init__.py
+++ b/labml_nn/transformers/feedback/__init__.py
@@ -136,7 +136,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor):
positional encodings $P_q, P_j$.
We replace term $\textcolor{lightgreen}{D}$ with $S_j$.
- """
+ r"""
# $U^K_j$
key_pos_emb = self.key_pos_embeddings[-key.shape[0]:]
@@ -160,7 +160,7 @@ def forward(self, *,
"""
* `query` has shape `[batch_size, d_model]`
* `key` and `value` has shape `[seq_len, batch_size, d_model]`
- """
+ r"""
# Prepare `query`, `key` and `value` for attention computation
# `key` and `value` will then have shape `[seq_len, batch_size, heads, d_k]`
diff --git a/labml_nn/transformers/flash/__init__.py b/labml_nn/transformers/flash/__init__.py
index fe4b2990d..84d14a8d1 100644
--- a/labml_nn/transformers/flash/__init__.py
+++ b/labml_nn/transformers/flash/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Flash Attention
summary: >
@@ -160,7 +160,7 @@ class AttentionFunc(torch.autograd.Function):
def forward(ctx: Any,
q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
causal: bool, sm_scale: float) -> torch.Tensor:
- """
+ r"""
### Forward pass
Group query attention forward pass. Returns the output in shape `[batch_size, n_heads, q_seq_len, d_head]`.
@@ -352,7 +352,7 @@ def _attn_fwd(t_q, t_k, t_v, sm_scale_log2e, t_lse, t_o,
BLOCK_Q: tl.constexpr,
BLOCK_K: tl.constexpr,
):
- """
+ r"""
### Triton kernel for Flash attention forward pass
:param t_q: queries $Q_i$
diff --git a/labml_nn/transformers/fnet/__init__.py b/labml_nn/transformers/fnet/__init__.py
index 4b123f376..61f45d3ca 100644
--- a/labml_nn/transformers/fnet/__init__.py
+++ b/labml_nn/transformers/fnet/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: "FNet: Mixing Tokens with Fourier Transforms"
summary: >
@@ -45,7 +45,7 @@
class FNetMix(nn.Module):
- """
+ r"""
## FNet - Mix tokens
This module simply implements
@@ -58,7 +58,7 @@ class FNetMix(nn.Module):
"""
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
- """
+ r"""
The [normal attention module](../mha.html) can be fed with different token embeddings for
$\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
diff --git a/labml_nn/transformers/gmlp/__init__.py b/labml_nn/transformers/gmlp/__init__.py
index 74b55e9f4..f2afd7d05 100644
--- a/labml_nn/transformers/gmlp/__init__.py
+++ b/labml_nn/transformers/gmlp/__init__.py
@@ -23,7 +23,7 @@
class GMLPBlock(nn.Module):
- """
+ r"""
## gMLP Block
Each block does the following transformations to input embeddings
@@ -87,7 +87,7 @@ def forward(self, *, x: torch.Tensor, mask: Optional[torch.Tensor] = None):
class SpacialGatingUnit(nn.Module):
- """
+ r"""
## Spatial Gating Unit
$$s(Z) = Z_1 \odot f_{W,b}(Z_2)$$
diff --git a/labml_nn/transformers/gpt/__init__.py b/labml_nn/transformers/gpt/__init__.py
index 7c8beda60..6d32eef10 100644
--- a/labml_nn/transformers/gpt/__init__.py
+++ b/labml_nn/transformers/gpt/__init__.py
@@ -124,7 +124,7 @@ def _transformer_configs(c: Configs):
def _init_weights(module):
- """
+ r"""
### Initialize weights
Weights of linear layers and embedding layers are initialized
diff --git a/labml_nn/transformers/hour_glass/__init__.py b/labml_nn/transformers/hour_glass/__init__.py
index f94bd7bda..0031c7ce5 100644
--- a/labml_nn/transformers/hour_glass/__init__.py
+++ b/labml_nn/transformers/hour_glass/__init__.py
@@ -246,7 +246,7 @@ def __init__(self):
class AttentionBasedShortening(nn.Module):
- """
+ r"""
### 🚧 Down-sampling with attention
\begin{align}
@@ -263,7 +263,7 @@ def __init__(self):
class LinearUpSampling(nn.Module):
- """
+ r"""
### 🚧 Linear projection for up-sampling
Make a linear projection of dense token embeddings to a size of $d_{\text{model}} k$.
@@ -275,7 +275,7 @@ def __init__(self):
class AttentionBasedUpSampling(nn.Module):
- """
+ r"""
### 🚧 Attention based up-sampling
\begin{align}
diff --git a/labml_nn/transformers/jax_transformer/__init__.py b/labml_nn/transformers/jax_transformer/__init__.py
index fb9b1bf5e..672b49160 100644
--- a/labml_nn/transformers/jax_transformer/__init__.py
+++ b/labml_nn/transformers/jax_transformer/__init__.py
@@ -385,7 +385,7 @@ def __init__(self, normalized_shape: Union[Tuple[int], List[int]], *,
$X \in \mathbb{R}^{* \times S[0] \times S[1] \times ... \times S[n]}$
* `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability
* `elementwise_affine` is whether to scale and shift the normalized value
- """
+ r"""
super().__init__()
self.eps = eps
@@ -487,7 +487,7 @@ def __call__(self, *,
`mask` has shape `[seq_len, seq_len]` and
`mask[i, j]` indicates whether query at position `i` can see key-value at position `j`.
- """
+ r"""
# Get sequence length
seq_len = len(query)
@@ -623,7 +623,7 @@ class CrossEntropyLoss(Module):
## Cross Entropy Loss
- """
+ r"""
def __init__(self):
super().__init__()
@@ -800,7 +800,7 @@ def step(self, params: Dict, grads: Dict):
* `params` is a tree-map of parameters
* `grads` is a tree-map of gradients
- """
+ r"""
# Increment step $t$
self._n_steps += 1
# Update states for each parameter
@@ -813,7 +813,7 @@ def _step(self, n_steps: int, param: jnp.ndarray, state: AdamState):
### Update parameters
This performs a Adam update on the given parameter
- """
+ r"""
# Bias corrections for $\hat{m}_t$: $1 - \beta_1^t$ and for $\hat{v}_t$: $1 - \beta_2^t$
bias_correction = [1 - beta ** n_steps for beta in self.betas]
@@ -834,7 +834,7 @@ def _update_state(self, grad, state: AdamState):
### Update state
This updates uncorrected first and second moments $m_t$ and $v_t$
- """
+ r"""
# Uncorrected first and second moments $m_{t-1}$ and $v_{t-1}$
m, v = state
# Clip gradients
diff --git a/labml_nn/transformers/knn/__init__.py b/labml_nn/transformers/knn/__init__.py
index 72d8037f0..902f5ffcc 100644
--- a/labml_nn/transformers/knn/__init__.py
+++ b/labml_nn/transformers/knn/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: k-Nearest Neighbor Language Models
summary: >
diff --git a/labml_nn/transformers/knn/build_index.py b/labml_nn/transformers/knn/build_index.py
index f6deafaa9..f8c6a9942 100644
--- a/labml_nn/transformers/knn/build_index.py
+++ b/labml_nn/transformers/knn/build_index.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Build FAISS index for k-NN search
summary: This builds the FAISS index with the transformer embeddings.
@@ -51,7 +51,7 @@ def load_experiment(run_uuid: str, checkpoint: Optional[int] = None):
def gather_keys(conf: Configs):
- """
+ r"""
## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays
*Note that these numpy arrays will take up a lot of space (even few hundred gigabytes)
diff --git a/labml_nn/transformers/knn/eval_knn.py b/labml_nn/transformers/knn/eval_knn.py
index 703a41ca7..17c6f49a9 100644
--- a/labml_nn/transformers/knn/eval_knn.py
+++ b/labml_nn/transformers/knn/eval_knn.py
@@ -20,7 +20,7 @@
def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray, vals_store: np.ndarray, n_tokens: int):
- """
+ r"""
## $k$-NN to get $p(w_t, c_t)$
Here we refer to $f(\textcolor{yellowgreen}{c_t})$ as queries,
diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py
index ff93530e0..e09516e84 100644
--- a/labml_nn/transformers/mha.py
+++ b/labml_nn/transformers/mha.py
@@ -123,7 +123,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor):
### Calculate scores between queries and keys
This method can be overridden for other variations like relative attention.
- """
+ r"""
# Calculate $Q K^\top$ or $S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd}$
return torch.einsum('ibhd,jbhd->ijbh', query, key)
diff --git a/labml_nn/transformers/mlp_mixer/__init__.py b/labml_nn/transformers/mlp_mixer/__init__.py
index 06b650583..4281efc37 100644
--- a/labml_nn/transformers/mlp_mixer/__init__.py
+++ b/labml_nn/transformers/mlp_mixer/__init__.py
@@ -48,7 +48,7 @@ def __init__(self, mlp: nn.Module):
self.mlp = mlp
def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None):
- """
+ r"""
The [normal attention module](../mha.html) can be fed with different token embeddings for
$\text{query}$,$\text{key}$, and $\text{value}$ and a mask.
diff --git a/labml_nn/transformers/positional_encoding.py b/labml_nn/transformers/positional_encoding.py
index 615ee913c..650877f3f 100644
--- a/labml_nn/transformers/positional_encoding.py
+++ b/labml_nn/transformers/positional_encoding.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Fixed Positional Encodings
summary: >
diff --git a/labml_nn/transformers/primer_ez/__init__.py b/labml_nn/transformers/primer_ez/__init__.py
index 6357f6d1c..276f26715 100644
--- a/labml_nn/transformers/primer_ez/__init__.py
+++ b/labml_nn/transformers/primer_ez/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: "Primer: Searching for Efficient Transformers for Language Modeling"
summary: >
@@ -42,7 +42,7 @@
class SquaredReLU(nn.Module):
- """
+ r"""
## Squared ReLU activation
$$y = {\max(x, 0)}^2$$
diff --git a/labml_nn/transformers/retro/bert_embeddings.py b/labml_nn/transformers/retro/bert_embeddings.py
index 9ddef8f09..a6c53558e 100644
--- a/labml_nn/transformers/retro/bert_embeddings.py
+++ b/labml_nn/transformers/retro/bert_embeddings.py
@@ -19,7 +19,7 @@
class BERTChunkEmbeddings:
- """
+ r"""
## BERT Embeddings
For a given chunk of text $N$ this class generates BERT embeddings $\text{B\small{ERT}}(N)$.
@@ -75,7 +75,7 @@ def _trim_chunk(chunk: str):
return stripped
def __call__(self, chunks: List[str]):
- """
+ r"""
### Get $\text{B\small{ERT}}(N)$ for a list of chunks.
"""
diff --git a/labml_nn/transformers/retro/database.py b/labml_nn/transformers/retro/database.py
index 90d0a2003..8a3e94e79 100644
--- a/labml_nn/transformers/retro/database.py
+++ b/labml_nn/transformers/retro/database.py
@@ -26,7 +26,7 @@
def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256,
code_size: int = 64, n_probe: int = 8, n_train: int = 50_000):
- """
+ r"""
## Build Database
* `chunk_len` is the length of a chunk (number of characters)
diff --git a/labml_nn/transformers/retro/model.py b/labml_nn/transformers/retro/model.py
index fe72048a5..06516de3b 100644
--- a/labml_nn/transformers/retro/model.py
+++ b/labml_nn/transformers/retro/model.py
@@ -32,7 +32,7 @@ class RotaryPositionalEmbeddings(nn.Module):
"""
def __init__(self, d: int, base: int = 10_000):
- """
+ r"""
* `d` is the number of features $d$
* `base` is the constant used for calculating $\Theta$
"""
@@ -81,7 +81,7 @@ def forward(self, x: torch.Tensor):
class SelfAttention(nn.Module):
- """
+ r"""
## Self-Attention Layer $\text{A\small{TTN}}$
This applies causal and non-causal [multi-headed self-attention](../mha.html).
@@ -185,7 +185,7 @@ def forward(self, h: torch.Tensor):
class CrossAttention(nn.Module):
- """
+ r"""
## Cross-Attention Layer $\text{C\small{A}}$
This is similar to the self-attention layer defined above, except that
@@ -272,7 +272,7 @@ def forward(self, e: torch.Tensor, h: torch.Tensor):
class ChunkedCrossAttention(nn.Module):
- """
+ r"""
## Chunked Cross-Attention Layer $\text{C\small{CA}}$
This is similar to the cross-attention layer defined above.
@@ -380,7 +380,7 @@ def forward(self, h: torch.Tensor, e: torch.Tensor):
class FeedForward(nn.Module):
- """
+ r"""
### Position-wise Feed Forward Layer $\text{F\small{FW}}$
This consists of two linear layers and an activation in the middle.
@@ -425,7 +425,7 @@ def forward(self, h: torch.Tensor):
class NearestNeighborEncoder(nn.Module):
- """
+ r"""
## Nearest Neighbor Encoder $\text{E\small{NCODER}}(\text{R\small{ET}}(C_u)_{1 \le u \le l}, H)$
This module encodes the retrieved nearest neighbors
@@ -433,7 +433,7 @@ class NearestNeighborEncoder(nn.Module):
def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int],
d_model: int, n_heads: int, d_k: int, d_ff: int):
- """
+ r"""
* `chunk_len` is the length of a chunk
* `n_layer` is the number of layers in the encoder $L_{\text{enc}}$
* `ca_layers` are the layers with cross attention $P_{\text{enc}}$
@@ -457,7 +457,7 @@ def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int],
self.norm_h = nn.LayerNorm(d_model)
def forward(self, e: torch.Tensor, h: torch.Tensor):
- """
+ r"""
* `e` are token embeddings of the retrieved nearest neighbors,
$\text{E\small{MB}}\big(\text{R\small{ET}}(C_u)_{1 \le u \le l}\big)$
of shape `[batch_size, chunks, neighbors, neighbor_len, d_model]`
@@ -541,7 +541,7 @@ def __init__(self, n_vocab: int, d_model: int, n_layers: int, ca_layers: Set[int
self.norm_e = nn.LayerNorm(d_model)
def forward(self, x: torch.Tensor, ret: torch.Tensor):
- """
+ r"""
* `x` is the input sequence, $X$ of shape `[batch_size, seq_len]`
* `ret` are the retrieved neighbors
$\text{R\small{ET}}(C_u)_{1 \le u \le l}$
diff --git a/labml_nn/transformers/rope/__init__.py b/labml_nn/transformers/rope/__init__.py
index a200785b0..f4800220a 100644
--- a/labml_nn/transformers/rope/__init__.py
+++ b/labml_nn/transformers/rope/__init__.py
@@ -28,7 +28,7 @@
class RotaryPositionalEmbeddings(nn.Module):
- """
+ r"""
## RoPE module
Rotary encoding transforms pairs of features by rotating in the 2D plane.
@@ -116,7 +116,7 @@ class RotaryPositionalEmbeddings(nn.Module):
"""
def __init__(self, d: int, base: int = 10_000):
- """
+ r"""
* `d` is the number of features $d$
* `base` is the constant used for calculating $\Theta$
"""
@@ -128,7 +128,7 @@ def __init__(self, d: int, base: int = 10_000):
self.sin_cached = None
def _build_cache(self, x: torch.Tensor):
- """
+ r"""
Cache $\cos$ and $\sin$ values
"""
# Return if cache is already built
diff --git a/labml_nn/transformers/rope/value_pe/__init__.py b/labml_nn/transformers/rope/value_pe/__init__.py
index 8aadeab8f..7855c0efe 100644
--- a/labml_nn/transformers/rope/value_pe/__init__.py
+++ b/labml_nn/transformers/rope/value_pe/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: Rotary Positional Embeddings with Relative distance (RoPER)
summary: >
diff --git a/labml_nn/uncertainty/evidence/__init__.py b/labml_nn/uncertainty/evidence/__init__.py
index 8062050a1..27f04873c 100644
--- a/labml_nn/uncertainty/evidence/__init__.py
+++ b/labml_nn/uncertainty/evidence/__init__.py
@@ -1,4 +1,4 @@
-"""
+r"""
---
title: "Evidential Deep Learning to Quantify Classification Uncertainty"
summary: >
@@ -55,7 +55,7 @@
class MaximumLikelihoodLoss(nn.Module):
- """
+ r"""
## Type II Maximum Likelihood Loss
@@ -81,7 +81,7 @@ class MaximumLikelihoodLoss(nn.Module):
"""
def forward(self, evidence: torch.Tensor, target: torch.Tensor):
- """
+ r"""
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
@@ -98,7 +98,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
class CrossEntropyBayesRisk(nn.Module):
- """
+ r"""
## Bayes Risk with Cross Entropy Loss
@@ -128,7 +128,7 @@ class CrossEntropyBayesRisk(nn.Module):
"""
def forward(self, evidence: torch.Tensor, target: torch.Tensor):
- """
+ r"""
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
@@ -145,7 +145,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
class SquaredErrorBayesRisk(nn.Module):
- """
+ r"""
## Bayes Risk with Squared Error Loss
@@ -191,7 +191,7 @@ class SquaredErrorBayesRisk(nn.Module):
"""
def forward(self, evidence: torch.Tensor, target: torch.Tensor):
- """
+ r"""
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
@@ -215,7 +215,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor):
class KLDivergenceLoss(nn.Module):
- """
+ r"""
## KL Divergence Regularization Loss
@@ -240,7 +240,7 @@ class KLDivergenceLoss(nn.Module):
"""
def forward(self, evidence: torch.Tensor, target: torch.Tensor):
- """
+ r"""
* `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]`
* `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]`
"""
diff --git a/labml_nn/unet/__init__.py b/labml_nn/unet/__init__.py
index cdfde7393..8e66cb843 100644
--- a/labml_nn/unet/__init__.py
+++ b/labml_nn/unet/__init__.py
@@ -30,7 +30,7 @@
class DoubleConvolution(nn.Module):
- """
+ r"""
### Two $3 \times 3$ Convolution Layers
Each step in the contraction path and expansive path have two $3 \times 3$
@@ -63,7 +63,7 @@ def forward(self, x: torch.Tensor):
class DownSample(nn.Module):
- """
+ r"""
### Down-sample
Each step in the contracting path down-samples the feature map with
@@ -80,7 +80,7 @@ def forward(self, x: torch.Tensor):
class UpSample(nn.Module):
- """
+ r"""
### Up-sample
Each step in the expansive path up-samples the feature map with