From cf5af5598d49bae020f142cfc6628a1f85b858e3 Mon Sep 17 00:00:00 2001 From: Creeken Date: Sat, 2 May 2026 03:51:54 +0800 Subject: [PATCH] fix: use raw docstrings (r""") for LaTeX math to avoid invalid escape warnings Convert """ to r""" for docstrings containing LaTeX formulas. In Python 3.12+, backslash sequences like \l (from \lambda), \b (from \bar), \t (from \theta) trigger SyntaxWarning or are misinterpreted. Raw strings preserve the backslash as literal text, fixing Pylance reportInvalidStringEscapeSequence diagnostics. Co-Authored-By: Claude Opus 4.7 --- labml_nn/activations/fta/__init__.py | 6 +- .../ponder_net/__init__.py | 16 ++-- labml_nn/capsule_networks/__init__.py | 8 +- labml_nn/cfr/__init__.py | 16 ++-- labml_nn/conv_mixer/__init__.py | 4 +- labml_nn/diffusion/ddpm/__init__.py | 12 +-- labml_nn/diffusion/ddpm/evaluate.py | 93 ++++++++++++------- labml_nn/diffusion/ddpm/unet.py | 8 +- .../stable_diffusion/latent_diffusion.py | 6 +- .../stable_diffusion/model/autoencoder.py | 2 +- .../diffusion/stable_diffusion/model/unet.py | 2 +- .../stable_diffusion/model/unet_attention.py | 4 +- .../stable_diffusion/sampler/__init__.py | 10 +- .../stable_diffusion/sampler/ddim.py | 14 +-- .../stable_diffusion/sampler/ddpm.py | 12 +-- .../scripts/image_to_image.py | 4 +- .../stable_diffusion/scripts/in_paint.py | 4 +- .../stable_diffusion/scripts/text_to_image.py | 4 +- labml_nn/distillation/__init__.py | 2 +- labml_nn/gan/cycle_gan/__init__.py | 4 +- labml_nn/gan/original/__init__.py | 8 +- labml_nn/gan/original/experiment.py | 2 +- labml_nn/gan/stylegan/__init__.py | 32 +++---- labml_nn/gan/stylegan/experiment.py | 2 +- labml_nn/gan/wasserstein/__init__.py | 2 +- labml_nn/graphs/gat/__init__.py | 4 +- labml_nn/graphs/gatv2/__init__.py | 6 +- labml_nn/hypernetworks/hyper_lstm.py | 4 +- labml_nn/lora/__init__.py | 8 +- labml_nn/lstm/__init__.py | 2 +- labml_nn/neox/model.py | 10 +- labml_nn/neox/utils/llm_int8.py | 2 +- .../batch_channel_norm/__init__.py | 8 +- labml_nn/normalization/batch_norm/__init__.py | 6 +- labml_nn/normalization/deep_norm/__init__.py | 10 +- .../normalization/deep_norm/experiment.py | 4 +- labml_nn/normalization/group_norm/__init__.py | 6 +- .../normalization/instance_norm/__init__.py | 6 +- labml_nn/normalization/layer_norm/__init__.py | 4 +- .../weight_standardization/__init__.py | 2 +- labml_nn/optimizers/__init__.py | 4 +- labml_nn/optimizers/ada_belief.py | 10 +- labml_nn/optimizers/adam.py | 14 +-- labml_nn/optimizers/adam_fp16.py | 4 +- labml_nn/optimizers/adam_warmup.py | 4 +- .../optimizers/adam_warmup_cosine_decay.py | 4 +- labml_nn/optimizers/amsgrad.py | 8 +- labml_nn/optimizers/noam.py | 4 +- labml_nn/optimizers/radam.py | 10 +- labml_nn/optimizers/sophia.py | 10 +- .../recurrent_highway_networks/__init__.py | 2 +- labml_nn/resnet/__init__.py | 10 +- labml_nn/rl/dqn/__init__.py | 4 +- labml_nn/rl/dqn/experiment.py | 2 +- labml_nn/rl/dqn/model.py | 2 +- labml_nn/rl/dqn/replay_buffer.py | 8 +- labml_nn/rl/ppo/__init__.py | 4 +- labml_nn/rl/ppo/gae.py | 2 +- labml_nn/sampling/nucleus.py | 2 +- labml_nn/sampling/temperature.py | 2 +- labml_nn/scaling/zero3/__init__.py | 2 +- labml_nn/sketch_rnn/__init__.py | 8 +- labml_nn/transformers/aft/__init__.py | 6 +- labml_nn/transformers/alibi/__init__.py | 4 +- labml_nn/transformers/compressive/__init__.py | 2 +- labml_nn/transformers/configs.py | 4 +- .../transformers/fast_weights/__init__.py | 10 +- labml_nn/transformers/feed_forward.py | 2 +- labml_nn/transformers/feedback/__init__.py | 4 +- labml_nn/transformers/flash/__init__.py | 6 +- labml_nn/transformers/fnet/__init__.py | 6 +- labml_nn/transformers/gmlp/__init__.py | 4 +- labml_nn/transformers/gpt/__init__.py | 2 +- labml_nn/transformers/hour_glass/__init__.py | 6 +- .../transformers/jax_transformer/__init__.py | 12 +-- labml_nn/transformers/knn/__init__.py | 2 +- labml_nn/transformers/knn/build_index.py | 4 +- labml_nn/transformers/knn/eval_knn.py | 2 +- labml_nn/transformers/mha.py | 2 +- labml_nn/transformers/mlp_mixer/__init__.py | 2 +- labml_nn/transformers/positional_encoding.py | 2 +- labml_nn/transformers/primer_ez/__init__.py | 4 +- .../transformers/retro/bert_embeddings.py | 4 +- labml_nn/transformers/retro/database.py | 2 +- labml_nn/transformers/retro/model.py | 18 ++-- labml_nn/transformers/rope/__init__.py | 6 +- .../transformers/rope/value_pe/__init__.py | 2 +- labml_nn/uncertainty/evidence/__init__.py | 18 ++-- labml_nn/unet/__init__.py | 6 +- 89 files changed, 325 insertions(+), 296 deletions(-) diff --git a/labml_nn/activations/fta/__init__.py b/labml_nn/activations/fta/__init__.py index ba682a0a0..6ae143d3c 100644 --- a/labml_nn/activations/fta/__init__.py +++ b/labml_nn/activations/fta/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Fuzzy Tiling Activations summary: > @@ -68,7 +68,7 @@ class FTA(nn.Module): """ def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: float): - """ + r""" :param lower_limit: is the lower limit $l$ :param upper_limit: is the upper limit $u$ :param delta: is the bin size $\delta$ @@ -86,7 +86,7 @@ def __init__(self, lower_limit: float, upper_limit: float, delta: float, eta: fl self.eta = eta def fuzzy_i_plus(self, x: torch.Tensor): - """ + r""" #### Fuzzy indicator function $$I_{\eta,+}(x) = I_+(\eta - x) x + I_+ (x - \eta)$$ diff --git a/labml_nn/adaptive_computation/ponder_net/__init__.py b/labml_nn/adaptive_computation/ponder_net/__init__.py index 7dfcd2d3c..b9a56eeef 100644 --- a/labml_nn/adaptive_computation/ponder_net/__init__.py +++ b/labml_nn/adaptive_computation/ponder_net/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: "PonderNet: Learning to Ponder" summary: > @@ -106,7 +106,7 @@ def __init__(self, n_elems: int, n_hidden: int, max_steps: int): self.is_halt = False def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - """ + r""" * `x` is the input of shape `[batch_size, n_elems]` This outputs a tuple of four tensors: @@ -177,7 +177,7 @@ def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Te class ReconstructionLoss(nn.Module): - """ + r""" ## Reconstruction loss $$L_{Rec} = \sum_{n=1}^N p_n \mathcal{L}(y, \hat{y}_n)$$ @@ -186,14 +186,14 @@ class ReconstructionLoss(nn.Module): """ def __init__(self, loss_func: nn.Module): - """ + r""" * `loss_func` is the loss function $\mathcal{L}$ """ super().__init__() self.loss_func = loss_func def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor): - """ + r""" * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]` * `y_hat` is $\hat{y}_1 \dots \hat{y}_N$ in a tensor of shape `[N, batch_size, ...]` * `y` is the target of shape `[batch_size, ...]` @@ -213,7 +213,7 @@ def forward(self, p: torch.Tensor, y_hat: torch.Tensor, y: torch.Tensor): class RegularizationLoss(nn.Module): - """ + r""" ## Regularization loss $$L_{Reg} = \mathop{KL} \Big(p_n \Vert p_G(\lambda_p) \Big)$$ @@ -229,7 +229,7 @@ class RegularizationLoss(nn.Module): """ def __init__(self, lambda_p: float, max_steps: int = 1_000): - """ + r""" * `lambda_p` is $\lambda_p$ - the success probability of geometric distribution * `max_steps` is the highest $N$; we use this to pre-compute $p_G(\lambda_p)$ """ @@ -253,7 +253,7 @@ def __init__(self, lambda_p: float, max_steps: int = 1_000): self.kl_div = nn.KLDivLoss(reduction='batchmean') def forward(self, p: torch.Tensor): - """ + r""" * `p` is $p_1 \dots p_N$ in a tensor of shape `[N, batch_size]` """ # Transpose `p` to `[batch_size, N]` diff --git a/labml_nn/capsule_networks/__init__.py b/labml_nn/capsule_networks/__init__.py index 9a9dfbeae..a2dfaf6cc 100644 --- a/labml_nn/capsule_networks/__init__.py +++ b/labml_nn/capsule_networks/__init__.py @@ -35,7 +35,7 @@ class Squash(nn.Module): - """ + r""" ## Squash This is **squashing** function from paper, given by equation $(1)$. @@ -69,7 +69,7 @@ def forward(self, s: torch.Tensor): class Router(nn.Module): - """ + r""" ## Routing Algorithm This is the routing mechanism described in the paper. @@ -132,7 +132,7 @@ def forward(self, u: torch.Tensor): class MarginLoss(nn.Module): - """ + r""" ## Margin loss for class existence A separate margin loss is used for each output capsule and the total loss is the sum of them. @@ -161,7 +161,7 @@ def __init__(self, *, n_labels: int, lambda_: float = 0.5, m_positive: float = 0 self.n_labels = n_labels def forward(self, v: torch.Tensor, labels: torch.Tensor): - """ + r""" `v`, $\mathbf{v}_j$ are the squashed output capsules. This has shape `[batch_size, n_labels, n_features]`; that is, there is a capsule for each label. diff --git a/labml_nn/cfr/__init__.py b/labml_nn/cfr/__init__.py index 48e496058..1e69674ea 100644 --- a/labml_nn/cfr/__init__.py +++ b/labml_nn/cfr/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Regret Minimization in Games with Incomplete Information (CFR) summary: > @@ -337,7 +337,7 @@ class History: - """ + r""" ## History @@ -349,14 +349,14 @@ class History: """ def is_terminal(self): - """ + r""" Whether it's a terminal history; i.e. game over. $h \in Z$ """ raise NotImplementedError() def terminal_utility(self, i: Player) -> float: - """ + r""" Utility of player $i$ for a terminal history. $u_i(h)$ where $h \in Z$ @@ -485,7 +485,7 @@ def load_dict(self, data: Dict[str, any]): self.calculate_strategy() def calculate_strategy(self): - """ + r""" ## Calculate strategy Calculate current strategy using [regret matching](#RegretMatching). @@ -520,7 +520,7 @@ def calculate_strategy(self): self.strategy = {a: 1 / count for a, r in regret.items()} def get_average_strategy(self): - """ + r""" ## Get average strategy $$\textcolor{cyan}{\bar{\sigma}^T_i(I)(a)} = @@ -596,7 +596,7 @@ def _get_info_set(self, h: History): return self.info_sets[info_set_key] def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> float: - """ + r""" ### Walk Tree This function walks the game tree. @@ -686,7 +686,7 @@ def walk_tree(self, h: History, i: Player, pi_i: float, pi_neg_i: float) -> floa return v def iterate(self): - """ + r""" ### Iteratively update $\textcolor{lightgreen}{\sigma^t(I)(a)}$ This updates the strategies for $T$ iterations. diff --git a/labml_nn/conv_mixer/__init__.py b/labml_nn/conv_mixer/__init__.py index 42d1804ae..5b8a91f4d 100644 --- a/labml_nn/conv_mixer/__init__.py +++ b/labml_nn/conv_mixer/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Patches Are All You Need? (ConvMixer) summary: > @@ -96,7 +96,7 @@ def forward(self, x: torch.Tensor): class PatchEmbeddings(nn.Module): - """ + r""" ## Get patch embeddings diff --git a/labml_nn/diffusion/ddpm/__init__.py b/labml_nn/diffusion/ddpm/__init__.py index c89c93eeb..013b6ee63 100644 --- a/labml_nn/diffusion/ddpm/__init__.py +++ b/labml_nn/diffusion/ddpm/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Denoising Diffusion Probabilistic Models (DDPM) summary: > @@ -175,7 +175,7 @@ class DenoiseDiffusion: """ def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device): - """ + r""" * `eps_model` is $\textcolor{lightgreen}{\epsilon_\theta}(x_t, t)$ model * `n_steps` is $t$ * `device` is the device to place constants on @@ -196,7 +196,7 @@ def __init__(self, eps_model: nn.Module, n_steps: int, device: torch.device): self.sigma2 = self.beta def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """ + r""" #### Get $q(x_t|x_0)$ distribution \begin{align} @@ -212,7 +212,7 @@ def q_xt_x0(self, x0: torch.Tensor, t: torch.Tensor) -> Tuple[torch.Tensor, torc return mean, var def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor] = None): - """ + r""" #### Sample from $q(x_t|x_0)$ \begin{align} @@ -230,7 +230,7 @@ def q_sample(self, x0: torch.Tensor, t: torch.Tensor, eps: Optional[torch.Tensor return mean + (var ** 0.5) * eps def p_sample(self, xt: torch.Tensor, t: torch.Tensor): - """ + r""" #### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ \begin{align} @@ -262,7 +262,7 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor): return mean + (var ** .5) * eps def loss(self, x0: torch.Tensor, noise: Optional[torch.Tensor] = None): - """ + r""" #### Simplified Loss $$L_{\text{simple}}(\theta) = \mathbb{E}_{t,x_0, \epsilon} \Bigg[ \bigg\Vert diff --git a/labml_nn/diffusion/ddpm/evaluate.py b/labml_nn/diffusion/ddpm/evaluate.py index 52251b61e..06456925f 100644 --- a/labml_nn/diffusion/ddpm/evaluate.py +++ b/labml_nn/diffusion/ddpm/evaluate.py @@ -26,7 +26,13 @@ class Sampler: ## Sampler class """ - def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size: int, device: torch.device): + def __init__( + self, + diffusion: DenoiseDiffusion, + image_channels: int, + image_size: int, + device: torch.device, + ): """ * `diffusion` is the `DenoiseDiffusion` instance * `image_channels` is the number of channels in the image @@ -63,9 +69,11 @@ def __init__(self, diffusion: DenoiseDiffusion, image_channels: int, image_size: # $$\tilde\beta_t = \frac{1 - \bar\alpha_{t-1}}{1 - \bar\alpha_t} \beta_t$$ self.beta_tilde = self.beta * (1 - alpha_bar_tm1) / (1 - self.alpha_bar) # $$\frac{\sqrt{\bar\alpha_{t-1}}\beta_t}{1 - \bar\alpha_t}$$ - self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1 ** 0.5) / (1 - self.alpha_bar) + self.mu_tilde_coef1 = self.beta * (alpha_bar_tm1**0.5) / (1 - self.alpha_bar) # $$\frac{\sqrt{\alpha_t}(1 - \bar\alpha_{t-1}}{1-\bar\alpha_t}$$ - self.mu_tilde_coef2 = (self.alpha ** 0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar) + self.mu_tilde_coef2 = ( + (self.alpha**0.5) * (1 - alpha_bar_tm1) / (1 - self.alpha_bar) + ) # $\sigma^2 = \beta$ self.sigma2 = self.beta @@ -80,6 +88,7 @@ def show_image(self, img, title=""): def make_video(self, frames, path="video.mp4"): """Helper function to create a video""" import imageio + # 20 second video writer = imageio.get_writer(path, fps=len(frames) // 20) # Add each image @@ -91,7 +100,7 @@ def make_video(self, frames, path="video.mp4"): writer.close() def sample_animation(self, n_frames: int = 1000, create_video: bool = True): - """ + r""" #### Sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ We sample an image step-by-step using $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ and at each step @@ -101,14 +110,17 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True): """ # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$ - xt = torch.randn([1, self.image_channels, self.image_size, self.image_size], device=self.device) + xt = torch.randn( + [1, self.image_channels, self.image_size, self.image_size], + device=self.device, + ) # Interval to log $\hat{x}_0$ interval = self.n_steps // n_frames # Frames for video frames = [] # Sample $T$ steps - for t_inv in monit.iterate('Denoise', self.n_steps): + for t_inv in monit.iterate("Denoise", self.n_steps): # $t$ t_ = self.n_steps - t_inv - 1 # $t$ in a tensor @@ -128,8 +140,10 @@ def sample_animation(self, n_frames: int = 1000, create_video: bool = True): if create_video: self.make_video(frames) - def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100): - """ + def interpolate( + self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: int = 100 + ): + r""" #### Interpolate two images $x_0$ and $x'_0$ We get $x_t \sim q(x_t|x_0)$ and $x'_t \sim q(x'_t|x_0)$. @@ -144,20 +158,28 @@ def interpolate(self, x1: torch.Tensor, x2: torch.Tensor, lambda_: float, t_: in * `x2` is $x'_0$ * `lambda_` is $\lambda$ * `t_` is $t$ - """ + r""" # Number of samples n_samples = x1.shape[0] # $t$ tensor t = torch.full((n_samples,), t_, device=self.device) # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$ - xt = (1 - lambda_) * self.diffusion.q_sample(x1, t) + lambda_ * self.diffusion.q_sample(x2, t) + xt = (1 - lambda_) * self.diffusion.q_sample( + x1, t + ) + lambda_ * self.diffusion.q_sample(x2, t) # $$\bar{x}_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|\bar{x}_t)$$ return self._sample_x0(xt, t_) - def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int = 100, t_: int = 100, - create_video=True): + def interpolate_animate( + self, + x1: torch.Tensor, + x2: torch.Tensor, + n_frames: int = 100, + t_: int = 100, + create_video=True, + ): """ #### Interpolate two images $x_0$ and $x'_0$ and make a video @@ -166,7 +188,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int * `n_frames` is the number of frames for the image * `t_` is $t$ * `create_video` specifies whether to make a video or to show each frame - """ + r""" # Show original images self.show_image(x1, "x1") @@ -183,7 +205,7 @@ def interpolate_animate(self, x1: torch.Tensor, x2: torch.Tensor, n_frames: int frames = [] # Get frames with different $\lambda$ - for i in monit.iterate('Interpolate', n_frames + 1, is_children_silent=True): + for i in monit.iterate("Interpolate", n_frames + 1, is_children_silent=True): # $\lambda$ lambda_ = i / n_frames # $$\bar{x}_t = (1 - \lambda)x_t + \lambda x'_0$$ @@ -206,15 +228,17 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int): * `xt` is $x_t$ * `n_steps` is $t$ - """ + r""" # Number of sampels n_samples = xt.shape[0] # Iterate until $t$ steps - for t_ in monit.iterate('Denoise', n_steps): + for t_ in monit.iterate("Denoise", n_steps): t = n_steps - t_ - 1 # Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ - xt = self.diffusion.p_sample(xt, xt.new_full((n_samples,), t, dtype=torch.long)) + xt = self.diffusion.p_sample( + xt, xt.new_full((n_samples,), t, dtype=torch.long) + ) # Return $x_0$ return xt @@ -222,9 +246,12 @@ def _sample_x0(self, xt: torch.Tensor, n_steps: int): def sample(self, n_samples: int = 16): """ #### Generate images - """ + r""" # $x_T \sim p(x_T) = \mathcal{N}(x_T; \mathbf{0}, \mathbf{I})$ - xt = torch.randn([n_samples, self.image_channels, self.image_size, self.image_size], device=self.device) + xt = torch.randn( + [n_samples, self.image_channels, self.image_size, self.image_size], + device=self.device, + ) # $$x_0 \sim \textcolor{lightgreen}{p_\theta}(x_0|x_t)$$ x0 = self._sample_x0(xt, self.n_steps) @@ -234,7 +261,7 @@ def sample(self, n_samples: int = 16): self.show_image(x0[i]) def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor): - """ + r""" #### Sample from $\textcolor{lightgreen}{p_\theta}(x_{t-1}|x_t)$ \begin{align} @@ -244,23 +271,23 @@ def p_sample(self, xt: torch.Tensor, t: torch.Tensor, eps_theta: torch.Tensor): &= \frac{1}{\sqrt{\alpha_t}} \Big(x_t - \frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big) \end{align} - """ + r""" # [gather](utils.html) $\bar\alpha_t$ alpha_bar = gather(self.alpha_bar, t) # $\alpha_t$ alpha = gather(self.alpha, t) # $\frac{\beta}{\sqrt{1-\bar\alpha_t}}$ - eps_coef = (1 - alpha) / (1 - alpha_bar) ** .5 + eps_coef = (1 - alpha) / (1 - alpha_bar) ** 0.5 # $$\frac{1}{\sqrt{\alpha_t}} \Big(x_t - # \frac{\beta_t}{\sqrt{1-\bar\alpha_t}}\textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$ - mean = 1 / (alpha ** 0.5) * (xt - eps_coef * eps_theta) + mean = 1 / (alpha**0.5) * (xt - eps_coef * eps_theta) # $\sigma^2$ var = gather(self.sigma2, t) # $\epsilon \sim \mathcal{N}(\mathbf{0}, \mathbf{I})$ eps = torch.randn(xt.shape, device=xt.device) # Sample - return mean + (var ** .5) * eps + return mean + (var**0.5) * eps def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor): """ @@ -268,13 +295,13 @@ def p_x0(self, xt: torch.Tensor, t: torch.Tensor, eps: torch.Tensor): $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}} \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$ - """ + r""" # [gather](utils.html) $\bar\alpha_t$ alpha_bar = gather(self.alpha_bar, t) # $$x_0 \approx \hat{x}_0 = \frac{1}{\sqrt{\bar\alpha}} # \Big( x_t - \sqrt{1 - \bar\alpha_t} \textcolor{lightgreen}{\epsilon_\theta}(x_t, t) \Big)$$ - return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar ** 0.5) + return (xt - (1 - alpha_bar) ** 0.5 * eps) / (alpha_bar**0.5) def main(): @@ -297,16 +324,18 @@ def main(): configs.init() # Set PyTorch modules for saving and loading - experiment.add_pytorch_models({'eps_model': configs.eps_model}) + experiment.add_pytorch_models({"eps_model": configs.eps_model}) # Load training experiment experiment.load(run_uuid) # Create sampler - sampler = Sampler(diffusion=configs.diffusion, - image_channels=configs.image_channels, - image_size=configs.image_size, - device=configs.device) + sampler = Sampler( + diffusion=configs.diffusion, + image_channels=configs.image_channels, + image_size=configs.image_size, + device=configs.device, + ) # Start evaluation with experiment.start(): @@ -324,5 +353,5 @@ def main(): # -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/labml_nn/diffusion/ddpm/unet.py b/labml_nn/diffusion/ddpm/unet.py index f5da80901..587c619d0 100644 --- a/labml_nn/diffusion/ddpm/unet.py +++ b/labml_nn/diffusion/ddpm/unet.py @@ -1,4 +1,4 @@ -""" +r""" --- title: U-Net model for Denoising Diffusion Probabilistic Models (DDPM) summary: > @@ -29,7 +29,7 @@ class Swish(nn.Module): - """ + r""" ### Swish activation function $$x \cdot \sigma(x)$$ @@ -272,7 +272,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor): class Upsample(nn.Module): - """ + r""" ### Scale up the feature map by $2 \times$ """ @@ -288,7 +288,7 @@ def forward(self, x: torch.Tensor, t: torch.Tensor): class Downsample(nn.Module): - """ + r""" ### Scale down the feature map by $\frac{1}{2} \times$ """ diff --git a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py index d7f9ecd1b..330ad2e04 100644 --- a/labml_nn/diffusion/stable_diffusion/latent_diffusion.py +++ b/labml_nn/diffusion/stable_diffusion/latent_diffusion.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Latent Diffusion Models summary: > @@ -70,7 +70,7 @@ def __init__(self, linear_start: float, linear_end: float, ): - """ + r""" :param unet_model: is the [U-Net](model/unet.html) that predicts noise $\epsilon_\text{cond}(x_t, c)$, in latent space :param autoencoder: is the [AutoEncoder](model/autoencoder.html) @@ -134,7 +134,7 @@ def autoencoder_decode(self, z: torch.Tensor): return self.first_stage_model.decode(z / self.latent_scaling_factor) def forward(self, x: torch.Tensor, t: torch.Tensor, context: torch.Tensor): - """ + r""" ### Predict noise Predict noise given the latent representation $x_t$, time step $t$, and the diff --git a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py index badc6cf85..ec3784060 100644 --- a/labml_nn/diffusion/stable_diffusion/model/autoencoder.py +++ b/labml_nn/diffusion/stable_diffusion/model/autoencoder.py @@ -416,7 +416,7 @@ def forward(self, x: torch.Tensor): def swish(x: torch.Tensor): - """ + r""" ### Swish activation $$x \cdot \sigma(x)$$ diff --git a/labml_nn/diffusion/stable_diffusion/model/unet.py b/labml_nn/diffusion/stable_diffusion/model/unet.py index 261a4bced..4eb10afeb 100644 --- a/labml_nn/diffusion/stable_diffusion/model/unet.py +++ b/labml_nn/diffusion/stable_diffusion/model/unet.py @@ -1,4 +1,4 @@ -""" +r""" --- title: U-Net for Stable Diffusion summary: > diff --git a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py index cf42efa4f..ef2044057 100644 --- a/labml_nn/diffusion/stable_diffusion/model/unet_attention.py +++ b/labml_nn/diffusion/stable_diffusion/model/unet_attention.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Transformer for Stable Diffusion U-Net summary: > @@ -291,7 +291,7 @@ def forward(self, x: torch.Tensor): class GeGLU(nn.Module): - """ + r""" ### GeGLU Activation $$\text{GeGLU}(x) = (xW + b) * \text{GELU}(xV + c)$$ diff --git a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py index 38c063e06..75e71bbf6 100644 --- a/labml_nn/diffusion/stable_diffusion/sampler/__init__.py +++ b/labml_nn/diffusion/stable_diffusion/sampler/__init__.py @@ -29,7 +29,7 @@ class DiffusionSampler: model: LatentDiffusion def __init__(self, model: LatentDiffusion): - """ + r""" :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$ """ super().__init__() @@ -40,7 +40,7 @@ def __init__(self, model: LatentDiffusion): def get_eps(self, x: torch.Tensor, t: torch.Tensor, c: torch.Tensor, *, uncond_scale: float, uncond_cond: Optional[torch.Tensor]): - """ + r""" ## Get $\epsilon(x_t, c)$ :param x: is $x_t$ of shape `[batch_size, channels, height, width]` @@ -79,7 +79,7 @@ def sample(self, uncond_cond: Optional[torch.Tensor] = None, skip_steps: int = 0, ): - """ + r""" ### Sampling Loop :param shape: is the shape of the generated images in the @@ -100,7 +100,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *, uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None, ): - """ + r""" ### Painting Loop :param x: is $x_{T'}$ of shape `[batch_size, channels, height, width]` @@ -116,7 +116,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *, raise NotImplementedError() def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None): - """ + r""" ### Sample from $q(x_t|x_0)$ :param x0: is $x_0$ of shape `[batch_size, channels, height, width]` diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py index 04a8837f3..fb36ab521 100644 --- a/labml_nn/diffusion/stable_diffusion/sampler/ddim.py +++ b/labml_nn/diffusion/stable_diffusion/sampler/ddim.py @@ -24,7 +24,7 @@ class DDIMSampler(DiffusionSampler): - """ + r""" ## DDIM Sampler This extends the [`DiffusionSampler` base class](index.html). @@ -52,7 +52,7 @@ class DDIMSampler(DiffusionSampler): model: LatentDiffusion def __init__(self, model: LatentDiffusion, n_steps: int, ddim_discretize: str = "uniform", ddim_eta: float = 0.): - """ + r""" :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$ :param n_steps: is the number of DDIM sampling steps, $S$ :param ddim_discretize: specifies how to extract $\tau$ from $[1,2,\dots,T]$. @@ -106,7 +106,7 @@ def sample(self, uncond_cond: Optional[torch.Tensor] = None, skip_steps: int = 0, ): - """ + r""" ### Sampling Loop :param shape: is the shape of the generated images in the @@ -153,7 +153,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, temperature: float = 1., uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None): - """ + r""" ### Sample $x_{\tau_{i-1}}$ :param x: is $x_{\tau_i}$ of shape `[batch_size, channels, height, width]` @@ -184,7 +184,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, *, temperature: float, repeat_noise: bool): - """ + r""" ### Sample $x_{\tau_{i-1}}$ given $\epsilon_\theta(x_{\tau_i})$ """ @@ -231,7 +231,7 @@ def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, @torch.no_grad() def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None): - """ + r""" ### Sample from $q_{\sigma,\tau}(x_{\tau_i}|x_0)$ $$q_{\sigma,\tau}(x_t|x_0) = @@ -258,7 +258,7 @@ def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *, uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None, ): - """ + r""" ### Painting Loop :param x: is $x_{S'}$ of shape `[batch_size, channels, height, width]` diff --git a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py index f591e2b65..ffa545abf 100644 --- a/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py +++ b/labml_nn/diffusion/stable_diffusion/sampler/ddpm.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Denoising Diffusion Probabilistic Models (DDPM) Sampling summary: > @@ -24,7 +24,7 @@ class DDPMSampler(DiffusionSampler): - """ + r""" ## DDPM Sampler This extends the [`DiffusionSampler` base class](index.html). @@ -49,7 +49,7 @@ class DDPMSampler(DiffusionSampler): model: LatentDiffusion def __init__(self, model: LatentDiffusion): - """ + r""" :param model: is the model to predict noise $\epsilon_\text{cond}(x_t, c)$ """ super().__init__(model) @@ -94,7 +94,7 @@ def sample(self, uncond_cond: Optional[torch.Tensor] = None, skip_steps: int = 0, ): - """ + r""" ### Sampling Loop :param shape: is the shape of the generated images in the @@ -139,7 +139,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, repeat_noise: bool = False, temperature: float = 1., uncond_scale: float = 1., uncond_cond: Optional[torch.Tensor] = None): - """ + r""" ### Sample $x_{t-1}$ from $p_\theta(x_{t-1} | x_t)$ :param x: is $x_t$ of shape `[batch_size, channels, height, width]` @@ -208,7 +208,7 @@ def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, @torch.no_grad() def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None): - """ + r""" ### Sample from $q(x_t|x_0)$ $$q(x_t|x_0) = \mathcal{N} \Big(x_t; \sqrt{\bar\alpha_t} x_0, (1-\bar\alpha_t) \mathbf{I} \Big)$$ diff --git a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py index ef3aab4d2..8e4fec81c 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/image_to_image.py @@ -26,7 +26,7 @@ class Img2Img: def __init__(self, *, checkpoint_path: Path, ddim_steps: int = 50, ddim_eta: float = 0.0): - """ + r""" :param checkpoint_path: is the path of the checkpoint :param ddim_steps: is the number of sampling steps :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant @@ -54,7 +54,7 @@ def __call__(self, *, prompt: str, uncond_scale: float = 5.0, ): - """ + r""" :param dest_path: is the path to store the generated images :param orig_img: is the image to transform :param strength: specifies how much of the original image should not be preserved diff --git a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py index a3504ed80..cdf731dc7 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/in_paint.py @@ -31,7 +31,7 @@ class InPaint: def __init__(self, *, checkpoint_path: Path, ddim_steps: int = 50, ddim_eta: float = 0.0): - """ + r""" :param checkpoint_path: is the path of the checkpoint :param ddim_steps: is the number of sampling steps :param ddim_eta: is the [DDIM sampling](../sampler/ddim.html) $\eta$ constant @@ -60,7 +60,7 @@ def __call__(self, *, uncond_scale: float = 5.0, mask: Optional[torch.Tensor] = None, ): - """ + r""" :param dest_path: is the path to store the generated images :param orig_img: is the image to transform :param strength: specifies how much of the original image should not be preserved diff --git a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py index aee342bbb..30ab64ffe 100644 --- a/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py +++ b/labml_nn/diffusion/stable_diffusion/scripts/text_to_image.py @@ -33,7 +33,7 @@ def __init__(self, *, n_steps: int = 50, ddim_eta: float = 0.0, ): - """ + r""" :param checkpoint_path: is the path of the checkpoint :param sampler_name: is the name of the [sampler](../sampler/index.html) :param n_steps: is the number of sampling steps @@ -62,7 +62,7 @@ def __call__(self, *, h: int = 512, w: int = 512, uncond_scale: float = 7.5, ): - """ + r""" :param dest_path: is the path to store the generated images :param batch_size: is the number of images to generate in a batch :param prompt: is the prompt to generate images with diff --git a/labml_nn/distillation/__init__.py b/labml_nn/distillation/__init__.py index a8d0d11b5..72708117a 100644 --- a/labml_nn/distillation/__init__.py +++ b/labml_nn/distillation/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Distilling the Knowledge in a Neural Network summary: > diff --git a/labml_nn/gan/cycle_gan/__init__.py b/labml_nn/gan/cycle_gan/__init__.py index 0a78e2613..3bd1c6178 100644 --- a/labml_nn/gan/cycle_gan/__init__.py +++ b/labml_nn/gan/cycle_gan/__init__.py @@ -188,7 +188,7 @@ def forward(self, x: torch.Tensor): def weights_init_normal(m): - """ + r""" Initialize convolution layer weights to $\mathcal{N}(0, 0.2)$ """ classname = m.__class__.__name__ @@ -436,7 +436,7 @@ def initialize(self): ) def run(self): - """ + r""" ## Training We aim to solve: diff --git a/labml_nn/gan/original/__init__.py b/labml_nn/gan/original/__init__.py index 27eb1c650..dff35f8de 100644 --- a/labml_nn/gan/original/__init__.py +++ b/labml_nn/gan/original/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Generative Adversarial Networks (GAN) summary: A simple PyTorch implementation/tutorial of Generative Adversarial Networks (GAN) loss functions. @@ -38,7 +38,7 @@ class DiscriminatorLogitsLoss(nn.Module): - """ + r""" ## Discriminator Loss Discriminator should **ascend** on the gradient, @@ -75,7 +75,7 @@ def __init__(self, smoothing: float = 0.2): self.register_buffer('labels_false', _create_labels(256, 0.0, smoothing), False) def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor): - """ + r""" `logits_true` are logits from $D(\pmb{x}^{(i)})$ and `logits_false` are logits from $D(G(\pmb{z}^{(i)}))$ """ @@ -91,7 +91,7 @@ def forward(self, logits_true: torch.Tensor, logits_false: torch.Tensor): class GeneratorLogitsLoss(nn.Module): - """ + r""" ## Generator Loss Generator should **descend** on the gradient, diff --git a/labml_nn/gan/original/experiment.py b/labml_nn/gan/original/experiment.py index 71789df71..dbb621795 100644 --- a/labml_nn/gan/original/experiment.py +++ b/labml_nn/gan/original/experiment.py @@ -115,7 +115,7 @@ def init(self): tracker.set_image("generated", True, 1 / 100) def sample_z(self, batch_size: int): - """ + r""" $$z \sim p(z)$$ """ return torch.randn(batch_size, 100, device=self.device) diff --git a/labml_nn/gan/stylegan/__init__.py b/labml_nn/gan/stylegan/__init__.py index c1c36bbe0..528cee11b 100644 --- a/labml_nn/gan/stylegan/__init__.py +++ b/labml_nn/gan/stylegan/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: StyleGAN 2 summary: > @@ -156,7 +156,7 @@ class MappingNetwork(nn.Module): - """ + r""" ## Mapping Network @@ -212,7 +212,7 @@ class Generator(nn.Module): """ def __init__(self, log_resolution: int, d_latent: int, n_features: int = 32, max_features: int = 512): - """ + r""" * `log_resolution` is the $\log_2$ of image resolution * `d_latent` is the dimensionality of $w$ * `n_features` number of features in the convolution layer at the highest resolution (final block) @@ -276,7 +276,7 @@ def forward(self, w: torch.Tensor, input_noise: List[Tuple[Optional[torch.Tensor class GeneratorBlock(nn.Module): - """ + r""" ### Generator Block @@ -379,7 +379,7 @@ def forward(self, x: torch.Tensor, w: torch.Tensor, noise: Optional[torch.Tensor class ToRGB(nn.Module): - """ + r""" ### To RGB @@ -430,7 +430,7 @@ class Conv2dWeightModulate(nn.Module): def __init__(self, in_features: int, out_features: int, kernel_size: int, demodulate: float = True, eps: float = 1e-8): - """ + r""" * `in_features` is the number of features in the input feature map * `out_features` is the number of features in the output feature map * `kernel_size` is the size of the convolution kernel @@ -492,7 +492,7 @@ def forward(self, x: torch.Tensor, s: torch.Tensor): class Discriminator(nn.Module): - """ + r""" ## StyleGAN 2 Discriminator @@ -506,7 +506,7 @@ class Discriminator(nn.Module): """ def __init__(self, log_resolution: int, n_features: int = 64, max_features: int = 512): - """ + r""" * `log_resolution` is the $\log_2$ of image resolution * `n_features` number of features in the convolution layer at the highest resolution (first block) * `max_features` maximum number of features in any generator block @@ -561,7 +561,7 @@ def forward(self, x: torch.Tensor): class DiscriminatorBlock(nn.Module): - """ + r""" ### Discriminator Block @@ -653,7 +653,7 @@ def forward(self, x: torch.Tensor): class DownSample(nn.Module): - """ + r""" ### Down-sample @@ -677,7 +677,7 @@ def forward(self, x: torch.Tensor): class UpSample(nn.Module): - """ + r""" ### Up-sample @@ -797,7 +797,7 @@ def forward(self, x: torch.Tensor): class EqualizedWeight(nn.Module): - """ + r""" ## Learning-rate Equalized Weights Parameter @@ -835,7 +835,7 @@ def forward(self): class GradientPenalty(nn.Module): - """ + r""" ## Gradient Penalty @@ -851,7 +851,7 @@ class GradientPenalty(nn.Module): """ def forward(self, x: torch.Tensor, d: torch.Tensor): - """ + r""" * `x` is $x \sim \mathcal{D}$ * `d` is $D(x)$ """ @@ -877,7 +877,7 @@ def forward(self, x: torch.Tensor, d: torch.Tensor): class PathLengthPenalty(nn.Module): - """ + r""" ## Path Length Penalty @@ -901,7 +901,7 @@ class PathLengthPenalty(nn.Module): """ def __init__(self, beta: float): - """ + r""" * `beta` is the constant $\beta$ used to calculate the exponential moving average $a$ """ super().__init__() diff --git a/labml_nn/gan/stylegan/experiment.py b/labml_nn/gan/stylegan/experiment.py index 7a33aba9b..621e8f859 100644 --- a/labml_nn/gan/stylegan/experiment.py +++ b/labml_nn/gan/stylegan/experiment.py @@ -1,4 +1,4 @@ -""" +r""" --- title: StyleGAN 2 Model Training summary: > diff --git a/labml_nn/gan/wasserstein/__init__.py b/labml_nn/gan/wasserstein/__init__.py index d28c6283f..8f115b439 100644 --- a/labml_nn/gan/wasserstein/__init__.py +++ b/labml_nn/gan/wasserstein/__init__.py @@ -108,7 +108,7 @@ def forward(self, f_real: torch.Tensor, f_fake: torch.Tensor): This returns the a tuple with losses for $f_w(x)$ and $f_w(g_\theta(z))$, which are later added. They are kept separate for logging. - """ + r""" # We use ReLUs to clip the loss to keep $f \in [-1, +1]$ range. return F.relu(1 - f_real).mean(), F.relu(1 + f_fake).mean() diff --git a/labml_nn/graphs/gat/__init__.py b/labml_nn/graphs/gat/__init__.py index 81ae9eaca..b8eb71ce2 100644 --- a/labml_nn/graphs/gat/__init__.py +++ b/labml_nn/graphs/gat/__init__.py @@ -30,7 +30,7 @@ class GraphAttentionLayer(nn.Module): - """ + r""" ## Graph attention layer This is a single graph attention layer. @@ -82,7 +82,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int, self.dropout = nn.Dropout(dropout) def forward(self, h: torch.Tensor, adj_mat: torch.Tensor): - """ + r""" * `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`. * `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`. We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head. diff --git a/labml_nn/graphs/gatv2/__init__.py b/labml_nn/graphs/gatv2/__init__.py index f306bb0be..60efa9656 100644 --- a/labml_nn/graphs/gatv2/__init__.py +++ b/labml_nn/graphs/gatv2/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Graph Attention Networks v2 (GATv2) summary: > @@ -60,7 +60,7 @@ class GraphAttentionV2Layer(nn.Module): - """ + r""" ## Graph attention v2 layer This is a single graph attention v2 layer. A GATv2 is made up of multiple such layers. @@ -119,7 +119,7 @@ def __init__(self, in_features: int, out_features: int, n_heads: int, self.dropout = nn.Dropout(dropout) def forward(self, h: torch.Tensor, adj_mat: torch.Tensor): - """ + r""" * `h`, $\mathbf{h}$ is the input node embeddings of shape `[n_nodes, in_features]`. * `adj_mat` is the adjacency matrix of shape `[n_nodes, n_nodes, n_heads]`. We use shape `[n_nodes, n_nodes, 1]` since the adjacency is the same for each head. diff --git a/labml_nn/hypernetworks/hyper_lstm.py b/labml_nn/hypernetworks/hyper_lstm.py index 917baf4d1..ccd84ed42 100644 --- a/labml_nn/hypernetworks/hyper_lstm.py +++ b/labml_nn/hypernetworks/hyper_lstm.py @@ -1,4 +1,4 @@ -""" +r""" --- title: HyperNetworks - HyperLSTM summary: A PyTorch implementation/tutorial of HyperLSTM introduced in paper HyperNetworks. @@ -223,7 +223,7 @@ def __init__(self, input_size: int, hidden_size: int, hyper_size: int, n_z: int, def forward(self, x: torch.Tensor, state: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]] = None): - """ + r""" * `x` has shape `[n_steps, batch_size, input_size]` and * `state` is a tuple of $h, c, \hat{h}, \hat{c}$. $h, c$ have shape `[batch_size, hidden_size]` and diff --git a/labml_nn/lora/__init__.py b/labml_nn/lora/__init__.py index bd3c42c7b..355707337 100644 --- a/labml_nn/lora/__init__.py +++ b/labml_nn/lora/__init__.py @@ -26,7 +26,7 @@ class Linear(nn.Module): - """ + r""" ## LoRA Linear Layer LoRA linear layer adds a low-rank decomposition to the pre-trained @@ -48,7 +48,7 @@ class Linear(nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool, r: int, alpha: int = None): - """ + r""" :param in_features: is the number of input features of the linear layer :param out_features: is the number of output features of the linear layer :param bias: is a flag indicating if there is a bias parameter @@ -99,7 +99,7 @@ def forward(self, x: torch.Tensor): class Embedding(nn.Module): - """ + r""" ## LoRA Embedding Layer Similar to LoRA linear layer this adds a low-rank decomposition to the pre-trained @@ -110,7 +110,7 @@ class Embedding(nn.Module): def __init__(self, num_embeddings: int, embedding_dim: int, r: int, alpha: int = None): - """ + r""" :param num_embeddings: is the number of embeddings :param embedding_dim: is the number embedding dimensions diff --git a/labml_nn/lstm/__init__.py b/labml_nn/lstm/__init__.py index 29edba75c..6d76b6378 100644 --- a/labml_nn/lstm/__init__.py +++ b/labml_nn/lstm/__init__.py @@ -17,7 +17,7 @@ class LSTMCell(nn.Module): - """ + r""" ## Long Short-Term Memory Cell LSTM Cell computes $c$, and $h$. $c$ is like the long-term memory, diff --git a/labml_nn/neox/model.py b/labml_nn/neox/model.py index 295afd05d..1e813a89c 100644 --- a/labml_nn/neox/model.py +++ b/labml_nn/neox/model.py @@ -73,7 +73,7 @@ class RoPE(nn.Module): """ def __init__(self, d_rope: int, base: float = 10_000.): - """ + r""" :param d_rope: is the number of features for RoPE embeddings :param base: is the base for $\theta_i = 10000^{\frac{2(i-1)}{d}}$, which defaults to $10000$ """ @@ -92,7 +92,7 @@ def __init__(self, d_rope: int, base: float = 10_000.): @staticmethod def rotate_half(x: torch.Tensor): - """ + r""" ### Rotate the features $[-x^{(\frac{d}{2} + 1)}, -x^{(\frac{d}{2} + 2)}, ..., -x^{(d)}, x^{(1)}, x^{(2)}, ..., -x^{(\frac{d}{2})}]$ @@ -101,7 +101,7 @@ def rotate_half(x: torch.Tensor): return torch.cat((-x2, x1), dim=-1) def forward(self, x: torch.Tensor, offset: int = 0): - """ + r""" :param x: has shape `[..., seq, n_heads, d_k]` :param offset: is the starting position of `x`. This is $\gt 0$ when we have cached the keys and queries of previous positions @@ -513,7 +513,7 @@ def __init__(self, *, n_vocab: int = 50_432, n_hidden: int = 6_144, llm_int8_threshold: float = 6.0, is_flash_attention: bool = False ): - """ + r""" ### Generator to create layers The layers are generated in the same order as checkpoints. @@ -571,7 +571,7 @@ def post_load_prepare(self, layer: NeoXModule, *, device: torch.device = None, llm_int8_threshold: float = None, ): - """ + r""" ### Layer transformations after loading the checkpoint diff --git a/labml_nn/neox/utils/llm_int8.py b/labml_nn/neox/utils/llm_int8.py index cd8420855..47c349233 100644 --- a/labml_nn/neox/utils/llm_int8.py +++ b/labml_nn/neox/utils/llm_int8.py @@ -41,7 +41,7 @@ def make_llm_int8_linear(linear_module: nn.Linear, device: torch.device, threshold: float = 6.0): - """ + r""" ## Transform a `nn.Linear` layer to LLM.int8() linear layer :param linear_module: is the `nn.Linear` layer to transform diff --git a/labml_nn/normalization/batch_channel_norm/__init__.py b/labml_nn/normalization/batch_channel_norm/__init__.py index 846361087..4a16dfe2f 100644 --- a/labml_nn/normalization/batch_channel_norm/__init__.py +++ b/labml_nn/normalization/batch_channel_norm/__init__.py @@ -40,7 +40,7 @@ class BatchChannelNorm(nn.Module): def __init__(self, channels: int, groups: int, eps: float = 1e-5, momentum: float = 0.1, estimate: bool = True): - """ + r""" * `channels` is the number of features in the input * `groups` is the number of groups the features are divided into * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability @@ -66,7 +66,7 @@ def forward(self, x): class EstimatedBatchNorm(nn.Module): - """ + r""" ## Estimated Batch Normalization When input $X \in \mathbb{R}^{B \times C \times H \times W}$ is a batch of image representations, @@ -88,7 +88,7 @@ class EstimatedBatchNorm(nn.Module): """ def __init__(self, channels: int, eps: float = 1e-5, momentum: float = 0.1, affine: bool = True): - """ + r""" * `channels` is the number of features in the input * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability * `momentum` is the momentum in taking the exponential moving average @@ -174,7 +174,7 @@ class ChannelNorm(nn.Module): def __init__(self, channels, groups, eps: float = 1e-5, affine: bool = True): - """ + r""" * `groups` is the number of groups the features are divided into * `channels` is the number of features in the input * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability diff --git a/labml_nn/normalization/batch_norm/__init__.py b/labml_nn/normalization/batch_norm/__init__.py index 1471b807f..8001f1bb7 100644 --- a/labml_nn/normalization/batch_norm/__init__.py +++ b/labml_nn/normalization/batch_norm/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Batch Normalization summary: > @@ -138,7 +138,7 @@ def __init__(self, channels: int, *, * `track_running_stats` is whether to calculate the moving averages or mean and variance We've tried to use the same names for arguments as PyTorch `BatchNorm` implementation. - """ + r""" super().__init__() self.channels = channels @@ -163,7 +163,7 @@ def forward(self, x: torch.Tensor): `*` denotes any number of (possibly 0) dimensions. For example, in an image (2D) convolution this will be `[batch_size, channels, height, width]` - """ + r""" # Keep the original shape x_shape = x.shape # Get the batch size diff --git a/labml_nn/normalization/deep_norm/__init__.py b/labml_nn/normalization/deep_norm/__init__.py index fcec5bd82..414a84398 100644 --- a/labml_nn/normalization/deep_norm/__init__.py +++ b/labml_nn/normalization/deep_norm/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: DeepNorm summary: > @@ -82,7 +82,7 @@ class DeepNorm(nn.Module): - """ + r""" ## DeepNorm Normalization $$x_{l + 1} = \mathop{LN}\Big( \alpha x_l + \mathop{G}_l \big(x_l, \theta_l \big)\Big)$$ @@ -91,7 +91,7 @@ class DeepNorm(nn.Module): def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], *, eps: float = 1e-5, elementwise_affine: bool = True): - """ + r""" :param alpha: is $\alpha$ :param normalized_shape: is the shape for LayerNorm $\mathop{LN}$ :param eps: is $\epsilon$ for LayerNorm @@ -104,7 +104,7 @@ def __init__(self, alpha: float, normalized_shape: Union[int, List[int], Size], self.layer_norm = LayerNorm(normalized_shape, eps=eps, elementwise_affine=elementwise_affine) def forward(self, x: torch.Tensor, gx: torch.Tensor): - """ + r""" :param x: is the output from the previous layer $x_l$ :param gx: is the output of the current sub-layer $\mathop{G}_l (x_l, \theta_l)$ """ @@ -126,7 +126,7 @@ def __init__(self, *, deep_norm_alpha: float, deep_norm_beta: float, ): - """ + r""" :param d_model: is the token embedding size :param self_attn: is the self attention module :param feed_forward: is the feed forward module diff --git a/labml_nn/normalization/deep_norm/experiment.py b/labml_nn/normalization/deep_norm/experiment.py index 9fcdadc64..1819453e1 100644 --- a/labml_nn/normalization/deep_norm/experiment.py +++ b/labml_nn/normalization/deep_norm/experiment.py @@ -89,7 +89,7 @@ class Configs(NLPAutoRegressionConfigs): @option(Configs.deep_norm_alpha) def _deep_norm_alpha(c: Configs): - """ + r""" #### Calculate $\alpha$ $\alpha = (2M)^{\frac{1}{4}}$ @@ -99,7 +99,7 @@ def _deep_norm_alpha(c: Configs): @option(Configs.deep_norm_beta) def _deep_norm_beta(c: Configs): - """ + r""" #### Calculate $\beta$ $\beta = (8M)^{-\frac{1}{4}}$ diff --git a/labml_nn/normalization/group_norm/__init__.py b/labml_nn/normalization/group_norm/__init__.py index e9a87ac92..3c20840ff 100644 --- a/labml_nn/normalization/group_norm/__init__.py +++ b/labml_nn/normalization/group_norm/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Group Normalization summary: > @@ -98,7 +98,7 @@ def __init__(self, groups: int, channels: int, *, * `channels` is the number of features in the input * `eps` is $\epsilon$, used in $\sqrt{Var[x^{(k)}] + \epsilon}$ for numerical stability * `affine` is whether to scale and shift the normalized value - """ + r""" super().__init__() assert channels % groups == 0, "Number of channels should be evenly divisible by the number of groups" @@ -118,7 +118,7 @@ def forward(self, x: torch.Tensor): `*` denotes any number of (possibly 0) dimensions. For example, in an image (2D) convolution this will be `[batch_size, channels, height, width]` - """ + r""" # Keep the original shape x_shape = x.shape # Get the batch size diff --git a/labml_nn/normalization/instance_norm/__init__.py b/labml_nn/normalization/instance_norm/__init__.py index c7db3adc9..937d69c80 100644 --- a/labml_nn/normalization/instance_norm/__init__.py +++ b/labml_nn/normalization/instance_norm/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Instance Normalization summary: > @@ -53,7 +53,7 @@ def __init__(self, channels: int, *, * `channels` is the number of features in the input * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability * `affine` is whether to scale and shift the normalized value - """ + r""" super().__init__() self.channels = channels @@ -71,7 +71,7 @@ def forward(self, x: torch.Tensor): `*` denotes any number of (possibly 0) dimensions. For example, in an image (2D) convolution this will be `[batch_size, channels, height, width]` - """ + r""" # Keep the original shape x_shape = x.shape # Get the batch size diff --git a/labml_nn/normalization/layer_norm/__init__.py b/labml_nn/normalization/layer_norm/__init__.py index 0d5ca8116..a0a004f57 100644 --- a/labml_nn/normalization/layer_norm/__init__.py +++ b/labml_nn/normalization/layer_norm/__init__.py @@ -79,7 +79,7 @@ def __init__(self, normalized_shape: Union[int, List[int], Size], *, * `elementwise_affine` is whether to scale and shift the normalized value We've tried to use the same names for arguments as PyTorch `LayerNorm` implementation. - """ + r""" super().__init__() # Convert `normalized_shape` to `torch.Size` @@ -104,7 +104,7 @@ def forward(self, x: torch.Tensor): `*` could be any number of dimensions. For example, in an NLP task this will be `[seq_len, batch_size, features]` - """ + r""" # Sanity check to make sure the shapes match assert self.normalized_shape == x.shape[-len(self.normalized_shape):] diff --git a/labml_nn/normalization/weight_standardization/__init__.py b/labml_nn/normalization/weight_standardization/__init__.py index 2fb3009b5..8165f9a9a 100644 --- a/labml_nn/normalization/weight_standardization/__init__.py +++ b/labml_nn/normalization/weight_standardization/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Weight Standardization summary: > diff --git a/labml_nn/optimizers/__init__.py b/labml_nn/optimizers/__init__.py index 172854a83..6cd5361f4 100644 --- a/labml_nn/optimizers/__init__.py +++ b/labml_nn/optimizers/__init__.py @@ -73,7 +73,7 @@ class GenericAdaptiveOptimizer(Optimizer): """ def __init__(self, params, defaults: Dict[str, Any], lr: float, betas: Tuple[float, float], eps: float): - """ + r""" ### Initialize * `params` is the collection of parameters or set of parameter groups. @@ -109,7 +109,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par pass def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.Tensor): - """ + r""" ### Take optimizer step on a parameter tensor This should be overridden and take the optimization step on `param` tensor $\theta$, diff --git a/labml_nn/optimizers/ada_belief.py b/labml_nn/optimizers/ada_belief.py index d33b1b4a8..b3d33fab7 100644 --- a/labml_nn/optimizers/ada_belief.py +++ b/labml_nn/optimizers/ada_belief.py @@ -1,4 +1,4 @@ -""" +r""" --- title: AdaBelief optimizer summary: A simple PyTorch implementation/tutorial of AdaBelief optimizer. @@ -53,7 +53,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), amsgrad=False, degenerate_to_sgd=True, rectify=True, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -75,7 +75,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, self.rectify = rectify def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter): - """ + r""" ### Initialize a parameter state * `state` is the optimizer state of the parameter (tensor) @@ -95,7 +95,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par state['max_exp_avg_var'] = torch.zeros_like(param, memory_format=torch.preserve_format) def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor): - """ + r""" ### Calculate $m_t$ and $s_t$ or $\max(s_1, s_2, ..., s_{t-1}, s_t)$ * `state` is the optimizer state of the parameter (tensor) @@ -131,7 +131,7 @@ def get_ms(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso return m, s def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter): - """ + r""" ### Take an update step for a given parameter tensor * `state` is the optimizer state of the parameter (tensor) diff --git a/labml_nn/optimizers/adam.py b/labml_nn/optimizers/adam.py index 568be4d55..24b115dc8 100644 --- a/labml_nn/optimizers/adam.py +++ b/labml_nn/optimizers/adam.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Adam Optimizer summary: A simple PyTorch implementation/tutorial of Adam optimizer @@ -60,7 +60,7 @@ def __init__(self, params, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, defaults: Optional[Dict[str, Any]] = None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -81,7 +81,7 @@ def __init__(self, params, self.optimized_update = optimized_update def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter): - """ + r""" ### Initialize a parameter state * `state` is the optimizer state of the parameter (tensor) @@ -97,7 +97,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par state['exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format) def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tensor): - """ + r""" ### Calculate $m_t$ and and $v_t$ * `state` is the optimizer state of the parameter (tensor) @@ -121,7 +121,7 @@ def get_mv(self, state: Dict[str, Any], group: Dict[str, Any], grad: torch.Tenso return m, v def get_lr(self, state: Dict[str, any], group: Dict[str, any]): - """ + r""" ### Get learning-rate This returns the modified learning rate based on the state. @@ -132,7 +132,7 @@ def get_lr(self, state: Dict[str, any], group: Dict[str, any]): def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter, m: torch.Tensor, v: torch.Tensor): - """ + r""" ### Do the *Adam* parameter update * `state` is the optimizer state of the parameter (tensor) @@ -192,7 +192,7 @@ def adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch param.data.addcdiv_(m, denominator, value=-step_size) def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter): - """ + r""" ### Take an update step for a given parameter tensor * `state` is the optimizer state of the parameter (tensor) diff --git a/labml_nn/optimizers/adam_fp16.py b/labml_nn/optimizers/adam_fp16.py index 1b36135f8..36e55435c 100644 --- a/labml_nn/optimizers/adam_fp16.py +++ b/labml_nn/optimizers/adam_fp16.py @@ -35,7 +35,7 @@ def __init__(self, params, lr: float = 1e-3, betas: Tuple[float, float] = (0.9, super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults) def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter): - """ + r""" ### Initialize a parameter state * `state` is the optimizer state of the parameter (tensor) @@ -55,7 +55,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par state['fp32_copy'] = param.to(torch.float) def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter): - """ + r""" ### Take an update step for a given parameter tensor * `state` is the optimizer state of the parameter (tensor) diff --git a/labml_nn/optimizers/adam_warmup.py b/labml_nn/optimizers/adam_warmup.py index fb73d1529..92cd9ff9a 100644 --- a/labml_nn/optimizers/adam_warmup.py +++ b/labml_nn/optimizers/adam_warmup.py @@ -25,7 +25,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, amsgrad=False, warmup=0, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -46,7 +46,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) def get_lr(self, state: Dict[str, any], group: Dict[str, any]): - """ + r""" ### Get learning-rate $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$ diff --git a/labml_nn/optimizers/adam_warmup_cosine_decay.py b/labml_nn/optimizers/adam_warmup_cosine_decay.py index 037f1b4cf..6358f4bcc 100644 --- a/labml_nn/optimizers/adam_warmup_cosine_decay.py +++ b/labml_nn/optimizers/adam_warmup_cosine_decay.py @@ -28,7 +28,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, amsgrad=False, warmup=0, total_steps=1e10, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -51,7 +51,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) def get_lr(self, state: Dict[str, any], group: Dict[str, any]): - """ + r""" ### Get learning-rate $$\alpha \min \bigg(1, \frac{t}{w}\bigg)$$ diff --git a/labml_nn/optimizers/amsgrad.py b/labml_nn/optimizers/amsgrad.py index 07658e09b..5d9971f6b 100644 --- a/labml_nn/optimizers/amsgrad.py +++ b/labml_nn/optimizers/amsgrad.py @@ -36,7 +36,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay: WeightDecay = WeightDecay(), optimized_update: bool = True, amsgrad=True, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -56,7 +56,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, super().__init__(params, lr, betas, eps, weight_decay, optimized_update, defaults) def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter): - """ + r""" ### Initialize a parameter state * `state` is the optimizer state of the parameter (tensor) @@ -73,7 +73,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par state['max_exp_avg_sq'] = torch.zeros_like(param, memory_format=torch.preserve_format) def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor): - """ + r""" ### Calculate $m_t$ and and $v_t$ or $\max(v_1, v_2, ..., v_{t-1}, v_t)$ * `state` is the optimizer state of the parameter (tensor) @@ -109,7 +109,7 @@ def get_mv(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tenso def _synthetic_experiment(is_adam: bool): - """ + r""" ## Synthetic Experiment This is the synthetic experiment described in the paper, diff --git a/labml_nn/optimizers/noam.py b/labml_nn/optimizers/noam.py index 8443f881c..26450311f 100644 --- a/labml_nn/optimizers/noam.py +++ b/labml_nn/optimizers/noam.py @@ -29,7 +29,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, optimized_update: bool = True, amsgrad=False, warmup=0, d_model=512, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -52,7 +52,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, self.d_model = d_model def get_lr(self, state: Dict[str, any], group: Dict[str, any]): - """ + r""" ### Get learning-rate $$\alpha \frac{1}{\sqrt{d_{model}}} \min \bigg(\frac{1}{\sqrt{t}}, \frac{t}{w^{3/2}}\bigg)$$ diff --git a/labml_nn/optimizers/radam.py b/labml_nn/optimizers/radam.py index 3e384c4d4..bd718e370 100644 --- a/labml_nn/optimizers/radam.py +++ b/labml_nn/optimizers/radam.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Rectified Adam (RAdam) optimizer summary: A simple PyTorch implementation/tutorial of RAdam optimizer. @@ -157,7 +157,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, optimized_update: bool = True, amsgrad=False, degenerated_to_sgd=True, defaults=None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -176,7 +176,7 @@ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, super().__init__(params, lr, betas, eps, weight_decay, optimized_update, amsgrad, defaults) def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter): - """ + r""" ### Take an update step for a given parameter tensor * `state` is the optimizer state of the parameter (tensor) @@ -221,7 +221,7 @@ def calc_rectification_term(beta2: float, step: int) -> Optional[float]: def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: torch.nn.Parameter, m: torch.Tensor, v: torch.Tensor): - """ + r""" ### Do the *RAdam* parameter update * `state` is the optimizer state of the parameter (tensor) @@ -274,7 +274,7 @@ def r_adam_update(self, state: Dict[str, any], group: Dict[str, any], param: tor def _test_rectification_term(): - """ + r""" ### Plot $r_t$ against $t$ for various $\beta_2$ ![Plot of r_t](radam_r_t.png) diff --git a/labml_nn/optimizers/sophia.py b/labml_nn/optimizers/sophia.py index 2aa58f426..12d43c2a1 100644 --- a/labml_nn/optimizers/sophia.py +++ b/labml_nn/optimizers/sophia.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Sophia Optimizer summary: A simple PyTorch implementation/tutorial of Sophia optimizer @@ -72,7 +72,7 @@ def __init__(self, params, rho: float = 0.03, weight_decay: WeightDecay = WeightDecay(), defaults: Optional[Dict[str, Any]] = None): - """ + r""" ### Initialize the optimizer * `params` is the list of parameters @@ -92,7 +92,7 @@ def __init__(self, params, self.weight_decay = weight_decay def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Parameter): - """ + r""" ### Initialize a parameter state * `state` is the optimizer state of the parameter (tensor) @@ -108,7 +108,7 @@ def init_state(self, state: Dict[str, any], group: Dict[str, any], param: nn.Par state['hessian'] = torch.zeros_like(param, memory_format=torch.preserve_format) def update_hessian(self, n_tokens_training_batch): - """ + r""" ### Update the EMA of Hessian diagonal $h_t$ * `n_tokens_training_batch` is the number of tokens/inputs in the batch $B$ @@ -145,7 +145,7 @@ def update_hessian(self, n_tokens_training_batch): state['hessian'].mul_(beta2).addcmul_(p.grad, p.grad, value=(1 - beta2) * n_tokens_training_batch) def step_param(self, state: Dict[str, any], group: Dict[str, any], grad: torch.Tensor, param: torch.nn.Parameter): - """ + r""" ### Take an update step for a given parameter tensor * `state` is the optimizer state of the parameter (tensor) diff --git a/labml_nn/recurrent_highway_networks/__init__.py b/labml_nn/recurrent_highway_networks/__init__.py index f1e8b3b76..0f9590c02 100644 --- a/labml_nn/recurrent_highway_networks/__init__.py +++ b/labml_nn/recurrent_highway_networks/__init__.py @@ -16,7 +16,7 @@ class RHNCell(nn.Module): - """ + r""" ## Recurrent Highway Network Cell This implements equations $(6) - (9)$. diff --git a/labml_nn/resnet/__init__.py b/labml_nn/resnet/__init__.py index bd085c470..960ecf2a3 100644 --- a/labml_nn/resnet/__init__.py +++ b/labml_nn/resnet/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Deep Residual Learning for Image Recognition (ResNet) summary: > @@ -67,7 +67,7 @@ class ShortcutProjection(nn.Module): """ def __init__(self, in_channels: int, out_channels: int, stride: int): - """ + r""" * `in_channels` is the number of channels in $x$ * `out_channels` is the number of channels in $\mathcal{F}(x, \{W_i\})$ * `stride` is the stride length in the convolution operation for $F$. @@ -86,7 +86,7 @@ def forward(self, x: torch.Tensor): class ResidualBlock(nn.Module): - """ + r""" ## Residual Block @@ -153,7 +153,7 @@ def forward(self, x: torch.Tensor): class BottleneckResidualBlock(nn.Module): - """ + r""" ## Bottleneck Residual Block @@ -181,7 +181,7 @@ class BottleneckResidualBlock(nn.Module): """ def __init__(self, in_channels: int, bottleneck_channels: int, out_channels: int, stride: int): - """ + r""" * `in_channels` is the number of channels in $x$ * `bottleneck_channels` is the number of channels for the $3 \times 3$ convlution * `out_channels` is the number of output channels diff --git a/labml_nn/rl/dqn/__init__.py b/labml_nn/rl/dqn/__init__.py index 048bda113..8320dc214 100644 --- a/labml_nn/rl/dqn/__init__.py +++ b/labml_nn/rl/dqn/__init__.py @@ -31,7 +31,7 @@ class QFuncLoss(nn.Module): - """ + r""" ## Train the model We want to find optimal action-value function. @@ -106,7 +106,7 @@ def __init__(self, gamma: float): def forward(self, q: torch.Tensor, action: torch.Tensor, double_q: torch.Tensor, target_q: torch.Tensor, done: torch.Tensor, reward: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """ + r""" * `q` - $Q(s;\theta_i)$ * `action` - $a$ * `double_q` - $\textcolor{cyan}Q(s';\textcolor{cyan}{\theta_i})$ diff --git a/labml_nn/rl/dqn/experiment.py b/labml_nn/rl/dqn/experiment.py index 2a3af4381..19be0f414 100644 --- a/labml_nn/rl/dqn/experiment.py +++ b/labml_nn/rl/dqn/experiment.py @@ -107,7 +107,7 @@ def __init__(self, *, self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2.5e-4) def _sample_action(self, q_value: torch.Tensor, exploration_coefficient: float): - """ + r""" #### $\epsilon$-greedy Sampling When sampling actions we use a $\epsilon$-greedy strategy, where we take a greedy action with probabiliy $1 - \epsilon$ and diff --git a/labml_nn/rl/dqn/model.py b/labml_nn/rl/dqn/model.py index 6dbe2e081..3b15f03c0 100644 --- a/labml_nn/rl/dqn/model.py +++ b/labml_nn/rl/dqn/model.py @@ -15,7 +15,7 @@ class Model(nn.Module): - """ + r""" ## Dueling Network ⚔️ Model for $Q$ Values We are using a [dueling network](https://arxiv.org/abs/1511.06581) diff --git a/labml_nn/rl/dqn/replay_buffer.py b/labml_nn/rl/dqn/replay_buffer.py index 966bfcbb6..ad3882e7b 100644 --- a/labml_nn/rl/dqn/replay_buffer.py +++ b/labml_nn/rl/dqn/replay_buffer.py @@ -18,7 +18,7 @@ class ReplayBuffer: - """ + r""" ## Buffer for Prioritized Experience Replay [Prioritized experience replay](https://arxiv.org/abs/1511.05952) @@ -180,7 +180,7 @@ def _set_priority_sum(self, idx, priority): self.priority_sum[idx] = self.priority_sum[2 * idx] + self.priority_sum[2 * idx + 1] def _sum(self): - """ + r""" #### $\sum_k p_k^\alpha$ """ @@ -188,7 +188,7 @@ def _sum(self): return self.priority_sum[1] def _min(self): - """ + r""" #### $\min_k p_k^\alpha$ """ @@ -196,7 +196,7 @@ def _min(self): return self.priority_min[1] def find_prefix_sum_idx(self, prefix_sum): - """ + r""" #### Find largest $i$ such that $\sum_{k=1}^{i} p_k^\alpha \le P$ """ diff --git a/labml_nn/rl/ppo/__init__.py b/labml_nn/rl/ppo/__init__.py index 2b878d1d4..ca5b31edb 100644 --- a/labml_nn/rl/ppo/__init__.py +++ b/labml_nn/rl/ppo/__init__.py @@ -31,7 +31,7 @@ class ClippedPPOLoss(nn.Module): - """ + r""" ## PPO Loss Here's how the PPO update rule is derived. @@ -179,7 +179,7 @@ def forward(self, log_pi: torch.Tensor, sampled_log_pi: torch.Tensor, class ClippedValueFunctionLoss(nn.Module): - """ + r""" ## Clipped Value Function Loss Similarly we clip the value function update also. diff --git a/labml_nn/rl/ppo/gae.py b/labml_nn/rl/ppo/gae.py index 981b609ef..74864b611 100644 --- a/labml_nn/rl/ppo/gae.py +++ b/labml_nn/rl/ppo/gae.py @@ -23,7 +23,7 @@ def __init__(self, n_workers: int, worker_steps: int, gamma: float, lambda_: flo self.n_workers = n_workers def __call__(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray: - """ + r""" ### Calculate advantages \begin{align} diff --git a/labml_nn/sampling/nucleus.py b/labml_nn/sampling/nucleus.py index 6de9c719e..60daa2b9c 100644 --- a/labml_nn/sampling/nucleus.py +++ b/labml_nn/sampling/nucleus.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Nucleus Sampling summary: A PyTorch implementation of nucleus sampling from language models. diff --git a/labml_nn/sampling/temperature.py b/labml_nn/sampling/temperature.py index 4c924ee61..a8f60a5d3 100644 --- a/labml_nn/sampling/temperature.py +++ b/labml_nn/sampling/temperature.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Sampling from Language Models with Temperature summary: A PyTorch implementation of sampling from language models with temperature. diff --git a/labml_nn/scaling/zero3/__init__.py b/labml_nn/scaling/zero3/__init__.py index 9f5955350..1f3609d6f 100644 --- a/labml_nn/scaling/zero3/__init__.py +++ b/labml_nn/scaling/zero3/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Zero-DP Memory Optimization summary: > diff --git a/labml_nn/sketch_rnn/__init__.py b/labml_nn/sketch_rnn/__init__.py index fe250008d..9632dbd78 100644 --- a/labml_nn/sketch_rnn/__init__.py +++ b/labml_nn/sketch_rnn/__init__.py @@ -54,7 +54,7 @@ class StrokesDataset(Dataset): """ def __init__(self, dataset: np.array, max_seq_length: int, scale: Optional[float] = None): - """ + r""" `dataset` is a list of numpy arrays of shape [seq_len, 3]. It is a sequence of strokes, and each stroke is represented by 3 integers. @@ -126,7 +126,7 @@ def __getitem__(self, idx: int): class BivariateGaussianMixture: - """ + r""" ## Bi-variate Gaussian mixture The mixture is represented by $\Pi$ and @@ -150,7 +150,7 @@ def n_distributions(self): return self.pi_logits.shape[-1] def set_temperature(self, temperature: float): - """ + r""" Adjust by temperature $\tau$ """ # $$\hat{\Pi_k} \leftarrow \frac{\hat{\Pi_k}}{\tau}$$ @@ -348,7 +348,7 @@ def forward(self, mask: torch.Tensor, target: torch.Tensor, class KLDivLoss(nn.Module): - """ + r""" ## KL-Divergence loss This calculates the KL divergence between a given normal distribution and $\mathcal{N}(0, 1)$ diff --git a/labml_nn/transformers/aft/__init__.py b/labml_nn/transformers/aft/__init__.py index b3f526cbb..5aab884d6 100644 --- a/labml_nn/transformers/aft/__init__.py +++ b/labml_nn/transformers/aft/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: An Attention Free Transformer summary: > @@ -64,7 +64,7 @@ class AFTLocal(nn.Module): - """ + r""" ### AFT Local Operation $$Y_t = \sigma(Q_t) \odot @@ -109,7 +109,7 @@ def __init__(self, d_model: int, seq_len: int, local_window_size: int, bias: boo @staticmethod def create_local_mask(seq_len, local_window_size): - """ + r""" #### Create local mask This creates a mask for diff --git a/labml_nn/transformers/alibi/__init__.py b/labml_nn/transformers/alibi/__init__.py index 8c1bdad27..154f93a52 100644 --- a/labml_nn/transformers/alibi/__init__.py +++ b/labml_nn/transformers/alibi/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Attention with Linear Biases (ALiBi) summary: > @@ -41,7 +41,7 @@ def get_slopes(n_heads: int): - """ + r""" ## Get head-specific slope $m$ for each head * `n_heads` is the number of heads in the attention layer $n$ diff --git a/labml_nn/transformers/compressive/__init__.py b/labml_nn/transformers/compressive/__init__.py index 96339e0cc..e3e4212c3 100644 --- a/labml_nn/transformers/compressive/__init__.py +++ b/labml_nn/transformers/compressive/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Compressive Transformer summary: > diff --git a/labml_nn/transformers/configs.py b/labml_nn/transformers/configs.py index e80f3f097..aab32b427 100644 --- a/labml_nn/transformers/configs.py +++ b/labml_nn/transformers/configs.py @@ -50,7 +50,7 @@ class FeedForwardConfigs(BaseConfigs): @option(FeedForwardConfigs.activation, 'ReLU') def _ffn_activation_relu(): - """ + r""" ### ReLU activation $$\max(0, x)$$ @@ -60,7 +60,7 @@ def _ffn_activation_relu(): @option(FeedForwardConfigs.activation, 'GELU') def _ffn_activation_gelu(): - """ + r""" ### GELU activation $$x \Phi(x)$$ where $\Phi(x) = P(X \le x), X \sim \mathcal{N}(0,1)$ diff --git a/labml_nn/transformers/fast_weights/__init__.py b/labml_nn/transformers/fast_weights/__init__.py index d4fbac2ea..2b0962fb8 100644 --- a/labml_nn/transformers/fast_weights/__init__.py +++ b/labml_nn/transformers/fast_weights/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Linear Transformers Are Secretly Fast Weight Memory Systems summary: > @@ -101,7 +101,7 @@ class DPFP(nn.Module): - """ + r""" ## Deterministic Parameter Free Project (DPFP) This is the new projection function $\textcolor{lightgreen}{\phi}$ introduced in the paper. @@ -135,7 +135,7 @@ class DPFP(nn.Module): """ def __init__(self, nu: int = 1, eps: float = 1e-6): - """ + r""" * `nu` is the hyper-parameter $\nu$. * `eps` is the small value used to make sure there is no division-by-zero when normalizing. """ @@ -151,7 +151,7 @@ def forward(self, k: torch.Tensor): return k / (torch.sum(k, dim=-1, keepdim=True) + self.eps) def dpfp(self, k: torch.Tensor): - """ + r""" $$\textcolor{lightgreen}{\phi(k)}$$ """ # $x = \text{ReLU}\Big(\big[k, -k\big]\Big)$ @@ -173,7 +173,7 @@ def dpfp(self, k: torch.Tensor): class FastWeightsAttention(nn.Module): - """ + r""" ## Fast Weights Attention The paper introduces a new update rule for calculating $\textcolor{cyan}{W^{(i)}}$. diff --git a/labml_nn/transformers/feed_forward.py b/labml_nn/transformers/feed_forward.py index f9c8d768e..eb1ab07f7 100644 --- a/labml_nn/transformers/feed_forward.py +++ b/labml_nn/transformers/feed_forward.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Position-wise Feed-Forward Network (FFN) summary: Documented reusable implementation of the position wise feedforward network. diff --git a/labml_nn/transformers/feedback/__init__.py b/labml_nn/transformers/feedback/__init__.py index ee9e9b8bd..6661d12c0 100644 --- a/labml_nn/transformers/feedback/__init__.py +++ b/labml_nn/transformers/feedback/__init__.py @@ -136,7 +136,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor): positional encodings $P_q, P_j$. We replace term $\textcolor{lightgreen}{D}$ with $S_j$. - """ + r""" # $U^K_j$ key_pos_emb = self.key_pos_embeddings[-key.shape[0]:] @@ -160,7 +160,7 @@ def forward(self, *, """ * `query` has shape `[batch_size, d_model]` * `key` and `value` has shape `[seq_len, batch_size, d_model]` - """ + r""" # Prepare `query`, `key` and `value` for attention computation # `key` and `value` will then have shape `[seq_len, batch_size, heads, d_k]` diff --git a/labml_nn/transformers/flash/__init__.py b/labml_nn/transformers/flash/__init__.py index fe4b2990d..84d14a8d1 100644 --- a/labml_nn/transformers/flash/__init__.py +++ b/labml_nn/transformers/flash/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Flash Attention summary: > @@ -160,7 +160,7 @@ class AttentionFunc(torch.autograd.Function): def forward(ctx: Any, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool, sm_scale: float) -> torch.Tensor: - """ + r""" ### Forward pass Group query attention forward pass. Returns the output in shape `[batch_size, n_heads, q_seq_len, d_head]`. @@ -352,7 +352,7 @@ def _attn_fwd(t_q, t_k, t_v, sm_scale_log2e, t_lse, t_o, BLOCK_Q: tl.constexpr, BLOCK_K: tl.constexpr, ): - """ + r""" ### Triton kernel for Flash attention forward pass :param t_q: queries $Q_i$ diff --git a/labml_nn/transformers/fnet/__init__.py b/labml_nn/transformers/fnet/__init__.py index 4b123f376..61f45d3ca 100644 --- a/labml_nn/transformers/fnet/__init__.py +++ b/labml_nn/transformers/fnet/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: "FNet: Mixing Tokens with Fourier Transforms" summary: > @@ -45,7 +45,7 @@ class FNetMix(nn.Module): - """ + r""" ## FNet - Mix tokens This module simply implements @@ -58,7 +58,7 @@ class FNetMix(nn.Module): """ def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None): - """ + r""" The [normal attention module](../mha.html) can be fed with different token embeddings for $\text{query}$,$\text{key}$, and $\text{value}$ and a mask. diff --git a/labml_nn/transformers/gmlp/__init__.py b/labml_nn/transformers/gmlp/__init__.py index 74b55e9f4..f2afd7d05 100644 --- a/labml_nn/transformers/gmlp/__init__.py +++ b/labml_nn/transformers/gmlp/__init__.py @@ -23,7 +23,7 @@ class GMLPBlock(nn.Module): - """ + r""" ## gMLP Block Each block does the following transformations to input embeddings @@ -87,7 +87,7 @@ def forward(self, *, x: torch.Tensor, mask: Optional[torch.Tensor] = None): class SpacialGatingUnit(nn.Module): - """ + r""" ## Spatial Gating Unit $$s(Z) = Z_1 \odot f_{W,b}(Z_2)$$ diff --git a/labml_nn/transformers/gpt/__init__.py b/labml_nn/transformers/gpt/__init__.py index 7c8beda60..6d32eef10 100644 --- a/labml_nn/transformers/gpt/__init__.py +++ b/labml_nn/transformers/gpt/__init__.py @@ -124,7 +124,7 @@ def _transformer_configs(c: Configs): def _init_weights(module): - """ + r""" ### Initialize weights Weights of linear layers and embedding layers are initialized diff --git a/labml_nn/transformers/hour_glass/__init__.py b/labml_nn/transformers/hour_glass/__init__.py index f94bd7bda..0031c7ce5 100644 --- a/labml_nn/transformers/hour_glass/__init__.py +++ b/labml_nn/transformers/hour_glass/__init__.py @@ -246,7 +246,7 @@ def __init__(self): class AttentionBasedShortening(nn.Module): - """ + r""" ### 🚧 Down-sampling with attention \begin{align} @@ -263,7 +263,7 @@ def __init__(self): class LinearUpSampling(nn.Module): - """ + r""" ### 🚧 Linear projection for up-sampling Make a linear projection of dense token embeddings to a size of $d_{\text{model}} k$. @@ -275,7 +275,7 @@ def __init__(self): class AttentionBasedUpSampling(nn.Module): - """ + r""" ### 🚧 Attention based up-sampling \begin{align} diff --git a/labml_nn/transformers/jax_transformer/__init__.py b/labml_nn/transformers/jax_transformer/__init__.py index fb9b1bf5e..672b49160 100644 --- a/labml_nn/transformers/jax_transformer/__init__.py +++ b/labml_nn/transformers/jax_transformer/__init__.py @@ -385,7 +385,7 @@ def __init__(self, normalized_shape: Union[Tuple[int], List[int]], *, $X \in \mathbb{R}^{* \times S[0] \times S[1] \times ... \times S[n]}$ * `eps` is $\epsilon$, used in $\sqrt{Var[X] + \epsilon}$ for numerical stability * `elementwise_affine` is whether to scale and shift the normalized value - """ + r""" super().__init__() self.eps = eps @@ -487,7 +487,7 @@ def __call__(self, *, `mask` has shape `[seq_len, seq_len]` and `mask[i, j]` indicates whether query at position `i` can see key-value at position `j`. - """ + r""" # Get sequence length seq_len = len(query) @@ -623,7 +623,7 @@ class CrossEntropyLoss(Module): ## Cross Entropy Loss - """ + r""" def __init__(self): super().__init__() @@ -800,7 +800,7 @@ def step(self, params: Dict, grads: Dict): * `params` is a tree-map of parameters * `grads` is a tree-map of gradients - """ + r""" # Increment step $t$ self._n_steps += 1 # Update states for each parameter @@ -813,7 +813,7 @@ def _step(self, n_steps: int, param: jnp.ndarray, state: AdamState): ### Update parameters This performs a Adam update on the given parameter - """ + r""" # Bias corrections for $\hat{m}_t$: $1 - \beta_1^t$ and for $\hat{v}_t$: $1 - \beta_2^t$ bias_correction = [1 - beta ** n_steps for beta in self.betas] @@ -834,7 +834,7 @@ def _update_state(self, grad, state: AdamState): ### Update state This updates uncorrected first and second moments $m_t$ and $v_t$ - """ + r""" # Uncorrected first and second moments $m_{t-1}$ and $v_{t-1}$ m, v = state # Clip gradients diff --git a/labml_nn/transformers/knn/__init__.py b/labml_nn/transformers/knn/__init__.py index 72d8037f0..902f5ffcc 100644 --- a/labml_nn/transformers/knn/__init__.py +++ b/labml_nn/transformers/knn/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: k-Nearest Neighbor Language Models summary: > diff --git a/labml_nn/transformers/knn/build_index.py b/labml_nn/transformers/knn/build_index.py index f6deafaa9..f8c6a9942 100644 --- a/labml_nn/transformers/knn/build_index.py +++ b/labml_nn/transformers/knn/build_index.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Build FAISS index for k-NN search summary: This builds the FAISS index with the transformer embeddings. @@ -51,7 +51,7 @@ def load_experiment(run_uuid: str, checkpoint: Optional[int] = None): def gather_keys(conf: Configs): - """ + r""" ## Gather $\big(f(c_i), w_i\big)$ and save them in numpy arrays *Note that these numpy arrays will take up a lot of space (even few hundred gigabytes) diff --git a/labml_nn/transformers/knn/eval_knn.py b/labml_nn/transformers/knn/eval_knn.py index 703a41ca7..17c6f49a9 100644 --- a/labml_nn/transformers/knn/eval_knn.py +++ b/labml_nn/transformers/knn/eval_knn.py @@ -20,7 +20,7 @@ def knn(queries: torch.Tensor, index: faiss.IndexFlatL2, keys_store: np.ndarray, vals_store: np.ndarray, n_tokens: int): - """ + r""" ## $k$-NN to get $p(w_t, c_t)$ Here we refer to $f(\textcolor{yellowgreen}{c_t})$ as queries, diff --git a/labml_nn/transformers/mha.py b/labml_nn/transformers/mha.py index ff93530e0..e09516e84 100644 --- a/labml_nn/transformers/mha.py +++ b/labml_nn/transformers/mha.py @@ -123,7 +123,7 @@ def get_scores(self, query: torch.Tensor, key: torch.Tensor): ### Calculate scores between queries and keys This method can be overridden for other variations like relative attention. - """ + r""" # Calculate $Q K^\top$ or $S_{ijbh} = \sum_d Q_{ibhd} K_{jbhd}$ return torch.einsum('ibhd,jbhd->ijbh', query, key) diff --git a/labml_nn/transformers/mlp_mixer/__init__.py b/labml_nn/transformers/mlp_mixer/__init__.py index 06b650583..4281efc37 100644 --- a/labml_nn/transformers/mlp_mixer/__init__.py +++ b/labml_nn/transformers/mlp_mixer/__init__.py @@ -48,7 +48,7 @@ def __init__(self, mlp: nn.Module): self.mlp = mlp def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None): - """ + r""" The [normal attention module](../mha.html) can be fed with different token embeddings for $\text{query}$,$\text{key}$, and $\text{value}$ and a mask. diff --git a/labml_nn/transformers/positional_encoding.py b/labml_nn/transformers/positional_encoding.py index 615ee913c..650877f3f 100644 --- a/labml_nn/transformers/positional_encoding.py +++ b/labml_nn/transformers/positional_encoding.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Fixed Positional Encodings summary: > diff --git a/labml_nn/transformers/primer_ez/__init__.py b/labml_nn/transformers/primer_ez/__init__.py index 6357f6d1c..276f26715 100644 --- a/labml_nn/transformers/primer_ez/__init__.py +++ b/labml_nn/transformers/primer_ez/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: "Primer: Searching for Efficient Transformers for Language Modeling" summary: > @@ -42,7 +42,7 @@ class SquaredReLU(nn.Module): - """ + r""" ## Squared ReLU activation $$y = {\max(x, 0)}^2$$ diff --git a/labml_nn/transformers/retro/bert_embeddings.py b/labml_nn/transformers/retro/bert_embeddings.py index 9ddef8f09..a6c53558e 100644 --- a/labml_nn/transformers/retro/bert_embeddings.py +++ b/labml_nn/transformers/retro/bert_embeddings.py @@ -19,7 +19,7 @@ class BERTChunkEmbeddings: - """ + r""" ## BERT Embeddings For a given chunk of text $N$ this class generates BERT embeddings $\text{B\small{ERT}}(N)$. @@ -75,7 +75,7 @@ def _trim_chunk(chunk: str): return stripped def __call__(self, chunks: List[str]): - """ + r""" ### Get $\text{B\small{ERT}}(N)$ for a list of chunks. """ diff --git a/labml_nn/transformers/retro/database.py b/labml_nn/transformers/retro/database.py index 90d0a2003..8a3e94e79 100644 --- a/labml_nn/transformers/retro/database.py +++ b/labml_nn/transformers/retro/database.py @@ -26,7 +26,7 @@ def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256, code_size: int = 64, n_probe: int = 8, n_train: int = 50_000): - """ + r""" ## Build Database * `chunk_len` is the length of a chunk (number of characters) diff --git a/labml_nn/transformers/retro/model.py b/labml_nn/transformers/retro/model.py index fe72048a5..06516de3b 100644 --- a/labml_nn/transformers/retro/model.py +++ b/labml_nn/transformers/retro/model.py @@ -32,7 +32,7 @@ class RotaryPositionalEmbeddings(nn.Module): """ def __init__(self, d: int, base: int = 10_000): - """ + r""" * `d` is the number of features $d$ * `base` is the constant used for calculating $\Theta$ """ @@ -81,7 +81,7 @@ def forward(self, x: torch.Tensor): class SelfAttention(nn.Module): - """ + r""" ## Self-Attention Layer $\text{A\small{TTN}}$ This applies causal and non-causal [multi-headed self-attention](../mha.html). @@ -185,7 +185,7 @@ def forward(self, h: torch.Tensor): class CrossAttention(nn.Module): - """ + r""" ## Cross-Attention Layer $\text{C\small{A}}$ This is similar to the self-attention layer defined above, except that @@ -272,7 +272,7 @@ def forward(self, e: torch.Tensor, h: torch.Tensor): class ChunkedCrossAttention(nn.Module): - """ + r""" ## Chunked Cross-Attention Layer $\text{C\small{CA}}$ This is similar to the cross-attention layer defined above. @@ -380,7 +380,7 @@ def forward(self, h: torch.Tensor, e: torch.Tensor): class FeedForward(nn.Module): - """ + r""" ### Position-wise Feed Forward Layer $\text{F\small{FW}}$ This consists of two linear layers and an activation in the middle. @@ -425,7 +425,7 @@ def forward(self, h: torch.Tensor): class NearestNeighborEncoder(nn.Module): - """ + r""" ## Nearest Neighbor Encoder $\text{E\small{NCODER}}(\text{R\small{ET}}(C_u)_{1 \le u \le l}, H)$ This module encodes the retrieved nearest neighbors @@ -433,7 +433,7 @@ class NearestNeighborEncoder(nn.Module): def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int], d_model: int, n_heads: int, d_k: int, d_ff: int): - """ + r""" * `chunk_len` is the length of a chunk * `n_layer` is the number of layers in the encoder $L_{\text{enc}}$ * `ca_layers` are the layers with cross attention $P_{\text{enc}}$ @@ -457,7 +457,7 @@ def __init__(self, chunk_len: int, n_layers: int, ca_layers: Set[int], self.norm_h = nn.LayerNorm(d_model) def forward(self, e: torch.Tensor, h: torch.Tensor): - """ + r""" * `e` are token embeddings of the retrieved nearest neighbors, $\text{E\small{MB}}\big(\text{R\small{ET}}(C_u)_{1 \le u \le l}\big)$ of shape `[batch_size, chunks, neighbors, neighbor_len, d_model]` @@ -541,7 +541,7 @@ def __init__(self, n_vocab: int, d_model: int, n_layers: int, ca_layers: Set[int self.norm_e = nn.LayerNorm(d_model) def forward(self, x: torch.Tensor, ret: torch.Tensor): - """ + r""" * `x` is the input sequence, $X$ of shape `[batch_size, seq_len]` * `ret` are the retrieved neighbors $\text{R\small{ET}}(C_u)_{1 \le u \le l}$ diff --git a/labml_nn/transformers/rope/__init__.py b/labml_nn/transformers/rope/__init__.py index a200785b0..f4800220a 100644 --- a/labml_nn/transformers/rope/__init__.py +++ b/labml_nn/transformers/rope/__init__.py @@ -28,7 +28,7 @@ class RotaryPositionalEmbeddings(nn.Module): - """ + r""" ## RoPE module Rotary encoding transforms pairs of features by rotating in the 2D plane. @@ -116,7 +116,7 @@ class RotaryPositionalEmbeddings(nn.Module): """ def __init__(self, d: int, base: int = 10_000): - """ + r""" * `d` is the number of features $d$ * `base` is the constant used for calculating $\Theta$ """ @@ -128,7 +128,7 @@ def __init__(self, d: int, base: int = 10_000): self.sin_cached = None def _build_cache(self, x: torch.Tensor): - """ + r""" Cache $\cos$ and $\sin$ values """ # Return if cache is already built diff --git a/labml_nn/transformers/rope/value_pe/__init__.py b/labml_nn/transformers/rope/value_pe/__init__.py index 8aadeab8f..7855c0efe 100644 --- a/labml_nn/transformers/rope/value_pe/__init__.py +++ b/labml_nn/transformers/rope/value_pe/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: Rotary Positional Embeddings with Relative distance (RoPER) summary: > diff --git a/labml_nn/uncertainty/evidence/__init__.py b/labml_nn/uncertainty/evidence/__init__.py index 8062050a1..27f04873c 100644 --- a/labml_nn/uncertainty/evidence/__init__.py +++ b/labml_nn/uncertainty/evidence/__init__.py @@ -1,4 +1,4 @@ -""" +r""" --- title: "Evidential Deep Learning to Quantify Classification Uncertainty" summary: > @@ -55,7 +55,7 @@ class MaximumLikelihoodLoss(nn.Module): - """ + r""" ## Type II Maximum Likelihood Loss @@ -81,7 +81,7 @@ class MaximumLikelihoodLoss(nn.Module): """ def forward(self, evidence: torch.Tensor, target: torch.Tensor): - """ + r""" * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ @@ -98,7 +98,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor): class CrossEntropyBayesRisk(nn.Module): - """ + r""" ## Bayes Risk with Cross Entropy Loss @@ -128,7 +128,7 @@ class CrossEntropyBayesRisk(nn.Module): """ def forward(self, evidence: torch.Tensor, target: torch.Tensor): - """ + r""" * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ @@ -145,7 +145,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor): class SquaredErrorBayesRisk(nn.Module): - """ + r""" ## Bayes Risk with Squared Error Loss @@ -191,7 +191,7 @@ class SquaredErrorBayesRisk(nn.Module): """ def forward(self, evidence: torch.Tensor, target: torch.Tensor): - """ + r""" * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ @@ -215,7 +215,7 @@ def forward(self, evidence: torch.Tensor, target: torch.Tensor): class KLDivergenceLoss(nn.Module): - """ + r""" ## KL Divergence Regularization Loss @@ -240,7 +240,7 @@ class KLDivergenceLoss(nn.Module): """ def forward(self, evidence: torch.Tensor, target: torch.Tensor): - """ + r""" * `evidence` is $\mathbf{e} \ge 0$ with shape `[batch_size, n_classes]` * `target` is $\mathbf{y}$ with shape `[batch_size, n_classes]` """ diff --git a/labml_nn/unet/__init__.py b/labml_nn/unet/__init__.py index cdfde7393..8e66cb843 100644 --- a/labml_nn/unet/__init__.py +++ b/labml_nn/unet/__init__.py @@ -30,7 +30,7 @@ class DoubleConvolution(nn.Module): - """ + r""" ### Two $3 \times 3$ Convolution Layers Each step in the contraction path and expansive path have two $3 \times 3$ @@ -63,7 +63,7 @@ def forward(self, x: torch.Tensor): class DownSample(nn.Module): - """ + r""" ### Down-sample Each step in the contracting path down-samples the feature map with @@ -80,7 +80,7 @@ def forward(self, x: torch.Tensor): class UpSample(nn.Module): - """ + r""" ### Up-sample Each step in the expansive path up-samples the feature map with