Upload inference directory

Browse files

Files changed (3) hide show

inference/model.py +189 -0
inference/optimized_diffattn.py +177 -0
inference/rotary.py +76 -0

inference/model.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import torch
+import torch.nn as nn
+import math
+from .optimized_diffattn import MultiheadDiffAttn
+# --- Tokenizer Definition ---
+# Vocabulary: 256 bytes + IM_START_TOKEN + IM_END_TOKEN + <pad>
+IM_START_TOKEN = "<|im_start|>"
+IM_END_TOKEN = "<|im_end|>"
+PAD_TOKEN = "<pad>"
+SPECIAL_TOKENS = [IM_START_TOKEN, IM_END_TOKEN, PAD_TOKEN]
+VOCAB_SIZE = 256 + len(SPECIAL_TOKENS)
+# Create token to id mapping
+token_to_id = {}
+id_to_token = {}
+for i in range(256):
+    token_to_id[bytes([i])] = i
+    id_to_token[i] = bytes([i])
+for i, token_str in enumerate(SPECIAL_TOKENS):
+    token_id = 256 + i
+    token_to_id[token_str] = token_id
+    id_to_token[token_id] = token_str
+PAD_ID = token_to_id[PAD_TOKEN]
+IM_START_ID = token_to_id[IM_START_TOKEN]
+IM_END_ID = token_to_id[IM_END_TOKEN]
+class ByteTokenizer:
+    def __init__(self):
+        self.token_to_id = token_to_id
+        self.id_to_token = id_to_token
+        self.vocab_size = VOCAB_SIZE
+        self.pad_id = PAD_ID
+        self.im_start_id = IM_START_ID
+        self.im_end_id = IM_END_ID
+    def encode(self, text_bytes: bytes, add_special_tokens=True):
+        ids = [self.token_to_id[bytes([b])] for b in text_bytes]
+        if add_special_tokens:
+            return [self.im_start_id] + ids + [self.im_end_id]
+        return ids
+    def decode(self, ids: list[int]):
+        tokens = []
+        for i in ids:
+            token = self.id_to_token.get(i)
+            if token is None:
+                # Handle unknown token ID if necessary, or raise error
+                tokens.append(b"?")  # Placeholder for unknown
+            elif isinstance(token, bytes):
+                tokens.append(token)
+            # Ignore special tokens for decoding to raw text, or handle as needed
+        return b"".join(tokens)
+# --- RoPE Embeddings --- (Reused from previous script)
+def get_rotary_embeddings(seq_len, dim_model, theta=10000.0):
+    if dim_model % 2 != 0:
+        raise ValueError(f"dim_model must be even, got {dim_model}")
+    position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
+    div_term = torch.exp(
+        torch.arange(0, dim_model, 2).float() * -(math.log(theta) / dim_model)
+    )
+    angles = position * div_term
+    cos_emb = torch.cos(angles)
+    sin_emb = torch.sin(angles)
+    return cos_emb, sin_emb
+# --- Model Definition ---
+class FeedForward(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, dropout=0.1):
+        super().__init__()
+        self.fc1 = nn.Linear(embed_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.act = nn.GELU()
+    def forward(self, x):
+        return self.fc2(self.dropout(self.act(self.fc1(x))))
+class DiffTransformerBlock(nn.Module):
+    def __init__(self, embed_dim, num_heads, depth, ffn_hidden_dim, dropout=0.1):
+        super().__init__()
+        self.attn = MultiheadDiffAttn(embed_dim, depth, num_heads, dropout=dropout)
+        self.ffn = FeedForward(embed_dim, ffn_hidden_dim, dropout)
+        self.norm1 = nn.LayerNorm(embed_dim)
+        self.norm2 = nn.LayerNorm(embed_dim)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, rel_pos, attn_mask=None):
+        # Pre-norm
+        attn_out = self.attn(self.norm1(x), rel_pos, attn_mask)
+        x = x + self.dropout(attn_out)
+        ffn_out = self.ffn(self.norm2(x))
+        x = x + self.dropout(ffn_out)
+        return x
+class DiffTransformerLLM(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        embed_dim,
+        num_layers,
+        num_heads,
+        ffn_hidden_dim,
+        max_seq_len,
+        dropout=0.1,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.max_seq_len = max_seq_len
+        self.token_embeddings = nn.Embedding(vocab_size, embed_dim)
+        # Positional embeddings are handled by RoPE, so no separate nn.Embedding for positions
+        self.dropout = nn.Dropout(dropout)
+        self.layers = nn.ModuleList(
+            [
+                DiffTransformerBlock(
+                    embed_dim, num_heads, depth, ffn_hidden_dim, dropout
+                )
+                for depth in range(num_layers)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(embed_dim)
+        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
+        # Tie weights
+        self.token_embeddings.weight = self.lm_head.weight
+        # RoPE precomputation
+        # The head_dim for MultiheadDiffAttn is embed_dim // num_heads // 2
+        self.rope_head_dim = embed_dim // num_heads // 2
+        cos_emb, sin_emb = get_rotary_embeddings(max_seq_len, self.rope_head_dim)
+        self.register_buffer("cos_emb", cos_emb, persistent=False)
+        self.register_buffer("sin_emb", sin_emb, persistent=False)
+    def forward(self, input_ids, attn_mask=None):
+        batch_size, seq_len = input_ids.shape
+        x = self.token_embeddings(input_ids) * math.sqrt(self.embed_dim)
+        x = self.dropout(x)
+        # Ensure RoPE embeddings are on the same device *and* dtype as activations
+        rel_pos = (
+            self.cos_emb[:seq_len, :].to(x.device, dtype=x.dtype),
+            self.sin_emb[:seq_len, :].to(x.device, dtype=x.dtype),
+        )
+        # Create causal attention mask if not provided
+        if attn_mask is None:
+            # Standard causal mask for autoregressive decoding
+            # MultiheadDiffAttn expects a mask where -inf indicates masked positions
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x.device) * float("-inf"),
+                diagonal=1,
+            )
+        else:
+            # If a custom mask is provided (e.g., for padding), ensure it's correctly formatted
+            # For MultiheadDiffAttn, 0 means attend, -inf means mask.
+            # Assuming input attn_mask is 1 for attend, 0 for mask (like Hugging Face)
+            # We need to convert it: (1 - attn_mask) * -inf
+            # However, MultiheadDiffAttn's internal mask logic might be sufficient if it handles padding.
+            # For simplicity, let's assume the provided attn_mask is already in the correct format if not None.
+            # If it's a padding mask (1 for real tokens, 0 for pad), we need to adapt it.
+            # Let's stick to causal mask for now, padding handled by loss_fn ignore_index.
+            causal_mask = torch.triu(
+                torch.ones(seq_len, seq_len, device=x.device) * float("-inf"),
+                diagonal=1,
+            )
+        for layer in self.layers:
+            x = layer(x, rel_pos, attn_mask=causal_mask)
+        x = self.norm_out(x)
+        logits = self.lm_head(x)
+        return logits
+    def count_parameters(self):
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)

inference/optimized_diffattn.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import math
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from torch import nn
+# Re-use rotary embedding helper from the original codebase
+from .rotary import apply_rotary_emb
+# -----------------------------------------------------------------------------
+# Utility helpers (copied from the original implementation)
+# -----------------------------------------------------------------------------
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Efficiently repeat keys / values for GQA without allocating new memory."""
+    bs, n_kv_heads, slen, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, None, :, :]
+        .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+        .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+    )
+def lambda_init_fn(depth: int) -> float:
+    """Init schedule described in the DiffAttention paper."""
+    return 0.8 - 0.6 * math.exp(-0.3 * depth)
+# -----------------------------------------------------------------------------
+# Optimised Multi-head DiffAttention implementation
+# -----------------------------------------------------------------------------
+class MultiheadDiffAttn(nn.Module):
+    """Optimised DiffAttention block.
+    Differences from the original implementation:
+    1. Removes the dependency on Apex / FusedRMSNorm; uses native LayerNorm.
+    2. Keeps all tensors on-device and works well with autocast fp16/bf16.
+    3. Minimises Python-side tensor reshapes and kernel launches.
+    """
+    def __init__(
+        self,
+        embed_dim: int,
+        depth: int,
+        num_heads: int,
+        num_kv_heads: Optional[int] = None,
+        dropout: float = 0.1,
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads  # query heads (will be doubled internally)
+        self.num_kv_heads = num_kv_heads or num_heads
+        self.n_rep = (
+            self.num_heads // self.num_kv_heads
+        )  # replication factor for keys / values (GQA)
+        self.attn_dropout = dropout  # Store dropout rate for attention
+        # One half of a traditional head – DiffAttention uses pairs of heads
+        self.head_dim = embed_dim // self.num_heads // 2
+        assert (
+            self.head_dim * self.num_heads * 2 == embed_dim
+        ), "embed_dim must be divisible by num_heads * 2"
+        self.scaling = self.head_dim**-0.5
+        # Projections.  We keep them separated because K/V are smaller (GQA)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim // self.n_rep, bias=False)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        # Add dropout for regularization
+        self.dropout = nn.Dropout(dropout)
+        # DiffAttention lambda parameters (learnable)
+        self.lambda_init = lambda_init_fn(depth)
+        self.lambda_q1 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_k1 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_q2 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        self.lambda_k2 = nn.Parameter(torch.randn(self.head_dim) * 0.1)
+        # Use standard LayerNorm which has a highly-optimised CUDA kernel
+        self.subln = nn.LayerNorm(2 * self.head_dim, eps=1e-5)
+    # ---------------------------------------------------------------------
+    # Forward
+    # ---------------------------------------------------------------------
+    def forward(
+        self,
+        x: torch.Tensor,  # [bsz, seq_len, embed_dim]
+        rel_pos: tuple[torch.Tensor, torch.Tensor],
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        bsz, seq_len, _ = x.size()
+        # ---- Projections --------------------------------------------------
+        # Projections (run inside the outer autocast context so they stay in
+        # the low-precision dtype and use tensor cores)
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        # Reshape into paired heads (2 × heads)
+        q = q.view(bsz, seq_len, 2 * self.num_heads, self.head_dim)
+        k = k.view(bsz, seq_len, 2 * self.num_kv_heads, self.head_dim)
+        v = v.view(bsz, seq_len, self.num_kv_heads, 2 * self.head_dim)
+        # Rotary position encodings (ensure dtype matches q)
+        cos, sin = rel_pos
+        cos = cos.to(dtype=q.dtype)
+        sin = sin.to(dtype=q.dtype)
+        q = apply_rotary_emb(q, cos, sin, interleaved=True)
+        k = apply_rotary_emb(k, cos, sin, interleaved=True)
+        # ---- Prepare tensors for matmul ----------------------------------
+        # Shape conventions follow PyTorch’s `scaled_dot_product_attention`:
+        #   (bsz, heads, seq, head_dim)
+        q = q.transpose(1, 2)  # [bsz, 2*heads, seq, head_dim]
+        k = k.transpose(1, 2)  # [bsz, 2*kv_heads, seq, head_dim]
+        v = v.transpose(1, 2)  # [bsz, kv_heads, seq, 2*head_dim]
+        # Replicate k/v heads when using GQA
+        k = repeat_kv(k, self.n_rep)  # [bsz, 2*heads, seq, head_dim]
+        v = repeat_kv(v, self.n_rep)  # [bsz, heads, seq, 2*head_dim]
+        # ---- Fused scaled dot-product attention (Flash / SDPA) -----------
+        #
+        # We avoid instantiating the full (seq×seq) score matrix. Instead we
+        # run the fused attention kernel twice (positive/negative queries) and
+        # combine the resulting context tensors with the λ weighting. This
+        # keeps everything in fp16/bf16 and leverages Blackwell’s Flash/SDPA
+        # path, giving ~30-80× speed-up vs. the naive implementation.
+        # ------------------------------------------------------------------
+        # Re-arrange the paired heads: [bsz, 2*H, S, D] → [bsz, H, 2, S, D]
+        q_pairs = q.view(bsz, 2, self.num_heads, seq_len, self.head_dim).permute(
+            0, 2, 1, 3, 4
+        )
+        k_pairs = k.view(bsz, 2, self.num_heads, seq_len, self.head_dim).permute(
+            0, 2, 1, 3, 4
+        )
+        q_pos, q_neg = q_pairs[:, :, 0], q_pairs[:, :, 1]  # [bsz, H, S, D]
+        k_pos, k_neg = k_pairs[:, :, 0], k_pairs[:, :, 1]
+        # λ scalar (identical across heads / sequence)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1)).type_as(q_pos)
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2)).type_as(q_pos)
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init  # scalar tensor
+        # --- Fused attention (only TWO SDPA calls) -------------------------
+        ctx_pos = F.scaled_dot_product_attention(
+            q_pos, k_pos, v, dropout_p=self.attn_dropout, is_causal=True
+        )  # [bsz, H, S, 2*D]
+        ctx_neg = F.scaled_dot_product_attention(
+            q_neg, k_neg, v, dropout_p=self.attn_dropout, is_causal=True
+        )  # [bsz, H, S, 2*D]
+        # DiffAttention combination
+        attn_out = ctx_pos - lambda_full * ctx_neg  # [bsz, H, S, 2*D]
+        # LayerNorm & residual scaling
+        attn_out = self.subln(attn_out) * (1.0 - self.lambda_init)
+        # Collapse heads and project out
+        attn_out = attn_out.transpose(1, 2).reshape(  # [bsz, seq, heads, 2*head_dim]
+            bsz, seq_len, self.embed_dim
+        )
+        # Apply output projection and dropout
+        out = self.out_proj(attn_out)
+        return self.dropout(out)

inference/rotary.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright (c) 2023, Tri Dao.
+from typing import Optional, Union
+import torch
+def apply_rotary_emb_torch(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets=0,
+    cu_seqlens=None,
+    max_seqlen=None,
+):
+    # Only supports the basic (not interleaved, not variable-length) case.
+    rotary_dim = cos.shape[1] * 2
+    x1 = x[..., :rotary_dim]
+    x2 = x[..., rotary_dim:]
+    # Split [even, odd] pairs
+    x1_1, x1_2 = x1[..., ::2], x1[..., 1::2]  # (..., rotary_dim/2)
+    # Reshape cos/sin for broadcasting
+    # x: [batch, seqlen, nheads, rotary_dim]
+    # cos/sin: [seqlen, rotary_dim/2]
+    # reshape to [1, seqlen, 1, rotary_dim/2] to broadcast
+    cos = cos.unsqueeze(0).unsqueeze(2)
+    sin = sin.unsqueeze(0).unsqueeze(2)
+    rot_x1 = x1_1 * cos - x1_2 * sin
+    rot_x2 = x1_1 * sin + x1_2 * cos
+    # Interleave last dimension: (..., rotary_dim/2, 2) -> (..., rotary_dim)
+    rot_x = torch.stack([rot_x1, rot_x2], dim=-1).reshape_as(x1)
+    out = torch.cat([rot_x, x2], dim=-1)
+    return out
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets: Union[int, torch.Tensor] = 0,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[int] = None,
+):
+    """
+    Arguments:
+        x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+        cos, sin: (seqlen_rotary, rotary_dim / 2)
+        interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+            of 1st half and 2nd half (GPT-NeoX style).
+        inplace: if True, apply rotary embedding in-place.
+        seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
+            Most commonly used in inference when we have KV cache.
+        cu_seqlens: (batch + 1,) or None
+        max_seqlen: int
+    Return:
+        out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None
+            else (total_seqlen, nheads, headdim)
+    rotary_dim must be <= headdim
+    Apply rotary embedding to the first rotary_dim of x.
+    """
+    # We are forcing the use of the pure PyTorch implementation (`apply_rotary_emb_torch`)
+    # for all devices. The custom Triton kernel (`ApplyRotaryEmb`) was causing a graph
+    # break in `torch.compile`, pushing expensive operations to the CPU.
+    # By using the pure PyTorch version, `torch.compile` can create a single, fully-optimized
+    # graph, which should resolve the CPU bottleneck and improve GPU utilization.
+    return apply_rotary_emb_torch(
+        x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen
+    )