Instructions to use OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints
- SGLang
How to use OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints with Docker Model Runner:
docker model run hf.co/OpenNLPLab/TransNormerLLM3-15B-Intermediate-Checkpoints
| # CREDITS: This comes almost as-is from the Triton layer norm tutorial | |
| # https://github.com/openai/triton/blob/master/python/tutorials/05-layer-norm.py | |
| # Copyright 2024 OpenNLPLab | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # coding=utf-8 | |
| import torch | |
| import torch.nn.functional as F | |
| import triton | |
| import triton.language as tl | |
| # fmt: off | |
| def srms_norm_fw(X, Y, V, stride, N, eps, BLOCK_SIZE_N: tl.constexpr): | |
| # fmt: on | |
| row = tl.program_id(0) | |
| cols = tl.arange(0, BLOCK_SIZE_N) | |
| mask = cols < N | |
| # Move to this row | |
| x_ptrs = X + row * stride + cols | |
| x = tl.load(x_ptrs, mask=mask, other=0.0).to(tl.float32) | |
| x_zm = tl.where(mask, x, 0.0) | |
| x_var = tl.sum(x_zm * x_zm, axis=0) / N | |
| rstd = 1.0 / tl.sqrt(x_var + eps) | |
| # Normalize, optionally affine | |
| y = x_zm * rstd | |
| tl.store(V + row, rstd) | |
| y_ptrs = Y + row * stride + cols | |
| tl.store(y_ptrs, y, mask=mask) | |
| # Backward pass (DX + partial DW + partial DB) | |
| # fmt: off | |
| def srms_norm_bwd_dx_fused( | |
| DX, DY, | |
| X, V, | |
| stride, N, | |
| # META-parameters | |
| BLOCK_SIZE_N: tl.constexpr, | |
| ): | |
| # fmt: on | |
| # position of elements processed by this program | |
| row = tl.program_id(0) | |
| cols = tl.arange(0, BLOCK_SIZE_N) | |
| mask = cols < N | |
| # offset data pointers to start at the row of interest | |
| x_ptrs = X + row * stride + cols | |
| dy_ptrs = DY + row * stride + cols | |
| # load data to SRAM | |
| x = tl.load(x_ptrs, mask=mask, other=0) | |
| dy = tl.load(dy_ptrs, mask=mask, other=0) | |
| rstd = tl.load(V + row) | |
| # compute dx | |
| xhat = x * rstd | |
| wdy = dy | |
| xhat = tl.where(mask, xhat, 0.) | |
| wdy = tl.where(mask, wdy, 0.) | |
| mean1 = tl.sum(xhat * wdy, axis=0) / N | |
| dx = (wdy - (xhat * mean1)) * rstd | |
| # write-back dx | |
| mask = cols < N # re-materialize the mask to save registers | |
| dx_ptrs = DX + row * stride + cols | |
| tl.store(dx_ptrs, dx, mask=mask) | |
| class _SrmsNorm(torch.autograd.Function): | |
| def forward(ctx, x, eps): | |
| # catch eps being too small if the tensors are fp16 | |
| if x.dtype == torch.float16: | |
| eps = max(eps, 1.6e-5) | |
| # allocate output | |
| y = torch.empty_like(x) | |
| # reshape input data into 2D tensor | |
| x_arg = x.reshape(-1, x.shape[-1]) | |
| M, N = x_arg.shape | |
| # allocate mean and std, they'll be used in the backward pass | |
| rstd = torch.empty((M, ), dtype=torch.float32, device=x.device) | |
| # Less than 64KB per feature: enqueue fused kernel | |
| MAX_FUSED_SIZE = 65536 // x.element_size() | |
| BLOCK_SIZE_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N)) | |
| if N > BLOCK_SIZE_N: | |
| raise RuntimeError( | |
| "This layer norm doesn't support feature dim >= 64KB.") | |
| if not x_arg.is_contiguous() or not y.is_contiguous(): | |
| x_arg = x_arg.contiguous() | |
| y = y.contiguous() | |
| # heuristics for number of warps. | |
| num_warps = min(max(BLOCK_SIZE_N // 256, 1), 16) | |
| # enqueue kernel | |
| # fmt: off | |
| srms_norm_fw[(M,)]( | |
| x_arg, y, rstd, | |
| x_arg.stride(0), | |
| N, | |
| eps, | |
| num_warps=num_warps, | |
| BLOCK_SIZE_N=BLOCK_SIZE_N, | |
| ) | |
| # fmt: on | |
| ctx.save_for_backward(x, rstd) | |
| ctx.BLOCK_SIZE_N = BLOCK_SIZE_N | |
| ctx.num_warps = num_warps | |
| return y.reshape_as(x) | |
| def backward( | |
| ctx, dy | |
| ): # pragma: no cover # this is covered, but called directly from C++ | |
| x, rstd = ctx.saved_tensors | |
| # flatten the batch dimension, if any. | |
| # We're interested in 'samples' x norm_dimension | |
| x = x.reshape(-1, x.size(-1)) | |
| M, N = x.size() | |
| # heuristics for amount of parallel reduction stream for DG/DB | |
| GROUP_SIZE_M = 32 | |
| if N <= 8192: | |
| GROUP_SIZE_M = 64 | |
| if N <= 4096: | |
| GROUP_SIZE_M = 96 | |
| if N <= 2048: | |
| GROUP_SIZE_M = 128 | |
| if N <= 1024: | |
| GROUP_SIZE_M = 256 | |
| if dy.dtype == torch.float32: | |
| GROUP_SIZE_M = GROUP_SIZE_M // 2 | |
| # allocate output | |
| dy = dy.contiguous() | |
| dx = torch.empty_like(dy) | |
| # Check the tensor shapes and layouts | |
| # we suppose in the kernel that they have the same size and are contiguous | |
| assert ( | |
| dy.numel() == x.numel() | |
| ), "Something is wrong in the backward graph, possibly because of an inplace operation after the layernorm" | |
| # enqueue kernel using forward pass heuristics | |
| # also compute partial sums for DW and DB | |
| num_warps = min(max(ctx.BLOCK_SIZE_N // 256, 1), 16) | |
| # fmt: off | |
| srms_norm_bwd_dx_fused[(M,)]( | |
| dx, dy, x, | |
| rstd, | |
| x.stride(0), | |
| N, | |
| BLOCK_SIZE_N=ctx.BLOCK_SIZE_N, | |
| num_warps=num_warps | |
| ) | |
| # fmt: on | |
| dx = dx.reshape_as(dy) | |
| return dx, None, None | |
| class SimpleRMSNorm(torch.nn.Module): | |
| def __init__(self, dim: int, eps: float = 1e-6): | |
| super().__init__() | |
| self.eps = eps | |
| self.dim = dim | |
| def forward(self, x): | |
| return _SrmsNorm.apply(x, self.eps) | |