Erland commited on Nov 12

Commit

7e92010

verified ·

1 Parent(s): 372fade

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +512 -0
torchtitan/components/__pycache__/ft.cpython-311.pyc +0 -0
torchtitan/components/optimizer.py +303 -0
torchtitan/distributed/__pycache__/pipeline.cpython-311.pyc +0 -0
torchtitan/distributed/pipeline.py +201 -0
torchtitan/experiments/deepseek_v3/checkpoint.py +154 -0
torchtitan/experiments/deepseek_v3/download.py +70 -0
torchtitan/experiments/deepseek_v3/generate.py +308 -0
torchtitan/experiments/deepseek_v3/inference.sh +15 -0
torchtitan/experiments/deepseek_v3/model.py +1325 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py +159 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py +260 -0
torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py +63 -0
torchtitan/experiments/deepseek_v3/train.py +142 -0
torchtitan/experiments/flux/README.md +23 -0
torchtitan/experiments/flux/dataset/flux_dataset.py +267 -0
torchtitan/experiments/flux/model/autoencoder.py +388 -0
torchtitan/experiments/flux/model/hf_embedder.py +40 -0
torchtitan/experiments/flux/model/layers.py +286 -0
torchtitan/experiments/flux/model/model.py +177 -0
torchtitan/experiments/flux/parallelize_flux.py +26 -0
torchtitan/experiments/flux/requirements.txt +2 -0
torchtitan/experiments/flux/tests/test_flux_dataloader.py +103 -0
torchtitan/experiments/flux/tests/test_generate_image.py +252 -0
torchtitan/experiments/flux/train_configs/debug_model.toml +68 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py +630 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/__init__.py +13 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py +240 -0
torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py +82 -0
torchtitan/experiments/llama4/__pycache__/__init__.cpython-311.pyc +0 -0
torchtitan/experiments/llama4/infra/expert_parallel.py +145 -0
torchtitan/experiments/llama4/infra/parallelize_llama.py +159 -0
torchtitan/experiments/llama4/model/__pycache__/model.cpython-311.pyc +0 -0
torchtitan/experiments/llama4/model/model.py +466 -0
torchtitan/experiments/llama4/model/moe.py +228 -0
torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh +25 -0
torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml +65 -0
torchtitan/experiments/multimodal/mm_collator.py +227 -0
torchtitan/experiments/multimodal/mm_dataset.py +268 -0
torchtitan/experiments/multimodal/tests/test_multimodal_model.py +128 -0
torchtitan/experiments/multimodal/utils.py +437 -0
torchtitan/experiments/simple_fsdp/__init__.py +33 -0
torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-311.pyc +0 -0
torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-311.pyc +0 -0
torchtitan/experiments/simple_fsdp/model.py +18 -0
torchtitan/experiments/simple_fsdp/parallelize_llama.py +98 -0
torchtitan/models/llama3/train_configs/llama3_8b.toml +63 -0
torchtitan/models/norms.py +35 -0
torchtitan/protocols/__pycache__/model_converter.cpython-311.pyc +0 -0
torchtitan/tools/utils.py +143 -0

README.md ADDED Viewed

	@@ -0,0 +1,512 @@

+<div align="center">
+# 🔥 Flame: Flash Linear Attention Made Easy
+# This is a fork for the paper:
+# Softpick: No Attention Sink, No Massive Activations with Rectified Softmax
+</div>
+## Instructions for Softpick Attention
+This fork can only work on an older commit of torchtitan and flame, so the setup looks like this:
+```bash
+git clone https://github.com/zaydzuhri/flame.git
+cd flame
+git checkout softpick-attention
+git submodule update --init --recursive --remote
+cd 3rdparty/torchtitan
+git checkout 4f532e0
+cd ../../
+pip install .
+pip install flash-attn --no-build-isolation
+```
+The flash-linear-attention submodule has been changed to link to our fork: https://github.com/zaydzuhri/flash-linear-attention/tree/softpick-attention
+So no need to manually clone it.
+Then prepare the fineweb-edu 100B sample the same way as described in the flame repo guide below.
+These are the training commands used in the paper:
+```bash
+NGPU=8 bash train.sh   --job.config_file flame/models/fla.toml   --job.dump_folder exp/vanilla.340M.batch16.seqlen4096.context4096.warmup1000.update1.steps100000.lr3e-4.cosine   --model.config configs/vanilla_transformer_340M.json   --model.tokenizer_path fla-hub/transformer-1.3B-100B   --optimizer.name AdamW   --optimizer.eps 1e-15   --optimizer.lr 3e-4   --lr_scheduler.warmup_steps 1000   --lr_scheduler.lr_min 0.1   --lr_scheduler.decay_type cosine   --training.batch_size 16   --training.seq_len 4096   --training.context_len 4096   --training.gradient_accumulation_steps 1   --training.steps 100000   --training.max_norm 1.0   --training.skip_nan_inf  --training.dataset ~/.cache/HuggingFaceFW___fineweb-edu/sample-100BT  --training.dataset_split train   --training.num_workers 32   --training.prefetch_factor 2   --training.seed 79   --training.compile   --checkpoint.interval 10000   --checkpoint.load_step -1   --metrics.log_freq 5 --checkpoint.hf_upload_enabled   --checkpoint.hf_repo_base_name "zaydzuhri/vanilla-340M-4096-batch16-steps100000" --comm.init_timeout_seconds 600 --comm.train_timeout_seconds 300
+NGPU=8 bash train.sh   --job.config_file flame/models/fla.toml   --job.dump_folder exp/softpick.340M.batch16.seqlen4096.context4096.warmup1000.update1.steps100000.lr3e-4.cosine   --model.config configs/softpick_transformer_340M.json   --model.tokenizer_path fla-hub/transformer-1.3B-100B   --optimizer.name AdamW   --optimizer.eps 1e-15   --optimizer.lr 3e-4   --lr_scheduler.warmup_steps 1000   --lr_scheduler.lr_min 0.1   --lr_scheduler.decay_type cosine   --training.batch_size 16   --training.seq_len 4096   --training.context_len 4096   --training.gradient_accumulation_steps 1   --training.steps 100000   --training.max_norm 1.0   --training.skip_nan_inf  --training.dataset ~/.cache/HuggingFaceFW___fineweb-edu/sample-100BT  --training.dataset_split train   --training.num_workers 32   --training.prefetch_factor 2   --training.seed 79   --training.compile   --checkpoint.interval 10000   --checkpoint.load_step -1   --metrics.log_freq 5 --checkpoint.hf_upload_enabled   --checkpoint.hf_repo_base_name "zaydzuhri/softpick-340M-4096-batch16-steps100000" --comm.init_timeout_seconds 600 --comm.train_timeout_seconds 300
+```
+And the same for the extra experiments in the appendix:
+```bash
+NGPU=8 bash train.sh   --job.config_file flame/models/fla.toml   --job.dump_folder exp/rectified.340M.batch16.seqlen4096.context4096.warmup1000.update1.steps100000.lr3e-4.cosine   --model.config configs/rectified_transformer_340M.json   --model.tokenizer_path fla-hub/transformer-1.3B-100B   --optimizer.name AdamW   --optimizer.eps 1e-15   --optimizer.lr 3e-4   --lr_scheduler.warmup_steps 1000   --lr_scheduler.lr_min 0.1   --lr_scheduler.decay_type cosine   --training.batch_size 16   --training.seq_len 4096   --training.context_len 4096   --training.gradient_accumulation_steps 1   --training.steps 100000   --training.max_norm 1.0   --training.skip_nan_inf  --training.dataset ~/.cache/HuggingFaceFW___fineweb-edu/sample-100BT  --training.dataset_split train   --training.num_workers 32   --training.prefetch_factor 2   --training.seed 79   --training.compile   --checkpoint.interval 10000   --checkpoint.load_step -1   --metrics.log_freq 5 --checkpoint.hf_upload_enabled   --checkpoint.hf_repo_base_name "zaydzuhri/rectified-340M-4096-batch16-steps100000" --comm.init_timeout_seconds 600 --comm.train_timeout_seconds 300
+NGPU=8 bash train.sh   --job.config_file flame/models/fla.toml   --job.dump_folder exp/softpick.scaled.340M.batch16.seqlen4096.context4096.warmup1000.update1.steps100000.lr3e-4.cosine   --model.config configs/softpick_scaled_transformer_340M.json   --model.tokenizer_path fla-hub/transformer-1.3B-100B   --optimizer.name AdamW   --optimizer.eps 1e-15   --optimizer.lr 3e-4   --lr_scheduler.warmup_steps 1000   --lr_scheduler.lr_min 0.1   --lr_scheduler.decay_type cosine   --training.batch_size 16   --training.seq_len 4096   --training.context_len 4096   --training.gradient_accumulation_steps 1   --training.steps 100000   --training.max_norm 1.0   --training.skip_nan_inf  --training.dataset ~/.cache/HuggingFaceFW___fineweb-edu/sample-100BT  --training.dataset_split train   --training.num_workers 32   --training.prefetch_factor 2   --training.seed 79   --training.compile   --checkpoint.interval 10000   --checkpoint.load_step -1   --metrics.log_freq 5 --checkpoint.hf_upload_enabled   --checkpoint.hf_repo_base_name "zaydzuhri/softpick-scaled-340M-4096-batch16-steps100000" --comm.init_timeout_seconds 600 --comm.train_timeout_seconds 300
+```
+Feel free to DM @zmkzmkz on X for any questions regarding the paper or this code!
+## Flame
+Welcome to 🔥 `flame`, a minimal and efficient framework built on `torchtitan` for training Flash Linear Attention (FLA) models (and more broadly, arbitrary autoregressive language models) with blazing efficiency.
+**Feature Highlights:**
+- 🚀 Minimal, easy-to-use, extensible training framework
+- 🤗 Seamless integration with `fla` and `transformers`
+- 🔄 Zero-cost data preprocessing: online tokenization, dataset shuffling, and multiple datasets support
+- 🔮 4D parallelism (coming soon)
+## Setup
+To get started, clone the `flame` repository and install the required dependencies:
+```bash
+git clone https://github.com/fla-org/flame.git
+cd flame
+pip install .
+```
+`flame` manages minimal dependencies, only including `fla` and `torchtitan` as submodules.
+After installation, initialize and update the submodules:
+```sh
+git submodule update --init --recursive
+```
+## Dataset Preparation
+To download the dataset to your local disk, create a new Python file with the following content and execute it:
+```py
+from datasets import load_dataset
+# load fineweb-edu with parallel processing
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="default", num_proc=64, cache_dir="/your/cache/path")
+# or load a subset with roughly 100B tokens, suitable for small- or medium-sized experiments
+dataset = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-100BT", num_proc=64, cache_dir="/your/cache/path")
+```
+## Training Recipes
+Here's an example of training a 340M FLA Transformer model with a LLaMA-like architecture from scratch on a 100BT subset of the Fineweb-edu corpus in streaming mode.
+> [!WARNING]
+> If the dataset is not downloaded beforehand, the streaming mode will attempt to fetch it from a remote server and download it on-the-fly, which can be highly unstable during training due to network issues.
+> For stable training, ensure the dataset is downloaded locally (see [**Dataset Preparation**](#dataset-preparation)). Otherwise, we assume you are only testing the new corpus.
+```sh
+bash train.sh \
+  --job.config_file flame/models/fla.toml \
+  --job.dump_folder exp/transformer-340M-4K-10B/batch1.seqlen65536.context4096.warmup1024.update1.steps20480.lr3e-4.cosine \
+  --model.config configs/transformer_340M.json \
+  --model.tokenizer_path fla-hub/transformer-1.3B-100B \
+  --optimizer.name AdamW \
+  --optimizer.eps 1e-15 \
+  --optimizer.lr 3e-4 \
+  --lr_scheduler.warmup_steps 1024 \
+  --lr_scheduler.lr_min 0.1 \
+  --lr_scheduler.decay_type cosine \
+  --training.batch_size 1 \
+  --training.seq_len 65536 \
+  --training.context_len 4096 \
+  --training.varlen \
+  --training.gradient_accumulation_steps 1 \
+  --training.steps 20480 \
+  --training.max_norm 1.0 \
+  --training.skip_nan_inf \
+  --training.dataset HuggingFaceFW/fineweb-edu \
+  --training.dataset_name sample-100BT \
+  --training.dataset_split train \
+  --training.streaming \
+  --training.num_workers 32 \
+  --training.prefetch_factor 2 \
+  --training.seed 42 \
+  --training.compile \
+  --checkpoint.interval 2048 \
+  --checkpoint.load_step -1 \
+  --checkpoint.keep_latest_k 2 \
+  --metrics.log_freq 1
+```
+You can specify the number of GPUs by setting the environment variable `NGPU`, which defaults to 8.
+**For single-GPU debugging, set `NGPU=1`.**
+We provide several [config files](https://github.com/fla-org/flame/tree/main/configs) for different models.
+By default, the learning rate is set to 3e-4 with a cosine scheduler. Other schedulers, such as WSD (wsd), are also supported.
+**Key parameters:**
+- `--lr_scheduler.decay_ratio`: The proportion of the steps allocated to the decay phase. The learning rate will remain stable after the warmup period and only start decaying during the last `decay_ratio` portion of the total training steps, which is known as the Warmup-Stable-Decay (WSD) schedule.
+- `--lr_scheduler.warmup_steps`: The number of steps for the learning rate warmup phase.
+- `--training.steps`: Total number of training steps.
+- `--training.batch_size`: Batch size per device, must be 1 if `--training.varlen` is set.
+- `--training.seq_len`: The length of each sequence in the batch, which is concatenated from multiple samples.
+- `--training.context_len`: The max allowed length of a sample. For non-varlen mode, this is equivalent to `seq_len`.
+- `--training.varlen`: Whether to conduct variable-length sequence training.
+- `--training.gradient_accumulation_steps`: Number of gradient accumulation steps.
+> [!WARNING]
+> The total number of tokens processed per batch, referred to as `global_batch_size`, is calculated as batch_size × gradient_accumulation_steps × num_gpus.
+> Each step processes `global_batch_size * seq_len` tokens.
+> Monitor the value of `global_batch_size`, `warmup_steps`, and `steps` carefully when modifying any of the hyperparameters!
+For a detailed explanation of all parameters, run:
+```sh
+bash train.sh -h
+```
+<details>
+<summary>Usage</summary>
+```py
+options:
+  -h, --help            show this help message and exit
+  --job.config_file JOB.CONFIG_FILE
+                        Job config file
+  --job.dump_folder JOB.DUMP_FOLDER
+                        Folder to dump job outputs
+  --job.description JOB.DESCRIPTION
+                        Description of the job
+  --job.use_for_integration_test
+                        Add this config to the integration test suite
+  --job.print_args      Print the args to terminal
+  --model.config MODEL.CONFIG
+                        Path to the model config
+  --model.norm_type MODEL.NORM_TYPE
+                        Type of layer normalization to use [layernorm,
+                        np_layernorm, rmsnorm, fused_rmsnorm]
+  --model.tokenizer_path MODEL.TOKENIZER_PATH
+                        Tokenizer path
+  --profiling.enable_profiling
+                        Whether to enable pytorch profiler
+  --profiling.save_traces_folder PROFILING.SAVE_TRACES_FOLDER
+                        Trace files location
+  --profiling.profile_freq PROFILING.PROFILE_FREQ
+                        How often to collect profiler traces, in iterations
+  --profiling.enable_memory_snapshot
+                        Whether to dump memory snapshot
+  --profiling.save_memory_snapshot_folder PROFILING.SAVE_MEMORY_SNAPSHOT_FOLDER
+                        Memeory snapshot files location
+  --optimizer.name OPTIMIZER.NAME
+                        Optimizer to use
+  --optimizer.eps OPTIMIZER.EPS
+                        Epsilon value for the optimizer.
+  --optimizer.fused     Whether the fused implementation(CUDA only) is used.
+  --optimizer.scheduler {wsd,cosine,linear}
+                        Scheduler to use. Currently supported: wsd, cosine,
+                        and linear.
+  --optimizer.lr OPTIMIZER.LR
+                        Learning rate to use
+  --optimizer.min_lr_ratio OPTIMIZER.MIN_LR_RATIO
+                        Min lr ratio for lr scheduler
+  --optimizer.early_step_in_backward
+                        Whether to apply optimizer in the backward. Caution,
+                        optimizer_in_backward is not compatible with gradients
+                        clipping, users should not call
+                        register_post_accumulate_grad_hook after the optimizer
+                        is built.
+  --training.batch_size TRAINING.BATCH_SIZE
+                        Batch size
+  --training.seq_len TRAINING.SEQ_LEN
+                        Sequence length
+  --training.context_len TRAINING.CONTEXT_LEN
+                        Max length allowed for each sequence
+  --training.varlen     Whether to take sequences of variable length as input
+  --training.warmup_steps TRAINING.WARMUP_STEPS
+                        Steps for lr scheduler warmup, normally 1/5 of
+                        --training.steps
+  --training.gradient_accumulation_steps TRAINING.GRADIENT_ACCUMULATION_STEPS
+                        Number of steps to accumulate gradients before
+                        updating parameters
+  --training.steps TRAINING.STEPS
+                        How many train steps to run
+  --training.max_norm TRAINING.MAX_NORM
+                        Max norm for gradient clipping
+  --training.skip_nan_inf
+                        Skip batch updates when NaN or INF gradients are
+                        encountered during training
+  --training.dataset TRAINING.DATASET
+                        Dataset to use, with comma separated values
+  --training.dataset_name TRAINING.DATASET_NAME
+                        The name of the dataset config, with comma separated
+                        values if provided
+  --training.dataset_split TRAINING.DATASET_SPLIT
+                        Dataset split to use, with comma separated values if
+                        provided
+  --training.data_dir TRAINING.DATA_DIR
+                        Data dirs to use, with comma separated values if
+                        provided
+  --training.data_files TRAINING.DATA_FILES
+                        Data files to use, with comma separated values if
+                        provided
+  --training.data_probs TRAINING.DATA_PROBS
+                        Data sampling probabilities, with comma separated
+                        values if provided
+  --training.streaming  Whether to load dataset in streaming mode, used for
+                        huge dataset
+  --training.num_workers TRAINING.NUM_WORKERS
+                        Number of subprocesses to use for data loading. 0
+                        means that the data will be loaded in the main
+                        process.
+  --training.prefetch_factor TRAINING.PREFETCH_FACTOR
+                        Number of batches loaded in advance by each worker.2
+                        means there will be a total of 2 * num_workers batches
+                        prefetched across all workers.
+  --training.data_parallel_replicate_degree TRAINING.DATA_PARALLEL_REPLICATE_DEGREE
+                        The `data_parallel_replicate_degree` argument
+                        specifies the degree of data parallelism for weight
+                        replication. When this value is greater than 1,
+                        weights will be replicated across
+                        `data_parallel_replicate_degree` ranks. If
+                        `data_parallel_shard_degree` is also greater than 1,
+                        the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is DDP (Distributed Data Parallelism). 1 means
+                        disabled.
+  --training.data_parallel_shard_degree TRAINING.DATA_PARALLEL_SHARD_DEGREE
+                        The `data_parallel_shard_degree` argument specifies
+                        the degree of data parallelism for weight sharding.
+                        When this value is greater than 1, weights will be
+                        sharded across `data_parallel_shard_degree` ranks. If
+                        `data_parallel_replicate_degree` is also greater than
+                        1, the parallelism method used is HSDP (Hybrid Sharded
+                        Data Parallelism). Otherwise, the parallelism method
+                        used is FSDP (Fully Sharded Data Parallelism). -1
+                        means leftover ranks will be used (After
+                        DP_REPLICATE/SP/PP). Note that only
+                        `data_parallel_shard_degree` can be negative. 1 means
+                        disabled.
+  --training.enable_cpu_offload
+                        Whether to apply CPU offloading of parameters,
+                        gradients, and optimizer states in FSDP
+  --training.tensor_parallel_degree TRAINING.TENSOR_PARALLEL_DEGREE
+                        Tensor Parallelism degree. 1 means disabled.
+  --training.disable_loss_parallel
+                        Whether to apply loss parallel when sequence parallel
+                        is enabled
+  --training.mixed_precision_param {bfloat16,float32}
+                        torch dtype to use for parameters when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.mixed_precision_reduce {float32}
+                        torch dtype to use for reductions when applying mixed
+                        precision via FSDP. This feature only takes effect
+                        when data_parallel_shard_degree > 1
+  --training.compile    Whether to compile the model
+  --training.gc_freq TRAINING.GC_FREQ
+                        Python garbage control scheduling interval, in steps
+  --training.seed TRAINING.SEED
+                        Choose the base RNG seed used for training
+  --training.deterministic
+                        Use deterministic algorithms wherever possible, may be
+                        slower
+  --metrics.log_freq METRICS.LOG_FREQ
+                        How often to log metrics to TensorBoard, in iterations
+  --metrics.enable_tensorboard
+                        Whether to log metrics to TensorBoard
+  --metrics.disable_color_printing
+                        Whether to disable color printing in logs
+  --metrics.save_tb_folder METRICS.SAVE_TB_FOLDER
+                        Folder to dump TensorBoard states
+  --metrics.rank_0_only
+                        Whether to save TensorBoard metrics only for rank 0 or
+                        for all ranks. When pipeline_parallel_degree is > 1,
+                        this option uses the 0th rank of the last stage
+                        pipeline group, which is the only stage that computes
+                        loss metrics.
+  --metrics.enable_wandb
+                        Whether to log metrics to Weights & Biases
+  --experimental.enable_async_tensor_parallel
+                        Whether to apply async tensor parallel (currently only
+                        effective when compile is enabled)
+  --experimental.pipeline_parallel_degree EXPERIMENTAL.PIPELINE_PARALLEL_DEGREE
+                        Pipeline Parallelism degree, or number of ranks. 1
+                        means disabled. If using looped schedules, this still
+                        specifies the number of physical ranks, not the number
+                        of stages. Stages per rank are inferred from split
+                        points degree, and schedule.
+  --experimental.pipeline_parallel_split_points EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS [EXPERIMENTAL.PIPELINE_PARALLEL_SPLIT_POINTS ...]
+                        Specify comma-separated names of modules to use as the
+                        beginning of a split point. e.g. "layers.0,layers.2"
+                        will cause the model to be split into 3 stages, the
+                        first containing all the layers up to layers.0, the
+                        second containing layers.0 and up to layers.2, the
+                        third containing layers.2 and all the remaining
+                        layers. Note: fully-automated splitting may be enabled
+                        in the future, but currently the split points must be
+                        specified manually.
+  --experimental.pipeline_parallel_schedule EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE
+                        Specify the Pipeline Parallel schedule to use. The
+                        supported schedules are: https://github.com/pytorch/py
+                        torch/blob/de4c2a3b4e89d96334dc678d1c3f2ae51a6630a0/to
+                        rch/distributed/pipelining/schedules.py#L2161. The
+                        schedule must be compatible with the split points and
+                        stages_per_rank. Looped schedules (e.g.
+                        Interleaved1F1B) require specifying
+                        pipeline_parallel_degree = number of ranks, and
+                        split_points = number of stages - 1
+  --experimental.pipeline_parallel_schedule_csv EXPERIMENTAL.PIPELINE_PARALLEL_SCHEDULE_CSV
+                        Specify the path to the pipeline parallel schedule csv
+                        file to use. The pipeline_parallel_schedule argument
+                        must be either PipelineScheduleSingle,
+                        PipelineScheduleMulti, or _PipelineScheduleRuntime.
+  --experimental.pipeline_parallel_microbatches EXPERIMENTAL.PIPELINE_PARALLEL_MICROBATCHES
+                        How many microbatches to split the global training
+                        batch into when using pipeline parallelism. The global
+                        training batch size must be evenly divisible by the
+                        number of microbatches. The default value will be the
+                        number of pipeline stages, if unspecified.
+  --experimental.enable_compiled_autograd
+                        Enable CompiledAutograd to compile the backward.
+  --experimental.context_parallel_degree EXPERIMENTAL.CONTEXT_PARALLEL_DEGREE
+                        Context parallelism degree. 1 means disabled.
+  --experimental.context_parallel_rotate_method EXPERIMENTAL.CONTEXT_PARALLEL_ROTATE_METHOD
+                        The collective to use in context parallel SDPA for kv
+                        shards exchange. 'allgather' means to all-gather all
+                        kv shards on ranks after the first sub-SDPA
+                        computation, 'alltoall' means to all-to-all shuffle
+                        the kv shards. The default value is 'allgather'.
+  --checkpoint.enable_checkpoint
+                        Whether to enable checkpoint
+  --checkpoint.folder CHECKPOINT.FOLDER
+                        The folder to store the checkpoints. When
+                        enable_checkpoint is set to true, checkpoints will be
+                        in {--job.dump_folder}/{--checkpoint.folder}.
+  --checkpoint.interval_type CHECKPOINT.INTERVAL_TYPE
+                        Checkpointing interval unit of measurement ['step',
+                        'seconds']
+  --checkpoint.interval CHECKPOINT.INTERVAL
+                        Checkpointing interval, in steps or seconds depending
+                        on --checkpoint.interval_type
+  --checkpoint.model_weights_only
+                        When model_weights_only=True, only model weights will
+                        be saved at the end of training. With this,
+                        checkpoints can be loaded using `torch.load(...,
+                        weights_only=True)` after conversion. When
+                        model_weights_only=False, the full checkpoint will be
+                        saved. A full checkpoint includes model, optimizer and
+                        train_state, which can be used to resume training. The
+                        default value is false.
+  --checkpoint.export_dtype {float16,bfloat16,float32}
+                        Converts to the specified precision when training
+                        completes and model_weights_only=true. Currently
+                        supports float32, float16, and bfloat16. The default
+                        value is float32.
+  --checkpoint.create_seed_checkpoint
+                        Initializes the full model without applying
+                        parallelisms, and then saves it as a seed checkpoint.
+                        Note: requires user to call train.py without
+                        specifying any parallelisms, e.g. NGPU=1. Could be
+                        implemented as a separate script, but this way shares
+                        more code.
+  --checkpoint.async_mode CHECKPOINT.ASYNC_MODE
+                        Which async checkpoint mode to use. Currently there
+                        are 3 different modes. 1. "disabled": synchronized
+                        checkpointing will be used. 2. "async":
+                        torch.distributed.checkpoint.async_save will be used.
+                        1. "async_with_pinned_mem": this option utilizes a
+                        dedicated pinned memory space and creates a separate
+                        process for faster GPU->CPU transfer performance and
+                        eliminating GIL contention. The cost is increased CPU
+                        memory usage. If insufficient CPU memory is available,
+                        performance may degrade due to memory paging. For most
+                        users, "async" should suffice as the performance
+                        overhead is typically small (on the order of tens of
+                        seconds) compared to checkpointing frequency. This
+                        mode can be employed to pursue near-zero checkpointing
+                        times (e.g., < 1 second) given appropriate hardware
+                        support such as ample CPU memory and fast PCIe.
+                        "disabled" is the default mode.
+  --checkpoint.keep_latest_k CHECKPOINT.KEEP_LATEST_K
+                        Keeps only the latest k checkpoints, and purging older
+                        ones. If 0, keep all checkpoints. 0 is the default
+                        value.
+  --checkpoint.load_step CHECKPOINT.LOAD_STEP
+                        Load the checkpoint at the specified step. If -1, load
+                        the latest checkpoint.
+  --float8.enable_float8_linear
+                        If true, swaps `torch.nn.Linear` with `Float8Linear`.
+                        This feature requires you to install 'torchao' which
+                        can be found here: https://github.com/pytorch/ao
+  --float8.enable_fsdp_float8_all_gather
+                        Whether enable float8 all-gather in FSDP
+  --float8.precompute_float8_dynamic_scale_for_fsdp
+                        Whether precompute float8 scales dynamically for FSDP
+  --float8.scaling_type_input {dynamic,delayed}
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_weight FLOAT8.SCALING_TYPE_WEIGHT
+                        float8 scaling for input, dynamic (default) or delayed
+  --float8.scaling_type_grad_output FLOAT8.SCALING_TYPE_GRAD_OUTPUT
+                        float8 scaling for input, dynamic (default) or delayed
+  --comm.init_timeout_seconds COMM.INIT_TIMEOUT_SECONDS
+                        Timeout for communication operations, during
+                        initialization and first train step.
+  --comm.train_timeout_seconds COMM.TRAIN_TIMEOUT_SECONDS
+                        Timeout for communication operations after the first
+                        train step -- usually a tighter bound than during
+                        initialization.
+  --comm.trace_buf_size COMM.TRACE_BUF_SIZE
+                        Flight recorder ring buffer size, >0 means recording
+                        by default, 0 means disabled
+  --memory_estimation.enabled
+                        Whether to estimate memory usage for FSDP
+  --memory_estimation.disable_fake_mode
+                        Whether to estimate memory under FakeTensorMode
+```
+</details>
+### Training with `torch.compile`
+Starting from `torch 2.0`, `torch.compile` has been introduced as a new feature to seamlessly accelerate training processes.
+In `flame`, one can simply enable `torch.compile` by adding `--training.compile` flag to your training script.
+However, `fla` has integrated numerous fused kernels for acceleration, which may potentially conflict with `torch.compile`.
+We are actively working on resolving these issues to make compilation transparent to users.
+In the meantime, please ensure you are using the latest dependencies.
+Specifically, **we recommend using `torch>=2.6` and `triton>=3.0`**.
+### Training with multiple datasets
+If you wish to train a model with all-round capabilities (e.g., code, math, and multilingual ability), it's necessary to train on multiple datasets.
+`flame` allows training with multiple datasets easily.
+For example, you can specify the following arguments to train on 6 datasets with different proportions:
+```sh
+  --training.dataset HuggingFaceFW/fineweb-edu,opencsg/Fineweb-Edu-Chinese-V2.1,OpenCoder-LLM/opc-fineweb-code-corpus,math-ai/AutoMathText,EleutherAI/proof-pile-2,OpenCoder-LLM/opc-fineweb-math-corpus   \
+  --training.data_probs 0.6,0.15,0.15,0.014,0.058,0.028     \
+```
+### ~Finalizing training~
+> [!NOTE]
+> We have done this conversion automatically in the training script since our latest updates.
+Once training is complete, you may want to convert the distributed checkpoints (DCPs) into the 🤗 format for broader use.
+To facilitate this, we provide a straightforward conversion script:
+```sh
+python -m flame.utils.convert_dcp_to_hf --path <path_to_model> --step <step> --config <path_to_config> --tokenizer <path_to_tokenizer>
+```
+After this, your model will be in the 🤗 format, ready to be shared or deployed.
+You can then easily publish your model using the `huggingface_hub` for wider accessibility.
+### Continual training
+If you wish to build upon a strong pre-trained model (in 🤗 format) and continue training, we also offer a script to convert the 🤗 format model back into DCP format.
+This allows you to seamlessly resume training with `flame`.
+```sh
+python -m flame.utils.convert_hf_to_dcp --model <path_to_hf> --checkpoint <path_to_dcp/checkpoint/step-0>
+```
+Here, `<path_to_dcp>` is the directory where your distributed checkpoints will be stored.
+The checkpoint is intentionally saved at `<step-0>` within the checkpoint folder to ensure it is loadable by `flame` during the initial training step, similar to how a seed checkpoint is handled.
+Once the conversion is complete, you can proceed with training using `flame` as usual, continuing from where the pretrained model left off.
+## Multi-node training
+If you have access to multi-node GPUs, consider leveraging them for optimal performance.
+This process is straightforward and well-documented in the PyTorch [docs](https://pytorch.org/docs/stable/elastic/run.html).
+To set up multi-node training:
+* Set the environment variables `MASTER_ADDR=<ip>` and `MASTER_PORT=<port>` before running the training script across all nodes.
+* If you're using a job scheduler like Slurm, it will handle these variables for you.
+`torchtitan` provides a [Slurm script](https://github.com/pytorch/torchtitan/blob/main/multinode_trainer.slurm) for multi-node training, which you can use as a reference or starting point.

torchtitan/components/__pycache__/ft.cpython-311.pyc ADDED Viewed

Binary file (7.05 kB). View file

torchtitan/components/optimizer.py ADDED Viewed

	@@ -0,0 +1,303 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import functools
+from typing import Any, Generic, Iterator, TypeVar
+import torch
+import torch.nn as nn
+from torch.distributed.checkpoint.state_dict import (
+    get_optimizer_state_dict,
+    set_optimizer_state_dict,
+    StateDictOptions,
+)
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.optim import Optimizer
+from torchtitan.components.ft import FTManager, has_torchft
+from torchtitan.config_manager import JobConfig
+__all__ = [
+    "OptimizersContainer",
+    "build_optimizers",
+]
+if has_torchft:
+    import torchft as ft
+T = TypeVar("T", bound=Optimizer)
+class OptimizersContainer(Optimizer, Stateful, Generic[T]):
+    """A container for multiple optimizers.
+    This class is used to wrap multiple optimizers into a single object that can be
+    used to reduce the complexity of the training loop. This mimics the behavior of
+    ``torch.optim.Optimizer``. This class currently only supports ``Adam`` and ``AdamW``.
+    **Note**
+    Users who want to customize the optimizer behavior can inherit from this class and
+    extend the functionality as needed. The following methods must follow the same signature
+    as ``torch.optim.Optimizer`` class: ``step()``, ``zero_grad()``, ``state_dict()``,
+    ``load_state_dict()``.
+    **Limitations**
+    This class assumes that all the optimizers are the same type and have the same
+    configurations. With this assumption, TorchTitan can support lr scheduler resharding
+    (e.g., loading a checkpoint with a different number of GPUs and/or different
+    parallelization strategy). Note that ``get_optimizer_state_dict`` already enables the
+    resharding for the optimizer state but not for the lr scheduler state, hence the limitation.
+    Args:
+        model_parts (List[nn.Module]): List of model parts to be optimized.
+        optimizer_kwargs (Dict[str, Any]): Keyword arguments for the optimizers.
+        name (str): Name of the optimizers.
+    """
+    optimizers: list[T]
+    model_parts: list[nn.Module]
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+    ) -> None:
+        all_params = []
+        self.optimizers = []
+        self.model_parts = model_parts
+        for model in self.model_parts:
+            params = [p for p in model.parameters() if p.requires_grad]
+            self.optimizers.append(optimizer_cls(params, **optimizer_kwargs))
+            all_params.extend(params)
+        self._validate_length(len(self.model_parts))
+        self._post_init(all_params, optimizer_kwargs)
+    def __iter__(self) -> Iterator[T]:
+        return iter(self.optimizers)
+    def __len__(self) -> int:
+        return len(self.optimizers)
+    def step(self, *args, **kwargs) -> None:
+        for optimizer in self.optimizers:
+            optimizer.step(*args, **kwargs)
+    def zero_grad(self, *args, **kwargs) -> None:
+        for optimizer in self.optimizers:
+            optimizer.zero_grad(*args, **kwargs)
+    def state_dict(self) -> dict[str, Any]:
+        func = functools.partial(
+            get_optimizer_state_dict,
+            options=StateDictOptions(flatten_optimizer_state_dict=True),
+        )
+        return {
+            k: v
+            for sd in map(func, self.model_parts, self.optimizers)
+            for k, v in sd.items()
+        }
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        func = functools.partial(
+            set_optimizer_state_dict,
+            optim_state_dict=state_dict,
+            options=StateDictOptions(flatten_optimizer_state_dict=True),
+        )
+        list(map(func, self.model_parts, self.optimizers))
+    def _validate_length(self, expected_length: int) -> None:
+        assert expected_length == len(self.optimizers), (
+            "Must pass one optimizer per model part or per param if "
+            "using OptimizersInBackwardContainer."
+        )
+    def _post_init(
+        self, all_params: list[nn.Parameter], optimizer_kwargs: dict[str, Any]
+    ) -> None:
+        # We need to call Optimizer.__init__() to initialize some necessary optimizer
+        # functionality such as hooks.
+        Optimizer.__init__(self, all_params, optimizer_kwargs)
+class OptimizersInBackwardContainer(OptimizersContainer):
+    """OptimizersContainer for executing ``optim.step()`` in backward pass.
+    This class extend ``OptimizersContainer`` to support optimizer step in
+    backward pass. ``step()`` and ``zero_grad()`` are no-op in this class.
+    Instead, ``register_post_accumulate_grad_hook`` is used to register a hook to
+    execute these methods when the gradient is accumulated.
+    """
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+    ) -> None:
+        all_params = []
+        self.model_parts = model_parts
+        optim_dict = {}
+        for model in self.model_parts:
+            for p in model.parameters():
+                if p.requires_grad:
+                    optim_dict[p] = optimizer_cls([p], **optimizer_kwargs)
+                all_params.append(p)
+        def optim_hook(param) -> None:
+            optim_dict[param].step()
+            optim_dict[param].zero_grad()
+        for model in self.model_parts:
+            for param in model.parameters():
+                if param.requires_grad:
+                    param.register_post_accumulate_grad_hook(optim_hook)
+        self.optimizers = list(optim_dict.values())
+        self._validate_length(
+            sum(len(list(model.parameters())) for model in self.model_parts)
+        )
+        self._post_init(all_params, optimizer_kwargs)
+    def step(self) -> None:
+        pass
+    def zero_grad(self) -> None:
+        pass
+class FTOptimizersContainer(OptimizersContainer):
+    def __init__(
+        self,
+        model_parts: list[nn.Module],
+        optimizer_cls: type[T],
+        optimizer_kwargs: dict[str, Any],
+        ft_manager: "ft.Manager",
+    ) -> None:
+        super().__init__(model_parts, optimizer_cls, optimizer_kwargs)
+        # Force to initialize the optimizer state so that `optim.step()`
+        # won't be called by state_dict() and load_state_dict().
+        _ = {
+            k: v
+            for sd in map(get_optimizer_state_dict, model_parts, self.optimizers)
+            for k, v in sd.items()
+        }
+        self.cache_state_dict: dict[str, Any] = {}
+        self._ft_optimizer = ft.Optimizer(ft_manager, self)
+        self._call_from_ft: bool = False
+    def init_cache_state_dict(self) -> None:
+        self.cache_state_dict = super().state_dict()
+    def state_dict(self) -> dict[str, Any]:
+        return self.cache_state_dict
+    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
+        # We have to invalidate the `cache_state_dict` because optimizer uses
+        # assign instead of copy when doing `load_state_dict()`. Without
+        # invalidating the `cache_state_dict`, there will be memory leakage.
+        self.cache_state_dict = {}
+        super().load_state_dict(state_dict)
+        self.init_cache_state_dict()
+    def step(self, *args, **kwargs) -> None:
+        """Calling the correct step() depending on the caller.
+        TorchFT's OptimizerWrapper.step() is designed to be callled only once
+        per train step per ft.Manager regardless how many optimizers are used.
+        Hence we will need to appropriately dispatch the call.
+        """
+        if self._call_from_ft:
+            super().step(*args, **kwargs)
+        else:
+            self._call_from_ft = True
+            self._ft_optimizer.step(*args, **kwargs)
+            self._call_from_ft = False
+    def zero_grad(self, *args, **kwargs) -> None:
+        """Calling the correct zero_grad() depending on the caller.
+        Check the comment in ``step()``.
+        """
+        if self._call_from_ft:
+            super().zero_grad(*args, **kwargs)
+        else:
+            self._call_from_ft = True
+            self._ft_optimizer.zero_grad(*args, **kwargs)
+            self._call_from_ft = False
+def build_optimizers(
+    model_parts: list[nn.Module],
+    job_config: JobConfig,
+    ft_manager: FTManager,
+) -> OptimizersContainer:
+    """Create a OptimizersContainer for the given model parts and job config.
+    This function creates a ``OptimizersContainer`` for the given model parts.
+    ``job_config`` should define the correct optimizer name and parameters.
+    This function currently supports creating ``OptimizersContainer`` and
+    ``OptimizersInBackwardContainer``.
+    **Note**
+    Users who want to customize the optimizer behavior can create their own
+    ``OptimizersContainer`` subclass and ``build_optimizers``. Passing the
+    customized ``build_optimizers`` to ``TrainSpec`` will create the customized
+    ``OptimizersContainer``.
+    Args:
+        model_parts (List[nn.Module]): List of model parts to be optimized.
+        job_config (JobConfig): Job config containing the optimizer name and parameters.
+    """
+    optim_in_bwd = job_config.optimizer.early_step_in_backward
+    if optim_in_bwd and job_config.parallelism.pipeline_parallel_degree > 1:
+        raise NotImplementedError(
+            "Optimizers in backward is not supported with pipeline parallelism."
+        )
+    name = job_config.optimizer.name
+    lr = job_config.optimizer.lr
+    eps = job_config.optimizer.eps
+    optim_implementation = job_config.optimizer.implementation
+    assert optim_implementation in ["fused", "foreach", "for-loop"]
+    fused = optim_implementation == "fused"
+    foreach = optim_implementation == "foreach"
+    optimizer_kwargs = {
+        "lr": lr,
+        "eps": eps,
+        "betas": (0.9, 0.95),
+        "weight_decay": 0.1,
+        "fused": fused,
+        "foreach": foreach,
+    }
+    optimizer_classes = {
+        "Adam": torch.optim.Adam,
+        "AdamW": torch.optim.AdamW,
+    }
+    if name not in optimizer_classes:
+        raise NotImplementedError(f"Optimizer {name} not added.")
+    optimizer_cls = optimizer_classes[name]
+    if optim_in_bwd and ft_manager.enabled:
+        raise ValueError("TorchFT is not supported with optimizers in backward.")
+    elif optim_in_bwd:
+        return OptimizersInBackwardContainer(
+            model_parts, optimizer_cls, optimizer_kwargs
+        )
+    elif ft_manager.enabled:
+        return FTOptimizersContainer(
+            model_parts, optimizer_cls, optimizer_kwargs, ft_manager.manager
+        )
+    else:
+        return OptimizersContainer(model_parts, optimizer_cls, optimizer_kwargs)

torchtitan/distributed/__pycache__/pipeline.cpython-311.pyc ADDED Viewed

Binary file (8.48 kB). View file

torchtitan/distributed/pipeline.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import os
+from typing import Callable, Optional
+from torch.distributed.pipelining.schedules import (
+    _PipelineSchedule,
+    _PipelineScheduleRuntime,
+    get_schedule_class,
+    PipelineScheduleMulti,
+    PipelineScheduleSingle,
+)
+from torch.distributed.pipelining.stage import PipelineStage
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+__all__ = ["build_pipeline_schedule", "generate_split_points", "stage_ids_this_rank"]
+# TODO: It's unclear if this API is general enough to be used by other models.
+# If not, we should move it to a Transformer-specific directory.
+def generate_split_points(
+    schedule_str: str,
+    layers_per_stage: Optional[int],
+    pp_dim: int,
+    num_layers: int,
+    input_weight: int = 1,
+    output_weight: int = 1,
+) -> list[str]:
+    """
+    Generate a list of split points based on the number of layers and
+    pipeline parallel dimension, ensuring the first and last stages have the least layers.
+    Args:
+        schedule_str (str): The string of the schedule name.
+        layers_per_stage (int): The number of layers per stage.
+        pp_dim (int): The pipeline parallel dimension.
+        num_layers (int): The number of layers in the model.
+        input_output_weight (int): The number of layers to consider the input/output modules in the layer calculation.
+    Returns:
+        list[str]: A list of split point FQNs.
+    """
+    schedule_class = get_schedule_class(schedule_str)
+    is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle)
+    num_stages_per_rank = 1 if is_single_stage_schedule else 2
+    if layers_per_stage is not None:
+        total_stages = math.ceil(num_layers / layers_per_stage)
+        if total_stages % pp_dim != 0:
+            raise ValueError(
+                f"Number of stages ({total_stages}) must be divisible by the pipeline parallel dimension ({pp_dim})."
+                f"Each rank should have the same number of stages. "
+            )
+        num_stages_per_rank = total_stages // pp_dim
+        if is_single_stage_schedule and num_stages_per_rank != 1:
+            raise ValueError(
+                f"Number of stages per rank ({num_stages_per_rank}) must be 1 for single stage schedules."
+            )
+        elif not is_single_stage_schedule and num_stages_per_rank < 2:
+            raise ValueError(
+                f"Number of stages per rank ({num_stages_per_rank}) must be >= 2 for multi stage schedules."
+            )
+    else:
+        total_stages = pp_dim * num_stages_per_rank
+        if total_stages > num_layers:
+            raise ValueError("Total stages cannot be greater than the number of layers")
+    # Calculate effective number of layers including input and output weights
+    effective_num_layers = num_layers + input_weight + output_weight
+    base_layers_per_stage = effective_num_layers // total_stages
+    splits = [""] * (total_stages - 1)
+    current_layer_index = 0
+    # First stage
+    layers_on_first_stage = max(0, base_layers_per_stage - input_weight)
+    current_layer_index += layers_on_first_stage
+    splits[0] = "layers." + str(current_layer_index)
+    # Last stage
+    layers_on_last_stage = max(0, base_layers_per_stage - output_weight)
+    splits[-1] = "layers." + str(num_layers - layers_on_last_stage)
+    # Middle stages
+    remaining_layers = num_layers - layers_on_first_stage - layers_on_last_stage - 1
+    middle_stages = len(splits) - 2
+    layers_per_middle_stage = remaining_layers // middle_stages
+    # split remainder evenly across middle stages
+    remainder = remaining_layers % middle_stages
+    for i in range(1, middle_stages + 1):
+        current_layer_index += layers_per_middle_stage
+        if remainder > 0:
+            current_layer_index += 1
+            remainder -= 1
+        splits[i] = "layers." + str(current_layer_index)
+    logger.info(
+        f"No 'pipeline_parallel_split_points' provided so the generated splits are: {splits} "
+        "This may be sub-optimal as the number of layers per stage may be unbalanced."
+    )
+    return splits
+def build_pipeline_schedule(
+    job_config: JobConfig, stages: list[PipelineStage], loss_fn: Callable
+) -> _PipelineSchedule:
+    """Builds a pipeline schedule for the given job configuration and stages.
+    Args:
+        job_config (JobConfig): The job configuration.
+        stages (list[PipelineStage]): The stages to be scheduled.
+        loss_fn (Callable): The loss function.
+    Returns:
+        _PipelineSchedule: The pipeline schedule for the given stages.
+    """
+    pp_schedule_csv = job_config.parallelism.pipeline_parallel_schedule_csv
+    # Validate that pp_schedule_csv is a valid path
+    if pp_schedule_csv:
+        if not os.path.isfile(pp_schedule_csv):
+            raise FileNotFoundError(
+                f"The specified path {pp_schedule_csv} does not exist or is not a file."
+            )
+        schedule_class = _PipelineScheduleRuntime
+    else:
+        schedule_class = get_schedule_class(
+            job_config.parallelism.pipeline_parallel_schedule
+        )
+    looped_schedule = issubclass(schedule_class, PipelineScheduleMulti)
+    microbatch_size = job_config.parallelism.pipeline_parallel_microbatch_size
+    batch_size = job_config.training.batch_size
+    # validate that the batch size is divisible by the microbatch_size otherwise we'll hang or error during training
+    if batch_size % microbatch_size != 0:
+        raise ValueError(
+            f"Batch size {job_config.training.batch_size} must be divisible by number of microbatches {n_microbatches}. "
+            "Update the config arguments for either batch_size or pipeline_parallel_microbatch_size."
+        )
+    n_microbatches = batch_size // microbatch_size
+    # We expect that the number of local stages (`len(stages)`) is the same across all ranks
+    num_total_stages = job_config.parallelism.pipeline_parallel_degree * len(stages)
+    if n_microbatches < num_total_stages:
+        logger.warning(
+            f"Number of microbatches ({n_microbatches}) is less than the total number "
+            f"of stages ({num_total_stages}) which may result in a bubble in the pipeline."
+        )
+    schedule = schedule_class(
+        stages if looped_schedule else stages[0],
+        n_microbatches=n_microbatches,
+        loss_fn=loss_fn,
+    )
+    logger.info(
+        f"Using pipeline schedule {job_config.parallelism.pipeline_parallel_schedule} "
+        f"with {n_microbatches} microbatches and {num_total_stages} stages."
+    )
+    if pp_schedule_csv:
+        assert schedule_class in [
+            PipelineScheduleSingle,
+            PipelineScheduleMulti,
+            _PipelineScheduleRuntime,
+        ], (
+            "Only PipelineScheduleSingle (single stage), PipelineScheduleMulti (multistage), "
+            "and _PipelineScheduleRuntime support csv schedules"
+        )
+        schedule._load_csv(pp_schedule_csv)
+    return schedule
+# TODO(whc) should this be a utility inside torch.pipelining?
+def stage_ids_this_rank(
+    pp_rank: int, pp_size: int, num_stages: int, style: str = "loop"
+) -> tuple[int]:
+    """Compute the stage ids for the stages that will run on this pp rank for either a looped or V style schedule"""
+    assert (
+        num_stages % pp_size == 0
+    ), f"num_stages {num_stages} must be evenly divisible by pp_size {pp_size}"
+    stages_per_rank = num_stages // pp_size
+    if style == "loop":
+        return tuple(pp_rank + s * pp_size for s in range(stages_per_rank))
+    elif style == "v":
+        assert (
+            stages_per_rank == 2
+        ), f"v schedules assume 2 stages per rank, got {stages_per_rank}"
+        stage_v_pairs = list(
+            zip(range(pp_size), range(num_stages - 1, pp_size - 1, -1))
+        )
+        return stage_v_pairs[pp_rank]

torchtitan/experiments/deepseek_v3/checkpoint.py ADDED Viewed

	@@ -0,0 +1,154 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import logging
+import os
+from typing import Dict, Optional, Set, Tuple
+import torch
+from safetensors import safe_open
+from transformers.utils import cached_file
+logger = logging.getLogger(__name__)
+_DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
+def read_weights_from_json(file_path: str) -> Optional[Dict[str, str]]:
+    try:
+        with open(file_path, "r") as file:
+            data = json.load(file)
+        if "weight_map" in data and isinstance(data["weight_map"], dict):
+            return data["weight_map"]
+        else:
+            logger.info("No 'weight_map' dictionary found in the JSON file.")
+            return None
+    except (json.JSONDecodeError, Exception) as e:
+        logger.info(f"An error occurred while reading the JSON file: {str(e)}")
+        return None
+def get_hf_weight_map_and_path(
+    model_id: str,
+) -> Tuple[Dict[str, str], str]:
+    """Get the weight map for a given HF model id and also the cache path for loading the weights"""
+    try:
+        index_file = cached_file(model_id, _DEFAULT_SAFETENSOR_FILE_NAME)
+    except Exception as e:
+        logger.error(
+            f"Model `{model_id}` not found in HF cache. "
+            f"You can download the model using `python download.py {model_id}"
+        )
+        raise e
+    weight_map = read_weights_from_json(index_file)
+    weight_path = os.path.dirname(index_file)
+    logger.info(f"Loading weights from: {weight_path}")
+    return weight_map, weight_path
+def get_needed_files(
+    state_dict: Dict[str, torch.Tensor], weight_map: Dict[str, str]
+) -> Set[str]:
+    needed_files = set()
+    for param in state_dict.keys():
+        file = weight_map.get(param)
+        if file:
+            needed_files.add(file)
+        elif param.endswith("weight"):
+            raise ValueError(
+                f"Parameter {param} not found in weight map, please check..."
+            )
+    logger.info(f"Needed files: {needed_files}")
+    return needed_files
+def load_safetensor_file(
+    full_path: str, device: torch.device
+) -> Dict[str, torch.Tensor]:
+    tensors = {}
+    with safe_open(full_path, framework="pt", device=device) as f:
+        for k in f.keys():
+            tensors[k] = f.get_tensor(k)
+    logger.info(f"Loaded {len(tensors)} tensors from {full_path}")
+    return tensors
+def load_safetensor_weights(
+    model: torch.nn.Module,
+    weight_map: Dict[str, str],
+    file_location: str,
+    device: torch.device,
+):
+    """
+    Load safetensor weights into a `nn.Module`.
+    Args:
+        model (Module): The PyTorch module to load weights into. It may be a
+        model chunk or a full model.
+        weight_map (Dict[str, str]): Mapping of model parameters to file names.
+        file_location (str): Directory containing the weight files.
+        device (torch.device): The device to load tensors onto.
+    """
+    model_state_dict = model.state_dict()
+    needed_files = get_needed_files(model_state_dict, weight_map)
+    updated_states: Set[str] = set()
+    for file in needed_files:
+        full_path = os.path.join(file_location, file)
+        try:
+            checkpoint = load_safetensor_file(full_path, "cpu")
+        except FileNotFoundError:
+            logger.error(f"File not found: {full_path}")
+        except Exception as e:
+            logger.error(f"Error during checkpoint processing of {full_path}: {str(e)}")
+        matched_keys = set(checkpoint.keys()) & set(model_state_dict.keys())
+        for key in matched_keys:
+            # Check shape
+            if model_state_dict[key].shape != checkpoint[key].shape:
+                raise ValueError(
+                    f"Shape mismatch for {key}: "
+                    f"model needs {model_state_dict[key].shape}, but "
+                    f"checkpoint has {checkpoint[key].shape}"
+                )
+            model_state_dict[key] = checkpoint[key].to(device)
+        updated_states.update(matched_keys)
+    missing_keys = set(model_state_dict.keys()) - updated_states
+    if missing_keys:
+        raise RuntimeError(
+            f"Partially updated state dict. Missing parameters: {missing_keys}"
+        )
+    model.load_state_dict(model_state_dict, strict=False, assign=True)
+    logger.info(f"Successfully loaded {len(updated_states)} weights into model")
+def load_weights_from_hf(
+    model: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+):
+    """
+    Load the weights from Hugging Face format (index file + multiple safetensor
+    files), and fill into `model`.  Model config is needed b/c we permute
+    wq and wk weights based on attn heads.
+    """
+    weight_map, weight_path = get_hf_weight_map_and_path(distribution)
+    load_safetensor_weights(
+        model,
+        weight_map,
+        weight_path,
+        device,
+    )

torchtitan/experiments/deepseek_v3/download.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Usage:
+# Downloads a given model to the HF Cache.  Pass in a listed option ala "v3" or your own custom model path.
+# python download.py {model_id} [custom_model_path]
+# Examples:
+# python download.py v2     # Use predefined model: deepseek-ai/DeepSeek-V2
+# python download.py custom "deepseek-ai/new-model"  # Download a custom model path
+# Available models:
+#   "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+#   "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+#   "v2": "deepseek-ai/DeepSeek-V2",
+#   "v3": "deepseek-ai/deepseek-v3",
+#   "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+#   "custom": None,  # Placeholder for custom models
+import sys
+from transformers import AutoModelForCausalLM
+MODELS = {
+    "v2-lite-chat": "deepseek-ai/DeepSeek-V2-Lite-Chat",
+    "v2-lite": "deepseek-ai/DeepSeek-V2-Lite",
+    "v2": "deepseek-ai/DeepSeek-V2",
+    "v3": "deepseek-ai/deepseek-v3",
+    "v3-0324": "deepseek-ai/DeepSeek-V3-0324",
+    "custom": None,  # For custom (any) models
+}
+def print_usage():
+    print("Usage:")
+    print("  python download.py [model_version]")
+    print("  python download.py custom [custom_model_path]")
+    print("\nAvailable predefined models:")
+    for key, model in MODELS.items():
+        if key != "custom":  # Skip the custom placeholder
+            print(f"  {key}: {model}")
+    print("\nFor custom models:")
+    print("  custom: Specify your own model path")
+    print('  Example: python download.py custom "organization/model-name"')
+    sys.exit(1)
+# Process command line arguments
+if len(sys.argv) < 2 or sys.argv[1] not in MODELS:
+    print_usage()
+if sys.argv[1] == "custom":
+    if len(sys.argv) != 3:
+        print("Error: Custom model requires a model path")
+        print_usage()
+    model_id = sys.argv[2]
+    print(f"Using custom model: {model_id}")
+else:
+    model_id = MODELS[sys.argv[1]]
+print(f"Downloading model: {model_id}")
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    trust_remote_code=True,
+)

torchtitan/experiments/deepseek_v3/generate.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# torchrun --standalone --nproc-per-node 4 generate.py
+# use inference.sh "Your Question Here?" to run inference with a single prompt.
+import sys
+from dataclasses import dataclass
+import torch
+import torch.distributed as dist
+from checkpoint import load_weights_from_hf
+from model import DeepseekForCausalLM
+from model_config import deepseek_config_registry
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
+from torchtitan.tools.utils import Color
+from transformers import AutoTokenizer
+# Uncomment the model you want to run.
+model_id, mesh_shape = "deepseek-ai/DeepSeek-V2-Lite-Chat", (1, 4)
+# model_id, mesh_shape = "deepseek-ai/deepseek-v3", (8, 4)
+def colorize_chat(text, user_color=None, assistant_color=None, output_color=None):
+    """Parse and colorize chat output with optional colors for each role."""
+    lines = text.split("\n")
+    result = []
+    current_role = None
+    current_content = []
+    def _process_current_content():
+        if not current_role or not current_content:
+            return None
+        content = "\n".join(current_content)
+        if current_role == "output":
+            return (
+                f"Output: {output_color}{content}{color.reset}"
+                if output_color
+                else f"Output: {content}"
+            )
+        else:
+            try:
+                prefix, rest = current_content[0].split(":", 1)
+                role_color = user_color if current_role == "user" else assistant_color
+                if role_color:
+                    formatted = f"{prefix}:{role_color}{rest}{color.reset}"
+                    if len(current_content) > 1:
+                        formatted += (
+                            f"{role_color}\n"
+                            + "\n".join(current_content[1:])
+                            + f"{color.reset}"
+                        )
+                    return formatted
+            except ValueError:
+                pass
+        return content
+    for line in lines:
+        if line.startswith("Output:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "output"
+            content = line[len("Output:") :].strip()
+            if output_color:
+                content = f"Output: {output_color}{content}{color.reset}"
+            else:
+                content = f"Output: {content}"
+            result.append(content)
+            current_content = []
+        elif line.startswith("User:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "user"
+            current_content = [line]
+        elif line.startswith("Assistant:"):
+            if processed := _process_current_content():
+                result.append(processed)
+            current_role = "assistant"
+            current_content = [line]
+        else:
+            if current_content:
+                current_content.append(line)
+            elif line.strip() and current_role is None:
+                # Handle system message at the beginning
+                current_role = "output"
+                if output_color:
+                    result.append(f"Output: {output_color}{line.strip()}{color.reset}")
+                else:
+                    result.append(f"Output: {line.strip()}")
+    # Process the last segment
+    if processed := _process_current_content():
+        result.append(processed)
+    return "\n".join(result)
+color = Color()
+@dataclass
+class DistConfig:
+    mesh: DeviceMesh
+    pp_mesh: DeviceMesh
+    ep_mesh: DeviceMesh
+    pp_size: int
+    ep_size: int
+    ep_rank: int
+    pp_rank: int
+    device: torch.device
+def create_model(dist_config: DistConfig):
+    model_args = deepseek_config_registry[model_id]
+    model_args.ep_size = dist_config.ep_size
+    model_args.num_stages = dist_config.pp_size
+    model_args.stage_idx = dist_config.pp_rank
+    model_args.max_seq_len = 16384
+    with dist_config.device, dist_config.mesh:
+        model = DeepseekForCausalLM(model_args)
+    load_weights_from_hf(model, model_id, dist_config.device)
+    model.eval()
+    model.setup_symm_mem(torch.bfloat16, dist_config.device)
+    stage = PipelineStage(
+        model,
+        dist_config.pp_rank,
+        dist_config.pp_size,
+        dist_config.device,
+        group=dist_config.pp_mesh.get_group(),
+    )
+    pp_schedule = ScheduleGPipe(stage, dist_config.pp_size)
+    return model, pp_schedule
+def create_dist_config(mesh: DeviceMesh):
+    rank = dist.get_rank()
+    device_count = torch.cuda.device_count()
+    device = torch.device("cuda", rank % device_count)
+    dist_config = DistConfig(
+        mesh=mesh,
+        pp_mesh=mesh["pp"],
+        ep_mesh=mesh["ep"],
+        pp_rank=mesh["pp"].get_local_rank(),
+        pp_size=mesh["pp"].size(),
+        ep_size=mesh["ep"].size(),
+        ep_rank=mesh["ep"].get_local_rank(),
+        device=device,
+    )
+    return dist_config
+def decode(tokenizer, x):
+    output = tokenizer.decode(x[0])
+    # Clean up the output by removing special tokens
+    bos = tokenizer.bos_token
+    output = output.replace(bos, "")
+    # Truncate at end of sentence token
+    eos_token = tokenizer.eos_token
+    if eos_token and eos_token in output:
+        output = output.split(eos_token)[0]
+    colored_output = colorize_chat(
+        output,
+        user_color=color.green,
+        assistant_color=color.cyan,
+        output_color=color.blue,
+    )
+    return colored_output
+@torch.inference_mode()
+def generate(
+    model,
+    pp_schedule,
+    tokenizer,
+    dist_config,
+    messages: list[dict],
+    n_tokens: int = 50,
+):
+    rank = dist.get_rank()
+    device = dist_config.device
+    x = tokenizer.apply_chat_template(
+        [messages] * dist_config.pp_size,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    next_idx = x.shape[-1]
+    x = torch.cat([x, torch.zeros(x.shape[0], n_tokens, dtype=torch.int64)], dim=-1)
+    x = x.to(device)
+    for _ in range(n_tokens):
+        if dist_config.pp_size > 1:
+            if dist_config.pp_rank == 0:
+                pp_schedule.step(x)
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            elif dist_config.pp_rank == dist_config.pp_size - 1:
+                preds = pp_schedule.step()
+                next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+                x[:, next_idx] = next_token
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            else:
+                pp_schedule.step()
+                torch.distributed.broadcast(
+                    x,
+                    group=dist_config.pp_mesh.get_group(),
+                    group_src=dist_config.pp_size - 1,
+                )
+            next_idx += 1
+        else:
+            preds = model(x)
+            next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+            x[:, next_idx] = next_token
+            next_idx += 1
+    if rank == 0:
+        colored_output = decode(tokenizer, x)
+        print(f"Without CUDA Graph:\n{colored_output}")
+@torch.inference_mode()
+def generate_with_cuda_graph(
+    model,
+    tokenizer,
+    dist_config,
+    messages: list[dict],
+    n_tokens: int = 10,
+):
+    rank = dist.get_rank()
+    device = dist_config.device
+    x = tokenizer.apply_chat_template(
+        [messages] * dist_config.pp_size,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    )
+    next_idx = x.shape[-1]
+    x = torch.cat([x, torch.zeros(x.shape[0], n_tokens, dtype=torch.int64)], dim=-1)
+    x = x.to(device)
+    torch.cuda.synchronize()
+    # Create CUDA graph
+    g = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(g):
+        preds = model(x)
+    # Run CUDA graph
+    for _ in range(n_tokens):
+        g.replay()
+        next_token = torch.argmax(preds[:, next_idx - 1], dim=-1)
+        x[:, next_idx] = next_token
+        next_idx += 1
+    if rank == 0:
+        colored_output = decode(tokenizer, x)
+        print(f"With CUDA Graph:\n{colored_output}")
+if __name__ == "__main__":
+    # Get user prompt from command line arguments
+    user_prompt = "What is 2+2?"  # Default prompt
+    if len(sys.argv) > 1:
+        user_prompt = sys.argv[1]
+    mesh = dist.init_device_mesh("cuda", mesh_shape, mesh_dim_names=("pp", "ep"))
+    rank = dist.get_rank()
+    if rank == 0:
+        print(
+            f"{color.yellow}Running inference with {model_id} on {mesh_shape} mesh{color.reset}"
+        )
+    dist_config = create_dist_config(mesh)
+    model, pp_schedule = create_model(dist_config)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": user_prompt},
+    ]
+    generate(model, pp_schedule, tokenizer, dist_config, messages)
+    generate_with_cuda_graph(model, tokenizer, dist_config, messages)
+    if rank == 0:
+        print(f"\n{color.yellow}Closing inference mesh...{color.reset}")
+    dist.destroy_process_group()

torchtitan/experiments/deepseek_v3/inference.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+NGPU=${NGPU:-"4"}
+# Get the prompt from command line argument or use a default
+prompt="${1:-What is 2+2?}"
+# Run the model with the prompt
+torchrun --standalone --nproc-per-node ${NGPU} generate.py "$prompt"

torchtitan/experiments/deepseek_v3/model.py ADDED Viewed

	@@ -0,0 +1,1325 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This code is based on model definition of `deepseek-ai/DeepSeek-V3-Base` on
+# Hugging Face Model Hub. Url:
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/blob/main/modeling_deepseek.py
+# https://huggingface.co/deepseek-ai/DeepSeek-V3-Base/resolve/main/configuration_deepseek.py
+#
+# It has been modified from its original forms to accommodate naming convention
+# and usage patterns of the TorchTitan project.
+# Copyright 2023 DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch DeepSeek model."""
+import math
+from typing import Optional, Tuple
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from attn_mask_utils import _prepare_4d_causal_attention_mask
+from indices import generate_permute_indices
+from model_config import ModelArgs
+from symm_mem_recipes import OnDeviceAllToAllV
+from torch import nn
+from torch.distributed._functional_collectives import all_to_all_single_autograd
+from torchtitan.experiments.kernels.triton_mg_group_gemm.torchao_pr import (
+    ALIGN_SIZE_M,
+    grouped_gemm_forward,
+)
+# Get model parallel subgroup by name:
+# e.g. "pp", "ep", None
+def get_group(dim_name: Optional[str] = None) -> dist.ProcessGroup:
+    glob = torch.distributed.device_mesh._mesh_resources.get_current_mesh()
+    return glob.get_group(dim_name)
+class RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+class RotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq.to(t.device))
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.max_seq_len_cached is None or seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+class LinearScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        t = t / self.scaling_factor
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Deepseek
+class DynamicNTKScalingRotaryEmbedding(RotaryEmbedding):
+    """RotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+    ):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings)
+                - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(
+            self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype
+        )
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+# Inverse dim formula to find dim based on number of rotations
+def yarn_find_correction_dim(
+    num_rotations, dim, base=10000, max_position_embeddings=2048
+):
+    return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (
+        2 * math.log(base)
+    )
+# Find dim range bounds based on rotations
+def yarn_find_correction_range(
+    low_rot, high_rot, dim, base=10000, max_position_embeddings=2048
+):
+    low = math.floor(
+        yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)
+    )
+    high = math.ceil(
+        yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)
+    )
+    return max(low, 0), min(high, dim - 1)  # Clamp values just in case
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+def yarn_linear_ramp_mask(min, max, dim):
+    if min == max:
+        max += 0.001  # Prevent singularity
+    linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+    ramp_func = torch.clamp(linear_func, 0, 1)
+    return ramp_func
+class YarnRotaryEmbedding(RotaryEmbedding):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        original_max_position_embeddings=4096,
+        beta_fast=32,
+        beta_slow=1,
+        mscale=1,
+        mscale_all_dim=0,
+    ):
+        self.scaling_factor = scaling_factor
+        self.original_max_position_embeddings = original_max_position_embeddings
+        self.beta_fast = beta_fast
+        self.beta_slow = beta_slow
+        self.mscale = mscale
+        self.mscale_all_dim = mscale_all_dim
+        super().__init__(dim, max_position_embeddings, base, device)
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        dim = self.dim
+        freq_extra = 1.0 / (
+            self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        freq_inter = 1.0 / (
+            self.scaling_factor
+            * self.base
+            ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim)
+        )
+        low, high = yarn_find_correction_range(
+            self.beta_fast,
+            self.beta_slow,
+            dim,
+            self.base,
+            self.original_max_position_embeddings,
+        )
+        inv_freq_mask = 1.0 - yarn_linear_ramp_mask(low, high, dim // 2).to(
+            device=device, dtype=torch.float32
+        )
+        inv_freq = freq_inter * (1 - inv_freq_mask) + freq_extra * inv_freq_mask
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        t = torch.arange(seq_len, device=device, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        _mscale = float(
+            yarn_get_mscale(self.scaling_factor, self.mscale)
+            / yarn_get_mscale(self.scaling_factor, self.mscale_all_dim)
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer(
+            "cos_cached", (emb.cos() * _mscale).to(dtype), persistent=False
+        )
+        self.register_buffer(
+            "sin_cached", (emb.sin() * _mscale).to(dtype), persistent=False
+        )
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class MLP(nn.Module):
+    act_fn = nn.SiLU()
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = (
+            config.intermediate_size if intermediate_size is None else intermediate_size
+        )
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class MoEGate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.top_k = config.num_experts_per_tok
+        self.n_routed_experts = config.n_routed_experts
+        self.routed_scaling_factor = config.routed_scaling_factor
+        self.scoring_func = config.scoring_func
+        self.seq_aux = config.seq_aux
+        self.topk_method = config.topk_method
+        self.n_group = config.n_group
+        self.topk_group = config.topk_group
+        # topk selection algorithm
+        self.norm_topk_prob = config.norm_topk_prob
+        self.gating_dim = config.hidden_size
+        self.weight = nn.Parameter(
+            torch.empty((self.n_routed_experts, self.gating_dim))
+        )
+        if self.topk_method == "noaux_tc":
+            self.e_score_correction_bias = nn.Parameter(
+                # Changed from torch.empty to torch.rand to avoid non-even
+                # distribution for runs without actual weigths
+                torch.rand((self.n_routed_experts))
+            )
+        self.reset_parameters()
+    def reset_parameters(self) -> None:
+        import torch.nn.init as init
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+    def forward(self, hidden_states):
+        bsz, seq_len, h = hidden_states.shape
+        # compute gating score
+        hidden_states = hidden_states.view(-1, h)
+        logits = F.linear(
+            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
+        )
+        if self.scoring_func == "sigmoid":
+            scores = logits.sigmoid()
+        elif self.scoring_func == "softmax":
+            scores = logits.softmax(dim=-1, dtype=torch.float32)
+        else:
+            raise NotImplementedError(
+                f"insupportable scoring function for MoE gating: {self.scoring_func}"
+            )
+        # select top-k experts
+        if self.topk_method == "noaux_tc":
+            scores_for_choice = scores.view(
+                bsz * seq_len, -1
+            ) + self.e_score_correction_bias.unsqueeze(0)
+            group_scores = (
+                scores_for_choice.view(bsz * seq_len, self.n_group, -1)
+                .topk(2, dim=-1)[0]
+                .sum(dim=-1)
+            )  # [n, n_group]
+            group_idx = torch.topk(
+                group_scores, k=self.topk_group, dim=-1, sorted=False
+            )[
+                1
+            ]  # [n, top_k_group]
+            group_mask = torch.zeros_like(group_scores)  # [n, n_group]
+            group_mask.scatter_(1, group_idx, 1)  # [n, n_group]
+            score_mask = (
+                group_mask.unsqueeze(-1)
+                .expand(
+                    bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group
+                )
+                .reshape(bsz * seq_len, -1)
+            )  # [n, e]
+            tmp_scores = scores_for_choice.masked_fill(
+                ~score_mask.bool(), 0.0
+            )  # [n, e]
+            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
+            topk_weight = scores.gather(1, topk_idx)
+        elif self.topk_method == "greedy":
+            topk_weight, topk_idx = torch.topk(
+                scores, k=self.top_k, dim=-1, sorted=False
+            )
+        else:
+            raise NotImplementedError(
+                f"insupportable TopK function for MoE gating: {self.topk_method}"
+            )
+        # norm gate to sum 1
+        if self.top_k > 1 and self.norm_topk_prob:
+            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
+            topk_weight = topk_weight / denominator
+        topk_weight = (
+            topk_weight * self.routed_scaling_factor
+        )  # must multiply the scaling factor
+        return topk_idx, topk_weight
+class MoE(nn.Module):
+    """
+    A mixed expert module containing shared experts.
+    """
+    # Class attributes:
+    # Two shuffle method supported:
+    # 1. "torch_all_to_all"
+    # 2. "symm_mem" (see `setup_symm_mem` below)
+    shuffle_method = "torch_all_to_all"
+    # Symmetric memory buffers shared by all MoE instances across layers
+    token_send_buf: Optional[torch.Tensor] = None
+    token_gather_buf: Optional[torch.Tensor] = None
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # ep_size is the number of ranks in expert dimension
+        if config.ep_size <= 1:
+            raise ValueError(
+                "For code simplicity, this model only supports distributed experts, "
+                "thus EP size must be > 1, please modify your model config"
+            )
+        self.ep_group = get_group("ep")
+        assert config.ep_size == self.ep_group.size()
+        self.ep_size = config.ep_size
+        self.ep_rank = self.ep_group.rank()
+        self.experts_per_rank = config.n_routed_experts // config.ep_size
+        # Use ModuleDict instead of ModuleList to preserve absoulte expert
+        # IDs while avoiding `None` experts. The absolute expert IDs match
+        # with checkpoint FQNs.
+        self.experts = nn.ModuleDict()
+        for i in range(self.experts_per_rank):
+            abs_expert_id = self.ep_rank * self.experts_per_rank + i
+            self.experts[str(abs_expert_id)] = MLP(
+                config, intermediate_size=config.moe_intermediate_size
+            )
+        self.gate = MoEGate(config)
+        if config.n_shared_experts is not None:
+            intermediate_size = config.moe_intermediate_size * config.n_shared_experts
+            self.shared_experts = MLP(
+                config=config, intermediate_size=intermediate_size
+            )
+    def combine_experts(self, submod_name):
+        all_weights = []
+        for expert in self.experts.values():
+            lin = expert.get_submodule(submod_name)
+            all_weights.append(lin.weight)
+            lin.weight = None
+        concat_weight = torch.cat(all_weights)
+        self.register_parameter(f"{submod_name}_weight", nn.Parameter(concat_weight))
+    # This function is used to create a symm mem buffer for MoE's. It is for
+    # shuffling tokens fully "on-device", as compared to traditional torch
+    # all_to_all APIs which requrie a GPU-to-CPU sync of the splits.  If a user
+    # calls this function, the `shuffle_method` would switch from
+    # `torch_all_to_all` to `symm_mem`.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        # Switch shuffle method
+        self.shuffle_method = "symm_mem"
+        # Combine expert weights
+        print("Combining expert weights for Group GEMM")
+        self.combine_experts("gate_proj")
+        self.combine_experts("up_proj")
+        self.combine_experts("down_proj")
+        # Assuming worst case, 2x tokens are routed to one EP rank
+        overflow = 2
+        OnDeviceAllToAllV.max_output_len = (
+            self.config.max_seq_len * self.num_experts_per_tok * overflow
+        )
+        # Symmetric memory buffers are shared by all MoE instances across
+        # layers, we only need to initialize them once
+        if MoE.token_send_buf is not None:
+            return
+        # Input buffer for DP-to-EP shuffle
+        MoE.token_send_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok,  # seq len * top k (flattened)
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        # Input buffer for EP-to-DP shuffle
+        MoE.token_gather_buf = symm_mem.empty(
+            self.config.max_seq_len
+            * self.num_experts_per_tok  # seq len * top k (flattened)
+            * overflow,
+            self.config.hidden_size,  # hidden dim
+            dtype=dtype,
+            device=device,
+        )
+        print(f"EP rank [{self.ep_rank}]: Created Symmetric Memory for MoE")
+    def get_send_buf(self):
+        # [Why detach?] During a first forward-backward step, the buffer would
+        # be included in a computational graph. In a second step, autograd will
+        # return an error saying "Trying to backward through the graph a second
+        # time (or directly access saved tensors more than once)". This is
+        # because the buffer is still in the graph, and autograd is trying to
+        # backward through the graph a second time. To avoid this, we detach the
+        # buffer from the graph. `detach()` returns a new tensor, which shares
+        # the same storage with the original one.
+        self.token_send_buf.grad = None
+        return self.token_send_buf.detach()
+    def get_gather_buf(self):
+        # See [Why detach?] in `get_send_buf`
+        self.token_gather_buf.grad = None
+        return self.token_gather_buf.detach()
+    def forward(self, hidden_states):
+        identity = hidden_states
+        orig_shape = hidden_states.shape
+        # for each token, select top-k experts, and compute the weight for each expert
+        topk_idx, topk_weight = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+        if self.shuffle_method == "symm_mem":
+            y = self.moe_on_device(hidden_states, topk_idx, topk_weight)
+        else:  # "torch_all_to_all"
+            y = self.moe_forward(hidden_states, topk_idx, topk_weight)
+        y = y.view(*orig_shape)
+        if self.config.n_shared_experts is not None:
+            y = y + self.shared_experts(identity)
+        return y
+    def moe_forward(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # DP to EP token shuffle. This part needs gradient.
+        if self.shuffle_method == "symm_mem":
+            # Move input to the `token_send_buf` symm mem
+            token_send_buf = self.get_send_buf()
+            token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+            # Note: `out=` avoids copy, but it is not differentiable
+            # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+            token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+                token_send_buf,
+                input_splits,
+                self.ep_group,
+            )
+            with torch.no_grad():
+                # Received tokens from all other ranks. TODO: use mask instead
+                received = output_splits.sum()
+            # TODO: don't use `received`
+            gathered_tokens = token_gather_buf[:received]
+        else:  # "torch_all_to_all"
+            # Prepare input ans output splits
+            with torch.no_grad():
+                output_splits = tokens_per_expert_group.view(self.ep_size, -1).sum(
+                    dim=1
+                )
+            gathered_tokens = all_to_all_single_autograd(
+                sorted_tokens,
+                output_splits.tolist(),
+                input_splits.tolist(),
+                self.ep_group,
+            )
+        # This part prepares a 1D tensor with the same length as
+        # `gathered_tokens`. The 1D tensor is filled with local expert IDs which
+        # the tokens in `gathered_tokens` are headed for. This part doesn't need
+        # gradient.
+        with torch.no_grad():
+            gatherd_idxs = (
+                torch.arange(
+                    tokens_per_expert_group.numel(),
+                    device=tokens_per_expert_group.device,
+                )
+                % self.experts_per_rank
+            )
+            gatherd_idxs = gatherd_idxs.repeat_interleave(tokens_per_expert_group)
+        # Prepare buffer for tokens processed by experts
+        if self.shuffle_method == "symm_mem":
+            # Take necessary space from `token_gather_buf` symm mem because we are
+            # going to send them out after expert processing
+            processed_tokens = self.get_gather_buf()[: gathered_tokens.shape[0]]
+        else:  # "torch_all_to_all"
+            processed_tokens = torch.empty_like(gathered_tokens)
+        # This part processes the tokens routed to the local experts.
+        # TODO: can we use group GEMM here?
+        for i, expert in enumerate(self.experts.values()):
+            processed_tokens[gatherd_idxs == i] = expert(
+                gathered_tokens[gatherd_idxs == i]
+            )
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        if self.shuffle_method == "symm_mem":
+            token_return_buf, _ = OnDeviceAllToAllV.apply(
+                processed_tokens,
+                output_splits,
+                self.ep_group,
+            )
+            returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        else:  # "torch_all_to_all"
+            returned_tokens = all_to_all_single_autograd(
+                processed_tokens,
+                input_splits.tolist(),
+                output_splits.tolist(),
+                self.ep_group,
+            )
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+    def moe_on_device(self, x, topk_ids, topk_weight):
+        # This part sorts the token indices so that tokens routed to the same expert reside consecutively.
+        # An implication is that tokens to the same "expert group" (i.e., device) are also consecutive.
+        # Since this is an "aritificial" index creation (final outcome being
+        # `idxs`), we don't need gradients here.
+        with torch.no_grad():
+            # [seq_len, n_routed_experts]
+            cnts = topk_ids.new_zeros((topk_ids.shape[0], self.config.n_routed_experts))
+            # Fill 1 to the selected experts
+            cnts.scatter_(1, topk_ids, 1)
+            tokens_per_expert = cnts.sum(dim=0)
+            # Token indices for each expert
+            idxs = topk_ids.view(-1).argsort()
+            sorted_tokens_shape = idxs.shape + x.shape[1:]
+        sorted_tokens = x[idxs // topk_ids.shape[1]]
+        assert sorted_tokens.shape == sorted_tokens_shape
+        # This part exchange the information about the number of tokens send and
+        # received by each expert. We can understand this information as "side
+        # band", which is not part of the actual data. Thus no gradient is
+        # needed.
+        with torch.no_grad():
+            # Sum the tokens over local experts, then we get tokens per EP rank,
+            # which is the input splits
+            tokens_per_expert_group = tokens_per_expert.new_empty(
+                tokens_per_expert.shape[0]
+            )
+            dist.all_to_all_single(
+                tokens_per_expert_group, tokens_per_expert, group=self.ep_group
+            )
+            input_splits = tokens_per_expert.view(self.ep_size, -1).sum(dim=1)
+        # Move input to the `token_send_buf` symm mem
+        token_send_buf = self.get_send_buf()
+        token_send_buf[: idxs.shape[0]].copy_(sorted_tokens)
+        # Note: `out=` avoids copy, but it is not differentiable
+        # torch.index_select(x, 0, idxs // topk_ids.shape[1], out=self.token_send_buf[: idxs.shape[0]])
+        token_gather_buf, output_splits = OnDeviceAllToAllV.apply(
+            token_send_buf,
+            input_splits,
+            self.ep_group,
+        )
+        # We need to permute the received tokens so that tokens for the same expert are contiguous.
+        # This part prepares a 1D tensor `permuted_indices` for such permutation.
+        # This part doesn't need gradient.
+        with torch.no_grad():
+            permuted_indices, m_sizes = generate_permute_indices(
+                tokens_per_expert_group,
+                self.experts_per_rank,
+                self.ep_size,
+                token_gather_buf.shape[0],
+                ALIGN_SIZE_M,
+            )
+        # Permute the received tokens so that tokens for the same expert are contiguous.
+        contig_tokens = token_gather_buf[permuted_indices]
+        # Run the first grouped GEMM
+        w1 = self.get_parameter("gate_proj_weight")
+        gate_proj = grouped_gemm_forward(contig_tokens, w1, m_sizes)
+        # Run the second grouped GEMM
+        w3 = self.get_parameter("up_proj_weight")
+        up_proj = grouped_gemm_forward(contig_tokens, w3, m_sizes)
+        # Apply activation
+        hidden_outputs = MLP.act_fn(gate_proj) * up_proj
+        # Run the third grouped GEMM
+        w2 = self.get_parameter("down_proj_weight")
+        hidden_outputs = grouped_gemm_forward(hidden_outputs, w2, m_sizes)
+        # Prepare buffer for tokens processed by experts
+        # Take necessary space from `token_gather_buf` symm mem because we are
+        # going to send them out after expert processing
+        processed_tokens = self.get_gather_buf()
+        # Move into Symmetric Memory for the return shuffle
+        processed_tokens[permuted_indices] = hidden_outputs
+        # Now shuffle the tokens back to their original owner, i.e. EP to DP shuffle.
+        # The input/output splits are just a reverse of the previous shuffle.
+        token_return_buf, _ = OnDeviceAllToAllV.apply(
+            processed_tokens,
+            output_splits,
+            self.ep_group,
+        )
+        returned_tokens = token_return_buf[: sorted_tokens_shape[0]]
+        output_tokens = torch.empty_like(returned_tokens)
+        output_tokens[idxs] = returned_tokens
+        final_out = (
+            output_tokens.view(*topk_ids.shape, -1)
+            .type(topk_weight.dtype)
+            .mul_(topk_weight.unsqueeze(dim=-1))
+            .sum(dim=1)
+            .type(returned_tokens.dtype)
+        )
+        return final_out
+class Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: ModelArgs, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.q_head_dim = config.qk_nope_head_dim + config.qk_rope_head_dim
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(
+                self.hidden_size, self.num_heads * self.q_head_dim, bias=False
+            )
+        else:
+            self.q_a_proj = nn.Linear(
+                self.hidden_size, config.q_lora_rank, bias=config.attention_bias
+            )
+            self.q_a_layernorm = RMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(
+                config.q_lora_rank, self.num_heads * self.q_head_dim, bias=False
+            )
+        self.kv_a_proj_with_mqa = nn.Linear(
+            self.hidden_size,
+            config.kv_lora_rank + config.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = RMSNorm(config.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            config.kv_lora_rank,
+            self.num_heads
+            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            self.hidden_size,
+            bias=config.attention_bias,
+        )
+        self._init_rope()
+        self.softmax_scale = self.q_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.softmax_scale = self.softmax_scale * mscale * mscale
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = RotaryEmbedding(
+                self.qk_rope_head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = LinearScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = DynamicNTKScalingRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "yarn":
+                kwargs = {
+                    key: self.config.rope_scaling[key]
+                    for key in [
+                        "original_max_position_embeddings",
+                        "beta_fast",
+                        "beta_slow",
+                        "mscale",
+                        "mscale_all_dim",
+                    ]
+                    if key in self.config.rope_scaling
+                }
+                self.rotary_emb = YarnRotaryEmbedding(
+                    self.qk_rope_head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        if self.q_lora_rank is None:
+            q = self.q_proj(hidden_states)
+        else:
+            q = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states)))
+        q = q.view(bsz, q_len, self.num_heads, self.q_head_dim).transpose(1, 2)
+        q_nope, q_pe = torch.split(
+            q, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
+        )
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        compressed_kv, k_pe = torch.split(
+            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
+        )
+        k_pe = k_pe.view(bsz, q_len, 1, self.qk_rope_head_dim).transpose(1, 2)
+        kv = (
+            self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
+            .view(bsz, q_len, self.num_heads, self.qk_nope_head_dim + self.v_head_dim)
+            .transpose(1, 2)
+        )
+        k_nope, value_states = torch.split(
+            kv, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
+        )
+        kv_seq_len = value_states.shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        q_pe, k_pe = apply_rotary_pos_emb(q_pe, k_pe, cos, sin, position_ids)
+        query_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        query_states[:, :, :, : self.qk_nope_head_dim] = q_nope
+        query_states[:, :, :, self.qk_nope_head_dim :] = q_pe
+        key_states = k_pe.new_empty(bsz, self.num_heads, q_len, self.q_head_dim)
+        key_states[:, :, :, : self.qk_nope_head_dim] = k_nope
+        key_states[:, :, :, self.qk_nope_head_dim :] = k_pe
+        if attention_mask is not None:
+            # Attention mask was made 4D because the `attn_weights` above is 4D.
+            # We probably can make this mask smarter if we want to pack sequences
+            # together, instead of using padding. This optimization can be used in
+            # inference. For training, if we want to pack sequences, data loader
+            # will pass in a mask containing such info.
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,  # None, or user provided mask in 2D
+                (bsz, q_len),
+                hidden_states,
+                0,  # past_key_values_length, 0 when training
+            )
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query=query_states,
+            key=key_states,
+            value=value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout,
+            is_causal=attention_mask is None,
+            scale=self.softmax_scale,
+        )
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.v_head_dim)
+        attn_output = self.o_proj(attn_output)
+        return attn_output
+class DecoderLayer(nn.Module):
+    def __init__(self, config: ModelArgs, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Attention(config=config, layer_idx=layer_idx)
+        self.mlp = (
+            MoE(config)
+            if (
+                config.n_routed_experts is not None
+                and layer_idx >= config.first_k_dense_replace
+                and layer_idx % config.moe_layer_freq == 0
+            )
+            else MLP(config)
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+Deepseek_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class DeepseekModel(torch.nn.Module):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`DecoderLayer`]
+    Args:
+        config: ModelArgs
+    """
+    def __init__(self, config: ModelArgs):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        # Creating model parts related to my stage
+        assert (
+            config.stage_idx < config.num_stages
+        ), f"Stage {config.stage_idx} is not in the model"
+        print(f"Creating model stage {config.stage_idx} of {config.num_stages}")
+        self.embed_tokens = (
+            nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+            if config.stage_idx == 0
+            else None
+        )
+        self.layers = torch.nn.ModuleDict()
+        division = config.num_hidden_layers // config.num_stages
+        residual = config.num_hidden_layers % config.num_stages
+        # Some earlier stages may have 1 more layer than latter stages because
+        # the division may have residual; this is more even than giving the
+        # entire residual to the last stage.
+        layers_per_stage = [
+            division + 1 if stage < residual else division
+            for stage in range(config.num_stages)
+        ]
+        assert sum(layers_per_stage) == config.num_hidden_layers
+        layer_id_start = sum(layers_per_stage[: config.stage_idx])
+        layer_id_end = layer_id_start + layers_per_stage[config.stage_idx]
+        for layer_id in range(layer_id_start, layer_id_end):
+            self.layers[str(layer_id)] = DecoderLayer(config, layer_id)
+        self.norm = (
+            RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        # Embedding
+        hidden_states = (
+            self.embed_tokens(tokens) if self.embed_tokens is not None else tokens
+        )
+        # decoder layers
+        for decoder_layer in self.layers.values():
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+        hidden_states = (
+            self.norm(hidden_states) if self.norm is not None else hidden_states
+        )
+        return hidden_states
+class DeepseekForCausalLM(torch.nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.model = DeepseekModel(config)
+        self.lm_head = (
+            nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+            if config.stage_idx == config.num_stages - 1
+            else None
+        )
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> Tuple:
+        r"""
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, DeepseekForCausalLM
+        >>> model = DeepseekForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        hidden_states = self.model(
+            tokens,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+        logits = (
+            self.lm_head(hidden_states) if self.lm_head is not None else hidden_states
+        )
+        return logits
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        if past_key_values is not None:
+            # Assuming isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if (
+                attention_mask is not None
+                and attention_mask.shape[1] > input_ids.shape[1]
+            ):
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+        model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx.to(past_state.device))
+                    for past_state in layer_past
+                ),
+            )
+        return reordered_past
+    # Setup Symmetric Memory for MoE token shuffle.
+    # Supports inference currently.
+    def setup_symm_mem(self, dtype: torch.dtype, device: torch.device):
+        for layer in self.model.layers.values():
+            if not isinstance(layer.mlp, MoE):
+                continue
+            layer.mlp.setup_symm_mem(dtype, device)

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_barrier.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import triton
+import triton.language as tl
+from .triton_utils import get_flat_bid, get_flat_tid
+@triton.jit
+def send_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.relaxed.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                send_signal:
+                    atom.global.release.sys.cas.b32 %tmp32_0, [$1], 0, 1;
+                    setp.eq.u32 %p0, %tmp32_0, 0;
+                    @!%p0 bra send_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def wait_signal(addrs, sem: tl.constexpr):
+    if sem == "relaxed":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.relaxed.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    elif sem == "acq_rel":
+        tl.inline_asm_elementwise(
+            """
+            {
+                .reg .u32   %tmp32_<1>;
+                .reg .pred  %p<1>;
+                wait_signal:
+                    atom.global.sys.acquire.cas.b32 %tmp32_0, [$1], 1, 0;
+                    setp.eq.u32 %p0, %tmp32_0, 1;
+                    @!%p0 bra wait_signal;
+            }
+            """,
+            "=r, l",
+            [addrs],
+            dtype=tl.int32,
+            is_pure=False,
+            pack=1,
+        )
+    else:
+        raise RuntimeError(f"Unrecognized sem: {sem}")
+@triton.jit
+def blockwise_barrier(
+    signal_pad_ptrs,
+    block_id,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    sem: tl.constexpr,
+):
+    """
+    Synchronizes blocks with matching block_id across participating devices.
+    Note: the function itself is not a system level barrier/fence. It is a
+    building block for expressing different synchronization patterns.
+    Pattern 0: Ensures that all writes to symm_mem buffers from previous
+    kernels across all devices are visible to the current kernel:
+        blockwise_barrier(..., sem="relaxed")
+        sync_threads()
+    Pattern 1: Ensures that all writes to symm_mem buffers from the current
+    block are visible to all remote blocks with matching blockIdx:
+        sync_threads()
+        blockwise_barrier(..., sem="acq_rel")
+        sync_threads()
+    Pattern 2: Ensures that symm_mem buffers read by the current kernel are safe
+    for writing by subsequent kernels across all devices.
+        sync_threads()
+        blockwise_barrier(..., sem="relaxed")
+    CUDA graph friendliness:
+        This barrier operates through atomic operations on a zero-filled signal
+        pad, which resets to a zero-filled state after each successful
+        synchronization. This design eliminates the need for incrementing a
+        flag from host.
+    """
+    if block_id is None:
+        block_id = get_flat_bid()
+    flat_tid = get_flat_tid()
+    remote_ranks = tl.arange(0, world_size)
+    signal_pad_ptrs = signal_pad_ptrs.to(tl.pointer_type(tl.uint64))
+    remote_signal_pad_addrs = tl.load(signal_pad_ptrs + remote_ranks).to(
+        tl.pointer_type(tl.uint32)
+    )
+    send_addrs = remote_signal_pad_addrs + block_id * world_size + rank
+    local_signal_pad_addr = tl.load(signal_pad_ptrs + rank).to(
+        tl.pointer_type(tl.uint32)
+    )
+    wait_addrs = local_signal_pad_addr + block_id * world_size + remote_ranks
+    if flat_tid < world_size:
+        send_signal(send_addrs, sem)
+        wait_signal(wait_addrs, sem)

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_on_device_all_to_all_v.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.distributed as dist
+import torch.distributed._symmetric_memory as symm_mem
+import triton
+import triton.language as tl
+from .triton_barrier import blockwise_barrier
+from .triton_utils import sync_threads
+@triton.jit
+def _exchange_row_offsets(
+    split_sizes_ptrs,
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+):
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    # split_sizes_ptr for all ranks
+    # All these vector stacks into split_sizes_matrix
+    split_sizes_ptrs = split_sizes_ptrs.to(tl.pointer_type(tl.uint64))
+    # split_sizes_matrix[remote_rank, :]
+    input_split_sizes_ptr = tl.load(split_sizes_ptrs + remote_rank).to(
+        tl.pointer_type(tl.int64)
+    )
+    offsets_ = tl.arange(0, world_size)
+    input_split_sizes = tl.load(
+        input_split_sizes_ptr + offsets_, mask=offsets_ <= rank, other=0
+    )
+    num_rows = tl.load(input_split_sizes_ptr + rank)
+    input_row_offset = tl.sum(input_split_sizes) - num_rows
+    # split_sizes_matrix[:, rank]
+    output_split_sizes_ptrs = (
+        tl.load(split_sizes_ptrs + offsets_).to(tl.pointer_type(tl.int64)) + rank
+    )
+    output_split_sizes = tl.load(
+        output_split_sizes_ptrs, mask=offsets_ <= remote_rank, other=0
+    )
+    output_row_offset = tl.sum(output_split_sizes) - num_rows
+    return input_row_offset, output_row_offset, num_rows
+@triton.jit
+def on_device_all_to_all_v_kernel(
+    output_ptr,
+    output_splits_ptr,
+    input_ptrs,
+    input_splits_ptr,
+    signal_pad_ptrs,
+    dim: tl.constexpr,  # Separate dim for easier vectorization
+    rank: tl.constexpr,
+    world_size: tl.constexpr,
+    BLOCKS_PER_REMOTE_RANK: tl.constexpr,
+    UNROLL_FACTOR: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    sync_threads()
+    remote_rank = tl.program_id(0) // BLOCKS_PER_REMOTE_RANK
+    block_offset = tl.program_id(0) % BLOCKS_PER_REMOTE_RANK
+    input_row_offset, output_row_offset, num_rows = _exchange_row_offsets(
+        input_splits_ptr, rank, world_size, BLOCKS_PER_REMOTE_RANK
+    )
+    output_splits_ptr = output_splits_ptr.to(tl.pointer_type(tl.uint64))
+    if block_offset == 0:
+        # Update output_splits
+        tl.store(output_splits_ptr + remote_rank, num_rows)
+    input_ptr = (
+        tl.load(input_ptrs.to(tl.pointer_type(tl.uint64)) + remote_rank).to(
+            tl.pointer_type(tl.bfloat16)
+        )
+        + input_row_offset * dim
+    )
+    output_ptr = output_ptr + output_row_offset * dim
+    outer_loop_step = BLOCK_SIZE * UNROLL_FACTOR
+    outer_loop_iters_per_rank = tl.cdiv(
+        tl.cdiv(num_rows * dim, outer_loop_step), BLOCKS_PER_REMOTE_RANK
+    )
+    numel_per_rank = outer_loop_step * outer_loop_iters_per_rank
+    offset = numel_per_rank * block_offset
+    end = tl.minimum(numel_per_rank * (block_offset + 1), num_rows * dim)
+    unroll_region_size = (end - offset) // outer_loop_step * outer_loop_step
+    for i in tl.range(offset, offset + unroll_region_size, outer_loop_step):
+        datas = []
+        for j in tl.range(
+            i,
+            i + outer_loop_step,
+            BLOCK_SIZE,
+            loop_unroll_factor=UNROLL_FACTOR,
+        ):
+            offsets = j + tl.arange(0, BLOCK_SIZE)
+            data = tl.load(input_ptr + offsets)
+            tl.store(output_ptr + offsets, data)
+    offset += unroll_region_size
+    while offset < end:
+        offsets = offset + tl.arange(0, BLOCK_SIZE)
+        mask = offsets < num_rows * dim
+        data = tl.load(input_ptr + offsets, mask=mask)
+        tl.store(output_ptr + offsets, data, mask=mask)
+        offset += BLOCK_SIZE
+    sync_threads()
+    blockwise_barrier(signal_pad_ptrs, None, rank, world_size, sem="relaxed")
+    return
+def _on_device_all_to_all_v(
+    output: torch.Tensor,
+    output_splits: torch.Tensor,
+    input: torch.Tensor,
+    input_splits: torch.Tensor,
+    group: dist.ProcessGroup = dist.group.WORLD,
+    BLOCKS_PER_REMOTE_RANK=8,
+    UNROLL_FACTOR: int = 8,
+    BLOCK_SIZE: int = 16384,
+):
+    assert output.dim() == 2, f"{output.shape}"
+    assert input.dim() == 2, f"{input.shape}"
+    assert output.shape[1] == input.shape[1]
+    dim = output.shape[1]
+    input_hdl = symm_mem.rendezvous(input, group=group)
+    input_splits_hdl = symm_mem.rendezvous(input_splits, group=group)
+    num_blocks = input_hdl.world_size * BLOCKS_PER_REMOTE_RANK
+    kernel = on_device_all_to_all_v_kernel[(num_blocks, 1, 1)](
+        output,
+        output_splits,
+        input_hdl.buffer_ptrs_dev,
+        input_splits_hdl.buffer_ptrs_dev,
+        input_hdl.signal_pad_ptrs_dev,
+        dim=dim,
+        rank=input_hdl.rank,
+        world_size=input_hdl.world_size,
+        BLOCKS_PER_REMOTE_RANK=BLOCKS_PER_REMOTE_RANK,
+        UNROLL_FACTOR=UNROLL_FACTOR,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=16,
+    )
+    # log_triton_kernel(kernel)
+    return output
+class OnDeviceAllToAllV(torch.autograd.Function):
+    # A symmetric memory holding the grad_output during backward
+    grad_output_buf = None
+    # A symmetric memory for exchanges split sizes during both forward and backward
+    splits_buf = None
+    # Maximum output length (need to be set before use of OnDeviceAllToAllV)
+    max_output_len = None
+    @staticmethod
+    def forward(
+        ctx,
+        input: torch.Tensor,
+        input_splits: torch.Tensor,
+        group: dist.ProcessGroup = dist.group.WORLD,
+    ):
+        """
+        Args:
+            input: input tensor with data for all ranks concatenated.
+            input_splits: input splits of shape (group.world_size,)
+            group: process group to scope the collective.
+        """
+        # Initialize input splits buffer (one time only)
+        if OnDeviceAllToAllV.splits_buf is None:
+            OnDeviceAllToAllV.splits_buf = symm_mem.empty(
+                *input_splits.shape,
+                dtype=input_splits.dtype,
+                device=input_splits.device,
+            )
+        if OnDeviceAllToAllV.max_output_len is None:
+            raise RuntimeError(
+                "Please set max output length via `OnDeviceAllToAllV.max_output_len = ...`"
+            )
+        # Allocate output buffer
+        output = input.new_empty(OnDeviceAllToAllV.max_output_len, *input.shape[1:])
+        # Allocate output splits tensor
+        output_splits = torch.empty_like(input_splits)
+        # Copy input splits to the buffer
+        OnDeviceAllToAllV.splits_buf.copy_(input_splits)
+        # Shuffle input to output
+        _on_device_all_to_all_v(
+            output, output_splits, input, OnDeviceAllToAllV.splits_buf, group=group
+        )
+        # Output splits in forward is the input splits in backward
+        ctx.save_for_backward(output_splits)
+        ctx.group = group
+        ctx.input_shape = input.shape
+        return output, output_splits
+    @staticmethod
+    def backward(ctx, grad_output, grad_splits):
+        """
+        Backward is implemented as a shuffle of the output's gradients to the input.
+        Args:
+            `grad_output`: output's gradients passed from the downstream.
+            `grad_splits`: unused.
+        """
+        # Initialize grad_output buffer (one time only)
+        if OnDeviceAllToAllV.grad_output_buf is None:
+            assert (
+                OnDeviceAllToAllV.max_output_len is not None
+            ), "`max_output_len` not set"
+            OnDeviceAllToAllV.grad_output_buf = symm_mem.empty(
+                OnDeviceAllToAllV.max_output_len,
+                *grad_output.shape[1:],
+                dtype=grad_output.dtype,
+                device=grad_output.device,
+            )
+        # TODO: is there a way to tell autograd to feed grad_output directly to
+        # our symm_mem buffer?
+        OnDeviceAllToAllV.grad_output_buf.narrow(0, 0, grad_output.shape[0]).copy_(
+            grad_output
+        )
+        # Size info
+        (grad_output_splits,) = ctx.saved_tensors
+        OnDeviceAllToAllV.splits_buf.copy_(grad_output_splits)
+        grad_input_splits = torch.empty_like(grad_output_splits)  # unused
+        grad_input = grad_output.new_empty(*ctx.input_shape)
+        # Shuffle gradients back to the input
+        _on_device_all_to_all_v(
+            grad_input,
+            grad_input_splits,
+            OnDeviceAllToAllV.grad_output_buf,
+            OnDeviceAllToAllV.splits_buf,
+            group=ctx.group,
+        )
+        return grad_input, None, None
+# Alias
+on_device_all_to_all_v = OnDeviceAllToAllV.apply

torchtitan/experiments/deepseek_v3/symm_mem_recipes/triton_utils.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import triton
+import triton.language as tl
+@triton.jit
+def get_tid():
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %tid.x;
+        mov.u32 $1, %tid.y;
+        mov.u32 $2, %tid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+@triton.jit
+def get_ntid():
+    return tl.inline_asm_elementwise(
+        """
+        mov.u32 $0, %ntid.x;
+        mov.u32 $1, %ntid.y;
+        mov.u32 $2, %ntid.z;
+        """,
+        "=r,=r,=r",
+        [],
+        dtype=(tl.uint32, tl.uint32, tl.uint32),
+        is_pure=True,
+        pack=1,
+    )
+@triton.jit
+def get_flat_tid():
+    tid_x, tid_y, tid_z = get_tid()
+    ntid_x, ntid_y, _ = get_ntid()
+    return tid_z * ntid_y * ntid_x + tid_y * ntid_x + tid_x
+@triton.jit
+def get_flat_bid():
+    return (
+        tl.program_id(2) * tl.num_programs(1) * tl.num_programs(0)
+        + tl.program_id(1) * tl.num_programs(0)
+        + tl.program_id(0)
+    )
+@triton.jit
+def sync_threads():
+    tl.inline_asm_elementwise(
+        "bar.sync 0;", "=r", [], dtype=tl.int32, is_pure=False, pack=1
+    )

torchtitan/experiments/deepseek_v3/train.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# torchrun --standalone --nproc-per-node 8 run.py
+import torch
+import torch.distributed as dist
+from checkpoint import load_weights_from_hf
+from model import DeepseekForCausalLM
+from model_config import deepseek_config_registry
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import fully_shard
+from torch.distributed.pipelining import PipelineStage, Schedule1F1B
+# Use DeepSeek-V2-Lite as a proxy
+model_id = "deepseek-ai/DeepSeek-V2-Lite"
+# Run full model
+def run_full_model(
+    mesh: DeviceMesh,
+):
+    rank = dist.get_rank()
+    device_count = torch.cuda.device_count()
+    device = torch.device("cuda", rank % device_count)
+    pp_mesh = mesh["pp"]
+    ep_mesh = mesh["ep"]
+    pp_rank = pp_mesh.get_local_rank()
+    ep_rank = ep_mesh.get_local_rank()
+    pp_size = pp_mesh.size()
+    ep_size = ep_mesh.size()
+    # Get model configs
+    model_args = deepseek_config_registry[model_id]
+    # [Note]: I am making the model smaller for testing / avoiding OOM. If you
+    # have sufficient GPUs for model parallelism, you can remove this line.
+    model_args.num_hidden_layers = 16
+    # Apply model parallelism
+    model_args.ep_size = ep_size
+    model_args.num_stages = pp_size
+    model_args.stage_idx = pp_rank
+    print(model_args)
+    # Instantiate model
+    with device, mesh:
+        model = DeepseekForCausalLM(model_args)
+    # Load weights
+    load_weights_from_hf(model, model_id, device)
+    model.train()
+    # Apply data parallelism
+    fsdp_mesh = mesh["fsdp"]
+    hsdp_mesh = mesh["ep", "fsdp"]
+    # Using `reshard_after_forward=False` to implement Zero-2, i.e. sharding the
+    # optimizer (Zero-1) and gradients (Zero-2), but not the model weights.
+    # Reason: the MoE is "sparsely activated" compared to the dense model, thus
+    # it will be ineconomical re-gather the weights.
+    for layer in model.model.layers.values():
+        # Apply FSDP to experts
+        if hasattr(layer.mlp, "experts"):
+            for expert in layer.mlp.experts.values():
+                fully_shard(expert, mesh=fsdp_mesh, reshard_after_forward=False)
+        # Apply HSDP to other parts such as attention, layernorm, because they
+        # are doing DDP on EP dimension
+        fully_shard(layer, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Apply HSDP on root model (lm_head, embeddings, etc)
+    fully_shard(model, mesh=hsdp_mesh, reshard_after_forward=False)
+    # Synthetic setting
+    microbatches = pp_size * 2
+    # Use Symmetric Memory for MoE token shuffle.
+    # TODO: we are rewriting `moe_on_device` function. `setup_symm_mem` is
+    # currently supported for forward only. See `generate.py`.
+    # model.setup_symm_mem(torch.bfloat16, device)
+    # Example inputs
+    torch.manual_seed(ep_rank)
+    bs = 4
+    seqlen = 128
+    x = torch.randint(model_args.vocab_size, (microbatches * bs, seqlen), device=device)
+    label = torch.rand(microbatches * bs, seqlen, model_args.vocab_size, device=device)
+    # Create loss function
+    loss_fn = torch.nn.functional.cross_entropy
+    # Run forward and backward
+    steps = 2
+    for _ in range(steps):
+        if pp_size > 1:
+            # Create pipeline stage
+            stage = PipelineStage(
+                model,
+                pp_rank,
+                pp_size,
+                device,
+                group=pp_mesh.get_group(),
+            )
+            # Create pipeline schedule
+            losses = []
+            pp_schedule = Schedule1F1B(stage, microbatches, loss_fn=loss_fn)
+            if pp_rank == 0:
+                y = pp_schedule.step(x)
+            elif pp_rank == pp_size - 1:
+                y = pp_schedule.step(target=label, losses=losses)
+                loss = torch.mean(torch.stack(losses))
+            else:
+                pp_schedule.step()
+        else:
+            y = model(x)
+            loss = loss_fn(y, label)
+            loss.backward()
+        if pp_rank == pp_size - 1:
+            print(f"logits: {y.shape}")
+            print(f"{loss=}")
+        if pp_rank == 0:
+            param = model.get_parameter("model.layers.0.self_attn.q_proj.weight")
+            print(f"{torch.linalg.norm(param.grad)=}")
+        model.zero_grad()
+    print("Backward done")
+if __name__ == "__main__":
+    mesh = dist.init_device_mesh("cuda", (2, 2, 2), mesh_dim_names=("pp", "ep", "fsdp"))
+    run_full_model(mesh)
+    dist.destroy_process_group()

torchtitan/experiments/flux/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# FLUX model in torchtitan
+## Overview
+## Usage
+First, download the autoencoder model from HuggingFace with your own access token:
+```bash
+python torchtitan/experiments/flux/scripts/download_autoencoder.py --repo_id black-forest-labs/FLUX.1-dev --ae_path ae.safetensors --hf_token <your_access_token>
+```
+This step will download the autoencoder model from HuggingFace and save it to the `torchtitan/experiments/flux/assets/autoencoder/ae.safetensors` file.
+Run the following command to train the model on a single GPU:
+```bash
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True torchrun --nproc_per_node=1 torchtitan/experiments/flux/train.py --job.config_file torchtitan/experiments/flux/train_configs/debug_model.toml
+```
+## TODO
+- [ ] Supporting for multiple GPUs is comming soon (FSDP, etc)
+- [ ] Implement test cases in CI for FLUX model. Adding more unit tests for FLUX model (eg, unit test for preprocessor, etc)
+- [ ] More parallesim support (Tensor Parallelism, Context Parallelism, etc)
+- [ ] Support for distributed checkpointing and loading
+- [ ] Implement init_weights() function to initialize the model weights
+- [ ] Implement the num_flops_per_token calculation in get_nparams_and_flops() function

torchtitan/experiments/flux/dataset/flux_dataset.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import random
+from dataclasses import dataclass
+from typing import Any, Callable, Optional
+import numpy as np
+import torch
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from PIL import Image
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.tools.logging import logger
+def _process_cc12m_image(
+    img: Image.Image,
+    output_size: int = 256,
+) -> Optional[torch.Tensor]:
+    """Process CC12M image to the desired size."""
+    width, height = img.size
+    # Skip low resolution images
+    if width < output_size or height < output_size:
+        return None
+    if width >= height:
+        # resize height to be equal to output_size, then crop
+        new_width, new_height = math.ceil(output_size / height * width), output_size
+        img = img.resize((new_width, new_height))
+        left = random.randint(0, new_width - output_size)
+        resized_img = img.crop((left, 0, left + output_size, output_size))
+    else:
+        # resize width to be equal to output_size, the crop
+        new_width, new_height = (
+            output_size,
+            math.ceil(output_size / width * height),
+        )
+        img = img.resize((new_width, new_height))
+        lower = random.randint(0, new_width - output_size)
+        resized_img = img.crop((0, lower, output_size, lower + output_size))
+    assert resized_img.size[0] == resized_img.size[1] == output_size
+    # Skip grayscale images
+    if resized_img.mode == "L":
+        return None
+    np_img = np.array(resized_img).transpose((2, 0, 1))
+    tensor_img = torch.tensor(np_img).float() / 255.0
+    # NOTE: The following commented code is an alternative way
+    # img_transform = transforms.Compose(
+    #     [
+    #         transforms.Resize(max(output_size, output_size)),
+    #         transforms.CenterCrop((output_size, output_size)),
+    #         transforms.ToTensor(),
+    #     ]
+    # )
+    # tensor_img = img_transform(img)
+    return tensor_img
+def _flux_data_processor(
+    sample: dict[str, Any],
+    t5_tokenizer: FluxTokenizer,
+    clip_tokenizer: FluxTokenizer,
+    output_size: int = 256,
+) -> dict[str, Any]:
+    """
+    Preprocess CC12M dataset sample image and text for Flux model.
+    Args:
+        sample: A sample from dataset
+        t5_encoder: T5 encoder
+        clip_encoder: CLIP encoder
+        output_size: The output image size
+    """
+    img = _process_cc12m_image(sample["jpg"], output_size=output_size)
+    t5_tokens = t5_tokenizer.encode(sample["txt"])
+    clip_tokens = clip_tokenizer.encode(sample["txt"])
+    return {
+        "image": img,
+        "clip_tokens": clip_tokens,  # type: List[int]
+        "t5_tokens": t5_tokens,  # type: List[int]
+    }
+@dataclass
+class TextToImageDatasetConfig:
+    path: str
+    loader: Callable
+    data_processor: Callable
+DATASETS = {
+    "cc12m": TextToImageDatasetConfig(
+        path="pixparse/cc12m-wds",
+        loader=lambda path: load_dataset(path, split="train", streaming=True),
+        data_processor=_flux_data_processor,
+    ),
+}
+def _validate_dataset(
+    dataset_name: str, dataset_path: Optional[str] = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(DATASETS.keys())}"
+        )
+    config = DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.data_processor
+class FluxDataset(IterableDataset, Stateful):
+    """Dataset for FLUX text-to-image model.
+    Args:
+    dataset_name (str): Name of the dataset.
+    dataset_path (str): Path to the dataset.
+    model_transform (Transform): Callable that applies model-specific preprocessing to the sample.
+    dp_rank (int): Data parallel rank.
+    dp_world_size (int): Data parallel world size.
+    infinite (bool): Whether to loop over the dataset infinitely.
+    """
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: Optional[str],
+        t5_tokenizer: FluxTokenizer,
+        clip_tokenizer: FluxTokenizer,
+        job_config: Optional[JobConfig] = None,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+        path, dataset_loader, data_processor = _validate_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._t5_tokenizer = t5_tokenizer
+        self._clip_tokenizer = clip_tokenizer
+        self._data_processor = data_processor
+        self.job_config = job_config
+        self.infinite = infinite
+        # Variables for checkpointing
+        self._sample_idx = 0
+        self._all_samples: list[dict[str, Any]] = []
+    def _get_data_iter(self):
+        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+            return iter([])
+        it = iter(self._data)
+        for _ in range(self._sample_idx):
+            next(it)
+        return it
+    def __iter__(self):
+        while True:
+            for sample in self._get_data_iter():
+                # Use the dataset-specific preprocessor
+                sample_dict = self._data_processor(
+                    sample, self._t5_tokenizer, self._clip_tokenizer, output_size=256
+                )
+                # skip low quality image or image with color channel = 1
+                if sample_dict["image"] is None:
+                    logger.warning(
+                        f"Low quality image {sample['__key__']} is skipped in Flux Dataloader"
+                    )
+                    continue
+                self._all_samples.extend(sample_dict)
+                self._sample_idx += 1
+                labels = sample_dict.pop("image")
+                yield sample_dict, labels
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+    def load_state_dict(self, state_dict):
+        self._sample_idx = state_dict["sample_idx"]
+        self._all_samples = state_dict["all_samples"]
+    def state_dict(self):
+        return {
+            "all_samples": self._all_samples,
+            "sample_idx": self._sample_idx,
+        }
+def build_flux_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    job_config: JobConfig,
+    # This parameter is not used, keep it for compatibility
+    tokenizer: FluxTokenizer | None,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.batch_size
+    t5_encoder_name = job_config.encoder.t5_encoder
+    clip_encoder_name = job_config.encoder.clip_encoder
+    max_t5_encoding_len = job_config.encoder.max_t5_encoding_len
+    ds = FluxDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        t5_tokenizer=FluxTokenizer(t5_encoder_name, max_length=max_t5_encoding_len),
+        clip_tokenizer=FluxTokenizer(
+            clip_encoder_name, max_length=77
+        ),  # fix max_length for CLIP
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+    return ParallelAwareDataloader(
+        dataset=ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+    )

torchtitan/experiments/flux/model/autoencoder.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import os
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from safetensors.torch import load_file as load_sft
+from torch import nn, Tensor
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: tuple[int] = (1, 2, 4, 4)
+    num_res_blocks: int = 2
+    z_channels: int = 16
+    scale_factor: float = 0.3611
+    shift_factor: float = 0.1159
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1
+        )
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
+def load_ae(
+    ckpt_path: str,
+    autoencoder_params: AutoEncoderParams,
+    device: str | torch.device = "cuda",
+    dtype=torch.bfloat16,
+) -> AutoEncoder:
+    """
+    Load the autoencoder from the given model name.
+    Args:
+        name (str): The name of the autoencoder.
+        device (str or torch.device): The device to load the autoencoder to.
+    Returns:
+        AutoEncoder: The loaded autoencoder.
+    """
+    # Loading the autoencoder
+    print("Init AE")
+    with torch.device(device):
+        ae = AutoEncoder(autoencoder_params)
+    if not os.path.exists(ckpt_path):
+        raise ValueError(
+            f"Autoencoder path {ckpt_path} does not exist. Please download it first."
+        )
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device=str(device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        if len(missing) > 0:
+            print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        if len(unexpected) > 0:
+            print(
+                f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected)
+            )
+    return ae.to(dtype=dtype)

torchtitan/experiments/flux/model/hf_embedder.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torch import nn, Tensor
+from transformers import CLIPTextModel, T5EncoderModel
+class FluxEmbedder(nn.Module):
+    def __init__(self, version: str, **hf_kwargs):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        else:
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(
+                version, **hf_kwargs
+            )
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, batch_tokens: Tensor) -> Tensor:
+        """
+        batch_tokens: [bsz, embedding_length]
+        For T5 Encoder, embeding_length is 768
+        For CLIP, embedding_length is 256
+        """
+        outputs = self.hf_module(
+            input_ids=batch_tokens.to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

torchtitan/experiments/flux/model/layers.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# imported from black-forest-labs/FLUX
+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import nn, Tensor
+from torchtitan.experiments.flux.model.math import attention, rope
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, dtype=torch.float32)
+        / half
+    ).to(t.device)
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)  # TODO(jianiw): switch to pytorch nn.RMSNorm
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(
+            self.multiplier, dim=-1
+        )
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+    def forward(
+        self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp(
+            (1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift
+        )
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp(
+            (1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift
+        )
+        return img, txt
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1
+        )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

torchtitan/experiments/flux/model/model.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass, field
+import torch
+from torch import nn, Tensor
+from torchtitan.components.tokenizer import Tokenizer
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.model.autoencoder import AutoEncoderParams
+from torchtitan.experiments.flux.model.layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol
+from torchtitan.tools.logging import logger
+@dataclass
+class FluxModelArgs(BaseModelArgs):
+    in_channels: int = 64
+    out_channels: int = 64
+    vec_in_dim: int = 768
+    context_in_dim: int = 512
+    hidden_size: int = 3072
+    mlp_ratio: float = 4.0
+    num_heads: int = 24
+    depth: int = 19
+    depth_single_blocks: int = 38
+    axes_dim: tuple = (16, 56, 56)
+    theta: int = 10_000
+    qkv_bias: bool = True
+    guidance_embed: bool = True
+    autoencoder_params: AutoEncoderParams = field(default_factory=AutoEncoderParams)
+    def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
+        # context_in_dim is the same as the T5 embedding dimension
+        self.context_in_dim = job_config.encoder.max_t5_encoding_len
+    def get_nparams_and_flops(self, model: nn.Module, seq_len: int) -> tuple[int, int]:
+        # TODO(jianiw): Add the number of flops for the autoencoder
+        nparams = sum(p.numel() for p in model.parameters())
+        logger.warning("FLUX model haven't implement get_nparams_and_flops() function")
+        return nparams, 1
+class FluxModel(nn.Module, ModelProtocol):
+    """
+    Transformer model for flow matching on sequences.
+    Agrs:
+        model_args: FluxModelArgs.
+    Attributes:
+        model_args (TransformerModelArgs): Model configuration arguments.
+    """
+    def __init__(self, model_args: FluxModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.in_channels = model_args.in_channels
+        self.out_channels = model_args.out_channels
+        if model_args.hidden_size % model_args.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {model_args.hidden_size} must be divisible by num_heads {model_args.num_heads}"
+            )
+        pe_dim = model_args.hidden_size // model_args.num_heads
+        if sum(model_args.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {model_args.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = model_args.hidden_size
+        self.num_heads = model_args.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=model_args.theta, axes_dim=model_args.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(model_args.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+            if model_args.guidance_embed
+            else nn.Identity()
+        )
+        self.txt_in = nn.Linear(model_args.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=model_args.mlp_ratio,
+                    qkv_bias=model_args.qkv_bias,
+                )
+                for _ in range(model_args.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size, self.num_heads, mlp_ratio=model_args.mlp_ratio
+                )
+                for _ in range(model_args.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+    def init_weights(self, buffer_device=None):
+        # TODO(jianiw): replace placeholder with real weight init
+        for param in self.parameters():
+            param.data.uniform_(0, 0.1)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.model_args.guidance_embed:
+            if guidance is None:
+                raise ValueError(
+                    "Didn't get guidance strength for guidance distilled model."
+                )
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
+    @classmethod
+    def from_model_args(cls, model_args: FluxModelArgs) -> "FluxModel":
+        """
+        Initialize a Flux model from a FluxModelArgs object.
+        Args:
+            model_args (FluxModelArgs): Model configuration arguments.
+        Returns:
+            FluxModel: FluxModel model.
+        """
+        return cls(model_args)

torchtitan/experiments/flux/parallelize_flux.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# This file applies the PT-D parallelisms (except pipeline parallelism) and various
+# training techniques (e.g. activation checkpointing and compile) to the Llama model.
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torchtitan.config_manager import JobConfig
+from torchtitan.distributed import ParallelDims
+def parallelize_flux(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    # TODO: Add model parallel strategy here
+    return model

torchtitan/experiments/flux/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ transformers
2	+ einops

torchtitan/experiments/flux/tests/test_flux_dataloader.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import sys
+from torchtitan.config_manager import JobConfig
+from torchtitan.experiments.flux.dataset.flux_dataset import build_flux_dataloader
+from torchtitan.tools.profiling import (
+    maybe_enable_memory_snapshot,
+    maybe_enable_profiling,
+)
+class TestFluxDataLoader:
+    def test_flux_dataloader(self):
+        dataset_name = "cc12m"
+        batch_size = 32
+        world_size = 4
+        rank = 0
+        num_steps = 10
+        path = "torchtitan.experiments.flux.flux_argparser"
+        sys.argv.append(f"--experimental.custom_args_module={path}")
+        config = JobConfig()
+        config.maybe_add_custom_args()
+        config.parse_args(
+            [
+                # Profiling options
+                # "--profiling.enable_profiling",
+                # "--profiling.profile_freq",
+                # "5",
+                # "--profiling.enable_memory_snapshot",
+                # "--profiling.save_memory_snapshot_folder",
+                # "memory_snapshot_flux",
+                "--training.dataset",
+                dataset_name,
+                "--training.batch_size",
+                str(batch_size),
+                "--encoder.t5_encoder",
+                "google/t5-v1_1-small",
+                "--encoder.clip_encoder",
+                "openai/clip-vit-large-patch14",
+                "--encoder.max_t5_encoding_len",
+                "512",
+            ]
+        )
+        with maybe_enable_profiling(
+            config, global_step=0
+        ) as torch_profiler, maybe_enable_memory_snapshot(
+            config, global_step=0
+        ) as memory_profiler:
+            dl = self._build_dataloader(
+                config,
+                world_size,
+                rank,
+            )
+            dl = iter(dl)
+            for i in range(0, num_steps):
+                input_data, labels = next(dl)
+                print(f"Step {i} image size: {labels.shape}")
+                if torch_profiler:
+                    torch_profiler.step()
+                if memory_profiler:
+                    memory_profiler.step()
+                print(len(input_data["clip_tokens"]))
+                for k, v in input_data.items():
+                    print(f"Step {i} {k} value: {type(v), v.shape}")
+                assert len(input_data) == 2  # (clip_encodings, t5_encodings)
+                assert labels.shape == (batch_size, 3, 256, 256)
+                # assert input_data["clip_tokens"].shape[0] == batch_size
+                # assert input_data["t5_tokens"].shape == (batch_size, 512, 512)
+            if torch_profiler:
+                torch_profiler.step()
+            if memory_profiler:
+                memory_profiler.step(exit_ctx=True)
+    def test_preprocess(self):
+        # TODO
+        pass
+    def _build_dataloader(
+        self,
+        job_config,
+        world_size,
+        rank,
+    ):
+        return build_flux_dataloader(
+            dp_world_size=world_size,
+            dp_rank=rank,
+            job_config=job_config,
+            tokenizer=None,
+            infinite=False,
+        )

torchtitan/experiments/flux/tests/test_generate_image.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import os
+import time
+from typing import Callable
+import torch
+from einops import rearrange
+from PIL import ExifTags, Image
+from torch import Tensor
+from torchtitan.experiments.flux.dataset.tokenizer import FluxTokenizer
+from torchtitan.experiments.flux.model.autoencoder import (
+    AutoEncoder,
+    AutoEncoderParams,
+    load_ae,
+)
+from torchtitan.experiments.flux.model.hf_embedder import FluxEmbedder
+from torchtitan.experiments.flux.model.model import FluxModel, FluxModelArgs
+from torchtitan.experiments.flux.utils import (
+    create_position_encoding_for_latents,
+    generate_noise_latent,
+    pack_latents,
+    preprocess_flux_data,
+    unpack_latents,
+)
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+) -> Callable[[float], float]:
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # estimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+class TestGenerateImage:
+    def test_generate_image(self):
+        """
+        Run a forward pass of flux model to generate an image.
+        """
+        name = "flux-dev"
+        img_width = 512
+        img_height = 512
+        seed = None
+        prompt = (
+            "a photo of a forest with mist swirling around the tree trunks. The word "
+            '"FLUX" is painted over it in big, red brush strokes with visible texture'
+        )
+        device = "cuda"
+        num_steps = None
+        loop = False
+        guidance = 3.5
+        output_dir = "output"
+        add_sampling_metadata = True
+        prompt = prompt.split("|")
+        if len(prompt) == 1:
+            prompt = prompt[0]
+            additional_prompts = None
+        else:
+            additional_prompts = prompt[1:]
+            prompt = prompt[0]
+        assert not (
+            (additional_prompts is not None) and loop
+        ), "Do not provide additional prompts and set loop to True"
+        torch_device = torch.device(device)
+        if num_steps is None:
+            num_steps = 30
+        # allow for packing and conversion to latent space
+        img_height = 16 * (img_height // 16)
+        img_width = 16 * (img_width // 16)
+        # init all components
+        model = FluxModel(FluxModelArgs()).to(device=torch_device, dtype=torch.bfloat16)
+        ae = load_ae(
+            ckpt_path="assets/autoencoder/ae.safetensors",
+            autoencoder_params=AutoEncoderParams(),
+            device=torch_device,
+            dtype=torch.bfloat16,
+        )
+        clip_tokenizer = FluxTokenizer(
+            model_path="openai/clip-vit-large-patch14", max_length=77
+        )
+        t5_tokenizer = FluxTokenizer(model_path="google/t5-v1_1-small", max_length=512)
+        clip_encoder = FluxEmbedder(version="openai/clip-vit-large-patch14").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        t5_encoder = FluxEmbedder(version="google/t5-v1_1-small").to(
+            torch_device, dtype=torch.bfloat16
+        )
+        rng = torch.Generator(device="cpu")
+        if seed is None:
+            seed = rng.seed()
+        print(f"Generating with seed {seed}:\n{prompt}")
+        t0 = time.perf_counter()
+        output_name = os.path.join(output_dir, f"img_{seed}.jpg")
+        # Tokenize the prompt, on CPU
+        clip_tokens = clip_tokenizer.encode(prompt)
+        t5_tokens = t5_tokenizer.encode(prompt)
+        batch = preprocess_flux_data(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            autoencoder=None,
+            clip_encoder=clip_encoder,
+            t5_encoder=t5_encoder,
+            batch={
+                "clip_tokens": clip_tokens,
+                "t5_tokens": t5_tokens,
+            },
+        )
+        img = self._generate_images(
+            device=torch_device,
+            dtype=torch.bfloat16,
+            model=model,
+            decoder=ae,
+            img_width=img_width,
+            img_height=img_height,
+            denoising_steps=num_steps,
+            seed=seed,
+            clip_encodings=batch["clip_encodings"],
+            t5_encodings=batch["t5_encodings"],
+            guidance=guidance,
+        )
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t1 = time.perf_counter()
+        print(f"Done in {t1 - t0:.1f}s.")
+        self._save_image(name, output_name, img, add_sampling_metadata, prompt)
+    def _generate_images(
+        self,
+        device: torch.device,
+        dtype: torch.dtype,
+        model: FluxModel,
+        decoder: AutoEncoder,
+        # image params:
+        img_width: int,
+        img_height: int,
+        # sampling params:
+        denoising_steps: int,
+        seed: int,
+        clip_encodings: torch.Tensor,
+        t5_encodings: torch.Tensor,
+        guidance: float = 4.0,
+    ):
+        bsz = clip_encodings.shape[0]
+        latents = generate_noise_latent(bsz, img_height, img_width, device, dtype, seed)
+        _, latent_channels, latent_height, latent_width = latents.shape
+        # create denoising schedule
+        timesteps = get_schedule(denoising_steps, latent_channels, shift=True)
+        # create positional encodings
+        POSITION_DIM = 3  # constant for Flux flow model
+        latent_pos_enc = create_position_encoding_for_latents(
+            bsz, latent_height, latent_width, POSITION_DIM
+        ).to(latents)
+        text_pos_enc = torch.zeros(bsz, t5_encodings.shape[1], POSITION_DIM).to(latents)
+        # convert img-like latents into sequences of patches
+        latents = pack_latents(latents)
+        # this is ignored for schnell
+        guidance_vec = torch.full((bsz,), guidance, device=device, dtype=dtype)
+        for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+            t_vec = torch.full((bsz,), t_curr, dtype=dtype, device=device)
+            pred = model(
+                img=latents,
+                img_ids=latent_pos_enc,
+                txt=t5_encodings,
+                txt_ids=text_pos_enc,
+                y=clip_encodings,
+                timesteps=t_vec,
+                guidance=guidance_vec,
+            )
+            latents = latents + (t_prev - t_curr) * pred
+        # convert sequences of patches into img-like latents
+        latents = unpack_latents(latents, latent_height, latent_width)
+        img = decoder.decode(latents)
+        return img
+    def _save_image(
+        self,
+        name: str,
+        output_name: str,
+        x: torch.Tensor,
+        add_sampling_metadata: bool,
+        prompt: str,
+    ):
+        print(f"Saving {output_name}")
+        # bring into PIL format and save
+        x = x.clamp(-1, 1)
+        x = rearrange(x[0], "c h w -> h w c")
+        img = Image.fromarray((127.5 * (x + 1.0)).cpu().byte().numpy())
+        exif_data = Image.Exif()
+        exif_data[ExifTags.Base.Software] = "AI generated;txt2img;flux"
+        exif_data[ExifTags.Base.Make] = "Black Forest Labs"
+        exif_data[ExifTags.Base.Model] = name
+        if add_sampling_metadata:
+            exif_data[ExifTags.Base.ImageDescription] = prompt
+        img.save(output_name, exif=exif_data, quality=95, subsampling=0)

torchtitan/experiments/flux/train_configs/debug_model.toml ADDED Viewed

	@@ -0,0 +1,68 @@

+[job]
+dump_folder = "./outputs"
+description = "Flux debug model"
+print_args = false
+use_for_integration_test = true
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 10
+enable_memory_snapshot = false
+save_memory_snapshot_folder = "memory_snapshot"
+[metrics]
+log_freq = 1
+disable_color_printing = false
+enable_tensorboard = false
+save_tb_folder = "tb"
+enable_wandb = false
+[model]
+name = "flux"
+flavor = "flux-debug"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+# test tokenizer.model, for debug purpose only
+# tokenizer_path = "./tests/assets/test_tiktoken.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 8e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 2  # lr scheduler warm up, normally 20% of the train steps
+decay_ratio = 0.8  # lr scheduler decay ratio, 80% of the train steps
+decay_type = "linear"
+lr_min = 0.0
+[training]
+batch_size = 32
+seq_len = 512
+max_norm = 1.0  # grad norm clipping
+steps = 10
+compile = false
+dataset = "cc12m"
+guidance = 3.5
+seed = 0
+[encoder]
+t5_encoder="google/t5-v1_1-small"
+clip_encoder="openai/clip-vit-large-patch14"
+max_t5_encoding_len=512
+auto_encoder_path="torchtitan/experiments/flux/assets/autoencoder/ae.safetensors"  # Autoencoder to use for image
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = 1
+fsdp_reshard_after_forward = "default" # default / never / always
+tensor_parallel_degree = 1
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[experimental]
+custom_args_module = "torchtitan.experiments.flux.flux_argparser"

torchtitan/experiments/kernels/triton_mg_group_gemm/benchmark.py ADDED Viewed

	@@ -0,0 +1,630 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Benchmark comparing reference PyTorch vs optimized M*G group GEMM implementation
+import argparse
+import logging
+import time
+# from typing import Dict, List, Optional, Tuple
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import triton
+# import triton.language as tl
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+# Try to import the optimized implementations
+try:
+    from torchao_pr.mg_grouped_gemm import grouped_gemm_forward
+except ImportError:
+    logging.error(
+        "Error importing MG grouped GEMM modules. Make sure the implementation files are in the correct path."
+    )
+    raise
+def compute_reference_forward(x, w, m_sizes):
+    """
+    Reference PyTorch implementation of M*G grouped GEMM forward pass.
+    Args:
+        x (torch.Tensor): Input tensor of shape (M, K)
+        w (torch.Tensor): Weight tensor of shape (N, K)
+        m_sizes (torch.Tensor): Group sizes tensor of shape (G)
+    Returns:
+        torch.Tensor: Output tensor of shape (M, N)
+    """
+    result = torch.zeros((x.shape[0], w.shape[0]), dtype=x.dtype, device=x.device)
+    m_start = 0
+    for g in range(len(m_sizes)):
+        m_size = m_sizes[g].item()
+        if m_size > 0:
+            m_end = m_start + m_size
+            # Extract group input
+            x_g = x[m_start:m_end]
+            # Compute group output
+            y_g = torch.matmul(x_g, w.T)
+            # Store result
+            result[m_start:m_end] = y_g
+            # Update start index
+            m_start = m_end
+    return result
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["N"],  # We'll vary the output dimension
+        x_vals=[1024, 2048, 4096, 8192, 16384],  # Different output dimensions to test
+        # x_vals=[8192, 16384],
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "M*G grouped GEMM"],
+        line_names=["PyTorch Reference", "M*G grouped Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_comparison",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 7168,  # Hidden dimension, fixed for all tests
+            "G": 8,  # Number of groups
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_forward(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+    """
+    Benchmark the forward pass of the grouped GEMM implementation.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create group sizes for M dimension (balanced across groups)
+    base_size = M // G
+    remainder = M % G
+    M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    print(f"N: {N}, M: {M}, K: {K}, G: {G}, dtype: {dtype}, device: {device}")
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Pre-compute for PyTorch reference to ensure fair comparison
+    if provider == "pytorch_reference":
+        # Warmup
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        # Benchmark
+        start_time = time.time()
+        for _ in range(10):  # Average over 10 runs
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:  # Optimized kernel
+        # Warmup
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        # Benchmark
+        start_time = time.time()
+        for _ in range(10):  # Average over 10 runs
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs
+    # For GEMM: 2 * M * N * K FLOPs (multiply-add counts as 2 FLOPs)
+    flops = 2 * M * N * K
+    # Convert to TFLOPS (tera-FLOPS)
+    avg_time = (end_time - start_time) / 10  # Average time per run
+    tflops = flops / avg_time / 1e12
+    return tflops
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["G"],  # We'll vary the number of groups
+        x_vals=[1, 2, 4, 8, 16],  # Different numbers of groups to test
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "optimized_kernel"],
+        line_names=["PyTorch Reference", "Optimized Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_group_scaling",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 4096,  # Hidden dimension, fixed for all tests
+            "N": 8192,  # Output dimension, fixed for all tests
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_forward_groups(M, K, N, G, provider, dtype=torch.float16, device="cuda"):
+    """
+    Benchmark how performance scales with number of groups.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create group sizes for M dimension (balanced across groups)
+    base_size = M // G
+    remainder = M % G
+    M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Benchmark logic - same as previous function
+    if provider == "pytorch_reference":
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs and TFLOPS
+    flops = 2 * M * N * K
+    avg_time = (end_time - start_time) / 10
+    tflops = flops / avg_time / 1e12
+    return tflops
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["group_balance"],  # We'll vary the group balance factor
+        x_vals=[
+            0.0,
+            0.25,
+            0.5,
+            0.75,
+            0.9,
+        ],  # Different imbalance factors (0 = balanced, 1 = max imbalance)
+        line_arg="provider",  # We'll compare different providers
+        line_vals=["pytorch_reference", "optimized_kernel"],
+        line_names=["PyTorch Reference", "Optimized Kernel"],
+        styles=[("blue", "-"), ("red", "-")],
+        ylabel="TFLOPS",  # We'll measure TFLOPS
+        plot_name="mg_grouped_gemm_imbalance",
+        args={
+            "M": 8192,  # Batch dimension, fixed for all tests
+            "K": 4096,  # Hidden dimension, fixed for all tests
+            "N": 8192,  # Output dimension, fixed for all tests
+            "G": 4,  # Number of groups
+            "dtype": torch.float16,
+            "device": "cuda",
+        },
+    )
+)
+def benchmark_imbalance(
+    M, K, N, G, group_balance, provider, dtype=torch.float16, device="cuda"
+):
+    """
+    Benchmark how performance is affected by imbalanced group sizes.
+    Args:
+        M (int): Total batch size dimension
+        K (int): Hidden dimension
+        N (int): Output dimension
+        G (int): Number of groups
+        group_balance (float): Balance factor from 0 to 1 (0 = balanced, 1 = max imbalance)
+        provider (str): Provider to use ('pytorch_reference' or 'optimized_kernel')
+        dtype (torch.dtype): Data type to use
+        device (str): Device to use
+    Returns:
+        float: Performance in TFLOPS
+    """
+    # Create imbalanced group sizes for M dimension
+    if group_balance == 0:
+        # Balanced case
+        base_size = M // G
+        remainder = M % G
+        M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+    else:
+        # Imbalanced case
+        # First group gets more elements, last group gets fewer
+        # The imbalance is controlled by the group_balance factor
+        remaining = M
+        M_sizes = []
+        for g in range(G):
+            # Interpolate from balanced to imbalanced based on group_balance
+            # For balanced (group_balance=0), each group gets M/G
+            # For imbalanced (group_balance=1), first group gets much more than last group
+            balanced_size = remaining // (G - g)
+            # Adjusting size based on position and imbalance factor
+            # First groups get more, last groups get less
+            if g < G // 2:
+                # First half of groups get more
+                adjustment = int(balanced_size * group_balance * (1 - g / (G - 1)))
+                size = balanced_size + adjustment
+            else:
+                # Second half of groups get less
+                adjustment = int(balanced_size * group_balance * ((g / (G - 1)) - 0.5))
+                size = balanced_size - adjustment
+            # Ensure we don't go below 1 or take more than remaining
+            size = max(1, min(size, remaining))
+            M_sizes.append(size)
+            remaining -= size
+        # Handle any remaining elements
+        if remaining > 0:
+            M_sizes[-1] += remaining
+    m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+    # Create input and weight tensors
+    x = torch.randn(M, K, dtype=dtype, device=device)
+    w = torch.randn(N, K, dtype=dtype, device=device)
+    # Benchmark logic
+    if provider == "pytorch_reference":
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    else:
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+    # Calculate FLOPs and TFLOPS
+    flops = 2 * M * N * K
+    avg_time = (end_time - start_time) / 10
+    tflops = flops / avg_time / 1e12
+    return tflops
+def benchmark_model_configs():
+    """
+    Benchmark common model configurations used in DeepSeek-like models.
+    """
+    # Model configurations: (M, K, N, G)
+    configs = [
+        (8192, 7168, 4096, 4),  # Config 1
+        (8192, 2048, 7168, 4),  # Config 2
+        (4096, 7168, 4096, 8),  # Config 3
+        (4096, 2048, 7168, 8),  # Config 4
+    ]
+    results = []
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.float16
+    for config_idx, (M, K, N, G) in enumerate(configs):
+        logging.info(f"\n===== Benchmarking DeepSeek Config {config_idx + 1} =====")
+        logging.info(f"M={M}, K={K}, N={N}, G={G}")
+        # Create group sizes for M dimension
+        base_size = M // G
+        remainder = M % G
+        M_sizes = [base_size + (1 if i < remainder else 0) for i in range(G)]
+        m_sizes = torch.tensor(M_sizes, device=device, dtype=torch.int32)
+        # Create tensors
+        x = torch.randn(M, K, dtype=dtype, device=device)
+        w = torch.randn(N, K, dtype=dtype, device=device)
+        # Benchmark PyTorch reference
+        torch.cuda.synchronize()
+        compute_reference_forward(x, w, m_sizes)  # Warmup
+        torch.cuda.synchronize()
+        logging.info("Benchmarking PyTorch reference...")
+        torch.cuda.reset_peak_memory_stats()
+        start_time = time.time()
+        for _ in range(10):
+            compute_reference_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        pt_time = (end_time - start_time) / 10
+        pt_memory = torch.cuda.max_memory_allocated() / (1024**2)  # MB
+        # Benchmark optimized kernel
+        torch.cuda.synchronize()
+        grouped_gemm_forward(x, w, m_sizes)  # Warmup
+        torch.cuda.synchronize()
+        logging.info("Benchmarking optimized kernel...")
+        torch.cuda.reset_peak_memory_stats()
+        start_time = time.time()
+        for _ in range(10):
+            grouped_gemm_forward(x, w, m_sizes)
+        torch.cuda.synchronize()
+        end_time = time.time()
+        opt_time = (end_time - start_time) / 10
+        opt_memory = torch.cuda.max_memory_allocated() / (1024**2)  # MB
+        # Calculate FLOPs and speedup
+        flops = 2 * M * N * K
+        pt_tflops = flops / pt_time / 1e12
+        opt_tflops = flops / opt_time / 1e12
+        speedup = pt_time / opt_time
+        # Store results
+        results.append(
+            {
+                "config": f"Config {config_idx + 1}",
+                "dimensions": f"M={M}, K={K}, N={N}, G={G}",
+                "pt_time_ms": pt_time * 1000,
+                "opt_time_ms": opt_time * 1000,
+                "pt_tflops": pt_tflops,
+                "opt_tflops": opt_tflops,
+                "speedup": speedup,
+                "pt_memory_mb": pt_memory,
+                "opt_memory_mb": opt_memory,
+                "memory_savings": (
+                    (pt_memory - opt_memory) / pt_memory * 100 if pt_memory > 0 else 0
+                ),
+            }
+        )
+        logging.info(
+            f"PyTorch Reference: {pt_time * 1000:.2f} ms, {pt_tflops:.2f} TFLOPS, {pt_memory:.2f} MB"
+        )
+        logging.info(
+            f"Optimized Kernel: {opt_time * 1000:.2f} ms, {opt_tflops:.2f} TFLOPS, {opt_memory:.2f} MB"
+        )
+        logging.info(
+            f"Speedup: {speedup:.2f}x, Memory savings: {results[-1]['memory_savings']:.2f}%"
+        )
+    # Print summary table
+    logging.info("\n===== Benchmark Results Summary =====")
+    logging.info(
+        f"{'Config':<10} | {'Time (ms)':<20} | {'TFLOPS':<20} | {'Speedup':<10} | {'Memory (MB)':<20} | {'Memory Saved':<12}"
+    )
+    logging.info(
+        f"{'':<10} | {'PyTorch':<9} {'Kernel':<9} | {'PyTorch':<9} {'Kernel':<9} | {'':<10} | "
+        f"{'PyTorch':<9} {'Kernel':<9} | {'':<12}"
+    )
+    logging.info("-" * 100)
+    for result in results:
+        logging.info(
+            f"{result['config']:<10} | "
+            f"{result['pt_time_ms']:<9.2f} {result['opt_time_ms']:<9.2f} | "
+            f"{result['pt_tflops']:<9.2f} {result['opt_tflops']:<9.2f} | "
+            f"{result['speedup']:<10.2f} | "
+            f"{result['pt_memory_mb']:<9.2f} {result['opt_memory_mb']:<9.2f} | "
+            f"{result['memory_savings']:<12.2f}%"
+        )
+    return results
+def plot_benchmark_results(results):
+    """
+    Plot benchmark results as bar charts.
+    """
+    # Extract data
+    configs = [r["config"] for r in results]
+    pt_tflops = [r["pt_tflops"] for r in results]
+    opt_tflops = [r["opt_tflops"] for r in results]
+    speedups = [r["speedup"] for r in results]
+    # Create figure with subplots
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
+    # Plot TFLOPS comparison
+    x = np.arange(len(configs))
+    width = 0.35
+    ax1.bar(x - width / 2, pt_tflops, width, label="PyTorch Reference")
+    ax1.bar(x + width / 2, opt_tflops, width, label="Optimized Kernel")
+    ax1.set_xlabel("Model Configuration")
+    ax1.set_ylabel("TFLOPS")
+    ax1.set_title("Performance Comparison (Higher is Better)")
+    ax1.set_xticks(x)
+    ax1.set_xticklabels(configs)
+    ax1.legend()
+    ax1.grid(axis="y", linestyle="--", alpha=0.7)
+    # Plot speedup
+    ax2.bar(x, speedups, width=0.6, color="green")
+    ax2.set_xlabel("Model Configuration")
+    ax2.set_ylabel("Speedup (x)")
+    ax2.set_title("Speedup Factor (Higher is Better)")
+    ax2.set_xticks(x)
+    ax2.set_xticklabels(configs)
+    ax2.grid(axis="y", linestyle="--", alpha=0.7)
+    # Add speedup values on top of bars
+    for i, v in enumerate(speedups):
+        ax2.text(i, v + 0.1, f"{v:.2f}x", ha="center")
+    plt.tight_layout()
+    plt.savefig("mg_grouped_gemm_benchmark_results.png")
+    logging.info(
+        "Benchmark results plot saved to 'mg_grouped_gemm_benchmark_results.png'"
+    )
+def compare_mg_implementations():
+    """
+    Combine the M*G and N*G benchmark results for comparison.
+    """
+    # Only run this if both NG and MG benchmarks have been run
+    try:
+        import pandas as pd
+        # Try to load previous benchmark results
+        mg_results = pd.read_csv("mg_grouped_gemm_benchmark_results.csv")
+        ng_results = pd.read_csv("ng_grouped_gemm_benchmark_results.csv")
+        # Create comparison plot
+        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+        # Plot speedup comparison
+        configs = mg_results["config"].unique()
+        mg_speedups = mg_results.groupby("config")["speedup"].mean()
+        ng_speedups = ng_results.groupby("config")["speedup"].mean()
+        x = np.arange(len(configs))
+        width = 0.35
+        axes[0].bar(x - width / 2, mg_speedups, width, label="M*G Grouping")
+        axes[0].bar(x + width / 2, ng_speedups, width, label="N*G Grouping")
+        axes[0].set_xlabel("Model Configuration")
+        axes[0].set_ylabel("Speedup (x)")
+        axes[0].set_title("Speedup Comparison: M*G vs N*G")
+        axes[0].set_xticks(x)
+        axes[0].set_xticklabels(configs)
+        axes[0].legend()
+        axes[0].grid(axis="y", linestyle="--", alpha=0.7)
+        # Plot TFLOPS comparison for optimized kernels
+        mg_tflops = (
+            mg_results[mg_results["implementation"] == "optimized"]
+            .groupby("config")["tflops"]
+            .mean()
+        )
+        ng_tflops = (
+            ng_results[ng_results["implementation"] == "optimized"]
+            .groupby("config")["tflops"]
+            .mean()
+        )
+        axes[1].bar(x - width / 2, mg_tflops, width, label="M*G Grouping")
+        axes[1].bar(x + width / 2, ng_tflops, width, label="N*G Grouping")
+        axes[1].set_xlabel("Model Configuration")
+        axes[1].set_ylabel("TFLOPS")
+        axes[1].set_title("Performance Comparison: M*G vs N*G")
+        axes[1].set_xticks(x)
+        axes[1].set_xticklabels(configs)
+        axes[1].legend()
+        axes[1].grid(axis="y", linestyle="--", alpha=0.7)
+        plt.tight_layout()
+        plt.savefig("mg_vs_ng_comparison.png")
+        logging.info("Comparison plot saved to 'mg_vs_ng_comparison.png'")
+    except Exception as e:
+        logging.error(f"Could not create comparison plot: {e}")
+        logging.info(
+            "Run both M*G and N*G benchmarks first to generate comparison plots"
+        )
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark M*G Grouped GEMM implementations"
+    )
+    parser.add_argument("--run-all", action="store_true", help="Run all benchmarks")
+    parser.add_argument(
+        "--triton-bench", action="store_true", help="Run Triton performance reports"
+    )
+    parser.add_argument(
+        "--model-configs", action="store_true", help="Benchmark model configurations"
+    )
+    parser.add_argument(
+        "--compare-mg-ng",
+        action="store_true",
+        help="Compare M*G and N*G implementations",
+    )
+    args = parser.parse_args()
+    # Check if CUDA is available
+    if not torch.cuda.is_available():
+        logging.error(
+            "CUDA is not available. This benchmark requires a CUDA-capable GPU."
+        )
+        exit(1)
+    if args.run_all or args.model_configs:
+        # Benchmark model configurations
+        logging.info("Running benchmark for model configurations...")
+        results = benchmark_model_configs()
+        plot_benchmark_results(results)
+    if args.run_all or args.triton_bench:
+        # Run Triton performance reports
+        logging.info("Running Triton performance reports...")
+        benchmark_forward.run(save_path="mg_grouped_gemm_benchmark_results")
+        benchmark_forward_groups.run(save_path="mg_grouped_gemm_benchmark_results")
+        benchmark_imbalance.run(save_path="mg_grouped_gemm_benchmark_results")
+        logging.info(
+            "Triton performance reports saved to 'mg_grouped_gemm_benchmark_results' directory"
+        )
+    if args.run_all or args.compare_mg_ng:
+        # Compare M*G and N*G implementations
+        logging.info("Comparing M*G and N*G implementations...")
+        compare_mg_implementations()

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from .mg_grouped_gemm import grouped_gemm_forward
+from .tma_autotuning import ALIGN_SIZE_M
+__all__ = [
+    "grouped_gemm_forward",
+    "ALIGN_SIZE_M",
+]

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/tma_autotuning.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# credit - TMAHelper class, AutoTuning are derived from FBGemm:
+# https://github.com/pytorch/FBGEMM/blob/main/fbgemm_gpu/experimental/gemm/triton_gemm
+# pyre-unsafe
+import functools
+import os
+import sys
+from typing import Any, Dict, Optional, Tuple
+import torch
+import triton
+import triton.language as tl
+from triton import Config as TConfig
+from triton.runtime import driver  # @manual
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# ===== Supporting utils, CUDA and TMA =====
+class CudaUtils:
+    @staticmethod
+    def is_cuda() -> bool:
+        """Check if Triton is running on CUDA backend."""
+        return driver.active.get_current_target().backend == "cuda"
+    @staticmethod
+    def verify_tma() -> bool:
+        """Check if TMA is supported on the current device."""
+        return (
+            CudaUtils.is_cuda()
+            and torch.cuda.is_available()
+            and torch.cuda.get_device_capability()[0] >= 9
+        )
+    @staticmethod
+    def get_num_sms() -> int:
+        """Get the number of streaming multiprocessors on the current device."""
+        if not CudaUtils.is_cuda():
+            raise RuntimeError("Triton is not running on CUDA backend")
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA is not available")
+        return torch.cuda.get_device_properties("cuda").multi_processor_count
+class TmaDescriptorHelper:
+    """Helper class for managing TMA descriptors in Triton kernels."""
+    class KernelParamWrapper:
+        """Wrapper to implement the TmaDescKernelParam interface."""
+        def __init__(self, desc: torch.Tensor):
+            self.desc = desc
+        def tma_desc_cpu_ptr(self) -> int:
+            """Return the CPU pointer to the TMA descriptor."""
+            return self.desc.data_ptr()
+    def __init__(self, tma_size: int = 128):
+        """Initialize the TMA descriptor helper.
+        Args:
+            tma_size: Size of the TMA descriptor in bytes
+        """
+        if not CudaUtils.verify_tma():
+            raise RuntimeError(
+                "TMA not supported on this device (requires Hopper or newer)"
+            )
+        if "nv_tma_desc_type" not in dir(tl):
+            raise RuntimeError(
+                "TMA grid constant descriptors not supported in your Triton version"
+            )
+        self.tma_size = tma_size
+        self.fill_1d_tma_descriptor_inner = driver.active.utils.fill_1d_tma_descriptor
+        self.fill_2d_tma_descriptor_inner = driver.active.utils.fill_2d_tma_descriptor
+        self.descriptors: Dict[str, torch.Tensor] = {}
+    def init_tma_descriptor(self, name: str) -> None:
+        """Initialize a TMA descriptor with the given name.
+        Call this method outside of the lambda function for grid size.
+        """
+        self.descriptors[name] = torch.empty(
+            self.tma_size, device="cpu", dtype=torch.int8
+        )
+    def fill_1d_tma_descriptor(
+        self, name: str, ptr: int, dim: int, block_dim: int, element_size: int
+    ) -> None:
+        """Fill a 1D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_1d_tma_descriptor_inner(
+            ptr, dim, block_dim, element_size, desc_x.data_ptr()
+        )
+    def fill_2d_tma_descriptor(
+        self,
+        name: str,
+        ptr: int,
+        dim1: int,
+        dim0: int,
+        block_dim1: int,
+        block_dim0: int,
+        element_size: int,
+    ) -> None:
+        """Fill a 2D TMA descriptor.
+        Call this method inside the lambda function for grid size.
+        """
+        if name not in self.descriptors:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        desc_x = self.descriptors[name]
+        if desc_x.data_ptr() % 64 != 0:
+            raise ValueError("TMA descriptor must be 64-byte aligned")
+        self.fill_2d_tma_descriptor_inner(
+            ptr, dim1, dim0, block_dim1, block_dim0, element_size, desc_x.data_ptr()
+        )
+    def get_tma_descriptor_kernel_param(self, name: str) -> KernelParamWrapper:
+        """Get the TMA descriptor kernel parameter for the given name."""
+        if name not in self.descriptors or self.descriptors[name] is None:
+            raise ValueError(f"TMA descriptor '{name}' not initialized")
+        return self.KernelParamWrapper(self.descriptors[name])
+# ======  Autotuning utilities ======
+ALIGN_SIZE_M = 128
+_NV_CONFIGS = [
+    triton.Config(
+        {
+            "BLOCK_SIZE_M": block_size_m,
+            "BLOCK_SIZE_N": block_size_n,
+            "BLOCK_SIZE_K": block_size_k,
+        },
+        num_stages=num_stages,
+        num_warps=num_warps,
+        num_ctas=num_ctas,
+    )
+    for block_size_m in [ALIGN_SIZE_M, ]
+    for block_size_n in [64, 128, 256]
+    for block_size_k in [64, 128, 256]
+    for num_stages in [3, 4]
+    for num_warps in [4, 8]
+    for num_ctas in [1]
+]
+def early_config_prune(configs, named_args, dtsize=None, dtype=None, **kwargs):
+    device = torch.cuda.current_device()
+    # Check for all possible pointer parameter names
+    if "grad_input_ptr" in named_args:
+        ptr_name = "grad_input_ptr"
+    elif "c_ptr" in named_args:
+        ptr_name = "c_ptr"
+    elif "grad_weight_ptr" in named_args:
+        ptr_name = "grad_weight_ptr"
+    else:
+        raise KeyError("No recognized pointer parameter found in kernel arguments")
+    if dtsize is None:
+        dtsize = named_args[ptr_name].element_size()
+    if dtype is None:
+        dtype = named_args[ptr_name].dtype
+    pruned_configs = []
+    for config in configs:
+        kw = config.kwargs
+        BLOCK_M, BLOCK_N, BLOCK_K, num_stages = (
+            kw["BLOCK_SIZE_M"],
+            kw["BLOCK_SIZE_N"],
+            kw["BLOCK_SIZE_K"],
+            config.num_stages,
+        )
+        G, M, N, K = (
+            named_args["G"],
+            named_args["M_BUCKET"],
+            named_args["N"],
+            named_args["K"],
+        )
+        # 1. make sure we have enough smem
+        max_shared_memory = driver.active.utils.get_device_properties(device)[
+            "max_shared_mem"
+        ]
+        required_shared_memory = (BLOCK_M + BLOCK_N) * BLOCK_K * num_stages * dtsize
+        if required_shared_memory > max_shared_memory:
+            continue
+        M_PER_GROUP = M // G
+        MIN_M_TILES = 64
+        # 2. make sure we don't load M tiles that are too big
+        if BLOCK_M > MIN_M_TILES and BLOCK_M > (M_PER_GROUP * 2):
+            continue
+        # 3. make sure we don't load N tiles that are too small
+        if BLOCK_M < 128 and BLOCK_M < (M_PER_GROUP // 2):
+            continue
+        num_sm = driver.active.utils.get_device_properties(device)[
+            "multiprocessor_count"
+        ]
+        N_TILES = N // BLOCK_N
+        MIN_N_TILES = 64
+        # 4. make sure we don't load N tiles that are too big
+        if BLOCK_N > MIN_N_TILES and M * N_TILES < num_sm:
+            continue
+        # 5. make sure we don't load N tiles that are too small
+        if BLOCK_N < 128 and M * N_TILES > 2 * num_sm:
+            continue
+        # 6. make sure K can be evenly divided
+        if K % BLOCK_K != 0:
+            continue
+        pruned_configs.append(config)
+    return pruned_configs
+# ======== End Autotuning utilities ========

torchtitan/experiments/kernels/triton_mg_group_gemm/torchao_pr/unit_test_forwards.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# pyre-unsafe
+import logging
+import unittest
+from typing import Tuple
+import torch
+import torch.nn as nn
+from mg_grouped_gemm import grouped_gemm_forward
+class TestMG_GroupedGEMM(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(2020)
+    def _run_grouped_gemm_test(
+        self,
+        shape: Tuple[int, int, int, int],
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16,
+        atol: float = 1e-5,
+        rtol: float = 1.6e-2,
+    ) -> None:
+        G, M, N, K = shape
+        # In M*G grouping, input is [M*G, K] and weights are [N*G, K]
+        a = torch.randn(M * G, K, dtype=dtype, device=device)
+        b = torch.randn(N * G, K, dtype=dtype, device=device)
+        # Create equal-sized groups for simplicity
+        m_size = M
+        m_sizes = torch.full((G,), m_size, device=device, dtype=torch.int32)
+        result = grouped_gemm_forward(a, b, m_sizes)
+        self.assertTrue(result.shape == (M * G, N))
+        expected_result = torch.zeros(M * G, N, dtype=dtype, device=device)
+        m_start = 0
+        for g in range(G):
+            m_end = m_start + m_sizes[g]
+            b_slice = b[N * g : N * (g+1), :]
+            expected_result[m_start:m_end, :] = a[m_start:m_end, :] @ b_slice.T
+            m_start = m_end
+        # Convert result to match input dtype if needed
+        result = result.to(dtype)
+        torch.testing.assert_close(result, expected_result, atol=atol, rtol=rtol)
+    def test_MG_grouped_gemm_bf16(self) -> None:
+        for G in (1, 4, 16):
+            for M in (128, 512, 1024):
+                print(f"Testing BF16 M*G GroupGeMM with G={G}, M={M}")
+                self._run_grouped_gemm_test(
+                    (G, M, 1024, 1024),
+                    torch.device("cuda"),
+                    dtype=torch.bfloat16,
+                    atol=1e-5,
+                    rtol=1.6e-2,
+                )
+    def test_MG_grouped_gemm_deepseek_shapes(self) -> None:
+        """Test with shapes from Deepseek model."""
+        deepseek_shapes = [
+            (4, 2048, 4096, 7168),  # G, M, N, K
+            (4, 2048, 7168, 2048),
+            (8, 512, 4096, 7168),
+            (8, 512, 7168, 2048),
+        ]
+        device = torch.device("cuda")
+        for shape in deepseek_shapes:
+            G, M, N, K = shape
+            print(f"Testing BF16 M*G Deepseek shape: G={G}, M={M}, N={N}, K={K}")
+            self._run_grouped_gemm_test(
+                shape, device, dtype=torch.bfloat16, atol=1e-5, rtol=1.6e-2
+            )

torchtitan/experiments/llama4/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.96 kB). View file

torchtitan/experiments/llama4/infra/expert_parallel.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import partial
+from typing import Optional, Tuple
+import torch.nn as nn
+from torch.distributed.tensor import (
+    DeviceMesh,
+    distribute_module,
+    distribute_tensor,
+    DTensor,
+    Partial,
+    Replicate,
+    Shard,
+)
+from torch.distributed.tensor.parallel import ParallelStyle
+from torch.distributed.tensor.placement_types import Placement
+# implementation of Tensor Parallel on the non-shared experts in MoE
+class TensorParallel(ParallelStyle):
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Tuple[Optional[Placement]]] = None,
+        output_layout: Optional[Placement] = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layouts = input_layouts or (Replicate(), None)
+        self.output_layout = output_layout or Partial()
+        self.desired_input_layouts = (Replicate(), None)
+        self.use_local_output = use_local_output
+    @staticmethod
+    def _prepare_input_fn(
+        input_layouts, desired_input_layouts, mod, inputs, device_mesh
+    ):
+        # TODO: figure out dynamo support for instance method and switch this to instance method
+        # annotate module input placements/sharding with input_layouts
+        input_tensor, input_layout, desired_input_layout = (
+            inputs[0],
+            input_layouts[0],
+            desired_input_layouts[0],
+        )
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, (input_layout,), run_check=False
+            )
+        if input_layouts != desired_input_layouts:
+            input_tensor = input_tensor.redistribute(
+                placements=(desired_input_layout,), async_op=True
+            )
+        return (input_tensor, *inputs[1:])
+    def _partition_fn(self, name, module, device_mesh):
+        module.register_parameter(
+            "w1", nn.Parameter(distribute_tensor(module.w1, device_mesh, [Shard(2)]))
+        )  # Column-wise sharding
+        module.register_parameter(
+            "w2",
+            nn.Parameter(distribute_tensor(module.w2, device_mesh, [Shard(1)])),
+        )  # Row-wise sharding
+        module.register_parameter(
+            "w3",
+            nn.Parameter(distribute_tensor(module.w3, device_mesh, [Shard(2)])),
+        )  # Column-wise sharding
+    @staticmethod
+    def _prepare_output_fn(output_layout, use_local_output, mod, outputs, device_mesh):
+        if outputs.placements != (output_layout,):
+            outputs = outputs.redistribute(placements=(output_layout,), async_op=True)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            self._partition_fn,
+            partial(
+                self._prepare_input_fn, self.input_layouts, self.desired_input_layouts
+            ),
+            partial(self._prepare_output_fn, self.output_layout, self.use_local_output),
+        )
+# NOTE: This is to achieve replicate computation on the gate module in the MoE router.
+# It does nothing other than (1) setting the module parameters as DTensors on the given mesh
+# and (2) inserting hooks to module boundary to change torch.Tensor to DTensor and back.
+# TODO: The reason we need this wrapping is to ensure all parameters are on the same 1D/2D mesh,
+# which is assumed by (1) gradient norm clipping, and (2) optimizer fused implementation.
+class NoParallel(ParallelStyle):
+    def __init__(
+        self,
+        *,
+        input_layout: Optional[Placement] = None,
+        output_layout: Optional[Placement] = None,
+        use_local_output: bool = True,
+    ):
+        super().__init__()
+        self.input_layout = input_layout or Replicate()
+        self.output_layout = output_layout or Replicate()
+        self.desired_input_layout = Replicate()
+        self.use_local_output = use_local_output
+    @staticmethod
+    def _prepare_input_fn(input_layout, desired_input_layout, mod, inputs, device_mesh):
+        # annotate module input placements/sharding with input_layouts
+        input_tensor = inputs[0]
+        if not isinstance(input_tensor, DTensor):
+            input_tensor = DTensor.from_local(
+                input_tensor, device_mesh, (input_layout,), run_check=False
+            )
+        if input_layout != desired_input_layout:
+            input_tensor = input_tensor.redistribute(
+                placements=(desired_input_layout,), async_op=True
+            )
+        return (input_tensor, *inputs[1:])
+    @staticmethod
+    def _prepare_output_fn(output_layout, use_local_output, mod, outputs, device_mesh):
+        if outputs.placements != (output_layout,):
+            outputs = outputs.redistribute(placements=(output_layout,), async_op=True)
+        # back to local tensor
+        return outputs.to_local() if use_local_output else outputs
+    def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
+        return distribute_module(
+            module,
+            device_mesh,
+            None,
+            partial(
+                self._prepare_input_fn, self.input_layout, self.desired_input_layout
+            ),
+            partial(self._prepare_output_fn, self.output_layout, self.use_local_output),
+        )

torchtitan/experiments/llama4/infra/parallelize_llama.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.models.llama3.parallelize_llama import (
+    apply_ac,
+    apply_compile,
+    apply_ddp,
+    apply_fsdp,
+    apply_tp,
+)
+from torchtitan.tools.logging import logger
+def parallelize_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    if parallel_dims.tp_enabled:
+        if (
+            job_config.parallelism.enable_async_tensor_parallel
+            and not job_config.training.compile
+        ):
+            raise RuntimeError("Async TP requires --training.compile")
+        enable_float8_linear = "float8" in job_config.model.converters
+        float8_is_rowwise = job_config.float8.recipe_name in (
+            "rowwise",
+            "rowwise_with_gw_hp",
+        )
+        # For now, float8 all-gather with TP is only supported for tensorwise
+        # float8 scaling recipes. For rowwise recipes, we use regular TP and
+        # all-gather happens in high precision.
+        enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+        apply_tp(
+            model,
+            world_mesh["tp"],
+            loss_parallel=parallel_dims.loss_parallel_enabled,
+            enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+            enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+        )
+        apply_moe_tp(model, world_mesh["tp"])
+    if job_config.activation_checkpoint.mode != "none":
+        if (
+            job_config.activation_checkpoint.mode == "selective"
+            and job_config.model.use_flex_attn
+        ):
+            raise ValueError(
+                "FlexAttention is not compatible with selective AC yet. "
+                "See https://github.com/pytorch/pytorch/issues/147879"
+            )
+        apply_ac(model, job_config.activation_checkpoint)
+    # turn on per-TransformerBlock compile after AC wrapping and before FSDP
+    if job_config.training.compile:
+        apply_compile(model)
+        # NOTE: needed for torch.compile to work with dynamic shapes in token-choice MoE
+        torch._dynamo.config.capture_scalar_outputs = True
+    if (
+        parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled
+    ):  # apply FSDP or HSDP, potentially with Context Parallel
+        if parallel_dims.dp_replicate_enabled:
+            dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+        apply_fsdp(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+            pp_enabled=parallel_dims.pp_enabled,
+            cpu_offload=job_config.training.enable_cpu_offload,
+            reshard_after_forward_policy=job_config.parallelism.fsdp_reshard_after_forward,
+        )
+        if parallel_dims.dp_replicate_enabled:
+            logger.info("Applied HSDP to the model")
+        else:
+            logger.info("Applied FSDP to the model")
+        if parallel_dims.cp_enabled:
+            logger.info("Applied Context Parallel to the model")
+        if job_config.training.enable_cpu_offload:
+            logger.info("Applied CPU Offloading to the model")
+    elif parallel_dims.dp_replicate_enabled:
+        if world_mesh.ndim > 1:
+            raise RuntimeError("DDP has not supported > 1D parallelism")
+        apply_ddp(
+            model,
+            world_mesh,
+            enable_compile=job_config.training.compile,
+            enable_compiled_autograd=job_config.parallelism.enable_compiled_autograd,
+        )
+    return model
+def apply_moe_tp(
+    model: nn.Module,
+    tp_mesh: DeviceMesh,
+):
+    from torch.distributed.tensor import Partial, Replicate, Shard
+    from torch.distributed.tensor.parallel import (
+        parallelize_module,
+        PrepareModuleInputOutput,
+    )
+    from .expert_parallel import NoParallel, TensorParallel
+    for _, transformer_block in model.layers.items():
+        moe_layer_plan = {
+            # input / output sharding on the seqlen dim
+            # all-gather for input, reduce-scatter for output
+            "moe": PrepareModuleInputOutput(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+                use_local_input=True,
+                output_layouts=(Partial(),),
+                desired_output_layouts=(Shard(1),),
+            ),
+            # replicate computation for the router
+            "moe.router.gate": NoParallel(),
+            # input Replicate, output Partial
+            "moe.experts": TensorParallel(),
+            "moe.shared_expert": TensorParallel(),
+        }
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=moe_layer_plan,
+        )

torchtitan/experiments/llama4/model/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (23.8 kB). View file

torchtitan/experiments/llama4/model/model.py ADDED Viewed

	@@ -0,0 +1,466 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchtitan.models.attention import build_attention, init_attention_mask
+from torchtitan.models.norms import build_norm
+from torchtitan.protocols.train_spec import ModelProtocol
+from .args import TransformerModelArgs
+from .moe import MoE
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0) -> torch.Tensor:
+    """
+    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
+    This function calculates a frequency tensor with complex exponentials using the given dimension 'dim'
+    and the end index 'end'. The 'theta' parameter scales the frequencies.
+    The returned tensor contains complex values in complex64 data type.
+    Args:
+        dim (int): Dimension of the frequency tensor.
+        end (int): End index for precomputing frequencies.
+        theta (float, optional): Scaling factor for frequency computation. Defaults to 10000.0.
+    Returns:
+        torch.Tensor: Precomputed frequency tensor with complex exponentials.
+    """
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)
+    freqs = torch.outer(t, freqs).float()
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
+    """
+    Reshape frequency tensor for broadcasting it with another tensor.
+    This function reshapes the frequency tensor to have the same shape as the target tensor 'x'
+    for the purpose of broadcasting the frequency tensor during element-wise operations.
+    The input freqs_cis tensor is assumed to be of shape (max_seqlen, dim),
+    and the first seqlen elements will be sliced, but dim must match x.
+    Args:
+        freqs_cis (torch.Tensor): Frequency tensor to be reshaped.
+        x (torch.Tensor): Target tensor for broadcasting compatibility.
+    Returns:
+        torch.Tensor: Reshaped frequency tensor.
+    """
+    ndim = x.ndim
+    assert ndim > 1
+    seqlen = x.shape[1]
+    freqs_cis = freqs_cis[0:seqlen]
+    assert freqs_cis.shape == (seqlen, x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor.
+    This function applies rotary embeddings to the given query 'xq' and key 'xk' tensors using the provided
+    frequency tensor 'freqs_cis'. The input tensors are reshaped as complex numbers, and the frequency tensor
+    is reshaped for broadcasting compatibility. The resulting tensors contain rotary embeddings and are
+    returned as real tensors.
+    Args:
+        xq (torch.Tensor): Query tensor to apply rotary embeddings.
+        xk (torch.Tensor): Key tensor to apply rotary embeddings.
+        freqs_cis (torch.Tensor): Precomputed frequency tensor for complex exponentials.
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        torch.unsqueeze(x, dim=3)
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    """
+    Multi-head attention module.
+    Args:
+        model_args (TransformerModelArgs): Model configuration arguments.
+    Attributes:
+        n_kv_heads (int): Number of key and value heads.
+        n_heads (int): Number of query heads.
+        n_rep (int): Number of repetitions for local heads.
+        head_dim (int): Dimension size of each attention head.
+        wq (Linear): Linear transformation for queries.
+        wk (Linear): Linear transformation for keys.
+        wv (Linear): Linear transformation for values.
+        wo (Linear): Linear transformation for output.
+    """
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__()
+        self.n_heads = model_args.n_heads
+        self.n_kv_heads = (
+            model_args.n_heads
+            if model_args.n_kv_heads is None
+            else model_args.n_kv_heads
+        )
+        self.n_rep = self.n_heads // self.n_kv_heads
+        self.head_dim = model_args.dim // model_args.n_heads
+        self.wq = nn.Linear(
+            model_args.dim, model_args.n_heads * self.head_dim, bias=False
+        )
+        self.wk = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(model_args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(
+            model_args.n_heads * self.head_dim, model_args.dim, bias=False
+        )
+        self.sdpa = build_attention(model_args.use_flex_attn, model_args.attn_mask_type)
+    def init_weights(self, init_std: float):
+        for linear in (self.wq, self.wk, self.wv):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.wo.weight, mean=0.0, std=init_std)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ):
+        """
+        Forward pass of the attention module.
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed frequency tensor.
+        Returns:
+            torch.Tensor: Output tensor after attention.
+        """
+        bs, seqlen, _ = x.shape
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        # Use -1 instead of `n_heads` (or `n_kv_heads`) to infer the actual
+        # local heads from sizes of xq, xk, and xv as TP may have sharded them
+        # after the above linear ops.
+        xq = xq.view(bs, seqlen, -1, self.head_dim)
+        xk = xk.view(bs, seqlen, -1, self.head_dim)
+        xv = xv.view(bs, seqlen, -1, self.head_dim)
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+        # repeat k/v heads if n_kv_heads < n_heads
+        keys = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        values = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xk = keys.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xv = values.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        output = self.sdpa(xq, xk, xv)
+        output = output.transpose(
+            1, 2
+        ).contiguous()  # (bs, seqlen, n_local_heads, head_dim)
+        output = output.view(bs, seqlen, -1)
+        return self.wo(output)
+class FeedForward(nn.Module):
+    """
+    FeedForward module
+    Args:
+        dim (int): Input dimension.
+        hidden_dim (int): Hidden dimension of the feedforward layer.
+        multiple_of (int): Value to ensure hidden dimension is a multiple of this value.
+        ffn_dim_multiplier (float | None): Custom multiplier for hidden dimension. Defaults to None.
+    Attributes:
+        w1 (Linear): Linear transformation for the first layer.
+        w2 (Linear): Linear transformation for the second layer.
+        w3 (Linear): Linear transformation for the third layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: float | None,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+    def forward(self, x):
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1.weight, mean=0.0, std=0.02)
+        for linear in (self.w2, self.w3):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=init_std)
+class TransformerBlock(nn.Module):
+    """
+    TransformerBlock Module
+    Args:
+        layer_id (int): Identifier for the layer.
+        model_args (TransformerModelArgs): Model configuration arguments.
+    Attributes:
+        n_heads (int): Number of attention heads.
+        dim (int): Dimension size of the model.
+        head_dim (int): Dimension size of each attention head.
+        attention (Attention): Attention module.
+        feed_forward (FeedForward): FeedForward module.
+        layer_id (int): Identifier for the layer.
+        attention_norm (RMSNorm): Layer normalization for attention output.
+        ffn_norm (RMSNorm): Layer normalization for feedforward output.
+    """
+    def __init__(self, layer_id: int, model_args: TransformerModelArgs):
+        super().__init__()
+        self.n_heads = model_args.n_heads
+        self.dim = model_args.dim
+        self.attention = Attention(model_args)
+        # use MoE layer for every interleave_moe_layer_step FFN layers
+        self.moe_enabled = (
+            model_args.moe_enabled
+            and (layer_id + 1) % model_args.interleave_moe_layer_step == 0
+        )
+        if self.moe_enabled:
+            self.moe = MoE(model_args)
+        else:
+            self.feed_forward = FeedForward(
+                dim=model_args.dim,
+                hidden_dim=4 * model_args.dim,
+                multiple_of=model_args.multiple_of,
+                ffn_dim_multiplier=model_args.ffn_dim_multiplier,
+            )
+        self.layer_id = layer_id
+        self.num_layers = model_args.n_layers
+        self.attention_norm = build_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
+        self.ffn_norm = build_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
+        if model_args.depth_init:
+            self.weight_init_std = 0.02 / (2 * (self.layer_id + 1)) ** 0.5
+        else:
+            self.weight_init_std = 0.02 / (2 * self.num_layers) ** 0.5
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cis: torch.Tensor,
+    ):
+        """
+        Perform a forward pass through the TransformerBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+            freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+        Returns:
+            torch.Tensor: Output tensor after applying attention and feedforward layers.
+        """
+        h = x + self.attention(self.attention_norm(x), freqs_cis)
+        if self.moe_enabled:
+            out = h + self.moe(self.ffn_norm(h))
+        else:
+            out = h + self.feed_forward(self.ffn_norm(h))
+        return out
+    def init_weights(self):
+        for norm in (self.attention_norm, self.ffn_norm):
+            norm.reset_parameters()
+        self.attention.init_weights(self.weight_init_std)
+        if self.moe_enabled:
+            self.moe.init_weights(self.weight_init_std)
+        else:
+            self.feed_forward.init_weights(self.weight_init_std)
+class Transformer(nn.Module, ModelProtocol):
+    """
+    Transformer Module
+    Args:
+        model_args (TransformerModelArgs): Model configuration arguments.
+    Attributes:
+        model_args (TransformerModelArgs): Model configuration arguments.
+        vocab_size (int): Vocabulary size.
+        n_layers (int): Number of layers in the model.
+        tok_embeddings (ParallelEmbedding): Token embeddings.
+        layers (torch.nn.ModuleList): List of Transformer blocks.
+        norm (RMSNorm): Layer normalization for the model output.
+        output (ColumnParallelLinear): Linear layer for final output.
+        freqs_cis (torch.Tensor): Precomputed cosine and sine frequencies.
+    """
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__()
+        self.model_args = model_args
+        self.vocab_size = model_args.vocab_size
+        self.n_layers = model_args.n_layers
+        self.eos_id = model_args.eos_id
+        self.tok_embeddings = nn.Embedding(model_args.vocab_size, model_args.dim)
+        # TODO persistent should be set to false, since this buffer can be recomputed.
+        # however, we set it to true for 2 reasons.  (1) due to pytorch/pytorch#123411,
+        # compile or pipeline-tracer will not correctly handle non-persistent buffers,
+        # so we need to fix that.  (2) if we initialize pipeline-parallel models from
+        # a seed checkpoint rather than calling init_weights, we need freqs_cis to be
+        # initialized by the checkpoint, or we need to add a separate initializer for
+        # just the non-persistent buffers that is called after loading checkpoints.
+        self.register_buffer("freqs_cis", self._precompute_freqs_cis(), persistent=True)
+        self.layers = torch.nn.ModuleDict()
+        for layer_id in range(model_args.n_layers):
+            self.layers[str(layer_id)] = TransformerBlock(layer_id, model_args)
+        self.norm = build_norm(
+            model_args.norm_type, dim=model_args.dim, eps=model_args.norm_eps
+        )
+        self.output = nn.Linear(model_args.dim, model_args.vocab_size, bias=False)
+        self.init_weights()
+    def init_weights(
+        self,
+        buffer_device: torch.device | None = None,
+    ):
+        """
+        [Note: On ``init_weights`` vs. ``reset_parameters``]
+        Modules may define ``reset_parameters`` to initialize parameter values.
+        ``reset_parameters`` is meant to only initialize directly owned
+        parameters/buffers, not those of their child modules, and it can be
+        used to give the initial values for these tensors.
+        Separately, users may want custom initialization for their modules,
+        different from that in ``reset_parameters``. For this, we define
+        ``init_weights``. We only call it in the constructor of this
+        ``Transformer`` root module to avoid reinitializing tensors.
+        """
+        buffer_device = buffer_device or self.freqs_cis.device
+        with torch.device(buffer_device):
+            self.freqs_cis = self._precompute_freqs_cis()
+        if self.tok_embeddings is not None:
+            nn.init.normal_(self.tok_embeddings.weight)
+        for layer in self.layers.values():
+            if layer is not None:
+                layer.init_weights()
+        if self.norm is not None:
+            self.norm.reset_parameters()
+        final_out_std = self.model_args.dim**-0.5
+        cutoff_factor = 3
+        if self.output is not None:
+            nn.init.trunc_normal_(
+                self.output.weight,
+                mean=0.0,
+                std=final_out_std,
+                a=-cutoff_factor * final_out_std,
+                b=cutoff_factor * final_out_std,
+            )
+    def _precompute_freqs_cis(self) -> torch.Tensor:
+        return precompute_freqs_cis(
+            self.model_args.dim // self.model_args.n_heads,
+            # Need to compute until at least the max token limit for generation
+            # TODO: explain in docs/composability.md why we removed the 2x
+            # relaxing in our CP enablement PR
+            self.model_args.max_seq_len,
+            self.model_args.rope_theta,
+        )
+    def forward(self, tokens: torch.Tensor):
+        """
+        Perform a forward pass through the Transformer model.
+        Args:
+            tokens (torch.Tensor): Input token indices.
+        Returns:
+            torch.Tensor: Output logits after applying the Transformer model.
+        """
+        # TODO: We will to change forward() signature to allow tokens to
+        # be always passed in.
+        if self.model_args.use_flex_attn:
+            init_attention_mask(tokens, eos_id=self.eos_id)
+        # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
+        h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+        for layer in self.layers.values():
+            h = layer(h, self.freqs_cis)
+        h = self.norm(h) if self.norm else h
+        output = self.output(h) if self.output else h
+        return output
+    @classmethod
+    def from_model_args(cls, model_args: TransformerModelArgs) -> "Transformer":
+        """
+        Initialize a Transformer model from a TransformerModelArgs object.
+        Args:
+            model_args (TransformerModelArgs): Model configuration arguments.
+        Returns:
+            Transformer: Transformer model.
+        """
+        return cls(model_args)

torchtitan/experiments/llama4/model/moe.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .args import TransformerModelArgs
+class GroupedExperts(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        num_experts: int,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.w1 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+        self.w2 = nn.Parameter(torch.empty(num_experts, hidden_dim, dim))
+        self.w3 = nn.Parameter(torch.empty(num_experts, dim, hidden_dim))
+    def forward(
+        self,
+        x: torch.Tensor,
+        num_local_tokens_per_expert: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if num_local_tokens_per_expert is not None:
+            # a tuple of tensors indexed by experts
+            # each with shape (tokens_per_expert(varying), dim)
+            x = torch.split(
+                x,
+                split_size_or_sections=num_local_tokens_per_expert.tolist(),
+                dim=0,
+            )
+            out_experts_splits = []
+            for expert_idx, x_expert in enumerate(x):
+                w1, w2, w3 = (
+                    self.w1[expert_idx],
+                    self.w2[expert_idx],
+                    self.w3[expert_idx],
+                )
+                h = F.silu(torch.matmul(x_expert, w1))
+                h = h * torch.matmul(x_expert, w3)
+                h = torch.matmul(h, w2)
+                # h shape (tokens_per_expert(varying), dim)
+                out_experts_splits.append(h)
+            out = torch.cat(out_experts_splits, dim=0)
+            # TODO:optimize with GroupedGEMM
+            # https://github.com/pytorch/pytorch/pull/150374
+            # _gouped_mm requires shapes to be multiple of 8
+            # offsets = torch.cumsum(num_local_tokens_per_expert, dim=0, dtype=torch.int32)
+            # h = F.silu(torch._grouped_mm(x, self.w1.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16))
+            # h = h * torch._grouped_mm(x, self.w3.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+            # out = torch._grouped_mm(h, self.w2.transpose(-2, -1), offs=offsets, out_dtype=torch.bfloat16)
+        else:
+            # x shape (num_experts, tokens_per_expert, dim)
+            h = F.silu(torch.bmm(x, self.w1))
+            h = h * torch.bmm(x, self.w3)
+            # out shape (num_experts, tokens_per_expert, dim)
+            out = torch.bmm(h, self.w2)
+        return out
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.w1, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.w2, mean=0.0, std=init_std)
+        nn.init.trunc_normal_(self.w3, mean=0.0, std=init_std)
+class TokenChoiceTopKRouter(nn.Module):
+    """This class implements token-choice routing. In token-choice top-K routing, each token is
+        routed to top K experts based on the router scores.
+    Args:
+        gate (nn.Module): Gate module to calculate the scores, typically nn.Linear(dim, num_experts).
+        dim (int): Dimension of input tokens.
+        num_experts (int): Number of experts in each moe layer.
+        top_k (int): Number of experts each token will be routed to in token-choice routing.
+        use_sigmoid (bool): Whether to use sigmoid or softmax for router scores. Default is False.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_experts: int,
+        top_k: int,
+        use_sigmoid: bool = False,
+    ):
+        super().__init__()
+        self.gate = nn.Linear(dim, num_experts, bias=False)
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.use_sigmoid = use_sigmoid
+    def forward(
+        self, x: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape ``(bs*slen, dim)``.
+        Returns:
+            routed_input (torch.Tensor):
+                Tokens grouped together by experts indices with shape ``(bs*slen*top_k,)``.
+            token_indices (torch.Tensor):
+                Token indices for routed_input with shape ``(bs*slen*top_k,)``.
+            num_local_tokens_per_expert (torch.Tensor):
+                Number of tokens assigned to each expert with shape ``(num_experts,)``.
+        """
+        # scores shape (bs*slen, num_experts)
+        scores = self.gate(x)
+        # By default, sigmoid or softmax is performed in float32 to avoid loss explosion
+        if self.use_sigmoid:
+            scores = torch.sigmoid(scores.to(torch.float32)).to(x.dtype)
+        else:
+            scores = F.softmax(scores.to(torch.float32), dim=1).to(x.dtype)
+        # top scores shape (bs*slen, top_k)
+        top_scores, selected_experts_indices = torch.topk(scores, k=self.top_k, dim=1)
+        # top_scores /= top_scores.sum(dim=-1, keep_dim=True).to(x.dtype)
+        # group tokens together by expert indices from 0 to num_experts and pass that to experts forward
+        num_local_tokens_per_expert = torch.histc(
+            selected_experts_indices.view(-1),
+            bins=self.num_experts,
+            min=0,
+            max=self.num_experts,
+        )
+        # token_indices_experts_sorted shape (bs*slen*top_k,)
+        token_indices_experts_sorted = torch.argsort(
+            selected_experts_indices.view(-1), stable=True
+        )
+        top_scores = top_scores.view(-1)[token_indices_experts_sorted]
+        token_indices_experts_sorted = token_indices_experts_sorted // self.top_k
+        return top_scores, token_indices_experts_sorted, num_local_tokens_per_expert
+    def init_weights(self, init_std: float):
+        nn.init.trunc_normal_(self.gate.weight, mean=0.0, std=init_std)
+# TODO: implement load balancing auxiliary loss for token-choice routing
+class MoE(nn.Module):
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__()
+        dim = model_args.dim
+        hidden_dim = 4 * model_args.dim
+        ffn_dim_multiplier = model_args.ffn_dim_multiplier
+        hidden_dim = int(2 * hidden_dim / 3)
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        num_experts = model_args.num_experts
+        hidden_dim_denom = 1
+        if model_args.auto_scale_hidden_dim:
+            hidden_dim_denom = model_args.top_k + int(model_args.use_shared_expert)
+        if model_args.auto_scale_hidden_dim:
+            hidden_dim = int(hidden_dim / hidden_dim_denom)
+        hidden_dim += -hidden_dim % model_args.multiple_of
+        self.experts = GroupedExperts(
+            dim=dim, hidden_dim=hidden_dim, num_experts=num_experts
+        )
+        self.router = TokenChoiceTopKRouter(
+            dim=dim, num_experts=num_experts, top_k=model_args.top_k
+        )
+        self.shared_expert = (
+            GroupedExperts(dim=dim, hidden_dim=hidden_dim, num_experts=1)
+            if model_args.use_shared_expert
+            else None
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x (torch.Tensor): Input tensor with shape ``(bs, slen, dim)``.
+        Returns:
+            out (torch.Tensor): Output tensor with shape ``(bs, slen, dim)``.
+        """
+        bs, slen, dim = x.shape
+        # top_scores and selected_indices shape (bs*slen*top_k,)
+        # num_local_tokens_per_expert shape (num_experts,)
+        (
+            top_scores,
+            token_indices,
+            num_local_tokens_per_expert,
+        ) = self.router(x.reshape(bs * slen, dim))
+        # shape (bs*slen*top_k, dim)
+        token_indices = token_indices.reshape(-1, 1).expand(-1, dim)
+        # shape (bs*slen*top_k, dim)
+        routed_input = torch.gather(
+            x.view(-1, dim),
+            dim=0,
+            index=token_indices,
+        )
+        routed_input = routed_input * top_scores.reshape(-1, 1)
+        # shape (bs*slen*top_k, dim)
+        routed_output = self.experts(routed_input, num_local_tokens_per_expert)
+        # shared expert
+        if self.shared_expert is not None:
+            out = self.shared_expert(x.reshape(1, bs * slen, dim)).reshape(
+                bs * slen, dim
+            )
+        else:
+            out = torch.zeros_like(x.reshape(bs * slen, dim))
+        out = out.scatter_add(dim=0, index=token_indices, src=routed_output)
+        out = out.reshape(bs, slen, dim)
+        return out
+    def init_weights(self, init_std: float):
+        self.experts.init_weights(init_std)
+        self.router.init_weights(init_std)
+        if self.shared_expert is not None:
+            self.shared_expert.init_weights(init_std)

torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+set -ex
+# use envs as local overrides for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./convert_meta_to_dcp_with_gpus.sh
+NGPU=${NGPU:-"8"}
+LOG_RANK=${LOG_RANK:-0,1,2,3,4,5,6,7}
+CONFIG_FILE=${CONFIG_FILE:-"../train_configs/llama4_17bx16e.toml"}
+overrides=""
+if [ $# -ne 0 ]; then
+    overrides="$*"
+fi
+PYTORCH_CUDA_ALLOC_CONF="expandable_segments:True" \
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+convert_meta_to_dcp_with_gpus_meta.py --job.config_file ${CONFIG_FILE} $overrides

torchtitan/experiments/llama4/train_configs/llama4_17bx128e.toml ADDED Viewed

	@@ -0,0 +1,65 @@

+# TODO: this toml config is still under development
+[job]
+dump_folder = "./outputs"
+description = "Llama 4 Maverick 17Bx128E training"
+[profiling]
+enable_profiling = false
+save_traces_folder = "profile_trace"
+profile_freq = 100
+[metrics]
+log_freq = 10
+enable_tensorboard = false
+save_tb_folder = "tb"
+[model]
+name = "llama4"
+flavor = "17bx128e"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+tokenizer_path = "./assets/tokenizer/tokenizer.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 4e-3
+eps = 1e-15
+[lr_scheduler]
+warmup_steps = 600
+lr_min = 0.1
+[training]
+batch_size = 1
+seq_len = 8192
+max_norm = 1.0  # grad norm clipping
+steps = 3000
+compile = false
+dataset = "c4"
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 8
+enable_async_tensor_parallel = false
+pipeline_parallel_degree = 4
+# pipeline_parallel_schedule = "interleaved1f1b"
+# pipeline_parallel_microbatches = 2
+context_parallel_degree = 1
+[checkpoint]
+enable_checkpoint = false
+folder = "checkpoint"
+interval = 500
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
+[activation_checkpoint]
+mode = 'full' # ['none', 'selective', 'full']
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = "output,router.gate"

torchtitan/experiments/multimodal/mm_collator.py ADDED Viewed

	@@ -0,0 +1,227 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import torch
+import torch.nn.functional as F
+from tokenizer.tiktoken import IGNORE_INDEX
+from torch.nn.utils.rnn import pad_sequence
+def padded_collate(
+    batch: List[Dict[str, List[int]]],
+    padding_idx: int = 0,
+    ignore_idx: int = -100,
+) -> Dict[str, torch.Tensor]:
+    """Pad a batch of sequences to the longest sequence length in the batch, and
+    convert integer lists to tensors.
+    Args:
+        batch (List[Dict[str, List[int]]]): A list of dictionaries containing input, label pairs.
+        padding_idx (int): Padding index for input ids. Defaults to 0.
+        ignore_idx (int): Padding index for labels. Defaults to -100.
+    Returns:
+        Dict[str, torch.Tensor]: Collated input and label tensors.
+    Example:
+        >>> token_pairs = [
+        >>>    {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
+        >>>    {"input_ids": [7,], "labels": [10,]},
+        >>> ]
+        >>> collated = padded_collate(
+        >>>    batch=token_pairs,
+        >>>    padding_idx=padding_idx,
+        >>>    ignore_idx=ignore_idx,
+        >>> )
+        >>> collated["input_ids"]
+        >>> tensor([[1, 2, 3], [7, 0, 0]])
+        >>> collated["labels"]
+        >>> tensor([[4, 5, 6], [10, -100, -100]])
+    """
+    input_ids = pad_sequence(
+        [x["input_ids"] for x in batch],
+        batch_first=True,
+        padding_value=padding_idx,
+    )
+    labels = pad_sequence(
+        [x["labels"] for x in batch],
+        batch_first=True,
+        padding_value=ignore_idx,
+    )
+    input_ids_seq_len = input_ids.shape[-1]
+    labels_seq_len = labels.shape[-1]
+    # Hack to pad correctly and not use max_seq_len, which is costly
+    if input_ids_seq_len > labels_seq_len:
+        labels = F.pad(
+            labels, (0, input_ids_seq_len - labels_seq_len), value=ignore_idx
+        )
+    elif labels_seq_len > input_ids_seq_len:
+        input_ids = F.pad(
+            input_ids,
+            (0, labels_seq_len - input_ids_seq_len),
+            value=padding_idx,
+        )
+    return {"input_ids": input_ids, "labels": labels}
+# NOTE Inspired from torchtune.data._collate.py
+@dataclass
+class MultiModalCollator:
+    padding_idx: int = 128004
+    ignore_idx: int = IGNORE_INDEX
+    pad_max_tiles: Optional[int] = None
+    pad_max_images: Optional[int] = None
+    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
+        """Pad a batch of text sequences, tiled image tensors, aspect ratios,
+        and cross attention masks. This can be used for both training and inference.
+        ``batch`` is expected to be a list of sample dicts containing the following::
+            - "input_ids": List[int] of length text_seq_len, varies across samples
+            - "labels": List[int] of length text_seq_len, varies across samples
+            - "encoder_input": Dict[str, List[torch.Tensor]]
+                - "images": List[torch.Tensor], each with shape (n_tiles, c, h, w)
+                - "aspect_ratio": List[torch.Tensor], each with shape (2, ) to indicate h_ratio, w_ratio
+        Shape notation:
+            - c = channel dim
+            - h = height dim
+            - w = weight dim
+        Note:
+            For each element in the batch, ``len(images) == len(aspect_ratio)``.
+        This collater does the following:
+            (1) Pad text sequence and encoder mask to the longest sequence length in the batch
+            (2) Pad image tensors in the tile dimension with zeros to the largest number
+                of tiles in the batch
+            (3) Add empty images of zeros to samples up to max number of images in the batch
+            (4) Pad aspect ratios with (1,1) for all added padding images
+        Args:
+            batch (List[Dict[str, Any]]): A list of sample dicts containing input_ids,
+                labels, images, and aspect_ratio.
+            padding_idx (int): Padding index for input token ids. Defaults to 0.
+            ignore_idx (int): Padding index for labels. Defaults to -100.
+            pad_max_tiles (Optional[int]): Maximum number of tiles to pad to. If None, will pad to the largest number of tiles
+                in the batch. Defaults to None.
+            pad_max_images (Optional[int]): Maximum number of images to pad to. If None, will pad to the largest number of images
+                in the batch. Defaults to None.
+        Returns:
+            Dict[str, Tensor]: Collated tokens, labels, images, aspect_ratio tensors.
+                - tokens: Tensor of shape (bsz, max_seq_len)
+                - labels: Tensor of shape (bsz, max_seq_len)
+                - images: Tensor of shape (bsz, max_num_images, max_num_tiles, c, h, w)
+                - aspect_ratio: Tensor of shape (bsz, max_num_images, 2)
+        Example:
+            >>> image_id = 1
+            >>> tokens_per_tile = 5
+            >>> c, h, w = 1, 1, 1
+            >>> batch = [
+            ...     {
+            ...         "input_ids": [1, 2, 1, 3], "labels": [4, 5, 6, 7],
+            ...         "encoder_input": {
+            ...             # One image with two tiles, one image with three tiles
+            ...             "images": [torch.ones(2, c, h, w), torch.ones(3, c, h, w)],
+            ...             "aspect_ratio": [torch.tensor([1, 2]), torch.tensor([1, 3])],
+            ...         },
+            ...     },
+            ...     {
+            ...         "input_ids": [1, 4], "labels": [8, 9],
+            ...         "encoder_input": {
+            ...             # One image with four tiles
+            ...             "images": [torch.ones(4, c, h, w)],
+            ...             "aspect_ratio": [torch.tensor([2, 2])],
+            ...         },
+            ...     },
+            ... ]
+            ... collator = MultiModalCollator(pad_max_tiles=4)
+            >>> model_inputs = collator(batch=batch)
+            >>> print(model_inputs["input_ids"])
+            tensor([[1, 2, 1, 3],
+                    [1, 4, 0, 0]])
+            >>> print(model_inputs["labels"])
+            tensor([[4, 5, 6, 7],
+                    [8, 9, -100, -100]])
+            >>> print(model_inputs["encoder_input"]["images"].shape)  # (bsz, max_num_images, max_num_tiles, c, h, w)
+            torch.Size([2, 2, 4, 1, 1, 1])
+            >>> print(model_inputs["encoder_input"]["aspect_ratio"].shape)  # (bsz, max_num_images, 2)
+            torch.Size([2, 2, 2])
+            >>> print(model_inputs["encoder_input"]["images"][0, 0, ...])  # Image with two tiles got padded to four
+            tensor([[[[1.]]], [[[1.]]], [[[0.]]], [[[0.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][0, 1, ...])  # Image with three tiles got padded to four
+            tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[0.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][1, 0, ...])  # Image with four tiles did not get padded
+            tensor([[[[1.]]], [[[1.]]], [[[1.]]], [[[1.]]]])
+            >>> print(model_inputs["encoder_input"]["images"][1, 1, ...])  # Extra padding image was added to second sample
+            tensor([[[[0.]]], [[[0.]]], [[[0.]]], [[[0.]]]])
+        """
+        # Text tokens can be handled independently by existing collaters
+        text_only = [
+            {"input_ids": sample["input_ids"], "labels": sample["labels"]}
+            for sample in batch
+        ]
+        collated_text = padded_collate(text_only, self.padding_idx, self.ignore_idx)
+        if self.pad_max_tiles is None:
+            # Get max number of tiles in batch
+            max_num_tiles = max(sample["images_tiles"].shape[0] for sample in batch)
+        else:
+            max_num_tiles = self.pad_max_tiles
+        # Pad images and aspect ratios to max number of tiles
+        batch_images = []
+        batch_aspect_ratios = []
+        for sample in batch:
+            sample_images = []
+            for image in sample["encoder_input"]["images"]:
+                # Single image in each sample has shape (n_tiles, c, h, w)
+                n_tiles = image.shape[0]
+                # Single mask in each sample corresponds to a single image and has shape (text_seq_len, image_seq_len)
+                # where image_seq_len = n_tiles * tokens_per_tile
+                padding_tiles = max_num_tiles - n_tiles
+                # Image should now have shape (max_num_tiles, c, h, w)
+                padded_image = F.pad(
+                    image, (0, 0, 0, 0, 0, 0, 0, padding_tiles), value=0
+                )
+                sample_images.append(padded_image)
+            # Stack multiple images and masks per sample in num_images dimension
+            batch_images.append(torch.stack(sample_images))
+            batch_aspect_ratios.append(
+                torch.stack(sample["encoder_input"]["aspect_ratio"])
+            )
+        # Finally, pad images, masks, aspect ratios to max number of images in batch
+        # (bsz, max_num_images, max_num_tiles, c, h, w)
+        collated_images = pad_sequence(batch_images, batch_first=True, padding_value=0)
+        # (bsz, max_num_images, 2)
+        collated_aspect_ratios = pad_sequence(
+            batch_aspect_ratios, batch_first=True, padding_value=1
+        )
+        batch_dict = {
+            "input_ids": collated_text["input_ids"],
+            "labels": collated_text["labels"],
+            "encoder_input": {
+                "images": collated_images,
+                "aspect_ratio": collated_aspect_ratios,
+            },
+        }
+        return batch_dict

torchtitan/experiments/multimodal/mm_dataset.py ADDED Viewed

	@@ -0,0 +1,268 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from datasets import Dataset, load_dataset
+from datasets.distributed import split_dataset_by_node
+from mm_collator import MultiModalCollator
+from tokenizer.tiktoken import IGNORE_INDEX, Tokenizer
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.utils.data import IterableDataset
+from transform import CLIPTransform
+from utils import load_image
+from torchtitan.components.dataloader import ParallelAwareDataloader
+from torchtitan.config_manager import JobConfig
+from torchtitan.tools.logging import logger
+def _load_obelics_dataset(dataset_path: str):
+    """Load C4 dataset with default configuration."""
+    return load_dataset(dataset_path, split="train", streaming=True)
+def _process_obelics_sample(
+    sample: dict[str, Any], image_token: str = "<|image|>"
+) -> Dict[str, List[Union[str, "PIL.Image.Image"]]]:
+    """
+    This function formats samples from the OBELICS dataset
+    Returns:
+        Dict[str, Any]: The transformed sample with the following fields:
+            - images: List[PIL.Image.Image] with the loaded images
+            - text: str with the text of the sample ready to be tokenized including the image tokens
+    Example:
+        >>> formatted_sample = format_obelics(sample, image_token="<|image|>")
+        >>> print(formatted_sample["text"])
+        ... "<|image|><|image|><|image|> The elephant look cute!<|image|><|image|> The cats are sad :("
+    """
+    sample_images = [image for image in sample["images"] if image is not None]
+    sample_text = [
+        text if text is not None else image_token for text in sample["texts"]
+    ]
+    return {
+        "images": [load_image(image) for image in sample_images],
+        "text": "".join(map(str, sample_text)),
+    }
+@dataclass
+class DatasetConfig:
+    path: str
+    loader: Callable
+    sample_processor: Callable
+# Add your dataset here here - more information at docs/datasets.md
+MM_DATASETS = {
+    "obelics": DatasetConfig(
+        path="HuggingFaceM4/OBELICS",
+        loader=_load_obelics_dataset,
+        sample_processor=_process_obelics_sample,
+    ),
+}
+def _validate_mm_dataset(
+    dataset_name: str, dataset_path: str = None
+) -> tuple[str, Callable, Callable]:
+    """Validate dataset name and path."""
+    if dataset_name not in MM_DATASETS:
+        raise ValueError(
+            f"Dataset {dataset_name} is not supported. "
+            f"Supported datasets are: {list(MM_DATASETS.keys())}"
+        )
+    config = MM_DATASETS[dataset_name]
+    path = dataset_path or config.path
+    logger.info(f"Preparing {dataset_name} dataset from {path}")
+    return path, config.loader, config.sample_processor
+class MultiModalDataset(IterableDataset, Stateful):
+    """PyTorch MultiModal Dataset.
+    Args:
+        dataset_name (str): name of the dataset to load
+        tokenizer (Tokenizer):
+            Tokenizer used to encode data. Tokenize must implement an `encode` and `decode` method.
+        world_size (int): number of data parallel processes participating in training
+        rank (int): rank of the current data parallel process
+        infinite (bool): whether to loop infinitely over the dataset
+    We currently ONLY support the OBELICS dataset
+    Example use:
+    >>> ds = MultiModalDataset(dataset_name="OBELICS", tokenizer=tokenizer)
+    >>> for batch in Dataloader(ds, batch_size=8):
+            print(f"Batch size: {len(batch)}")
+        Batch size: 8
+    """
+    def __init__(
+        self,
+        dataset_name: str,
+        dataset_path: Optional[str],
+        tokenizer: Tokenizer,
+        image_token: str = "<|image|>",
+        tile_size: int = 448,
+        max_num_tiles: int = 4,
+        seq_len: int = 2048,
+        dp_rank: int = 0,
+        dp_world_size: int = 1,
+        infinite: bool = False,
+    ) -> None:
+        # Force lowercase for consistent comparison
+        dataset_name = dataset_name.lower()
+        path, dataset_loader, sample_processor = _validate_mm_dataset(
+            dataset_name, dataset_path
+        )
+        ds = dataset_loader(path)
+        # TODO: support shuffling
+        self.dataset_name = dataset_name
+        self._data = split_dataset_by_node(ds, dp_rank, dp_world_size)
+        self._tokenizer = tokenizer
+        self.seq_len = seq_len
+        self.infinite = infinite
+        self._sample_processor = sample_processor
+        self.image_token = (
+            image_token  # TODO(tj.solergibert) Add `image_token` to JobConfig
+        )
+        # TODO(tj.solergibert) Add `tile_size` & `max_num_tiles` to JobConfig
+        self.transform_image = CLIPTransform(
+            image_mean=(
+                0.48145466,
+                0.4578275,
+                0.40821073,
+            ),  # TODO(tj.solergibert) What should we do with `image_mean` & `image_std`?,
+            image_std=(0.26862954, 0.26130258, 0.27577711),
+            tile_size=tile_size,
+            possible_resolutions=None,
+            max_num_tiles=max_num_tiles,
+            resample="bilinear",
+            resize_to_max_canvas=False,
+        )
+        # variables for checkpointing
+        self._sample_idx = 0
+    def __iter__(self):
+        while True:
+            for sample in self._get_data_iter():
+                try:
+                    sample = self._sample_processor(
+                        sample, image_token=self.image_token
+                    )
+                except Exception:
+                    continue
+                self._sample_idx += 1
+                # CLIP Transform
+                encoder_input = {"images": [], "aspect_ratio": []}
+                for image in sample["images"]:
+                    out = self.transform_image(image)
+                    encoder_input["images"].append(out["image"])
+                    encoder_input["aspect_ratio"].append(out["aspect_ratio"])
+                sample["encoder_input"] = encoder_input
+                # Tokenize
+                tokens = self._tokenizer.encode(
+                    sample["text"],
+                    bos=True,
+                    eos=True,
+                    allowed_special=set(["<|image|>"]),
+                )
+                sample["input_ids"] = torch.LongTensor(tokens[:-1])
+                sample["labels"] = torch.LongTensor(tokens[1:])
+                # Mask BOS, EOS & image tokens from the loss
+                sample["labels"] = torch.where(
+                    torch.isin(
+                        sample["labels"],
+                        torch.LongTensor(
+                            [
+                                self._tokenizer.bos_id,
+                                self._tokenizer.eos_id,
+                                self._tokenizer.image_id,
+                            ]
+                        ),
+                    ),
+                    IGNORE_INDEX,
+                    sample["labels"],
+                )
+                # Truncate
+                sample["input_ids"], sample["labels"] = (
+                    sample["input_ids"][: self.seq_len],
+                    sample["labels"][: self.seq_len],
+                )
+                yield sample
+            if not self.infinite:
+                logger.warning(f"Dataset {self.dataset_name} has run out of data")
+                break
+            else:
+                # Reset offset for the next iteration
+                self._sample_idx = 0
+                logger.warning(f"Dataset {self.dataset_name} is being re-looped")
+    def _get_data_iter(self):
+        if isinstance(self._data, Dataset) and self._sample_idx == len(self._data):
+            return iter([])
+        it = iter(self._data)
+        for _ in range(self._sample_idx):
+            next(it)
+        return it
+    def load_state_dict(self, state_dict):
+        self._sample_idx = state_dict["sample_idx"]
+    def state_dict(self):
+        return {"sample_idx": self._sample_idx}
+def build_mm_dataloader(
+    dp_world_size: int,
+    dp_rank: int,
+    tokenizer: Tokenizer,
+    job_config: JobConfig,
+    infinite: bool = True,
+) -> ParallelAwareDataloader:
+    """Build a data loader for HuggingFace datasets."""
+    dataset_name = job_config.training.dataset
+    dataset_path = job_config.training.dataset_path
+    batch_size = job_config.training.batch_size
+    seq_len = job_config.training.seq_len
+    pad_max_tiles = 4  # TODO(tj.solergibert) Add `pad_max_tiles` to JobConfig
+    padding_idx = 128004  # TODO(tj.solergibert) Add `padding_idx` to JobConfig
+    hf_ds = MultiModalDataset(
+        dataset_name=dataset_name,
+        dataset_path=dataset_path,
+        tokenizer=tokenizer,
+        seq_len=seq_len,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        infinite=infinite,
+    )
+    collate_fn = MultiModalCollator(
+        padding_idx=padding_idx, pad_max_tiles=pad_max_tiles
+    )
+    return ParallelAwareDataloader(
+        dataset=hf_ds,
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        batch_size=batch_size,
+        collate_fn=collate_fn,
+    )

torchtitan/experiments/multimodal/tests/test_multimodal_model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import pytest
+import torch
+from torchtitan.experiments.llama_multimodal import (
+    ModelArgs,
+    MultimodalDecoder,
+    VisionEncoder,
+)
+from .test_utils import fixed_init_model, fixed_init_tensor
+@pytest.fixture
+def encoder_config():
+    return ModelArgs(
+        encoder_embed_dim=32,
+        encoder_num_layers=2,
+        encoder_num_heads=4,
+        tile_size=49,
+        patch_size=9,
+        max_num_tiles=4,
+        in_channels=3,
+        return_intermediates=[0, 1],
+        num_layers_projection=2,
+        decoder_embed_dim=128,
+    )
+@pytest.fixture
+def decoder_config():
+    return ModelArgs(
+        decoder_embed_dim=512,
+        vocab_size=10000,
+        fusion_interval=2,
+        num_special_tokens=3,
+        decoder_num_layers=6,
+        decoder_num_heads=8,
+        decoder_num_kv_heads=4,
+        max_seq_len=512,
+        rope_theta=50000.0,
+    )
+class TestMultimodalModelVisionEncoder:
+    @pytest.fixture(autouse=True)
+    def setup_class(self, encoder_config):
+        self.model_args = encoder_config
+        self.batch_size = 1
+        self.num_imgs = 2
+        self.num_tiles = 4
+        self.aspect_ratio = torch.tensor([[1, 3], [2, 2]]).reshape(
+            self.batch_size, self.num_imgs, 2
+        )
+        image = torch.rand(
+            (
+                self.batch_size,
+                self.num_imgs,
+                self.num_tiles,
+                self.model_args.in_channels,
+                self.model_args.tile_size,
+                self.model_args.tile_size,
+            )
+        )
+        self.image = fixed_init_tensor(image.shape, min_val=-1, max_val=1)
+    def test_llama_mm_vision_encoder(self):
+        model = VisionEncoder(self.model_args)
+        fixed_init_model(model, min_val=-1, max_val=1)
+        output = model(self.image, self.aspect_ratio)
+        expected_shape = (
+            self.batch_size,
+            self.num_imgs * self.num_tiles * (model.vit.patches_per_tile + 1),
+            self.model_args.decoder_embed_dim,
+        )
+        assert (
+            output.shape == expected_shape
+        ), f"Expected shape {expected_shape}, but got {output.shape}"
+        # TODO: Need to ensure numerical stability before doing convergence test.
+        # output.mean() = 3.994, we need to debug why it is not close to 5.28800, which is
+        # the test value from the original torch tune test
+        # assert torch.allclose(
+        #     output.mean(), torch.tensor(5.28800), atol=1e-3, rtol=1e-3
+        # )
+class TestMultimodalModelDecoder:
+    @pytest.fixture(autouse=True)
+    def setup_class(self, decoder_config):
+        self.model_args = decoder_config
+        self.batch_size = 1
+        self.decoder_embed_dim = self.model_args.decoder_embed_dim
+        self.vocab_size = self.model_args.vocab_size
+        self.seq_len = 128
+        self.input = {
+            "tokens": torch.arange(self.batch_size * self.seq_len).reshape(
+                self.batch_size, self.seq_len
+            ),
+            "encoder_input": fixed_init_tensor(
+                (self.batch_size, self.seq_len, self.decoder_embed_dim),
+                min_val=-1,
+                max_val=1,
+            ),
+            "encoder_mask": None,
+        }
+    @torch.no_grad()
+    def test_llama_mm_decoder(self):
+        model = MultimodalDecoder(self.model_args)
+        fixed_init_model(model, min_val=-1, max_val=1)
+        output = model(**self.input)
+        expected_shape = (self.batch_size, self.seq_len, self.vocab_size)
+        assert (
+            output.shape == expected_shape
+        ), f"Expected shape {expected_shape}, but got {output.shape}"
+        # TODO: Need to ensure numerical stability before doing convergence test.
+        # output.mean() = -0.0134, we need to debug why it is not close to -9.47548e-5, which is
+        # the test value from the original torch tune test
+        # assert torch.allclose(
+        #     output.mean(), torch.tensor(-9.47548e-5), atol=1e-3, rtol=1e-3
+        # )

torchtitan/experiments/multimodal/utils.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Optional, Set, Tuple, Union
+from urllib import request
+import torch
+import torchvision
+from torchvision.transforms.v2 import functional as F
+# NOTE Copied from torchtune.modules.transforms.vision_utils.tile_crop.py
+def tile_crop(image: torch.Tensor, tile_size: int) -> torch.Tensor:
+    """
+    Divides a tensor into equally sized tiles. The tensor should be divisible by tile_size.
+    Args:
+        image (torch.Tensor): Input image to crop into tiles.
+        tile_size (int): Size of each tile.
+    Returns:
+        torch.Tensor: torch.Tensor of shape [num_tiles, channel_size, tile_size, tile_size]
+    Examples:
+        >>> image = torch.rand(3, 200, 300)
+        >>> tiles = tile_crop(image, tile_size=50)
+        >>> tiles.shape # 4x6 = 24 tiles
+        torch.Size([24, 3, 50, 50])
+        >>> image = torch.rand(3, 400, 600)
+        >>> tiles = tile_crop(image, tile_size=200)
+        >>> tiles.shape # 2x3 = 6 tiles
+        torch.Size([6, 3, 200, 200])
+    """
+    channel_size, height, width = image.shape
+    # assert sizes are divisible
+    assert (
+        height % tile_size == 0 and width % tile_size == 0
+    ), f"Image size {height}x{width} is not divisible by tile size {tile_size}"
+    # Reshape to split height and width into tile_size blocks
+    tiles_height = height // tile_size
+    tiles_width = width // tile_size
+    reshaped = image.view(channel_size, tiles_height, tile_size, tiles_width, tile_size)
+    # Transpose to bring tiles together
+    # We want [tiles_height, tiles_width, channel_size, tile_size, tile_size]
+    transposed = reshaped.permute(1, 3, 0, 2, 4)
+    # Flatten the tiles
+    tiles = transposed.contiguous().view(
+        tiles_height * tiles_width, channel_size, tile_size, tile_size
+    )
+    return tiles
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def resize_with_pad(
+    image: torch.Tensor,
+    target_size: Tuple[int, int],
+    resample: torchvision.transforms.InterpolationMode,
+    max_size: Optional[int] = None,
+) -> torch.Tensor:
+    """
+    Resizes and pads an image to target_size without causing distortion.
+    The user can set max_size to limit upscaling when target_size exceeds image_size.
+    Args:
+        image (torch.Tensor): The input image tensor in the format [..., H, W].
+        target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width].
+        resample (torchvision.transforms.InterpolationMode): Resampling method used when resizing images.
+            Supports torchvision.transforms.InterpolationMode.NEAREST, InterpolationMode.NEAREST_EXACT,
+            InterpolationMode.BILINEAR and InterpolationMode.BICUBIC.
+        max_size (Optional[int]): The maximum size to upscale the image to.
+            If None, will upscale up to target_size.
+    Returns:
+        torch.Tensor: The resized and padded image tensor in the format [..., H, W].
+    Examples:
+        Example 1: The image will be upscaled from (300, 800) to (448, 1194), since 448 is the limiting side,
+        and then padded from (448, 1194) to (448, 1344).
+            >>> max_size = None
+            >>> image = torch.rand([3, 300, 800])
+            >>> target_size = (448, 1344)
+            >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+            >>> output = resize_with_pad(image, target_size, resample, max_size)
+        Example 2: The image will stay as is, since 800 > 600, and then padded from (300, 800) to (448, 1344).
+            >>> max_size = 600
+            >>> image = torch.rand([3, 300, 800])
+            >>> target_size = (448, 1344)
+            >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+            >>> output = resize_with_pad(image, target_size, resample, max_size)
+        Example 3: The image will be downscaled from (500, 1000) to (224, 448),
+        and padded from (224, 448) to (448, 448).
+            >>> max_size = 600
+            >>> image = torch.rand([3, 500, 1000])
+            >>> target_size = (448, 488)
+            >>> resample = torchvision.transforms.InterpolationMode.BILINEAR
+            >>> output = resize_with_pad(image, target_size, resample, max_size)
+    """
+    image_height, image_width = image.shape[-2:]
+    image_size = (image_height, image_width)
+    # If target_size requires upscaling, we might want to limit the upscaling to max_size
+    if max_size is not None:
+        new_target_height = min(max(image_height, max_size), target_size[0])
+        new_target_width = min(max(image_width, max_size), target_size[1])
+        target_size_resize = (new_target_height, new_target_width)
+    else:
+        target_size_resize = target_size
+    # resize to target_size while preserving aspect ratio
+    new_size_preserving_aspect_ratio = _get_max_res_without_distortion(
+        image_size=image_size,
+        target_size=target_size_resize,
+    )
+    image = F.resize(
+        inpt=image,
+        size=list(new_size_preserving_aspect_ratio),
+        interpolation=resample,
+        antialias=True,
+    )
+    image = _pad_image_top_left(image=image, target_size=target_size)
+    return image
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def _pad_image_top_left(
+    image: torch.Tensor,
+    target_size: Tuple[int, int],
+) -> torch.Tensor:
+    """
+    Places the image at the top left of the canvas and pads with 0 the right and bottom
+    to fit to the target resolution. If target_size < image_size, it will crop the image.
+    Args:
+        image (torch.Tensor): The input image tensor in the format [..., H, W].
+        target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width].
+    Returns:
+        torch.Tensor: The padded image tensor in the format [..., H, W].
+    """
+    image_size = image.shape[-2:]
+    height, width = image_size
+    target_height, target_width = target_size
+    pad_x = target_width - width
+    pad_y = target_height - height
+    padding = [0, 0, pad_x, pad_y]
+    return F.pad(inpt=image, padding=padding)
+# NOTE Copied from torchtune.modules.transforms.vision_utils.resize_with_pad.py
+def _get_max_res_without_distortion(
+    image_size: Tuple[int, int],
+    target_size: Tuple[int, int],
+) -> Tuple[int, int]:
+    """
+    Determines the maximum resolution to which an image can be resized to without distorting its
+    aspect ratio, based on the target resolution.
+    For example, if image_size = (200,400) and target_size = (600,800),
+    scale_h = 600/200 = 3
+    scale_w = 800/400 = 2
+    So the maximum that we can upscale without distortion is min(scale_h, scale_w) = 2
+    Since scale_w is the limiting side, then new_w = target_w, and new_h = old_h*scale_w
+    Args:
+        image_size (Tuple[int, int]): The original resolution of the image.
+        target_size (Tuple[int, int]): The desired resolution to fit the image into.
+    Returns:
+        Tuple[int, int]: The optimal dimensions to which the image should be resized.
+    Examples:
+        >>> _get_max_res_without_distortion([200, 300], target_size = (450, 200))
+        (133, 200)
+        >>> _get_max_res_without_distortion([800, 600], target_size = (450, 1300))
+        (450, 337)
+    """
+    original_height, original_width = image_size
+    target_height, target_width = target_size
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.floor(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.floor(original_width * scale_h), target_width)
+    return new_height, new_width
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def _get_factors(n: int) -> Set[int]:
+    """
+    Calculate all factors of a given number, i.e. a divisor that leaves no remainder.
+    Args:
+        n (int): The number to find factors for.
+    Returns:
+        set: A set containing all factors of the number.
+    Examples:
+        >>> _get_factors(n=12)
+        {1, 2, 3, 4, 6, 12}
+    """
+    factors_set = set()
+    for i in range(1, int(n**0.5) + 1):
+        if n % i == 0:
+            factors_set.add(i)
+            factors_set.add(n // i)
+    return factors_set
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def get_canvas_best_fit(
+    image: torch.Tensor, possible_resolutions: torch.Tensor, resize_to_max_canvas: bool
+) -> Tuple[int, int]:
+    """
+    Determines the best canvas possible from a list of possible resolutions to
+    resize an image to, without distortion.
+    For each possible resolution, calculates the scaling factors for
+    width and height, and selects the smallest one, which is the limiting side.
+    E.g. if to match a canvas shape you have to upscale an image's height by 2x, and width by 1.5x,
+    then the maximum upscaling without distortion is min(2, 1.5) = 1.5.
+    If there are multiple canvases that satisfy the conditions,
+    we pick the one with the lowest area to minimize padding.
+    Args:
+        image (torch.Tensor): The image we want to fit into a canvas.
+        possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each
+            row represents a possible canvas.
+        resize_to_max_canvas (bool): If True, pick the canvas that allows maximum scaling.
+            If False, pick the canvas that minimizes downscaling, including no downscaling at all.
+    Returns:
+        Tuple[int, int]: The best resolution to fit the image into.
+    Examples:
+        >>> image = torch.rand(3, 200, 300)
+        >>> possible_resolutions = torch.tensor([
+        ...     [224, 672],
+        ...     [672, 224],
+        ...     [224, 448],
+        ...     [448, 224],
+        ...     [224, 224]
+        ... ])
+        >>> get_canvas_best_fit(image, possible_resolutions, resize_to_max_canvas=False)
+        (224, 448)
+        In the example above, we calculate the scaling factors for each possible resolution
+        >>> scale_height = torch.tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200])
+        >>> scale_width = torch.tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467])
+        >>> scales = torch.tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467])
+        Two options have scaling_factor > 1, since resize_to_max_canvas is False, we pick the smallest
+        >>> upscaling_options = torch.tensor([1.1200, 1.1200])
+        >>> selected_scale = torch.tensor(1.1200)
+        There are two possible options, so we pick the one with the smallest area
+        >>> areas = torch.tensor([150528, 100352])  # for resolutions [672, 224] and [224, 448], respectively
+        >>> optimal_canvas = torch.tensor([224, 448])  # resolution with the smallest area
+    """
+    original_height, original_width = image.shape[-2:]
+    # possible resolutions heights/widths
+    target_heights, target_widths = (
+        possible_resolutions[:, 0],
+        possible_resolutions[:, 1],
+    )
+    # scaling factors to resize the image without distortion
+    scale_w = target_widths / original_width
+    scale_h = target_heights / original_height
+    # get limiting side scaling -> no distortion
+    scales = torch.where(scale_w > scale_h, scale_h, scale_w)
+    # filter only scales that allow upscaling
+    upscaling_options = scales[scales >= 1]
+    if len(upscaling_options) > 0:
+        if resize_to_max_canvas:
+            selected_scale = torch.max(upscaling_options)
+        else:
+            selected_scale = torch.min(upscaling_options)
+    else:
+        # no upscaling possible,
+        # get the minimum downscaling (max scale for scales<1)
+        downscaling_options = scales[scales < 1]
+        selected_scale = torch.max(downscaling_options)
+    # get all resolutions that support this scaling factor,
+    # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion
+    chosen_canvas = possible_resolutions[scales == selected_scale]
+    # if there are multiple resolutions,
+    # get the one with minimum area to reduce padding
+    if len(chosen_canvas) > 1:
+        areas = chosen_canvas[:, 0] * chosen_canvas[:, 1]
+        optimal_idx = torch.argmin(areas)
+        optimal_canvas = chosen_canvas[optimal_idx]
+    else:
+        optimal_canvas = chosen_canvas[0]
+    return tuple(optimal_canvas.tolist())
+# NOTE Copied from torchtune.modules.transforms.vision_utils.get_canvas_best_fit.py
+def find_supported_resolutions(
+    max_num_tiles: int, tile_size: int
+) -> List[Tuple[int, int]]:
+    """
+    Computes all combinations of resolutions, multiple of tile_size,
+    that contain up to max_num_tiles. Useful for when dividing an image into tiles.
+    For example, if we want at most 2 tiles per image, then we can support the
+    following resolutions: (1x1, 1x2, 2x1) * tile_size
+    Args:
+        max_num_tiles (int): Maximum number of tiles.
+        tile_size (int): Size of the side of the tile.
+    Returns:
+        List[Tuple[int, int]]: List of possible resolutions as tuples (height, width).
+    Examples:
+        >>> max_num_tiles = 4
+        >>> tile_size = 224
+        >>> find_supported_resolutions(max_num_tiles, tile_size)
+        [(224, 896), (448, 448), (224, 224), (896, 224), (224, 672), (672, 224), (224, 448), (448, 224)]
+    """
+    # create dictionary {aspect_ratio: [resolution1, ..., resolution n]}
+    # example {0.25: [(1,4)], 1.0: [(2,2), (1,1)], 4.0: [(4,1)]}
+    asp_dict = defaultdict(list)
+    for _tile_size in range(max_num_tiles, 0, -1):
+        factors = sorted(_get_factors(_tile_size))
+        asp_ratios = [(factor, _tile_size // factor) for factor in factors]
+        for height, width in asp_ratios:
+            ratio_float = height / width
+            asp_dict[ratio_float].append((height, width))
+    # get the resolutions multiplied by the tile_size
+    possible_resolutions = []
+    for ar, resolution in asp_dict.items():
+        for height, width in resolution:
+            possible_resolutions.append((height * tile_size, width * tile_size))
+    return possible_resolutions
+# NOTE Copied from torchtune.data._utils.py
+def load_image(image_loc: Union[Path, str]) -> torch.Tensor:
+    """
+    Convenience method to load an image in torch.Tensor format from a local file path or remote source.
+    Args:
+        image_loc (Union[Path, str]): Local file path or remote source pointing to the image
+            which will be loaded in PIL format.
+    Note:
+        If loading an image from a remote source, the function expects the URL provided in ``image_loc``
+        to start with "http" or "https" e.g. "https://www.wikipedia.org/en/bird.jpg".
+    Raises:
+        ValueError: If the image cannot be loaded from remote source, **or**
+        if the image cannot be opened as a :class:`~torch.Tensor`.
+    Examples:
+        >>> # Load from remote source
+        >>> image = load_image("https://www.wikipedia.org/en/bird.jpg")
+        >>> # Load from local file path
+        >>> image = load_image(Path("/home/user/bird.jpg"))
+    Returns:
+        torch.Tensor: The loaded image.
+    """
+    # If pointing to remote source, try to load to local
+    if isinstance(image_loc, str) and image_loc.startswith("http"):
+        try:
+            image_loc = request.urlopen(image_loc).read()
+            image = torchvision.io.decode_image(
+                torch.frombuffer(image_loc, dtype=torch.uint8),
+                mode="RGB",
+            )
+        except Exception as e:
+            raise ValueError("Failed to load remote image as torch.Tensor") from e
+    # Open the local image as a Tensor image
+    else:
+        try:
+            image = torchvision.io.decode_image(image_loc, mode="RGB")
+        except Exception as e:
+            raise ValueError("Failed to load local image as torch.Tensor") from e
+    return image

torchtitan/experiments/simple_fsdp/__init__.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+from torchtitan.components.loss import build_cross_entropy_loss
+from torchtitan.components.lr_scheduler import build_lr_schedulers
+from torchtitan.components.optimizer import build_optimizers
+from torchtitan.datasets.hf_datasets import build_hf_dataloader
+from torchtitan.datasets.tokenizer.tiktoken import build_tiktoken_tokenizer
+from torchtitan.models.llama3 import llama3_configs, pipeline_llama
+from torchtitan.protocols.train_spec import register_train_spec, TrainSpec
+from .model import SimpleFSDPTransformer
+from .parallelize_llama import parallelize_llama
+register_train_spec(
+    TrainSpec(
+        name="llama3_simple_fsdp",
+        cls=SimpleFSDPTransformer,
+        config=llama3_configs,
+        parallelize_fn=parallelize_llama,
+        pipelining_fn=pipeline_llama,
+        build_optimizers_fn=build_optimizers,
+        build_lr_schedulers_fn=build_lr_schedulers,
+        build_dataloader_fn=build_hf_dataloader,
+        build_tokenizer_fn=build_tiktoken_tokenizer,
+        build_loss_fn=build_cross_entropy_loss,
+    )
+)

torchtitan/experiments/simple_fsdp/__pycache__/parallelize_llama.cpython-311.pyc ADDED Viewed

Binary file (2.67 kB). View file

torchtitan/experiments/simple_fsdp/__pycache__/simple_fsdp.cpython-311.pyc ADDED Viewed

Binary file (7.22 kB). View file

torchtitan/experiments/simple_fsdp/model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from torchtitan.models.llama3 import Transformer, TransformerModelArgs
+from .simple_fsdp import disable_data_parallel
+class SimpleFSDPTransformer(Transformer):
+    def __init__(self, model_args: TransformerModelArgs):
+        super().__init__(model_args)
+        self.init_weights()
+    def init_weights(self, *args, **kwargs):
+        with disable_data_parallel():
+            super().init_weights(*args, **kwargs)

torchtitan/experiments/simple_fsdp/parallelize_llama.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.distributed import DeviceMesh
+from torchtitan.config_manager import JobConfig, TORCH_DTYPE_MAP
+from torchtitan.distributed import ParallelDims
+from torchtitan.models.llama3.parallelize_llama import apply_ac
+from torchtitan.tools.logging import logger
+from .simple_fsdp import data_parallel, MixedPrecisionPolicy
+def parallelize_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+    job_config: JobConfig,
+):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+    # TODO(ruisizhang123): Add support for TP (on-going)
+    # if parallel_dims.tp_enabled:
+    #     if (
+    #         job_config.parallelism.enable_async_tensor_parallel
+    #         and not job_config.training.compile
+    #     ):
+    #         raise RuntimeError("Async TP requires --training.compile")
+    #     enable_float8_linear = "float8" in job_config.model.converters
+    #     float8_is_rowwise = job_config.float8.recipe_name in (
+    #         "rowwise",
+    #         "rowwise_with_gw_hp",
+    #     )
+    #     # For now, float8 all-gather with TP is only supported for tensorwise
+    #     # float8 scaling recipes. For rowwise recipes, we use regular TP and
+    #     # all-gather happens in high precision.
+    #     enable_float8_tensorwise_tp = enable_float8_linear and not float8_is_rowwise
+    #     apply_tp(
+    #         model,
+    #         world_mesh["tp"],
+    #         loss_parallel=parallel_dims.loss_parallel_enabled,
+    #         enable_float8_tensorwise_tp=enable_float8_tensorwise_tp,
+    #         enable_async_tp=job_config.parallelism.enable_async_tensor_parallel,
+    #     )
+    if job_config.activation_checkpoint.mode != "none":
+        apply_ac(model, job_config.activation_checkpoint)
+    # apply data parallel
+    if (
+        parallel_dims.dp_replicate_enabled
+        or parallel_dims.dp_shard_enabled
+        or parallel_dims.cp_enabled
+    ):
+        if parallel_dims.dp_replicate_enabled:
+            if parallel_dims.dp_shard_enabled or parallel_dims.cp_enabled:
+                dp_mesh_dim_names = ("dp_replicate", "dp_shard_cp")
+                dp_mode = "hybrid_shard"
+            else:
+                dp_mesh_dim_names = ("dp_replicate",)
+                dp_mode = "replicate"
+        else:
+            dp_mesh_dim_names = ("dp_shard_cp",)
+            dp_mode = "fully_shard"
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
+        )
+        model = data_parallel(
+            model,
+            world_mesh[tuple(dp_mesh_dim_names)],
+            mode=dp_mode,
+            ac_mode=job_config.activation_checkpoint.mode,
+            mp_policy=mp_policy,
+        )
+        logger.info("Applied Data Parallel (dp mode=%s) to the model", dp_mode)
+    if job_config.training.compile:
+        torch._inductor.config.reorder_for_peak_memory = False
+        model = torch.compile(model, fullgraph=True)
+    return model

torchtitan/models/llama3/train_configs/llama3_8b.toml ADDED Viewed

	@@ -0,0 +1,63 @@

+# torchtitan Config.toml
+# NOTE: this toml config is a preset for 64 A100 GPUs.
+[job]
+dump_folder = "./outputs"
+description = "Llama 3 8B training"
+[profiling]
+enable_profiling = true
+save_traces_folder = "profile_trace"
+profile_freq = 100
+[metrics]
+log_freq = 10
+enable_tensorboard = true
+save_tb_folder = "tb"
+[model]
+name = "llama3"
+flavor = "8B"
+norm_type = "rmsnorm"  # layernorm / np_layernorm / rmsnorm
+tokenizer_path = "./assets/tokenizer/original/tokenizer.model"
+# converters = "float8"
+[optimizer]
+name = "AdamW"
+lr = 3e-4
+eps = 1e-8
+[lr_scheduler]
+warmup_steps = 200  # lr scheduler warm up
+[training]
+batch_size = 1
+seq_len = 8192
+max_norm = 1.0  # grad norm clipping
+steps = 1000
+compile = false
+dataset = "c4"
+[parallelism]
+data_parallel_replicate_degree = 1
+data_parallel_shard_degree = -1
+tensor_parallel_degree = 1
+pipeline_parallel_degree = 1
+context_parallel_degree = 1
+[checkpoint]
+enable_checkpoint = false
+folder = "checkpoint"
+interval = 500
+model_weights_only = false
+export_dtype = "float32"
+async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"]
+[activation_checkpoint]
+mode = 'selective'  # ['none', 'selective', 'full']
+selective_ac_option = 'op'  # 'int' = ac every positive int layer or 'op', ac based on ops policy
+[float8]
+enable_fsdp_float8_all_gather = false
+precompute_float8_dynamic_scale_for_fsdp = false
+filter_fqns = "output"

torchtitan/models/norms.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+def build_norm(norm_type: str, dim: int, eps: float = 1e-6):
+    """
+    Builds the specified normalization layer based on the norm_type.
+    Args:
+        norm_type (str): The type of normalization layer to build.
+            Supported types: layernorm, np_layernorm, rmsnorm
+        dim (int): The dimension of the normalization layer.
+        eps (float, optional): The epsilon value for numerical stability. Defaults to 1e-6.
+    Returns:
+        The built normalization layer.
+    Raises:
+        NotImplementedError: If an unknown norm_type is provided.
+    """
+    norm_type = norm_type.lower()  # Normalize to lowercase
+    if norm_type == "layernorm":
+        return nn.LayerNorm(dim, eps=eps, bias=False)
+    elif norm_type == "np_layernorm":
+        return nn.LayerNorm(dim, eps=eps, elementwise_affine=False, bias=False)
+    elif norm_type == "rmsnorm":
+        return nn.RMSNorm(dim, eps=eps)
+    else:
+        raise NotImplementedError(f"Unknown norm_type: '{norm_type}'")

torchtitan/protocols/__pycache__/model_converter.cpython-311.pyc ADDED Viewed

Binary file (5.05 kB). View file

torchtitan/tools/utils.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import gc
+import subprocess
+import time
+from dataclasses import dataclass
+from typing import Optional
+import torch
+from torch._utils import _get_available_device_type, _get_device_module
+from torchtitan.tools.logging import logger
+def get_device_info():
+    device_type = _get_available_device_type()
+    if device_type is None:
+        device_type = "cuda"  # default device_type: cuda
+    device_module = _get_device_module(device_type)  # default device_module:torch.cuda
+    return device_type, device_module
+device_type, device_module = get_device_info()
+# used to avoid stragglers in garbage collection
+class GarbageCollection:
+    def __init__(self, gc_freq=1000):
+        assert gc_freq > 0, "gc_freq must be a positive integer"
+        self.gc_freq = gc_freq
+        gc.disable()
+        self.collect("Initial GC collection.")
+    def run(self, step_count):
+        if step_count > 1 and step_count % self.gc_freq == 0:
+            self.collect("Peforming periodical GC collection.")
+    @staticmethod
+    def collect(reason: str):
+        begin = time.monotonic()
+        gc.collect(1)
+        logger.info("[GC] %s %.2f seconds.", reason, time.monotonic() - begin)
+# hardcoded BF16 type peak flops for NVIDIA A100, H100, H200 GPU and AMD MI250, MI300X, AMD MI325X and Intel PVC
+def get_peak_flops(device_name: str) -> int:
+    try:
+        # Run the lspci command and capture the output
+        result = subprocess.run(["lspci"], stdout=subprocess.PIPE, text=True)
+        # Filter the output for lines containing both "NVIDIA" and "H100"
+        filtered_lines = [
+            line
+            for line in result.stdout.splitlines()
+            if "NVIDIA" in line and "H100" in line
+        ]
+        # Join all filtered lines into a single string
+        device_name = " ".join(filtered_lines) or device_name
+    except FileNotFoundError as e:
+        logger.warning(f"Error running lspci: {e}, fallback to use device_name")
+    if "A100" in device_name:
+        # data from https://www.nvidia.com/en-us/data-center/a100/
+        return 312e12
+    elif "H100" in device_name:
+        # data from https://www.nvidia.com/en-us/data-center/h100/
+        # NOTE: Specifications are one-half lower without sparsity.
+        if "NVL" in device_name:
+            return 835e12
+        elif "PCIe" in device_name:
+            return 756e12
+        else:  # for H100 SXM and other variants
+            return 989e12
+    elif "H200" in device_name:
+        # data from https://www.nvidia.com/en-us/data-center/h200/
+        return 989e12
+    elif "MI300X" in device_name or "MI325X" in device_name:
+        # MI300X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html
+        # MI325X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html
+        return 1300e12
+    elif "MI250X" in device_name:
+        # data from https://www.amd.com/en/products/accelerators/instinct/mi200/mi250x.html (per GCD)
+        return 191.5e12
+    elif "Data Center GPU Max 1550" in device_name:
+        # Also known as Ponte Vecchio (PVC).
+        # data from https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html
+        # Dot Product Accumulate Systolic (DPAS):
+        # - Freq: 1300MHz
+        # - #ops: 512
+        # Full EU mode (i.e. 512 max compute units): 340.8 TFLOPS (BF16)
+        # Standard EU mode (i.e. 448 max compute units): 298.2 TFLOPS (BF16)
+        max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units
+        return 512 * max_comp_units * 1300 * 10**6
+    else:  # for other GPU types, assume A100
+        logger.warning(f"Peak flops undefined for: {device_name}, fallback to A100")
+        return 312e12
+@dataclass(frozen=True)
+class Color:
+    black = "\033[30m"
+    red = "\033[31m"
+    green = "\033[32m"
+    yellow = "\033[33m"
+    blue = "\033[34m"
+    magenta = "\033[35m"
+    cyan = "\033[36m"
+    white = "\033[37m"
+    reset = "\033[39m"
+@dataclass(frozen=True)
+class NoColor:
+    black = ""
+    red = ""
+    green = ""
+    yellow = ""
+    blue = ""
+    magenta = ""
+    cyan = ""
+    white = ""
+    reset = ""
+def check_if_feature_in_pytorch(
+    feature_name: str,
+    pull_request: str,
+    min_nightly_version: Optional[str] = None,
+) -> None:
+    if "git" in torch.__version__:  # pytorch is built from source
+        # notify users to check if the pull request is included in their pytorch
+        logger.warning(
+            "detected that the pytorch is built from source. Please make sure the PR "
+            f"({pull_request_link}) is included in pytorch for correct {feature_name}."
+        )
+    elif min_nightly_version is not None and torch.__version__ < min_nightly_version:
+        logger.warning(
+            f"detected that the pytorch version {torch.__version__} is older than "
+            f"{min_nightly_version}. Please upgrade a newer version to include the "
+            f"change in ({pull_request_link}) for correct {feature_name}."
+        )