import torch from datasets import load_dataset from transformers import AutoProcessor, Qwen3VLForConditionalGeneration from llmcompressor import oneshot from llmcompressor.modeling import replace_modules_for_calibration from llmcompressor.modifiers.quantization import QuantizationModifier from llmcompressor.utils import dispatch_for_generation # NOTE: Requires a minimum of transformers 4.57.0 MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" # Load model. model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") processor = AutoProcessor.from_pretrained(MODEL_ID) model = replace_modules_for_calibration(model) DATASET_ID = "neuralmagic/calibration" NUM_CALIBRATION_SAMPLES = 256 MAX_SEQUENCE_LENGTH = 8192 ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") def preprocess_function(example): messgages = [] for message in example["messages"]: messgages.append( { "role": message["role"], "content": [{"type": "text", "text": message["content"]}], } ) return processor.apply_chat_template( messgages, return_tensors="pt", padding=False, truncation=True, max_length=MAX_SEQUENCE_LENGTH, tokenize=True, add_special_tokens=False, return_dict=True, add_generation_prompt=False, ) ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) def data_collator(batch): assert len(batch) == 1 return { key: ( torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) ) for key, value in batch[0].items() } # Configure the quantization algorithm and scheme. # In this case, we: # * quantize the weights to fp4 with group-wise quantization # * quantize the activations to fp4 with dynamic group activations recipe = QuantizationModifier( targets="Linear", scheme="NVFP4", ignore=[ "re:.*lm_head", "re:visual.*", "re:model.visual.*", "re:.*mlp.gate$", ], ) # Apply quantization. oneshot( model=model, recipe=recipe, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, dataset=ds, data_collator=data_collator, ) print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4" model.save_pretrained(SAVE_DIR) processor.save_pretrained(SAVE_DIR)