lhoang8500
/

Qwen3-VL-8B-Instruct-NVFP4

8-bit precision

compressed-tensors

Model card Files Files and versions

Qwen3-VL-8B-Instruct-NVFP4 / quantize.py

lhoang8500's picture

Create quantize.py

b707c8c verified about 1 month ago

history blame contribute delete

2.87 kB

	import torch
	from datasets import load_dataset
	from transformers import AutoProcessor, Qwen3VLForConditionalGeneration

	from llmcompressor import oneshot
	from llmcompressor.modeling import replace_modules_for_calibration
	from llmcompressor.modifiers.quantization import QuantizationModifier
	from llmcompressor.utils import dispatch_for_generation

	# NOTE: Requires a minimum of transformers 4.57.0

	MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"


	# Load model.
	model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto")
	processor = AutoProcessor.from_pretrained(MODEL_ID)
	model = replace_modules_for_calibration(model)

	DATASET_ID = "neuralmagic/calibration"
	NUM_CALIBRATION_SAMPLES = 256
	MAX_SEQUENCE_LENGTH = 8192

	ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]")


	def preprocess_function(example):
	messgages = []
	for message in example["messages"]:
	messgages.append(
	{
	"role": message["role"],
	"content": [{"type": "text", "text": message["content"]}],
	}
	)

	return processor.apply_chat_template(
	messgages,
	return_tensors="pt",
	padding=False,
	truncation=True,
	max_length=MAX_SEQUENCE_LENGTH,
	tokenize=True,
	add_special_tokens=False,
	return_dict=True,
	add_generation_prompt=False,
	)


	ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)


	def data_collator(batch):
	assert len(batch) == 1
	return {
	key: (
	torch.tensor(value)
	if key != "pixel_values"
	else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
	)
	for key, value in batch[0].items()
	}


	# Configure the quantization algorithm and scheme.
	# In this case, we:
	# * quantize the weights to fp4 with group-wise quantization
	# * quantize the activations to fp4 with dynamic group activations
	recipe = QuantizationModifier(
	targets="Linear",
	scheme="NVFP4",
	ignore=[
	"re:.*lm_head",
	"re:visual.*",
	"re:model.visual.*",
	"re:.*mlp.gate$",
	],
	)

	# Apply quantization.
	oneshot(
	model=model,
	recipe=recipe,
	max_seq_length=MAX_SEQUENCE_LENGTH,
	num_calibration_samples=NUM_CALIBRATION_SAMPLES,
	dataset=ds,
	data_collator=data_collator,
	)

	print("========== SAMPLE GENERATION ==============")
	dispatch_for_generation(model)
	input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda")
	output = model.generate(input_ids, max_new_tokens=20)
	print(processor.decode(output[0]))
	print("==========================================")


	# Save to disk in compressed-tensors format.
	SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4"
	model.save_pretrained(SAVE_DIR)
	processor.save_pretrained(SAVE_DIR)