|
|
import torch |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoProcessor, Qwen3VLForConditionalGeneration |
|
|
|
|
|
from llmcompressor import oneshot |
|
|
from llmcompressor.modeling import replace_modules_for_calibration |
|
|
from llmcompressor.modifiers.quantization import QuantizationModifier |
|
|
from llmcompressor.utils import dispatch_for_generation |
|
|
|
|
|
|
|
|
|
|
|
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" |
|
|
|
|
|
|
|
|
|
|
|
model = Qwen3VLForConditionalGeneration.from_pretrained(MODEL_ID, torch_dtype="auto") |
|
|
processor = AutoProcessor.from_pretrained(MODEL_ID) |
|
|
model = replace_modules_for_calibration(model) |
|
|
|
|
|
DATASET_ID = "neuralmagic/calibration" |
|
|
NUM_CALIBRATION_SAMPLES = 256 |
|
|
MAX_SEQUENCE_LENGTH = 8192 |
|
|
|
|
|
ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") |
|
|
|
|
|
|
|
|
def preprocess_function(example): |
|
|
messgages = [] |
|
|
for message in example["messages"]: |
|
|
messgages.append( |
|
|
{ |
|
|
"role": message["role"], |
|
|
"content": [{"type": "text", "text": message["content"]}], |
|
|
} |
|
|
) |
|
|
|
|
|
return processor.apply_chat_template( |
|
|
messgages, |
|
|
return_tensors="pt", |
|
|
padding=False, |
|
|
truncation=True, |
|
|
max_length=MAX_SEQUENCE_LENGTH, |
|
|
tokenize=True, |
|
|
add_special_tokens=False, |
|
|
return_dict=True, |
|
|
add_generation_prompt=False, |
|
|
) |
|
|
|
|
|
|
|
|
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) |
|
|
|
|
|
|
|
|
def data_collator(batch): |
|
|
assert len(batch) == 1 |
|
|
return { |
|
|
key: ( |
|
|
torch.tensor(value) |
|
|
if key != "pixel_values" |
|
|
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) |
|
|
) |
|
|
for key, value in batch[0].items() |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
recipe = QuantizationModifier( |
|
|
targets="Linear", |
|
|
scheme="NVFP4", |
|
|
ignore=[ |
|
|
"re:.*lm_head", |
|
|
"re:visual.*", |
|
|
"re:model.visual.*", |
|
|
"re:.*mlp.gate$", |
|
|
], |
|
|
) |
|
|
|
|
|
|
|
|
oneshot( |
|
|
model=model, |
|
|
recipe=recipe, |
|
|
max_seq_length=MAX_SEQUENCE_LENGTH, |
|
|
num_calibration_samples=NUM_CALIBRATION_SAMPLES, |
|
|
dataset=ds, |
|
|
data_collator=data_collator, |
|
|
) |
|
|
|
|
|
print("========== SAMPLE GENERATION ==============") |
|
|
dispatch_for_generation(model) |
|
|
input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") |
|
|
output = model.generate(input_ids, max_new_tokens=20) |
|
|
print(processor.decode(output[0])) |
|
|
print("==========================================") |
|
|
|
|
|
|
|
|
|
|
|
SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4" |
|
|
model.save_pretrained(SAVE_DIR) |
|
|
processor.save_pretrained(SAVE_DIR) |