Update README.md
Browse files
README.md
CHANGED
|
@@ -174,27 +174,6 @@ output_text = tokenizer.batch_decode(
|
|
| 174 |
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 175 |
)
|
| 176 |
print("Response:", output_text[0][len(prompt):])
|
| 177 |
-
|
| 178 |
-
# Local Benchmark
|
| 179 |
-
import torch.utils.benchmark as benchmark
|
| 180 |
-
from torchao.utils import benchmark_model
|
| 181 |
-
import torchao
|
| 182 |
-
|
| 183 |
-
def benchmark_fn(f, *args, **kwargs):
|
| 184 |
-
# Manual warmup
|
| 185 |
-
for _ in range(2):
|
| 186 |
-
f(*args, **kwargs)
|
| 187 |
-
|
| 188 |
-
t0 = benchmark.Timer(
|
| 189 |
-
stmt="f(*args, **kwargs)",
|
| 190 |
-
globals={"args": args, "kwargs": kwargs, "f": f},
|
| 191 |
-
num_threads=torch.get_num_threads(),
|
| 192 |
-
)
|
| 193 |
-
return f"{(t0.blocked_autorange().mean):.3f}"
|
| 194 |
-
|
| 195 |
-
torchao.quantization.utils.recommended_inductor_config_setter()
|
| 196 |
-
quantized_model = torch.compile(quantized_model, mode="max-autotune")
|
| 197 |
-
print(f"{save_to} model:", benchmark_fn(quantized_model.generate, **inputs, max_new_tokens=128))
|
| 198 |
```
|
| 199 |
|
| 200 |
# Model Quality
|
|
|
|
| 174 |
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
| 175 |
)
|
| 176 |
print("Response:", output_text[0][len(prompt):])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
```
|
| 178 |
|
| 179 |
# Model Quality
|