Add IQ2_KT
Browse files- README.md +61 -0
- images/perplexity.png +2 -2
README.md
CHANGED
|
@@ -445,6 +445,67 @@ numactl -N "$SOCKET" -m "$SOCKET" \
|
|
| 445 |
|
| 446 |
</details>
|
| 447 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
## IQ2_KS 193.144 GiB (2.472 BPW)
|
| 449 |
Final estimate: PPL = 3.9583 +/- 0.02433
|
| 450 |
|
|
|
|
| 445 |
|
| 446 |
</details>
|
| 447 |
|
| 448 |
+
## IQ2_KT 204.592 GiB (2.619 BPW)
|
| 449 |
+
Final estimate: PPL = 3.8109 +/- 0.02294
|
| 450 |
+
|
| 451 |
+
Remember, the KT quants are better suited for full GPU offload as calculating trellis on CPU bottlenecks token generation.
|
| 452 |
+
|
| 453 |
+
<details>
|
| 454 |
+
|
| 455 |
+
<summary>👈 Secret Recipe</summary>
|
| 456 |
+
|
| 457 |
+
```bash
|
| 458 |
+
#!/usr/bin/env bash
|
| 459 |
+
|
| 460 |
+
custom="
|
| 461 |
+
## Attention [0-60] (GPU)
|
| 462 |
+
blk\..*\.attn_k_b\.weight=q8_0
|
| 463 |
+
blk\..*\.attn_v_b\.weight=q8_0
|
| 464 |
+
|
| 465 |
+
# Balance of attn tensors
|
| 466 |
+
blk\..*\.attn_kv_a_mqa\.weight=q8_0
|
| 467 |
+
blk\..*\.attn_q_a\.weight=q8_0
|
| 468 |
+
blk\..*\.attn_q_b\.weight=q8_0
|
| 469 |
+
blk\..*\.attn_output\.weight=q8_0
|
| 470 |
+
|
| 471 |
+
## First Three Dense Layers [0-2] (GPU)
|
| 472 |
+
blk\..*\.ffn_down\.weight=q8_0
|
| 473 |
+
blk\..*\.ffn_(gate|up)\.weight=q8_0
|
| 474 |
+
|
| 475 |
+
## Shared Expert [3-60] (GPU)
|
| 476 |
+
blk\..*\.ffn_down_shexp\.weight=q8_0
|
| 477 |
+
blk\..*\.ffn_(gate|up)_shexp\.weight=q8_0
|
| 478 |
+
|
| 479 |
+
## Routed Experts [3-60] (CPU)
|
| 480 |
+
blk\..*\.ffn_down_exps\.weight=iq3_kt
|
| 481 |
+
blk\..*\.ffn_(gate|up)_exps\.weight=iq2_kt
|
| 482 |
+
|
| 483 |
+
## Token embedding and output tensors (GPU)
|
| 484 |
+
token_embd\.weight=iq6_k
|
| 485 |
+
output\.weight=iq6_k
|
| 486 |
+
"
|
| 487 |
+
|
| 488 |
+
custom=$(
|
| 489 |
+
echo "$custom" | grep -v '^#' | \
|
| 490 |
+
sed -Ez 's:\n+:,:g;s:,$::;s:^,::'
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
SOCKET=0
|
| 494 |
+
|
| 495 |
+
numactl -N "$SOCKET" -m "$SOCKET" \
|
| 496 |
+
./build/bin/llama-quantize \
|
| 497 |
+
--custom-q "$custom" \
|
| 498 |
+
--imatrix /mnt/raid/models/ubergarm/DeepSeek-V3.1-GGUF/imatrix-DeepSeek-V3.1-Q8_0.dat \
|
| 499 |
+
/mnt/raid/models/ubergarm/DeepSeek-V3.1-GGUF/DeepSeek-V3.1-256x20B-safetensors-BF16-00001-of-00030.gguf \
|
| 500 |
+
/mnt/raid/models/ubergarm/DeepSeek-V3.1-GGUF/DeepSeek-V3.1-IQ2_KT.gguf \
|
| 501 |
+
IQ2_KT \
|
| 502 |
+
192
|
| 503 |
+
```
|
| 504 |
+
|
| 505 |
+
</details>
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
|
| 509 |
## IQ2_KS 193.144 GiB (2.472 BPW)
|
| 510 |
Final estimate: PPL = 3.9583 +/- 0.02433
|
| 511 |
|
images/perplexity.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|