GPTQ Quantization of YanoljaNEXT-Rosetta-27B-2511
On 1x3090, you can set the context length up to 49152 using vLLM.
Quantization method
Quantized using
- Tool: llm-compressor (63c175b)
- System: 1x3090, DDR4 128GB
- Time taken: about 1 hour (wall time)
import sys
import random
import string
from transformers import AutoProcessor, AutoModelForCausalLM
from datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier
import torch
MODEL_ID = sys.argv[1]
NUM_CALIBRATION_SAMPLES=1024
MAX_SEQUENCE_LENGTH=2048
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype="auto")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
dampening_frac=0.07
ds = load_dataset("Helsinki-NLP/opus-100", "en-ja", split="train[:1024]")
def preprocess_function(example):
en = example["translation"].get("en", "")
ja = example["translation"].get("ja", "")
rid = ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
messages = [
{"role": "system", "content": "Translate the user's text to Japanese.\nOutput format:XML\nProvide the final translation immediately without any other text."},
{"role": "user", "content": f'<seg id="{rid}" type="calib"><field key="content">{en}</field></seg>'},
{"role": "assistant", "content": f'<seg id="{rid}" type="calib"><field key="content">{ja}</field></seg>'},
]
return processor.apply_chat_template(
messages,
return_tensors="pt",
padding=False,
truncation=True,
max_length=MAX_SEQUENCE_LENGTH,
tokenize=True,
add_special_tokens=False,
return_dict=True,
add_generation_prompt=False,
)
ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names)
def data_collator(batch):
assert len(batch) == 1
return {
key: (
torch.tensor(value)
if key != "pixel_values"
else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
)
for key, value in batch[0].items()
}
recipe = [
GPTQModifier(
targets="Linear",
scheme="W4A16",
ignore=["lm_head"],
dampening_frac=dampening_frac,
)
]
SAVE_DIR = sys.argv[2]
oneshot(
model=model,
processor=processor,
recipe=recipe,
dataset=ds,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
data_collator=data_collator,
sequential_targets=["Gemma3DecoderLayer"],
tie_word_embeddings=True,
)
model.save_pretrained(SAVE_DIR, save_compressed=True)
processor.save_pretrained(SAVE_DIR)
- Downloads last month
- 15
Model tree for Bedovyy/YanoljaNEXT-Rosetta-27B-2511-W4A16-G128
Base model
yanolja/YanoljaNEXT-Rosetta-27B-2511