| | import torch |
| | from transformers import AutoModelForCausalLM, AutoTokenizer |
| | from safetensors.torch import load_file |
| | from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
| |
|
| | |
| | MODEL_NAME = "mistral-8x7B" |
| | SAFETENSORS_PATH = "path_to_your_model.safetensors" |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| |
|
| | |
| | with init_empty_weights(): |
| | model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
| |
|
| | |
| | model_weights = load_file(SAFETENSORS_PATH) |
| |
|
| | |
| | |
| | model = load_checkpoint_and_dispatch( |
| | model, |
| | SAFETENSORS_PATH, |
| | device_map="auto", # Automatically handles GPU/CPU offloading |
| | no_split_module_classes=["MistralLayer"], # Specify layers not to split |
| | dtype=torch.float16, # Use mixed precision for memory efficiency |
| | ) |
| |
|
| | |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | model.to(device) |
| |
|
| | |
| | input_text = "Hello, how are you?" |
| | inputs = tokenizer(input_text, return_tensors="pt").to(device) |
| |
|
| | |
| | with torch.no_grad(): |
| | outputs = model.generate( |
| | inputs["input_ids"], |
| | max_length=50, |
| | num_return_sequences=1, |
| | temperature=0.7, |
| | top_k=50, |
| | top_p=0.95, |
| | ) |
| |
|
| | |
| | generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
| | print("Generated Text:", generated_text) |