|
|
from dataclasses import dataclass |
|
|
from utils import load_config |
|
|
|
|
|
cfg = load_config() |
|
|
trainer_cfg = cfg.get('training',{}) |
|
|
|
|
|
@dataclass |
|
|
class TrainerConfig: |
|
|
eval_interval:int = trainer_cfg.get('eval_interval',500) |
|
|
log_interval:int = trainer_cfg.get('log_interval',10) |
|
|
eval_iters:int = trainer_cfg.get('eval_iters',150) |
|
|
eval_only:bool = trainer_cfg.get('eval_only',False) |
|
|
always_save_checkpoint:bool = trainer_cfg.get('always_save_checkpoint',False) |
|
|
init_from:str = trainer_cfg.get('init_from','scratch') |
|
|
wandb_log:bool = trainer_cfg.get('wandb_log',True) |
|
|
wandb_project:str = trainer_cfg.get('wandb_project', 'sllama') |
|
|
wandb_run_name:str = trainer_cfg.get('wandb_run_name', 'sllama') |
|
|
gradient_accumulation_steps:int = trainer_cfg.get('gradient_accumulation_steps',2) |
|
|
batch_size:int = trainer_cfg.get('batch_size', 128) |
|
|
block_size:int = trainer_cfg.get('block_size', 256) |
|
|
dropout:float = trainer_cfg.get('dropout', 0.1) |
|
|
bias:bool = trainer_cfg.get('bias', False) |
|
|
|
|
|
learning_rate:float = trainer_cfg.get('learning_rate', 4e-4) |
|
|
max_iters:int = trainer_cfg.get('max_iters', 3000) |
|
|
weight_decay:float = trainer_cfg.get('weight_decay', 0.0) |
|
|
layer_sharing: bool = trainer_cfg.get('layer_sharing', False) |
|
|
beta1:float = trainer_cfg.get('beta1', 0.9) |
|
|
beta2:float = trainer_cfg.get('beta2', 0.95) |
|
|
grad_clip:float = trainer_cfg.get('grad_clip', 1.0) |
|
|
|
|
|
decay_lr:bool = trainer_cfg.get('decay_lr', True) |
|
|
warmup_iters:int = trainer_cfg.get('warmup_iters', 200) |
|
|
lr_decay_iters:int = trainer_cfg.get('lr_decay_iters', 5000) |
|
|
min_lr:float = trainer_cfg.get('min_lr', 4e-5) |
|
|
|
|
|
backend:str = trainer_cfg.get('backend', 'nccl') |
|
|
|
|
|
device:str = trainer_cfg.get('device', 'cuda') |
|
|
dtype:str = trainer_cfg.get('dtype', 'bfloat16') |
|
|
compile:bool = trainer_cfg.get('compile', True) |
|
|
save_weights:bool = trainer_cfg.get('save_weights', False) |
|
|
train_or_dev:str = trainer_cfg.get('train_or_dev', 'train') |
|
|
_out_dir: str = 'sllama_main' |
|
|
base_dir = cfg.get('outputs', {}).get('pretrained_models', 'sllama_main') |
|
|
|
|
|
@property |
|
|
def out_dir(self) -> str: |
|
|
return f"{self.base_dir}/{self._out_dir}" |
|
|
|
|
|
@out_dir.setter |
|
|
def out_dir(self, value: str): |
|
|
self._out_dir = f"{self.base_dir}/{value}" |
|
|
|
|
|
|
|
|
|
|
|
|