| { | |
| "data": { | |
| "cache_dir": "data", | |
| "group": true, | |
| "train": "openwebtext", | |
| "train_name": null, | |
| "valid": "wikitext103", | |
| "valid_name": null | |
| }, | |
| "eval": { | |
| "batch_size": 512, | |
| "perplexity": true, | |
| "perplexity_batch_size": 32 | |
| }, | |
| "graph": { | |
| "file": "data", | |
| "report_all": false, | |
| "type": "uniform" | |
| }, | |
| "model": { | |
| "cond_dim": 128, | |
| "dropout": 0.1, | |
| "hidden_size": 768, | |
| "length": 1024, | |
| "n_blocks": 12, | |
| "n_heads": 12, | |
| "name": "small", | |
| "scale_by_sigma": false, | |
| "type": "ddit" | |
| }, | |
| "ngpus": 8, | |
| "noise": { | |
| "sigma_max": 20, | |
| "sigma_min": 0.0001, | |
| "type": "loglinear" | |
| }, | |
| "optim": { | |
| "beta1": 0.9, | |
| "beta2": 0.999, | |
| "eps": 1e-08, | |
| "grad_clip": 1.0, | |
| "lr": 0.0003, | |
| "optimizer": "AdamW", | |
| "warmup": 2500, | |
| "weight_decay": 0 | |
| }, | |
| "sampling": { | |
| "noise_removal": true, | |
| "predictor": "euler", | |
| "steps": 128 | |
| }, | |
| "tokens": 50257, | |
| "training": { | |
| "accum": 4, | |
| "batch_size": 512, | |
| "ema": 0.9999, | |
| "eval_freq": 100, | |
| "log_freq": 50, | |
| "n_iters": 400000, | |
| "snapshot_freq": 4000, | |
| "snapshot_freq_for_preemption": 1000, | |
| "snapshot_sampling": true, | |
| "weight": "standard" | |
| }, | |
| "wandb_name": "m_small-g_uniform-pretrain" | |
| } |