Zihan Min
commited on
Commit
Β·
8704f55
1
Parent(s):
fb1e189
upload 0.6+0.5 fuser
Browse filesThis view is limited to 50 files because it contains too many changes. Β
See raw diff
- qwen3_0.6b+qwen2.5_0.5b_Fuser/config.json +57 -0
- aggregator_config.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/aggregator_config.json +0 -0
- projector_0.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_0.json +0 -0
- projector_0.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_0.pt +0 -0
- projector_1.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_1.json +0 -0
- projector_1.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_1.pt +0 -0
- projector_10.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_10.json +0 -0
- projector_10.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_10.pt +0 -0
- projector_11.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_11.json +0 -0
- projector_11.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_11.pt +0 -0
- projector_12.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_12.json +0 -0
- projector_12.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_12.pt +0 -0
- projector_13.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_13.json +0 -0
- projector_13.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_13.pt +0 -0
- projector_14.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_14.json +0 -0
- projector_14.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_14.pt +0 -0
- projector_15.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_15.json +0 -0
- projector_15.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_15.pt +0 -0
- projector_16.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_16.json +0 -0
- projector_16.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_16.pt +0 -0
- projector_17.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_17.json +0 -0
- projector_17.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_17.pt +0 -0
- projector_18.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_18.json +0 -0
- projector_18.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_18.pt +0 -0
- projector_19.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_19.json +0 -0
- projector_19.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_19.pt +0 -0
- projector_2.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_2.json +0 -0
- projector_2.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_2.pt +0 -0
- projector_20.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_20.json +0 -0
- projector_20.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_20.pt +0 -0
- projector_21.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_21.json +0 -0
- projector_21.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_21.pt +0 -0
- projector_22.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_22.json +0 -0
- projector_22.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_22.pt +0 -0
- projector_23.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_23.json +0 -0
- projector_23.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_23.pt +0 -0
- projector_24.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_24.json +0 -0
- projector_24.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_24.pt +0 -0
- projector_25.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_25.json +0 -0
- projector_25.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_25.pt +0 -0
- projector_26.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_26.json +0 -0
- projector_26.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_26.pt +0 -0
- projector_27.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_27.json +0 -0
- projector_27.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_27.pt +0 -0
- projector_3.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_3.json +0 -0
- projector_3.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_3.pt +0 -0
- projector_4.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_4.json +0 -0
- projector_4.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_4.pt +0 -0
- projector_5.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_5.json +0 -0
- projector_5.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_5.pt +0 -0
qwen3_0.6b+qwen2.5_0.5b_Fuser/config.json
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": {
|
| 3 |
+
"base_model": "Qwen/Qwen3-0.6B",
|
| 4 |
+
"teacher_model": "Qwen/Qwen2.5-0.5B-Instruct",
|
| 5 |
+
"include_response": false,
|
| 6 |
+
"is_do_alignment": false,
|
| 7 |
+
"alignment_strategy": "first",
|
| 8 |
+
"projector": {
|
| 9 |
+
"type": "C2CProjector",
|
| 10 |
+
"params": {
|
| 11 |
+
"hidden_dim": 1024,
|
| 12 |
+
"intermediate_dim": 1024,
|
| 13 |
+
"num_layers": 3,
|
| 14 |
+
"dropout": 0.1,
|
| 15 |
+
"initial_temperature": 1.0,
|
| 16 |
+
"final_temperature": 0.001,
|
| 17 |
+
"anneal_steps": 1929
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"mapping": "last_aligned"
|
| 21 |
+
},
|
| 22 |
+
"training": {
|
| 23 |
+
"learning_rate": 1e-4,
|
| 24 |
+
"weight_decay": 0.01,
|
| 25 |
+
"num_epochs": 1,
|
| 26 |
+
"max_length": 2048,
|
| 27 |
+
"device": "cuda",
|
| 28 |
+
"scheduler_type": "linear",
|
| 29 |
+
"warmup_ratio": 0.1,
|
| 30 |
+
"max_grad_norm": 1.0,
|
| 31 |
+
"gradient_accumulation_steps": 8,
|
| 32 |
+
"per_device_train_batch_size": 4,
|
| 33 |
+
"num_processes": 8,
|
| 34 |
+
"freeze": ["teacher","base"],
|
| 35 |
+
"seed": 42
|
| 36 |
+
},
|
| 37 |
+
"output": {
|
| 38 |
+
"output_dir": "local/checkpoints/0.6+0.5B_C2C_general_again_test",
|
| 39 |
+
"save_steps": 500,
|
| 40 |
+
"eval_steps": 100,
|
| 41 |
+
"wandb_config": {
|
| 42 |
+
"project": "Rosetta",
|
| 43 |
+
"mode": "offline",
|
| 44 |
+
"entity": "nics-efc",
|
| 45 |
+
"run_name": "0.6B+0.5B_C2C_general_OpenHermes_500k"
|
| 46 |
+
}
|
| 47 |
+
},
|
| 48 |
+
"data": {
|
| 49 |
+
"type": "OpenHermesChatDataset",
|
| 50 |
+
"kwargs": {
|
| 51 |
+
"split": "train",
|
| 52 |
+
"max_word_count": 2048,
|
| 53 |
+
"num_samples": 500000
|
| 54 |
+
},
|
| 55 |
+
"train_ratio": 0.99
|
| 56 |
+
}
|
| 57 |
+
}
|
aggregator_config.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/aggregator_config.json
RENAMED
|
File without changes
|
projector_0.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_0.json
RENAMED
|
File without changes
|
projector_0.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_0.pt
RENAMED
|
File without changes
|
projector_1.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_1.json
RENAMED
|
File without changes
|
projector_1.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_1.pt
RENAMED
|
File without changes
|
projector_10.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_10.json
RENAMED
|
File without changes
|
projector_10.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_10.pt
RENAMED
|
File without changes
|
projector_11.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_11.json
RENAMED
|
File without changes
|
projector_11.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_11.pt
RENAMED
|
File without changes
|
projector_12.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_12.json
RENAMED
|
File without changes
|
projector_12.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_12.pt
RENAMED
|
File without changes
|
projector_13.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_13.json
RENAMED
|
File without changes
|
projector_13.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_13.pt
RENAMED
|
File without changes
|
projector_14.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_14.json
RENAMED
|
File without changes
|
projector_14.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_14.pt
RENAMED
|
File without changes
|
projector_15.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_15.json
RENAMED
|
File without changes
|
projector_15.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_15.pt
RENAMED
|
File without changes
|
projector_16.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_16.json
RENAMED
|
File without changes
|
projector_16.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_16.pt
RENAMED
|
File without changes
|
projector_17.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_17.json
RENAMED
|
File without changes
|
projector_17.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_17.pt
RENAMED
|
File without changes
|
projector_18.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_18.json
RENAMED
|
File without changes
|
projector_18.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_18.pt
RENAMED
|
File without changes
|
projector_19.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_19.json
RENAMED
|
File without changes
|
projector_19.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_19.pt
RENAMED
|
File without changes
|
projector_2.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_2.json
RENAMED
|
File without changes
|
projector_2.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_2.pt
RENAMED
|
File without changes
|
projector_20.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_20.json
RENAMED
|
File without changes
|
projector_20.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_20.pt
RENAMED
|
File without changes
|
projector_21.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_21.json
RENAMED
|
File without changes
|
projector_21.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_21.pt
RENAMED
|
File without changes
|
projector_22.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_22.json
RENAMED
|
File without changes
|
projector_22.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_22.pt
RENAMED
|
File without changes
|
projector_23.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_23.json
RENAMED
|
File without changes
|
projector_23.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_23.pt
RENAMED
|
File without changes
|
projector_24.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_24.json
RENAMED
|
File without changes
|
projector_24.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_24.pt
RENAMED
|
File without changes
|
projector_25.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_25.json
RENAMED
|
File without changes
|
projector_25.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_25.pt
RENAMED
|
File without changes
|
projector_26.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_26.json
RENAMED
|
File without changes
|
projector_26.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_26.pt
RENAMED
|
File without changes
|
projector_27.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_27.json
RENAMED
|
File without changes
|
projector_27.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_27.pt
RENAMED
|
File without changes
|
projector_3.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_3.json
RENAMED
|
File without changes
|
projector_3.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_3.pt
RENAMED
|
File without changes
|
projector_4.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_4.json
RENAMED
|
File without changes
|
projector_4.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_4.pt
RENAMED
|
File without changes
|
projector_5.json β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_5.json
RENAMED
|
File without changes
|
projector_5.pt β qwen3_0.6b+qwen2.5_0.5b_Fuser/final/projector_5.pt
RENAMED
|
File without changes
|