| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 30, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.025078369905956112, | |
| "grad_norm": 0.43725547194480896, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.9437, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.050156739811912224, | |
| "grad_norm": 0.6708689332008362, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.849, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.07523510971786834, | |
| "grad_norm": 0.3925197422504425, | |
| "learning_rate": 4e-05, | |
| "loss": 1.8725, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.10031347962382445, | |
| "grad_norm": 0.4107086956501007, | |
| "learning_rate": 5.6000000000000006e-05, | |
| "loss": 1.7837, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.12539184952978055, | |
| "grad_norm": 0.5341728329658508, | |
| "learning_rate": 7.2e-05, | |
| "loss": 1.8033, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.15047021943573669, | |
| "grad_norm": 0.43938425183296204, | |
| "learning_rate": 8.800000000000001e-05, | |
| "loss": 1.5958, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.1755485893416928, | |
| "grad_norm": 0.3650668263435364, | |
| "learning_rate": 0.00010400000000000001, | |
| "loss": 1.5566, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2006269592476489, | |
| "grad_norm": 0.3861461877822876, | |
| "learning_rate": 0.00012, | |
| "loss": 1.3631, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.22570532915360503, | |
| "grad_norm": 0.3685659170150757, | |
| "learning_rate": 0.00013600000000000003, | |
| "loss": 1.3227, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.2507836990595611, | |
| "grad_norm": 0.4720342457294464, | |
| "learning_rate": 0.000152, | |
| "loss": 1.1027, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.27586206896551724, | |
| "grad_norm": 0.40199315547943115, | |
| "learning_rate": 0.000168, | |
| "loss": 1.018, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.30094043887147337, | |
| "grad_norm": 0.22768579423427582, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.9077, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.32601880877742945, | |
| "grad_norm": 0.2388496696949005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7587, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.3510971786833856, | |
| "grad_norm": 0.19000130891799927, | |
| "learning_rate": 0.0001998993710691824, | |
| "loss": 0.7766, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.3761755485893417, | |
| "grad_norm": 0.1965627372264862, | |
| "learning_rate": 0.0001997987421383648, | |
| "loss": 0.7575, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.4012539184952978, | |
| "grad_norm": 0.18852603435516357, | |
| "learning_rate": 0.00019969811320754718, | |
| "loss": 0.8292, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.4263322884012539, | |
| "grad_norm": 0.16236507892608643, | |
| "learning_rate": 0.00019959748427672956, | |
| "loss": 0.7272, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.45141065830721006, | |
| "grad_norm": 0.14128465950489044, | |
| "learning_rate": 0.00019949685534591195, | |
| "loss": 0.751, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.47648902821316613, | |
| "grad_norm": 0.14623811841011047, | |
| "learning_rate": 0.00019939622641509434, | |
| "loss": 0.6661, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.5015673981191222, | |
| "grad_norm": 0.14945009350776672, | |
| "learning_rate": 0.00019929559748427673, | |
| "loss": 0.7435, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5266457680250783, | |
| "grad_norm": 0.1485632061958313, | |
| "learning_rate": 0.00019919496855345915, | |
| "loss": 0.6466, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.14336936175823212, | |
| "learning_rate": 0.00019909433962264153, | |
| "loss": 0.7, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.5768025078369906, | |
| "grad_norm": 0.1316949725151062, | |
| "learning_rate": 0.0001989937106918239, | |
| "loss": 0.6194, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.6018808777429467, | |
| "grad_norm": 0.14485935866832733, | |
| "learning_rate": 0.00019889308176100629, | |
| "loss": 0.8044, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.6269592476489029, | |
| "grad_norm": 0.12860289216041565, | |
| "learning_rate": 0.00019879245283018867, | |
| "loss": 0.5773, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6520376175548589, | |
| "grad_norm": 0.16889798641204834, | |
| "learning_rate": 0.0001986918238993711, | |
| "loss": 0.7281, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.677115987460815, | |
| "grad_norm": 0.14104238152503967, | |
| "learning_rate": 0.00019859119496855348, | |
| "loss": 0.6654, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.7021943573667712, | |
| "grad_norm": 0.1515515297651291, | |
| "learning_rate": 0.00019849056603773587, | |
| "loss": 0.6833, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.7272727272727273, | |
| "grad_norm": 0.16043280065059662, | |
| "learning_rate": 0.00019838993710691826, | |
| "loss": 0.6599, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.7523510971786834, | |
| "grad_norm": 0.15895870327949524, | |
| "learning_rate": 0.00019828930817610062, | |
| "loss": 0.686, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7774294670846394, | |
| "grad_norm": 0.3486965298652649, | |
| "learning_rate": 0.00019818867924528303, | |
| "loss": 0.6835, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.8025078369905956, | |
| "grad_norm": 0.1274975836277008, | |
| "learning_rate": 0.00019808805031446542, | |
| "loss": 0.5425, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.8275862068965517, | |
| "grad_norm": 0.1669531911611557, | |
| "learning_rate": 0.0001979874213836478, | |
| "loss": 0.766, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.8526645768025078, | |
| "grad_norm": 0.14856377243995667, | |
| "learning_rate": 0.0001978867924528302, | |
| "loss": 0.5993, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.877742946708464, | |
| "grad_norm": 0.13947483897209167, | |
| "learning_rate": 0.0001977861635220126, | |
| "loss": 0.7071, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.9028213166144201, | |
| "grad_norm": 0.15276968479156494, | |
| "learning_rate": 0.00019768553459119498, | |
| "loss": 0.687, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.9278996865203761, | |
| "grad_norm": 0.14595958590507507, | |
| "learning_rate": 0.00019758490566037737, | |
| "loss": 0.6495, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.9529780564263323, | |
| "grad_norm": 0.17588546872138977, | |
| "learning_rate": 0.00019748427672955975, | |
| "loss": 0.6584, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.9780564263322884, | |
| "grad_norm": 0.15688568353652954, | |
| "learning_rate": 0.00019738364779874214, | |
| "loss": 0.6769, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.1879902333021164, | |
| "learning_rate": 0.00019728301886792453, | |
| "loss": 0.5924, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.025078369905956, | |
| "grad_norm": 0.15292422473430634, | |
| "learning_rate": 0.00019718238993710695, | |
| "loss": 0.5727, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.0501567398119123, | |
| "grad_norm": 0.14542542397975922, | |
| "learning_rate": 0.00019708176100628934, | |
| "loss": 0.5528, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.0752351097178683, | |
| "grad_norm": 0.15912258625030518, | |
| "learning_rate": 0.0001969811320754717, | |
| "loss": 0.5404, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.1003134796238245, | |
| "grad_norm": 0.16443438827991486, | |
| "learning_rate": 0.00019688050314465409, | |
| "loss": 0.544, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.1253918495297806, | |
| "grad_norm": 0.18315915763378143, | |
| "learning_rate": 0.00019677987421383647, | |
| "loss": 0.5768, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.1504702194357366, | |
| "grad_norm": 0.16878078877925873, | |
| "learning_rate": 0.0001966792452830189, | |
| "loss": 0.6918, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.1755485893416928, | |
| "grad_norm": 0.1652018129825592, | |
| "learning_rate": 0.00019657861635220128, | |
| "loss": 0.5903, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.2006269592476488, | |
| "grad_norm": 0.181439608335495, | |
| "learning_rate": 0.00019647798742138367, | |
| "loss": 0.5917, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.225705329153605, | |
| "grad_norm": 0.15887363255023956, | |
| "learning_rate": 0.00019637735849056606, | |
| "loss": 0.5631, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.250783699059561, | |
| "grad_norm": 0.16309796273708344, | |
| "learning_rate": 0.00019627672955974842, | |
| "loss": 0.6206, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.2758620689655173, | |
| "grad_norm": 0.19174307584762573, | |
| "learning_rate": 0.00019617610062893083, | |
| "loss": 0.5375, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.3009404388714734, | |
| "grad_norm": 0.2240614891052246, | |
| "learning_rate": 0.00019607547169811322, | |
| "loss": 0.6407, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.3260188087774294, | |
| "grad_norm": 0.1673874855041504, | |
| "learning_rate": 0.0001959748427672956, | |
| "loss": 0.631, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.3510971786833856, | |
| "grad_norm": 0.16143380105495453, | |
| "learning_rate": 0.000195874213836478, | |
| "loss": 0.4985, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.3761755485893417, | |
| "grad_norm": 0.18511593341827393, | |
| "learning_rate": 0.0001957735849056604, | |
| "loss": 0.5844, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.4012539184952977, | |
| "grad_norm": 0.15226313471794128, | |
| "learning_rate": 0.00019567295597484278, | |
| "loss": 0.5237, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.426332288401254, | |
| "grad_norm": 0.16536672413349152, | |
| "learning_rate": 0.00019557232704402517, | |
| "loss": 0.4945, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.4514106583072102, | |
| "grad_norm": 0.17802752554416656, | |
| "learning_rate": 0.00019547169811320755, | |
| "loss": 0.632, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.4764890282131662, | |
| "grad_norm": 0.18615730106830597, | |
| "learning_rate": 0.00019537106918238994, | |
| "loss": 0.6778, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.5015673981191222, | |
| "grad_norm": 0.16418549418449402, | |
| "learning_rate": 0.00019527044025157233, | |
| "loss": 0.6141, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.5266457680250785, | |
| "grad_norm": 0.16458265483379364, | |
| "learning_rate": 0.00019516981132075475, | |
| "loss": 0.5316, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 0.15842436254024506, | |
| "learning_rate": 0.0001950691823899371, | |
| "loss": 0.4761, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.5768025078369905, | |
| "grad_norm": 0.15368856489658356, | |
| "learning_rate": 0.0001949685534591195, | |
| "loss": 0.471, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.6018808777429467, | |
| "grad_norm": 0.16104081273078918, | |
| "learning_rate": 0.0001948679245283019, | |
| "loss": 0.572, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.626959247648903, | |
| "grad_norm": 0.1696012020111084, | |
| "learning_rate": 0.00019476729559748428, | |
| "loss": 0.5293, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.6520376175548588, | |
| "grad_norm": 0.16601622104644775, | |
| "learning_rate": 0.0001946666666666667, | |
| "loss": 0.6569, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.677115987460815, | |
| "grad_norm": 0.15106241405010223, | |
| "learning_rate": 0.00019456603773584908, | |
| "loss": 0.5687, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.7021943573667713, | |
| "grad_norm": 0.18189087510108948, | |
| "learning_rate": 0.00019446540880503147, | |
| "loss": 0.4965, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.7272727272727273, | |
| "grad_norm": 0.15463034808635712, | |
| "learning_rate": 0.00019436477987421383, | |
| "loss": 0.6398, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 1.7523510971786833, | |
| "grad_norm": 0.17576032876968384, | |
| "learning_rate": 0.00019426415094339622, | |
| "loss": 0.6122, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.7774294670846396, | |
| "grad_norm": 0.14750592410564423, | |
| "learning_rate": 0.00019416352201257863, | |
| "loss": 0.5475, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 1.8025078369905956, | |
| "grad_norm": 0.15765072405338287, | |
| "learning_rate": 0.00019406289308176102, | |
| "loss": 0.443, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 1.8275862068965516, | |
| "grad_norm": 0.16872242093086243, | |
| "learning_rate": 0.0001939622641509434, | |
| "loss": 0.5061, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 1.8526645768025078, | |
| "grad_norm": 0.16207149624824524, | |
| "learning_rate": 0.0001938616352201258, | |
| "loss": 0.572, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.877742946708464, | |
| "grad_norm": 0.16720207035541534, | |
| "learning_rate": 0.0001937610062893082, | |
| "loss": 0.5329, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.90282131661442, | |
| "grad_norm": 0.1653318852186203, | |
| "learning_rate": 0.00019366037735849058, | |
| "loss": 0.5361, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 1.9278996865203761, | |
| "grad_norm": 0.1918332576751709, | |
| "learning_rate": 0.00019355974842767297, | |
| "loss": 0.6219, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 1.9529780564263324, | |
| "grad_norm": 0.1535947322845459, | |
| "learning_rate": 0.00019345911949685536, | |
| "loss": 0.586, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 1.9780564263322884, | |
| "grad_norm": 0.16222791373729706, | |
| "learning_rate": 0.00019335849056603774, | |
| "loss": 0.5228, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.24010032415390015, | |
| "learning_rate": 0.00019325786163522013, | |
| "loss": 0.5218, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.0250783699059562, | |
| "grad_norm": 0.15844006836414337, | |
| "learning_rate": 0.00019315723270440255, | |
| "loss": 0.5111, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.050156739811912, | |
| "grad_norm": 0.1755230873823166, | |
| "learning_rate": 0.0001930566037735849, | |
| "loss": 0.6063, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.0752351097178683, | |
| "grad_norm": 0.2025759369134903, | |
| "learning_rate": 0.0001929559748427673, | |
| "loss": 0.5672, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.1003134796238245, | |
| "grad_norm": 0.2058378905057907, | |
| "learning_rate": 0.0001928553459119497, | |
| "loss": 0.4629, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.1253918495297803, | |
| "grad_norm": 0.1765381544828415, | |
| "learning_rate": 0.00019275471698113208, | |
| "loss": 0.5236, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.1504702194357366, | |
| "grad_norm": 0.20618358254432678, | |
| "learning_rate": 0.0001926540880503145, | |
| "loss": 0.5029, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.175548589341693, | |
| "grad_norm": 0.1737968772649765, | |
| "learning_rate": 0.00019255345911949688, | |
| "loss": 0.3964, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.200626959247649, | |
| "grad_norm": 0.20385882258415222, | |
| "learning_rate": 0.00019245283018867927, | |
| "loss": 0.5188, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.225705329153605, | |
| "grad_norm": 0.2051456868648529, | |
| "learning_rate": 0.00019235220125786163, | |
| "loss": 0.4548, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.250783699059561, | |
| "grad_norm": 0.18826241791248322, | |
| "learning_rate": 0.00019225157232704402, | |
| "loss": 0.4515, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.2758620689655173, | |
| "grad_norm": 0.18653476238250732, | |
| "learning_rate": 0.00019215094339622644, | |
| "loss": 0.5373, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.300940438871473, | |
| "grad_norm": 0.179554283618927, | |
| "learning_rate": 0.00019205031446540882, | |
| "loss": 0.49, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.3260188087774294, | |
| "grad_norm": 0.18949083983898163, | |
| "learning_rate": 0.0001919496855345912, | |
| "loss": 0.4795, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.3510971786833856, | |
| "grad_norm": 0.21681569516658783, | |
| "learning_rate": 0.0001918490566037736, | |
| "loss": 0.4826, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.376175548589342, | |
| "grad_norm": 0.20997639000415802, | |
| "learning_rate": 0.000191748427672956, | |
| "loss": 0.3699, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.4012539184952977, | |
| "grad_norm": 0.3043127954006195, | |
| "learning_rate": 0.00019164779874213838, | |
| "loss": 0.5264, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.426332288401254, | |
| "grad_norm": 0.19533301889896393, | |
| "learning_rate": 0.00019154716981132077, | |
| "loss": 0.4243, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.45141065830721, | |
| "grad_norm": 0.20891591906547546, | |
| "learning_rate": 0.00019144654088050316, | |
| "loss": 0.4748, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.476489028213166, | |
| "grad_norm": 0.1940625011920929, | |
| "learning_rate": 0.00019134591194968554, | |
| "loss": 0.456, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.501567398119122, | |
| "grad_norm": 0.2169208973646164, | |
| "learning_rate": 0.00019124528301886793, | |
| "loss": 0.577, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.5266457680250785, | |
| "grad_norm": 0.21462920308113098, | |
| "learning_rate": 0.00019114465408805032, | |
| "loss": 0.3583, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.5517241379310347, | |
| "grad_norm": 0.22243842482566833, | |
| "learning_rate": 0.0001910440251572327, | |
| "loss": 0.5473, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.5768025078369905, | |
| "grad_norm": 0.20357415080070496, | |
| "learning_rate": 0.0001909433962264151, | |
| "loss": 0.5596, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 2.6018808777429467, | |
| "grad_norm": 0.21374137699604034, | |
| "learning_rate": 0.0001908427672955975, | |
| "loss": 0.6041, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 2.626959247648903, | |
| "grad_norm": 0.22612103819847107, | |
| "learning_rate": 0.00019074213836477988, | |
| "loss": 0.4825, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.652037617554859, | |
| "grad_norm": 0.182185098528862, | |
| "learning_rate": 0.0001906415094339623, | |
| "loss": 0.5961, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 2.677115987460815, | |
| "grad_norm": 0.21316243708133698, | |
| "learning_rate": 0.00019054088050314468, | |
| "loss": 0.4892, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 2.7021943573667713, | |
| "grad_norm": 0.20594292879104614, | |
| "learning_rate": 0.00019044025157232704, | |
| "loss": 0.5294, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 0.18579436838626862, | |
| "learning_rate": 0.00019033962264150943, | |
| "loss": 0.4867, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 2.7523510971786833, | |
| "grad_norm": 0.20978513360023499, | |
| "learning_rate": 0.00019023899371069182, | |
| "loss": 0.5459, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.7774294670846396, | |
| "grad_norm": 0.20766879618167877, | |
| "learning_rate": 0.00019013836477987424, | |
| "loss": 0.4467, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 2.8025078369905954, | |
| "grad_norm": 0.2247876673936844, | |
| "learning_rate": 0.00019003773584905662, | |
| "loss": 0.4955, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 2.8275862068965516, | |
| "grad_norm": 0.20031589269638062, | |
| "learning_rate": 0.00018993710691823901, | |
| "loss": 0.4274, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 2.852664576802508, | |
| "grad_norm": 0.22423385083675385, | |
| "learning_rate": 0.0001898364779874214, | |
| "loss": 0.4741, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 2.877742946708464, | |
| "grad_norm": 0.1920011192560196, | |
| "learning_rate": 0.00018973584905660376, | |
| "loss": 0.4802, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.9028213166144203, | |
| "grad_norm": 0.1996566653251648, | |
| "learning_rate": 0.00018963522012578615, | |
| "loss": 0.5204, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 2.927899686520376, | |
| "grad_norm": 0.18659324944019318, | |
| "learning_rate": 0.00018953459119496857, | |
| "loss": 0.563, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.9529780564263324, | |
| "grad_norm": 0.20645543932914734, | |
| "learning_rate": 0.00018943396226415096, | |
| "loss": 0.5968, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 2.978056426332288, | |
| "grad_norm": 0.20103755593299866, | |
| "learning_rate": 0.00018933333333333335, | |
| "loss": 0.4683, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.26790672540664673, | |
| "learning_rate": 0.00018923270440251573, | |
| "loss": 0.4214, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.0250783699059562, | |
| "grad_norm": 0.24234965443611145, | |
| "learning_rate": 0.00018913207547169812, | |
| "loss": 0.4778, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.050156739811912, | |
| "grad_norm": 0.2758055031299591, | |
| "learning_rate": 0.0001890314465408805, | |
| "loss": 0.4029, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 3.0752351097178683, | |
| "grad_norm": 0.2382444590330124, | |
| "learning_rate": 0.0001889308176100629, | |
| "loss": 0.3801, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.1003134796238245, | |
| "grad_norm": 0.24490897357463837, | |
| "learning_rate": 0.0001888301886792453, | |
| "loss": 0.3975, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.1253918495297803, | |
| "grad_norm": 0.26239147782325745, | |
| "learning_rate": 0.00018872955974842768, | |
| "loss": 0.4038, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.1504702194357366, | |
| "grad_norm": 0.24605032801628113, | |
| "learning_rate": 0.00018862893081761007, | |
| "loss": 0.4364, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.175548589341693, | |
| "grad_norm": 0.28764280676841736, | |
| "learning_rate": 0.00018852830188679248, | |
| "loss": 0.3577, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 3.200626959247649, | |
| "grad_norm": 0.23804618418216705, | |
| "learning_rate": 0.00018842767295597484, | |
| "loss": 0.4155, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 3.225705329153605, | |
| "grad_norm": 0.25497862696647644, | |
| "learning_rate": 0.00018832704402515723, | |
| "loss": 0.4654, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.250783699059561, | |
| "grad_norm": 0.23537839949131012, | |
| "learning_rate": 0.00018822641509433962, | |
| "loss": 0.4144, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.2758620689655173, | |
| "grad_norm": 0.268036812543869, | |
| "learning_rate": 0.000188125786163522, | |
| "loss": 0.4546, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 3.300940438871473, | |
| "grad_norm": 0.25395911931991577, | |
| "learning_rate": 0.00018802515723270443, | |
| "loss": 0.4604, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.3260188087774294, | |
| "grad_norm": 0.3395281732082367, | |
| "learning_rate": 0.00018792452830188681, | |
| "loss": 0.5816, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.3510971786833856, | |
| "grad_norm": 0.258900910615921, | |
| "learning_rate": 0.0001878238993710692, | |
| "loss": 0.4415, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.376175548589342, | |
| "grad_norm": 0.24031828343868256, | |
| "learning_rate": 0.00018772327044025156, | |
| "loss": 0.469, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.4012539184952977, | |
| "grad_norm": 0.26624906063079834, | |
| "learning_rate": 0.00018762264150943395, | |
| "loss": 0.4304, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.426332288401254, | |
| "grad_norm": 0.2869020998477936, | |
| "learning_rate": 0.00018752201257861637, | |
| "loss": 0.4623, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 3.45141065830721, | |
| "grad_norm": 0.2383798062801361, | |
| "learning_rate": 0.00018742138364779876, | |
| "loss": 0.3973, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 3.476489028213166, | |
| "grad_norm": 0.25947991013526917, | |
| "learning_rate": 0.00018732075471698115, | |
| "loss": 0.4468, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 3.501567398119122, | |
| "grad_norm": 0.21950559318065643, | |
| "learning_rate": 0.00018722012578616354, | |
| "loss": 0.3432, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.5266457680250785, | |
| "grad_norm": 0.26003995537757874, | |
| "learning_rate": 0.00018711949685534592, | |
| "loss": 0.4664, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 3.5517241379310347, | |
| "grad_norm": 0.2847505807876587, | |
| "learning_rate": 0.0001870188679245283, | |
| "loss": 0.4583, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 3.5768025078369905, | |
| "grad_norm": 0.2824760973453522, | |
| "learning_rate": 0.0001869182389937107, | |
| "loss": 0.4735, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 3.6018808777429467, | |
| "grad_norm": 0.268838107585907, | |
| "learning_rate": 0.0001868176100628931, | |
| "loss": 0.4071, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 3.626959247648903, | |
| "grad_norm": 0.24519529938697815, | |
| "learning_rate": 0.00018671698113207548, | |
| "loss": 0.4178, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 3.652037617554859, | |
| "grad_norm": 0.24740180373191833, | |
| "learning_rate": 0.00018661635220125787, | |
| "loss": 0.4716, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 3.677115987460815, | |
| "grad_norm": 0.22623687982559204, | |
| "learning_rate": 0.00018651572327044026, | |
| "loss": 0.3645, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 3.7021943573667713, | |
| "grad_norm": 0.2554280459880829, | |
| "learning_rate": 0.00018641509433962264, | |
| "loss": 0.4044, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 3.7272727272727275, | |
| "grad_norm": 0.2251761108636856, | |
| "learning_rate": 0.00018631446540880503, | |
| "loss": 0.3663, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 3.7523510971786833, | |
| "grad_norm": 0.20053140819072723, | |
| "learning_rate": 0.00018621383647798742, | |
| "loss": 0.4342, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.7774294670846396, | |
| "grad_norm": 0.2692326605319977, | |
| "learning_rate": 0.0001861132075471698, | |
| "loss": 0.4268, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 3.8025078369905954, | |
| "grad_norm": 0.23218081891536713, | |
| "learning_rate": 0.00018601257861635223, | |
| "loss": 0.4848, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 3.8275862068965516, | |
| "grad_norm": 0.2571001648902893, | |
| "learning_rate": 0.00018591194968553462, | |
| "loss": 0.5391, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.852664576802508, | |
| "grad_norm": 0.20899826288223267, | |
| "learning_rate": 0.00018581132075471698, | |
| "loss": 0.4183, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 3.877742946708464, | |
| "grad_norm": 0.24893143773078918, | |
| "learning_rate": 0.00018571069182389937, | |
| "loss": 0.4314, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.9028213166144203, | |
| "grad_norm": 0.26598888635635376, | |
| "learning_rate": 0.00018561006289308175, | |
| "loss": 0.4182, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 3.927899686520376, | |
| "grad_norm": 0.24121470749378204, | |
| "learning_rate": 0.00018550943396226417, | |
| "loss": 0.4327, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 3.9529780564263324, | |
| "grad_norm": 0.2874317467212677, | |
| "learning_rate": 0.00018540880503144656, | |
| "loss": 0.4616, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 3.978056426332288, | |
| "grad_norm": 0.22735589742660522, | |
| "learning_rate": 0.00018530817610062895, | |
| "loss": 0.376, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.3424147069454193, | |
| "learning_rate": 0.00018520754716981134, | |
| "loss": 0.537, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.025078369905956, | |
| "grad_norm": 0.27403339743614197, | |
| "learning_rate": 0.0001851069182389937, | |
| "loss": 0.3539, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 4.0501567398119125, | |
| "grad_norm": 0.33905503153800964, | |
| "learning_rate": 0.0001850062893081761, | |
| "loss": 0.3367, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.075235109717869, | |
| "grad_norm": 0.47853460907936096, | |
| "learning_rate": 0.0001849056603773585, | |
| "loss": 0.393, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 4.100313479623824, | |
| "grad_norm": 0.3133102059364319, | |
| "learning_rate": 0.0001848050314465409, | |
| "loss": 0.3762, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 4.12539184952978, | |
| "grad_norm": 0.22834369540214539, | |
| "learning_rate": 0.00018470440251572328, | |
| "loss": 0.3404, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.150470219435737, | |
| "grad_norm": 0.2537166476249695, | |
| "learning_rate": 0.00018460377358490567, | |
| "loss": 0.3582, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 4.175548589341693, | |
| "grad_norm": 0.35708144307136536, | |
| "learning_rate": 0.00018450314465408806, | |
| "loss": 0.369, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 4.200626959247649, | |
| "grad_norm": 0.3224405348300934, | |
| "learning_rate": 0.00018440251572327045, | |
| "loss": 0.3458, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 4.225705329153605, | |
| "grad_norm": 0.34621739387512207, | |
| "learning_rate": 0.00018430188679245283, | |
| "loss": 0.4176, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 4.250783699059561, | |
| "grad_norm": 0.24818404018878937, | |
| "learning_rate": 0.00018420125786163522, | |
| "loss": 0.3205, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.275862068965517, | |
| "grad_norm": 0.29599064588546753, | |
| "learning_rate": 0.0001841006289308176, | |
| "loss": 0.3696, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 4.300940438871473, | |
| "grad_norm": 0.2980504333972931, | |
| "learning_rate": 0.00018400000000000003, | |
| "loss": 0.3256, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 4.326018808777429, | |
| "grad_norm": 0.35454604029655457, | |
| "learning_rate": 0.0001838993710691824, | |
| "loss": 0.4957, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 4.351097178683386, | |
| "grad_norm": 0.3497369885444641, | |
| "learning_rate": 0.00018379874213836478, | |
| "loss": 0.3941, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 4.376175548589342, | |
| "grad_norm": 0.32460692524909973, | |
| "learning_rate": 0.00018369811320754717, | |
| "loss": 0.3615, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.401253918495298, | |
| "grad_norm": 0.29358094930648804, | |
| "learning_rate": 0.00018359748427672955, | |
| "loss": 0.3749, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 4.4263322884012535, | |
| "grad_norm": 0.2807920575141907, | |
| "learning_rate": 0.00018349685534591197, | |
| "loss": 0.3575, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 4.45141065830721, | |
| "grad_norm": 0.2809455096721649, | |
| "learning_rate": 0.00018339622641509436, | |
| "loss": 0.3097, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 4.476489028213166, | |
| "grad_norm": 0.3250884413719177, | |
| "learning_rate": 0.00018329559748427675, | |
| "loss": 0.4113, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 4.501567398119122, | |
| "grad_norm": 0.29040804505348206, | |
| "learning_rate": 0.0001831949685534591, | |
| "loss": 0.4481, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 4.5266457680250785, | |
| "grad_norm": 0.3208359479904175, | |
| "learning_rate": 0.0001830943396226415, | |
| "loss": 0.4438, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 4.551724137931035, | |
| "grad_norm": 0.23080404102802277, | |
| "learning_rate": 0.00018299371069182391, | |
| "loss": 0.2099, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 4.576802507836991, | |
| "grad_norm": 0.2984071373939514, | |
| "learning_rate": 0.0001828930817610063, | |
| "loss": 0.3339, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 4.601880877742946, | |
| "grad_norm": 0.3299279808998108, | |
| "learning_rate": 0.0001827924528301887, | |
| "loss": 0.4208, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 4.6269592476489025, | |
| "grad_norm": 0.3243483006954193, | |
| "learning_rate": 0.00018269182389937108, | |
| "loss": 0.4165, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 4.652037617554859, | |
| "grad_norm": 0.29541853070259094, | |
| "learning_rate": 0.00018259119496855347, | |
| "loss": 0.3703, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 4.677115987460815, | |
| "grad_norm": 0.3010431230068207, | |
| "learning_rate": 0.00018249056603773586, | |
| "loss": 0.4034, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 4.702194357366771, | |
| "grad_norm": 0.2970607578754425, | |
| "learning_rate": 0.00018238993710691825, | |
| "loss": 0.3151, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 4.7272727272727275, | |
| "grad_norm": 0.2794083058834076, | |
| "learning_rate": 0.00018228930817610063, | |
| "loss": 0.3605, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 4.752351097178684, | |
| "grad_norm": 0.2949012219905853, | |
| "learning_rate": 0.00018218867924528302, | |
| "loss": 0.343, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 4.777429467084639, | |
| "grad_norm": 0.28160709142684937, | |
| "learning_rate": 0.0001820880503144654, | |
| "loss": 0.4515, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 4.802507836990595, | |
| "grad_norm": 0.296051561832428, | |
| "learning_rate": 0.00018198742138364783, | |
| "loss": 0.3908, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 4.827586206896552, | |
| "grad_norm": 0.26115506887435913, | |
| "learning_rate": 0.0001818867924528302, | |
| "loss": 0.3312, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 4.852664576802508, | |
| "grad_norm": 0.27632880210876465, | |
| "learning_rate": 0.00018178616352201258, | |
| "loss": 0.3179, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 4.877742946708464, | |
| "grad_norm": 0.2973230481147766, | |
| "learning_rate": 0.00018168553459119497, | |
| "loss": 0.382, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 4.90282131661442, | |
| "grad_norm": 0.2833520472049713, | |
| "learning_rate": 0.00018158490566037736, | |
| "loss": 0.3363, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 4.927899686520377, | |
| "grad_norm": 0.30823326110839844, | |
| "learning_rate": 0.00018148427672955977, | |
| "loss": 0.3234, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 4.952978056426332, | |
| "grad_norm": 0.2736763060092926, | |
| "learning_rate": 0.00018138364779874216, | |
| "loss": 0.415, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 4.978056426332288, | |
| "grad_norm": 0.2832898199558258, | |
| "learning_rate": 0.00018128301886792455, | |
| "loss": 0.3755, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.3281781077384949, | |
| "learning_rate": 0.0001811823899371069, | |
| "loss": 0.2756, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 4000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.25570189464994e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |