{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 124205, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020128014170121975, "grad_norm": 4.729732990264893, "learning_rate": 2.994e-05, "loss": 4.6998, "step": 500 }, { "epoch": 0.04025602834024395, "grad_norm": 5.319427967071533, "learning_rate": 2.9878986298047775e-05, "loss": 3.2594, "step": 1000 }, { "epoch": 0.06038404251036593, "grad_norm": 6.513505458831787, "learning_rate": 2.975773008366679e-05, "loss": 3.0934, "step": 1500 }, { "epoch": 0.0805120566804879, "grad_norm": 7.240413665771484, "learning_rate": 2.9636473869285803e-05, "loss": 3.0216, "step": 2000 }, { "epoch": 0.10064007085060987, "grad_norm": 7.327127933502197, "learning_rate": 2.9515217654904812e-05, "loss": 2.8993, "step": 2500 }, { "epoch": 0.12076808502073186, "grad_norm": 7.550909519195557, "learning_rate": 2.9393961440523828e-05, "loss": 2.7834, "step": 3000 }, { "epoch": 0.14089609919085383, "grad_norm": 9.492281913757324, "learning_rate": 2.927270522614284e-05, "loss": 2.6516, "step": 3500 }, { "epoch": 0.1610241133609758, "grad_norm": 13.708161354064941, "learning_rate": 2.9151449011761853e-05, "loss": 2.5647, "step": 4000 }, { "epoch": 0.18115212753109777, "grad_norm": 8.566939353942871, "learning_rate": 2.903019279738087e-05, "loss": 2.4627, "step": 4500 }, { "epoch": 0.20128014170121974, "grad_norm": 13.398711204528809, "learning_rate": 2.890893658299988e-05, "loss": 2.427, "step": 5000 }, { "epoch": 0.22140815587134174, "grad_norm": 9.740044593811035, "learning_rate": 2.878768036861889e-05, "loss": 2.3102, "step": 5500 }, { "epoch": 0.24153617004146372, "grad_norm": 13.687227249145508, "learning_rate": 2.8666424154237906e-05, "loss": 2.2774, "step": 6000 }, { "epoch": 0.26166418421158566, "grad_norm": 10.618597984313965, "learning_rate": 2.854516793985692e-05, "loss": 2.2404, "step": 6500 }, { "epoch": 0.28179219838170766, "grad_norm": 9.622278213500977, "learning_rate": 2.842391172547593e-05, "loss": 2.1845, "step": 7000 }, { "epoch": 0.30192021255182966, "grad_norm": 12.272369384765625, "learning_rate": 2.8302655511094947e-05, "loss": 2.1741, "step": 7500 }, { "epoch": 0.3220482267219516, "grad_norm": 8.921002388000488, "learning_rate": 2.818139929671396e-05, "loss": 2.1317, "step": 8000 }, { "epoch": 0.3421762408920736, "grad_norm": 11.55328369140625, "learning_rate": 2.8060143082332968e-05, "loss": 2.1168, "step": 8500 }, { "epoch": 0.36230425506219555, "grad_norm": 9.84124755859375, "learning_rate": 2.7938886867951984e-05, "loss": 2.1129, "step": 9000 }, { "epoch": 0.38243226923231755, "grad_norm": 9.949904441833496, "learning_rate": 2.7817630653570996e-05, "loss": 2.0368, "step": 9500 }, { "epoch": 0.4025602834024395, "grad_norm": 15.386507034301758, "learning_rate": 2.769637443919001e-05, "loss": 2.0455, "step": 10000 }, { "epoch": 0.4226882975725615, "grad_norm": 10.55031967163086, "learning_rate": 2.757511822480902e-05, "loss": 2.0155, "step": 10500 }, { "epoch": 0.4428163117426835, "grad_norm": 13.076041221618652, "learning_rate": 2.7453862010428037e-05, "loss": 1.9655, "step": 11000 }, { "epoch": 0.46294432591280543, "grad_norm": 9.13262939453125, "learning_rate": 2.7332605796047046e-05, "loss": 1.9698, "step": 11500 }, { "epoch": 0.48307234008292743, "grad_norm": 14.231966972351074, "learning_rate": 2.721134958166606e-05, "loss": 1.9137, "step": 12000 }, { "epoch": 0.5032003542530494, "grad_norm": 13.586121559143066, "learning_rate": 2.7090093367285074e-05, "loss": 1.8677, "step": 12500 }, { "epoch": 0.5233283684231713, "grad_norm": 16.752092361450195, "learning_rate": 2.6968837152904087e-05, "loss": 1.8927, "step": 13000 }, { "epoch": 0.5434563825932933, "grad_norm": 11.639701843261719, "learning_rate": 2.68475809385231e-05, "loss": 1.8398, "step": 13500 }, { "epoch": 0.5635843967634153, "grad_norm": 20.035966873168945, "learning_rate": 2.6726324724142115e-05, "loss": 1.8741, "step": 14000 }, { "epoch": 0.5837124109335373, "grad_norm": 12.169594764709473, "learning_rate": 2.6605068509761127e-05, "loss": 1.8125, "step": 14500 }, { "epoch": 0.6038404251036593, "grad_norm": 14.39586067199707, "learning_rate": 2.6483812295380136e-05, "loss": 1.7955, "step": 15000 }, { "epoch": 0.6239684392737812, "grad_norm": 9.263022422790527, "learning_rate": 2.6362556080999152e-05, "loss": 1.8026, "step": 15500 }, { "epoch": 0.6440964534439032, "grad_norm": 10.964536666870117, "learning_rate": 2.6241299866618165e-05, "loss": 1.7693, "step": 16000 }, { "epoch": 0.6642244676140252, "grad_norm": 17.373477935791016, "learning_rate": 2.6120043652237177e-05, "loss": 1.7218, "step": 16500 }, { "epoch": 0.6843524817841472, "grad_norm": 13.868760108947754, "learning_rate": 2.5998787437856193e-05, "loss": 1.7308, "step": 17000 }, { "epoch": 0.7044804959542692, "grad_norm": 16.622940063476562, "learning_rate": 2.5877531223475205e-05, "loss": 1.7091, "step": 17500 }, { "epoch": 0.7246085101243911, "grad_norm": 9.113160133361816, "learning_rate": 2.5756275009094214e-05, "loss": 1.7317, "step": 18000 }, { "epoch": 0.7447365242945131, "grad_norm": 24.89649200439453, "learning_rate": 2.563501879471323e-05, "loss": 1.6873, "step": 18500 }, { "epoch": 0.7648645384646351, "grad_norm": 11.15603256225586, "learning_rate": 2.5513762580332243e-05, "loss": 1.6772, "step": 19000 }, { "epoch": 0.7849925526347571, "grad_norm": 19.64437484741211, "learning_rate": 2.5392506365951255e-05, "loss": 1.6586, "step": 19500 }, { "epoch": 0.805120566804879, "grad_norm": 14.52999496459961, "learning_rate": 2.527125015157027e-05, "loss": 1.6505, "step": 20000 }, { "epoch": 0.825248580975001, "grad_norm": 13.304444313049316, "learning_rate": 2.5149993937189283e-05, "loss": 1.615, "step": 20500 }, { "epoch": 0.845376595145123, "grad_norm": 14.634563446044922, "learning_rate": 2.5028737722808292e-05, "loss": 1.6244, "step": 21000 }, { "epoch": 0.865504609315245, "grad_norm": 12.946802139282227, "learning_rate": 2.4907481508427308e-05, "loss": 1.6158, "step": 21500 }, { "epoch": 0.885632623485367, "grad_norm": 11.765786170959473, "learning_rate": 2.478622529404632e-05, "loss": 1.5787, "step": 22000 }, { "epoch": 0.9057606376554889, "grad_norm": 11.961956024169922, "learning_rate": 2.4664969079665333e-05, "loss": 1.5621, "step": 22500 }, { "epoch": 0.9258886518256109, "grad_norm": 13.635610580444336, "learning_rate": 2.454371286528435e-05, "loss": 1.5692, "step": 23000 }, { "epoch": 0.9460166659957329, "grad_norm": 13.10095500946045, "learning_rate": 2.442245665090336e-05, "loss": 1.5458, "step": 23500 }, { "epoch": 0.9661446801658549, "grad_norm": 11.790682792663574, "learning_rate": 2.430120043652237e-05, "loss": 1.5394, "step": 24000 }, { "epoch": 0.9862726943359769, "grad_norm": 11.249995231628418, "learning_rate": 2.4179944222141386e-05, "loss": 1.5485, "step": 24500 }, { "epoch": 1.0064007085060989, "grad_norm": 13.755157470703125, "learning_rate": 2.40586880077604e-05, "loss": 1.5134, "step": 25000 }, { "epoch": 1.0265287226762208, "grad_norm": 15.988091468811035, "learning_rate": 2.393743179337941e-05, "loss": 1.4833, "step": 25500 }, { "epoch": 1.0466567368463426, "grad_norm": 11.56142807006836, "learning_rate": 2.3816175578998427e-05, "loss": 1.4523, "step": 26000 }, { "epoch": 1.0667847510164647, "grad_norm": 10.849580764770508, "learning_rate": 2.369491936461744e-05, "loss": 1.4571, "step": 26500 }, { "epoch": 1.0869127651865866, "grad_norm": 17.24896240234375, "learning_rate": 2.3573663150236448e-05, "loss": 1.4886, "step": 27000 }, { "epoch": 1.1070407793567087, "grad_norm": 12.933219909667969, "learning_rate": 2.345240693585546e-05, "loss": 1.4485, "step": 27500 }, { "epoch": 1.1271687935268306, "grad_norm": 12.675749778747559, "learning_rate": 2.3331150721474476e-05, "loss": 1.4161, "step": 28000 }, { "epoch": 1.1472968076969525, "grad_norm": 21.270776748657227, "learning_rate": 2.320989450709349e-05, "loss": 1.4371, "step": 28500 }, { "epoch": 1.1674248218670746, "grad_norm": 17.078645706176758, "learning_rate": 2.30886382927125e-05, "loss": 1.3918, "step": 29000 }, { "epoch": 1.1875528360371965, "grad_norm": 23.501638412475586, "learning_rate": 2.2967382078331517e-05, "loss": 1.4278, "step": 29500 }, { "epoch": 1.2076808502073186, "grad_norm": 12.903084754943848, "learning_rate": 2.284612586395053e-05, "loss": 1.3752, "step": 30000 }, { "epoch": 1.2278088643774405, "grad_norm": 15.732855796813965, "learning_rate": 2.272486964956954e-05, "loss": 1.3931, "step": 30500 }, { "epoch": 1.2479368785475624, "grad_norm": 11.898987770080566, "learning_rate": 2.2603613435188554e-05, "loss": 1.3587, "step": 31000 }, { "epoch": 1.2680648927176845, "grad_norm": 18.970348358154297, "learning_rate": 2.2482357220807567e-05, "loss": 1.3706, "step": 31500 }, { "epoch": 1.2881929068878064, "grad_norm": 13.289978981018066, "learning_rate": 2.236110100642658e-05, "loss": 1.3378, "step": 32000 }, { "epoch": 1.3083209210579283, "grad_norm": 25.023792266845703, "learning_rate": 2.2239844792045595e-05, "loss": 1.3174, "step": 32500 }, { "epoch": 1.3284489352280504, "grad_norm": 12.036040306091309, "learning_rate": 2.2118588577664607e-05, "loss": 1.3293, "step": 33000 }, { "epoch": 1.3485769493981723, "grad_norm": 12.723782539367676, "learning_rate": 2.1997332363283616e-05, "loss": 1.3287, "step": 33500 }, { "epoch": 1.3687049635682944, "grad_norm": 11.143896102905273, "learning_rate": 2.1876076148902632e-05, "loss": 1.3125, "step": 34000 }, { "epoch": 1.3888329777384163, "grad_norm": 12.347333908081055, "learning_rate": 2.1754819934521645e-05, "loss": 1.3472, "step": 34500 }, { "epoch": 1.4089609919085384, "grad_norm": 20.10418701171875, "learning_rate": 2.1633563720140657e-05, "loss": 1.3052, "step": 35000 }, { "epoch": 1.4290890060786603, "grad_norm": 17.1345157623291, "learning_rate": 2.1512307505759673e-05, "loss": 1.2828, "step": 35500 }, { "epoch": 1.4492170202487822, "grad_norm": 17.451622009277344, "learning_rate": 2.1391051291378685e-05, "loss": 1.2787, "step": 36000 }, { "epoch": 1.4693450344189043, "grad_norm": 19.05263900756836, "learning_rate": 2.1269795076997694e-05, "loss": 1.3195, "step": 36500 }, { "epoch": 1.4894730485890262, "grad_norm": 12.999706268310547, "learning_rate": 2.114853886261671e-05, "loss": 1.2759, "step": 37000 }, { "epoch": 1.509601062759148, "grad_norm": 12.11323356628418, "learning_rate": 2.1027282648235722e-05, "loss": 1.2738, "step": 37500 }, { "epoch": 1.5297290769292702, "grad_norm": 10.93237018585205, "learning_rate": 2.0906026433854735e-05, "loss": 1.2742, "step": 38000 }, { "epoch": 1.5498570910993923, "grad_norm": 26.265893936157227, "learning_rate": 2.078477021947375e-05, "loss": 1.2193, "step": 38500 }, { "epoch": 1.569985105269514, "grad_norm": 17.12728500366211, "learning_rate": 2.0663514005092763e-05, "loss": 1.2355, "step": 39000 }, { "epoch": 1.590113119439636, "grad_norm": 15.962538719177246, "learning_rate": 2.0542257790711772e-05, "loss": 1.219, "step": 39500 }, { "epoch": 1.6102411336097582, "grad_norm": 19.7554931640625, "learning_rate": 2.0421001576330788e-05, "loss": 1.2393, "step": 40000 }, { "epoch": 1.63036914777988, "grad_norm": 17.81658363342285, "learning_rate": 2.02997453619498e-05, "loss": 1.22, "step": 40500 }, { "epoch": 1.650497161950002, "grad_norm": 16.0762882232666, "learning_rate": 2.0178489147568813e-05, "loss": 1.2359, "step": 41000 }, { "epoch": 1.670625176120124, "grad_norm": 13.652649879455566, "learning_rate": 2.005723293318783e-05, "loss": 1.244, "step": 41500 }, { "epoch": 1.690753190290246, "grad_norm": 8.598692893981934, "learning_rate": 1.993597671880684e-05, "loss": 1.2153, "step": 42000 }, { "epoch": 1.7108812044603678, "grad_norm": 15.637930870056152, "learning_rate": 1.981472050442585e-05, "loss": 1.1855, "step": 42500 }, { "epoch": 1.73100921863049, "grad_norm": 16.582963943481445, "learning_rate": 1.9693464290044866e-05, "loss": 1.1908, "step": 43000 }, { "epoch": 1.7511372328006118, "grad_norm": 16.173324584960938, "learning_rate": 1.957220807566388e-05, "loss": 1.1659, "step": 43500 }, { "epoch": 1.7712652469707337, "grad_norm": 15.524099349975586, "learning_rate": 1.945095186128289e-05, "loss": 1.1853, "step": 44000 }, { "epoch": 1.7913932611408558, "grad_norm": 11.66182804107666, "learning_rate": 1.9329695646901903e-05, "loss": 1.1679, "step": 44500 }, { "epoch": 1.811521275310978, "grad_norm": 10.504340171813965, "learning_rate": 1.920843943252092e-05, "loss": 1.155, "step": 45000 }, { "epoch": 1.8316492894810998, "grad_norm": 16.5634708404541, "learning_rate": 1.908718321813993e-05, "loss": 1.1544, "step": 45500 }, { "epoch": 1.8517773036512217, "grad_norm": 13.282904624938965, "learning_rate": 1.896592700375894e-05, "loss": 1.1892, "step": 46000 }, { "epoch": 1.8719053178213438, "grad_norm": 13.532590866088867, "learning_rate": 1.8844670789377956e-05, "loss": 1.1728, "step": 46500 }, { "epoch": 1.8920333319914657, "grad_norm": 15.26899242401123, "learning_rate": 1.872341457499697e-05, "loss": 1.1733, "step": 47000 }, { "epoch": 1.9121613461615876, "grad_norm": 14.551050186157227, "learning_rate": 1.860215836061598e-05, "loss": 1.156, "step": 47500 }, { "epoch": 1.9322893603317097, "grad_norm": 11.31080436706543, "learning_rate": 1.8480902146234997e-05, "loss": 1.1405, "step": 48000 }, { "epoch": 1.9524173745018316, "grad_norm": 19.817716598510742, "learning_rate": 1.835964593185401e-05, "loss": 1.1488, "step": 48500 }, { "epoch": 1.9725453886719535, "grad_norm": 13.350114822387695, "learning_rate": 1.823838971747302e-05, "loss": 1.1094, "step": 49000 }, { "epoch": 1.9926734028420756, "grad_norm": 13.383456230163574, "learning_rate": 1.8117133503092034e-05, "loss": 1.1162, "step": 49500 }, { "epoch": 2.0128014170121977, "grad_norm": 14.433093070983887, "learning_rate": 1.7995877288711047e-05, "loss": 1.0904, "step": 50000 }, { "epoch": 2.0329294311823194, "grad_norm": 5.4649338722229, "learning_rate": 1.787462107433006e-05, "loss": 1.0922, "step": 50500 }, { "epoch": 2.0530574453524415, "grad_norm": 14.124307632446289, "learning_rate": 1.7753364859949075e-05, "loss": 1.0895, "step": 51000 }, { "epoch": 2.0731854595225636, "grad_norm": 9.346240043640137, "learning_rate": 1.7632108645568087e-05, "loss": 1.0611, "step": 51500 }, { "epoch": 2.0933134736926853, "grad_norm": 16.641101837158203, "learning_rate": 1.7510852431187096e-05, "loss": 1.0945, "step": 52000 }, { "epoch": 2.1134414878628074, "grad_norm": 12.283844947814941, "learning_rate": 1.7389596216806112e-05, "loss": 1.078, "step": 52500 }, { "epoch": 2.1335695020329295, "grad_norm": 8.219395637512207, "learning_rate": 1.7268340002425125e-05, "loss": 1.0482, "step": 53000 }, { "epoch": 2.1536975162030516, "grad_norm": 15.403627395629883, "learning_rate": 1.7147083788044137e-05, "loss": 1.0433, "step": 53500 }, { "epoch": 2.1738255303731733, "grad_norm": 15.240696907043457, "learning_rate": 1.7025827573663153e-05, "loss": 1.0493, "step": 54000 }, { "epoch": 2.1939535445432954, "grad_norm": 19.72351837158203, "learning_rate": 1.6904571359282165e-05, "loss": 1.042, "step": 54500 }, { "epoch": 2.2140815587134175, "grad_norm": 21.067684173583984, "learning_rate": 1.6783315144901174e-05, "loss": 1.025, "step": 55000 }, { "epoch": 2.234209572883539, "grad_norm": 14.798884391784668, "learning_rate": 1.666205893052019e-05, "loss": 1.0143, "step": 55500 }, { "epoch": 2.2543375870536613, "grad_norm": 15.239629745483398, "learning_rate": 1.6540802716139202e-05, "loss": 1.0322, "step": 56000 }, { "epoch": 2.2744656012237834, "grad_norm": 14.908315658569336, "learning_rate": 1.6419546501758215e-05, "loss": 1.0826, "step": 56500 }, { "epoch": 2.294593615393905, "grad_norm": 13.52440071105957, "learning_rate": 1.629829028737723e-05, "loss": 1.0199, "step": 57000 }, { "epoch": 2.314721629564027, "grad_norm": 20.474451065063477, "learning_rate": 1.6177034072996243e-05, "loss": 1.0061, "step": 57500 }, { "epoch": 2.3348496437341493, "grad_norm": 15.805046081542969, "learning_rate": 1.6055777858615252e-05, "loss": 1.0141, "step": 58000 }, { "epoch": 2.3549776579042714, "grad_norm": 9.82214641571045, "learning_rate": 1.5934521644234268e-05, "loss": 1.0099, "step": 58500 }, { "epoch": 2.375105672074393, "grad_norm": 17.32090950012207, "learning_rate": 1.581326542985328e-05, "loss": 0.9851, "step": 59000 }, { "epoch": 2.395233686244515, "grad_norm": 27.325069427490234, "learning_rate": 1.5692009215472293e-05, "loss": 0.9991, "step": 59500 }, { "epoch": 2.4153617004146373, "grad_norm": 21.118209838867188, "learning_rate": 1.557075300109131e-05, "loss": 1.0082, "step": 60000 }, { "epoch": 2.435489714584759, "grad_norm": 14.355386734008789, "learning_rate": 1.544949678671032e-05, "loss": 0.9549, "step": 60500 }, { "epoch": 2.455617728754881, "grad_norm": 16.598129272460938, "learning_rate": 1.532824057232933e-05, "loss": 1.0101, "step": 61000 }, { "epoch": 2.475745742925003, "grad_norm": 21.729766845703125, "learning_rate": 1.5206984357948344e-05, "loss": 0.9704, "step": 61500 }, { "epoch": 2.495873757095125, "grad_norm": 16.548641204833984, "learning_rate": 1.5085728143567358e-05, "loss": 0.9893, "step": 62000 }, { "epoch": 2.516001771265247, "grad_norm": 14.282777786254883, "learning_rate": 1.496447192918637e-05, "loss": 0.9721, "step": 62500 }, { "epoch": 2.536129785435369, "grad_norm": 9.005020141601562, "learning_rate": 1.4843215714805385e-05, "loss": 0.98, "step": 63000 }, { "epoch": 2.5562577996054907, "grad_norm": 8.10714340209961, "learning_rate": 1.4721959500424397e-05, "loss": 0.9887, "step": 63500 }, { "epoch": 2.576385813775613, "grad_norm": 13.707820892333984, "learning_rate": 1.460070328604341e-05, "loss": 0.9805, "step": 64000 }, { "epoch": 2.596513827945735, "grad_norm": 20.182363510131836, "learning_rate": 1.4479447071662424e-05, "loss": 0.9837, "step": 64500 }, { "epoch": 2.6166418421158566, "grad_norm": 9.87313175201416, "learning_rate": 1.4358190857281435e-05, "loss": 0.9609, "step": 65000 }, { "epoch": 2.6367698562859787, "grad_norm": 12.288646697998047, "learning_rate": 1.4236934642900449e-05, "loss": 1.0035, "step": 65500 }, { "epoch": 2.656897870456101, "grad_norm": 18.152629852294922, "learning_rate": 1.4115678428519463e-05, "loss": 0.9494, "step": 66000 }, { "epoch": 2.677025884626223, "grad_norm": 16.326662063598633, "learning_rate": 1.3994422214138473e-05, "loss": 0.946, "step": 66500 }, { "epoch": 2.6971538987963446, "grad_norm": 18.14234733581543, "learning_rate": 1.3873165999757488e-05, "loss": 0.9504, "step": 67000 }, { "epoch": 2.7172819129664667, "grad_norm": 20.3934326171875, "learning_rate": 1.3751909785376502e-05, "loss": 0.9676, "step": 67500 }, { "epoch": 2.737409927136589, "grad_norm": 11.495948791503906, "learning_rate": 1.3630653570995512e-05, "loss": 0.9283, "step": 68000 }, { "epoch": 2.757537941306711, "grad_norm": 20.127979278564453, "learning_rate": 1.3509397356614527e-05, "loss": 0.9467, "step": 68500 }, { "epoch": 2.7776659554768326, "grad_norm": 13.345834732055664, "learning_rate": 1.338814114223354e-05, "loss": 0.9538, "step": 69000 }, { "epoch": 2.7977939696469547, "grad_norm": 9.327335357666016, "learning_rate": 1.3266884927852551e-05, "loss": 0.9437, "step": 69500 }, { "epoch": 2.817921983817077, "grad_norm": 12.741182327270508, "learning_rate": 1.3145628713471566e-05, "loss": 0.9291, "step": 70000 }, { "epoch": 2.8380499979871985, "grad_norm": 16.994661331176758, "learning_rate": 1.302437249909058e-05, "loss": 0.9147, "step": 70500 }, { "epoch": 2.8581780121573206, "grad_norm": 15.74470043182373, "learning_rate": 1.2903116284709592e-05, "loss": 0.9296, "step": 71000 }, { "epoch": 2.8783060263274427, "grad_norm": 13.54488754272461, "learning_rate": 1.2781860070328604e-05, "loss": 0.9482, "step": 71500 }, { "epoch": 2.8984340404975644, "grad_norm": 10.650059700012207, "learning_rate": 1.2660603855947619e-05, "loss": 0.9516, "step": 72000 }, { "epoch": 2.9185620546676865, "grad_norm": 12.577211380004883, "learning_rate": 1.2539347641566631e-05, "loss": 0.9173, "step": 72500 }, { "epoch": 2.9386900688378086, "grad_norm": 14.282366752624512, "learning_rate": 1.2418091427185643e-05, "loss": 0.9511, "step": 73000 }, { "epoch": 2.9588180830079303, "grad_norm": 14.529337882995605, "learning_rate": 1.2296835212804656e-05, "loss": 0.9302, "step": 73500 }, { "epoch": 2.9789460971780524, "grad_norm": 11.681228637695312, "learning_rate": 1.217557899842367e-05, "loss": 0.9097, "step": 74000 }, { "epoch": 2.9990741113481745, "grad_norm": 11.70090389251709, "learning_rate": 1.2054322784042682e-05, "loss": 0.9233, "step": 74500 }, { "epoch": 3.019202125518296, "grad_norm": 27.22252655029297, "learning_rate": 1.1933066569661695e-05, "loss": 0.8651, "step": 75000 }, { "epoch": 3.0393301396884183, "grad_norm": 14.896398544311523, "learning_rate": 1.1811810355280709e-05, "loss": 0.8639, "step": 75500 }, { "epoch": 3.0594581538585404, "grad_norm": 20.037960052490234, "learning_rate": 1.1690554140899721e-05, "loss": 0.8606, "step": 76000 }, { "epoch": 3.0795861680286625, "grad_norm": 16.03421974182129, "learning_rate": 1.1569297926518734e-05, "loss": 0.8639, "step": 76500 }, { "epoch": 3.099714182198784, "grad_norm": 14.802894592285156, "learning_rate": 1.1448041712137748e-05, "loss": 0.8875, "step": 77000 }, { "epoch": 3.1198421963689063, "grad_norm": 9.06533145904541, "learning_rate": 1.132678549775676e-05, "loss": 0.8877, "step": 77500 }, { "epoch": 3.1399702105390284, "grad_norm": 13.744263648986816, "learning_rate": 1.1205529283375773e-05, "loss": 0.8761, "step": 78000 }, { "epoch": 3.16009822470915, "grad_norm": 12.16555404663086, "learning_rate": 1.1084273068994787e-05, "loss": 0.8782, "step": 78500 }, { "epoch": 3.180226238879272, "grad_norm": 29.285688400268555, "learning_rate": 1.09630168546138e-05, "loss": 0.8579, "step": 79000 }, { "epoch": 3.2003542530493942, "grad_norm": 14.758946418762207, "learning_rate": 1.0841760640232812e-05, "loss": 0.878, "step": 79500 }, { "epoch": 3.220482267219516, "grad_norm": 12.481344223022461, "learning_rate": 1.0720504425851826e-05, "loss": 0.8383, "step": 80000 }, { "epoch": 3.240610281389638, "grad_norm": 11.378300666809082, "learning_rate": 1.0599248211470838e-05, "loss": 0.866, "step": 80500 }, { "epoch": 3.26073829555976, "grad_norm": 18.51228141784668, "learning_rate": 1.047799199708985e-05, "loss": 0.8727, "step": 81000 }, { "epoch": 3.2808663097298822, "grad_norm": 13.013883590698242, "learning_rate": 1.0356735782708865e-05, "loss": 0.8497, "step": 81500 }, { "epoch": 3.300994323900004, "grad_norm": 18.66629409790039, "learning_rate": 1.0235479568327876e-05, "loss": 0.8817, "step": 82000 }, { "epoch": 3.321122338070126, "grad_norm": 22.02678108215332, "learning_rate": 1.011422335394689e-05, "loss": 0.8207, "step": 82500 }, { "epoch": 3.341250352240248, "grad_norm": 21.1297550201416, "learning_rate": 9.992967139565904e-06, "loss": 0.834, "step": 83000 }, { "epoch": 3.36137836641037, "grad_norm": 15.060477256774902, "learning_rate": 9.871710925184914e-06, "loss": 0.8313, "step": 83500 }, { "epoch": 3.381506380580492, "grad_norm": 20.013944625854492, "learning_rate": 9.750454710803929e-06, "loss": 0.8628, "step": 84000 }, { "epoch": 3.401634394750614, "grad_norm": 11.168913841247559, "learning_rate": 9.629198496422943e-06, "loss": 0.8261, "step": 84500 }, { "epoch": 3.4217624089207357, "grad_norm": 15.372590065002441, "learning_rate": 9.507942282041953e-06, "loss": 0.8618, "step": 85000 }, { "epoch": 3.441890423090858, "grad_norm": 11.604378700256348, "learning_rate": 9.386686067660968e-06, "loss": 0.8239, "step": 85500 }, { "epoch": 3.46201843726098, "grad_norm": 9.609265327453613, "learning_rate": 9.265429853279982e-06, "loss": 0.8371, "step": 86000 }, { "epoch": 3.4821464514311016, "grad_norm": 15.69279956817627, "learning_rate": 9.144173638898994e-06, "loss": 0.8218, "step": 86500 }, { "epoch": 3.5022744656012237, "grad_norm": 14.74257755279541, "learning_rate": 9.022917424518007e-06, "loss": 0.8055, "step": 87000 }, { "epoch": 3.522402479771346, "grad_norm": 10.193700790405273, "learning_rate": 8.90166121013702e-06, "loss": 0.8566, "step": 87500 }, { "epoch": 3.5425304939414675, "grad_norm": 13.010785102844238, "learning_rate": 8.780404995756033e-06, "loss": 0.8443, "step": 88000 }, { "epoch": 3.5626585081115896, "grad_norm": 11.916807174682617, "learning_rate": 8.659148781375045e-06, "loss": 0.8272, "step": 88500 }, { "epoch": 3.5827865222817117, "grad_norm": 11.876017570495605, "learning_rate": 8.53789256699406e-06, "loss": 0.8518, "step": 89000 }, { "epoch": 3.602914536451834, "grad_norm": 21.5701847076416, "learning_rate": 8.416636352613072e-06, "loss": 0.8087, "step": 89500 }, { "epoch": 3.623042550621956, "grad_norm": 11.204216957092285, "learning_rate": 8.295380138232084e-06, "loss": 0.8279, "step": 90000 }, { "epoch": 3.6431705647920776, "grad_norm": 11.78646469116211, "learning_rate": 8.174123923851097e-06, "loss": 0.8316, "step": 90500 }, { "epoch": 3.6632985789621997, "grad_norm": 12.788416862487793, "learning_rate": 8.052867709470111e-06, "loss": 0.8332, "step": 91000 }, { "epoch": 3.683426593132322, "grad_norm": 14.306061744689941, "learning_rate": 7.931611495089123e-06, "loss": 0.823, "step": 91500 }, { "epoch": 3.7035546073024435, "grad_norm": 20.168163299560547, "learning_rate": 7.810355280708136e-06, "loss": 0.81, "step": 92000 }, { "epoch": 3.7236826214725656, "grad_norm": 20.580291748046875, "learning_rate": 7.68909906632715e-06, "loss": 0.822, "step": 92500 }, { "epoch": 3.7438106356426877, "grad_norm": 13.826583862304688, "learning_rate": 7.567842851946163e-06, "loss": 0.8378, "step": 93000 }, { "epoch": 3.7639386498128093, "grad_norm": 30.890518188476562, "learning_rate": 7.446586637565176e-06, "loss": 0.8311, "step": 93500 }, { "epoch": 3.7840666639829315, "grad_norm": 15.22163200378418, "learning_rate": 7.325330423184188e-06, "loss": 0.8138, "step": 94000 }, { "epoch": 3.8041946781530536, "grad_norm": 8.326911926269531, "learning_rate": 7.204074208803201e-06, "loss": 0.784, "step": 94500 }, { "epoch": 3.8243226923231752, "grad_norm": 31.577423095703125, "learning_rate": 7.082817994422215e-06, "loss": 0.8006, "step": 95000 }, { "epoch": 3.8444507064932973, "grad_norm": 15.388664245605469, "learning_rate": 6.961561780041227e-06, "loss": 0.8418, "step": 95500 }, { "epoch": 3.8645787206634195, "grad_norm": 21.28485107421875, "learning_rate": 6.84030556566024e-06, "loss": 0.7972, "step": 96000 }, { "epoch": 3.884706734833541, "grad_norm": 11.151982307434082, "learning_rate": 6.7190493512792536e-06, "loss": 0.8133, "step": 96500 }, { "epoch": 3.9048347490036632, "grad_norm": 11.545019149780273, "learning_rate": 6.597793136898266e-06, "loss": 0.8035, "step": 97000 }, { "epoch": 3.9249627631737853, "grad_norm": 11.109121322631836, "learning_rate": 6.476536922517279e-06, "loss": 0.7959, "step": 97500 }, { "epoch": 3.945090777343907, "grad_norm": 12.6671142578125, "learning_rate": 6.355280708136292e-06, "loss": 0.8132, "step": 98000 }, { "epoch": 3.965218791514029, "grad_norm": 11.02685260772705, "learning_rate": 6.234024493755305e-06, "loss": 0.7959, "step": 98500 }, { "epoch": 3.9853468056841512, "grad_norm": 11.704038619995117, "learning_rate": 6.112768279374318e-06, "loss": 0.7837, "step": 99000 }, { "epoch": 4.005474819854273, "grad_norm": 16.34335708618164, "learning_rate": 5.991512064993331e-06, "loss": 0.7851, "step": 99500 }, { "epoch": 4.0256028340243954, "grad_norm": 10.739608764648438, "learning_rate": 5.870255850612345e-06, "loss": 0.7684, "step": 100000 }, { "epoch": 4.045730848194517, "grad_norm": 15.17026424407959, "learning_rate": 5.748999636231357e-06, "loss": 0.7679, "step": 100500 }, { "epoch": 4.065858862364639, "grad_norm": 16.030241012573242, "learning_rate": 5.62774342185037e-06, "loss": 0.764, "step": 101000 }, { "epoch": 4.085986876534761, "grad_norm": 15.900766372680664, "learning_rate": 5.506487207469383e-06, "loss": 0.7666, "step": 101500 }, { "epoch": 4.106114890704883, "grad_norm": 13.20738410949707, "learning_rate": 5.385230993088396e-06, "loss": 0.7686, "step": 102000 }, { "epoch": 4.126242904875005, "grad_norm": 9.8963623046875, "learning_rate": 5.2639747787074086e-06, "loss": 0.7589, "step": 102500 }, { "epoch": 4.146370919045127, "grad_norm": 16.053571701049805, "learning_rate": 5.142718564326422e-06, "loss": 0.7676, "step": 103000 }, { "epoch": 4.166498933215249, "grad_norm": 12.643793106079102, "learning_rate": 5.021462349945435e-06, "loss": 0.7462, "step": 103500 }, { "epoch": 4.186626947385371, "grad_norm": 27.60247230529785, "learning_rate": 4.9002061355644475e-06, "loss": 0.7864, "step": 104000 }, { "epoch": 4.206754961555493, "grad_norm": 13.564982414245605, "learning_rate": 4.778949921183461e-06, "loss": 0.7693, "step": 104500 }, { "epoch": 4.226882975725615, "grad_norm": 20.11015510559082, "learning_rate": 4.657693706802474e-06, "loss": 0.7386, "step": 105000 }, { "epoch": 4.247010989895736, "grad_norm": 15.393072128295898, "learning_rate": 4.5364374924214865e-06, "loss": 0.7793, "step": 105500 }, { "epoch": 4.267139004065859, "grad_norm": 19.87403678894043, "learning_rate": 4.4151812780405e-06, "loss": 0.7779, "step": 106000 }, { "epoch": 4.287267018235981, "grad_norm": 9.388250350952148, "learning_rate": 4.293925063659512e-06, "loss": 0.7681, "step": 106500 }, { "epoch": 4.307395032406103, "grad_norm": 10.060807228088379, "learning_rate": 4.1726688492785255e-06, "loss": 0.7509, "step": 107000 }, { "epoch": 4.327523046576225, "grad_norm": 23.562870025634766, "learning_rate": 4.051412634897539e-06, "loss": 0.7833, "step": 107500 }, { "epoch": 4.3476510607463466, "grad_norm": 14.926592826843262, "learning_rate": 3.930156420516551e-06, "loss": 0.7446, "step": 108000 }, { "epoch": 4.367779074916469, "grad_norm": 11.940516471862793, "learning_rate": 3.808900206135565e-06, "loss": 0.754, "step": 108500 }, { "epoch": 4.387907089086591, "grad_norm": 14.217045783996582, "learning_rate": 3.6876439917545777e-06, "loss": 0.7731, "step": 109000 }, { "epoch": 4.408035103256712, "grad_norm": 9.447354316711426, "learning_rate": 3.5663877773735905e-06, "loss": 0.7597, "step": 109500 }, { "epoch": 4.428163117426835, "grad_norm": 19.97547149658203, "learning_rate": 3.4451315629926034e-06, "loss": 0.7657, "step": 110000 }, { "epoch": 4.448291131596957, "grad_norm": 15.066329956054688, "learning_rate": 3.3238753486116167e-06, "loss": 0.7619, "step": 110500 }, { "epoch": 4.468419145767078, "grad_norm": 12.446183204650879, "learning_rate": 3.2026191342306295e-06, "loss": 0.7656, "step": 111000 }, { "epoch": 4.488547159937201, "grad_norm": 32.365234375, "learning_rate": 3.0813629198496423e-06, "loss": 0.7575, "step": 111500 }, { "epoch": 4.5086751741073225, "grad_norm": 12.082524299621582, "learning_rate": 2.960106705468655e-06, "loss": 0.7502, "step": 112000 }, { "epoch": 4.528803188277444, "grad_norm": 20.70221519470215, "learning_rate": 2.8388504910876685e-06, "loss": 0.7638, "step": 112500 }, { "epoch": 4.548931202447567, "grad_norm": 22.083984375, "learning_rate": 2.717594276706681e-06, "loss": 0.7365, "step": 113000 }, { "epoch": 4.569059216617688, "grad_norm": 14.066744804382324, "learning_rate": 2.596338062325694e-06, "loss": 0.766, "step": 113500 }, { "epoch": 4.58918723078781, "grad_norm": 24.38865089416504, "learning_rate": 2.4750818479447074e-06, "loss": 0.7449, "step": 114000 }, { "epoch": 4.609315244957933, "grad_norm": 11.597355842590332, "learning_rate": 2.3538256335637203e-06, "loss": 0.7556, "step": 114500 }, { "epoch": 4.629443259128054, "grad_norm": 10.837632179260254, "learning_rate": 2.232569419182733e-06, "loss": 0.7501, "step": 115000 }, { "epoch": 4.649571273298177, "grad_norm": 20.56001853942871, "learning_rate": 2.111313204801746e-06, "loss": 0.7234, "step": 115500 }, { "epoch": 4.6696992874682985, "grad_norm": 14.60595703125, "learning_rate": 1.9900569904207592e-06, "loss": 0.7695, "step": 116000 }, { "epoch": 4.68982730163842, "grad_norm": 28.349151611328125, "learning_rate": 1.868800776039772e-06, "loss": 0.7661, "step": 116500 }, { "epoch": 4.709955315808543, "grad_norm": 10.647957801818848, "learning_rate": 1.747544561658785e-06, "loss": 0.7308, "step": 117000 }, { "epoch": 4.730083329978664, "grad_norm": 11.21895980834961, "learning_rate": 1.6262883472777982e-06, "loss": 0.7585, "step": 117500 }, { "epoch": 4.750211344148786, "grad_norm": 12.75427532196045, "learning_rate": 1.505032132896811e-06, "loss": 0.7553, "step": 118000 }, { "epoch": 4.770339358318909, "grad_norm": 9.93217658996582, "learning_rate": 1.383775918515824e-06, "loss": 0.7525, "step": 118500 }, { "epoch": 4.79046737248903, "grad_norm": 13.394769668579102, "learning_rate": 1.262519704134837e-06, "loss": 0.7493, "step": 119000 }, { "epoch": 4.810595386659152, "grad_norm": 8.94278335571289, "learning_rate": 1.1412634897538498e-06, "loss": 0.7575, "step": 119500 }, { "epoch": 4.8307234008292745, "grad_norm": 16.46908950805664, "learning_rate": 1.0200072753728628e-06, "loss": 0.7651, "step": 120000 }, { "epoch": 4.850851414999396, "grad_norm": 27.788360595703125, "learning_rate": 8.987510609918758e-07, "loss": 0.7472, "step": 120500 }, { "epoch": 4.870979429169518, "grad_norm": 7.398582458496094, "learning_rate": 7.774948466108889e-07, "loss": 0.7696, "step": 121000 }, { "epoch": 4.89110744333964, "grad_norm": 17.573110580444336, "learning_rate": 6.562386322299018e-07, "loss": 0.7445, "step": 121500 }, { "epoch": 4.911235457509762, "grad_norm": 5.554362773895264, "learning_rate": 5.349824178489148e-07, "loss": 0.7407, "step": 122000 }, { "epoch": 4.931363471679884, "grad_norm": 8.908127784729004, "learning_rate": 4.137262034679277e-07, "loss": 0.7352, "step": 122500 }, { "epoch": 4.951491485850006, "grad_norm": 17.096956253051758, "learning_rate": 2.924699890869407e-07, "loss": 0.7548, "step": 123000 }, { "epoch": 4.971619500020128, "grad_norm": 15.15579891204834, "learning_rate": 1.7121377470595367e-07, "loss": 0.7635, "step": 123500 }, { "epoch": 4.99174751419025, "grad_norm": 14.474600791931152, "learning_rate": 4.9957560324966654e-08, "loss": 0.748, "step": 124000 }, { "epoch": 5.0, "step": 124205, "total_flos": 2.789913716232192e+16, "train_loss": 1.1867824254838901, "train_runtime": 21629.6238, "train_samples_per_second": 91.877, "train_steps_per_second": 5.742 } ], "logging_steps": 500, "max_steps": 124205, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.789913716232192e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }