evolve_sft / sft /checkpoint-11000 /trainer_state.json
xinchen9's picture
Upload folder using huggingface_hub
8c3a32b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.3722635608756604,
"eval_steps": 500,
"global_step": 11000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010783996549121105,
"grad_norm": 0.2076384574174881,
"learning_rate": 5.861244019138756e-06,
"loss": 1.0803,
"step": 50
},
{
"epoch": 0.02156799309824221,
"grad_norm": 0.25434058904647827,
"learning_rate": 1.1842105263157895e-05,
"loss": 1.0521,
"step": 100
},
{
"epoch": 0.03235198964736331,
"grad_norm": 0.24684220552444458,
"learning_rate": 1.7822966507177032e-05,
"loss": 1.0288,
"step": 150
},
{
"epoch": 0.04313598619648442,
"grad_norm": 0.3034498691558838,
"learning_rate": 2.380382775119617e-05,
"loss": 0.9972,
"step": 200
},
{
"epoch": 0.05391998274560552,
"grad_norm": 0.2725016176700592,
"learning_rate": 2.9784688995215314e-05,
"loss": 0.9555,
"step": 250
},
{
"epoch": 0.06470397929472663,
"grad_norm": 0.27356916666030884,
"learning_rate": 3.576555023923445e-05,
"loss": 0.9688,
"step": 300
},
{
"epoch": 0.07548797584384773,
"grad_norm": 0.2624454200267792,
"learning_rate": 4.174641148325359e-05,
"loss": 0.9699,
"step": 350
},
{
"epoch": 0.08627197239296884,
"grad_norm": 0.2676330506801605,
"learning_rate": 4.772727272727273e-05,
"loss": 0.9739,
"step": 400
},
{
"epoch": 0.09705596894208994,
"grad_norm": 0.24369767308235168,
"learning_rate": 4.999934880025785e-05,
"loss": 0.9833,
"step": 450
},
{
"epoch": 0.10783996549121104,
"grad_norm": 0.26960158348083496,
"learning_rate": 4.9995554200393156e-05,
"loss": 0.9677,
"step": 500
},
{
"epoch": 0.11862396204033215,
"grad_norm": 0.2564559578895569,
"learning_rate": 4.998837209058379e-05,
"loss": 0.9493,
"step": 550
},
{
"epoch": 0.12940795858945325,
"grad_norm": 0.23627087473869324,
"learning_rate": 4.9977803444181587e-05,
"loss": 0.9726,
"step": 600
},
{
"epoch": 0.14019195513857435,
"grad_norm": 0.22857290506362915,
"learning_rate": 4.996384969349704e-05,
"loss": 0.9653,
"step": 650
},
{
"epoch": 0.15097595168769545,
"grad_norm": 0.25175178050994873,
"learning_rate": 4.9946512729605226e-05,
"loss": 0.9725,
"step": 700
},
{
"epoch": 0.16175994823681655,
"grad_norm": 0.20284195244312286,
"learning_rate": 4.992579490208947e-05,
"loss": 0.968,
"step": 750
},
{
"epoch": 0.17254394478593768,
"grad_norm": 0.228809654712677,
"learning_rate": 4.990169901872295e-05,
"loss": 0.9338,
"step": 800
},
{
"epoch": 0.18332794133505878,
"grad_norm": 0.2436237633228302,
"learning_rate": 4.987422834508818e-05,
"loss": 0.9581,
"step": 850
},
{
"epoch": 0.19411193788417988,
"grad_norm": 0.2001142054796219,
"learning_rate": 4.9843386604134425e-05,
"loss": 0.9512,
"step": 900
},
{
"epoch": 0.20489593443330098,
"grad_norm": 0.20406965911388397,
"learning_rate": 4.980917797567315e-05,
"loss": 0.9479,
"step": 950
},
{
"epoch": 0.21567993098242208,
"grad_norm": 0.20756883919239044,
"learning_rate": 4.9771607095811565e-05,
"loss": 0.9552,
"step": 1000
},
{
"epoch": 0.22646392753154318,
"grad_norm": 0.23893098533153534,
"learning_rate": 4.9730679056324334e-05,
"loss": 0.9732,
"step": 1050
},
{
"epoch": 0.2372479240806643,
"grad_norm": 0.20374947786331177,
"learning_rate": 4.968639940396346e-05,
"loss": 0.961,
"step": 1100
},
{
"epoch": 0.2480319206297854,
"grad_norm": 0.20845109224319458,
"learning_rate": 4.963877413970663e-05,
"loss": 0.9481,
"step": 1150
},
{
"epoch": 0.2588159171789065,
"grad_norm": 0.23683245480060577,
"learning_rate": 4.958780971794388e-05,
"loss": 0.9558,
"step": 1200
},
{
"epoch": 0.2695999137280276,
"grad_norm": 0.18015944957733154,
"learning_rate": 4.953351304560292e-05,
"loss": 0.9367,
"step": 1250
},
{
"epoch": 0.2803839102771487,
"grad_norm": 0.21432434022426605,
"learning_rate": 4.947589148121301e-05,
"loss": 0.9289,
"step": 1300
},
{
"epoch": 0.2911679068262698,
"grad_norm": 0.217897430062294,
"learning_rate": 4.941495283390778e-05,
"loss": 0.9663,
"step": 1350
},
{
"epoch": 0.3019519033753909,
"grad_norm": 0.23911495506763458,
"learning_rate": 4.9350705362366836e-05,
"loss": 0.9534,
"step": 1400
},
{
"epoch": 0.312735899924512,
"grad_norm": 0.21729810535907745,
"learning_rate": 4.928315777369652e-05,
"loss": 0.9663,
"step": 1450
},
{
"epoch": 0.3235198964736331,
"grad_norm": 0.19448955357074738,
"learning_rate": 4.9212319222249914e-05,
"loss": 0.9203,
"step": 1500
},
{
"epoch": 0.3343038930227542,
"grad_norm": 0.20799997448921204,
"learning_rate": 4.913819930838616e-05,
"loss": 0.9426,
"step": 1550
},
{
"epoch": 0.34508788957187536,
"grad_norm": 0.1989525556564331,
"learning_rate": 4.906080807716941e-05,
"loss": 0.9544,
"step": 1600
},
{
"epoch": 0.35587188612099646,
"grad_norm": 0.21680687367916107,
"learning_rate": 4.898015601700745e-05,
"loss": 0.9666,
"step": 1650
},
{
"epoch": 0.36665588267011756,
"grad_norm": 0.2180759161710739,
"learning_rate": 4.889625405823027e-05,
"loss": 0.9441,
"step": 1700
},
{
"epoch": 0.37743987921923866,
"grad_norm": 0.19334350526332855,
"learning_rate": 4.880911357160877e-05,
"loss": 0.9415,
"step": 1750
},
{
"epoch": 0.38822387576835976,
"grad_norm": 0.19350044429302216,
"learning_rate": 4.871874636681366e-05,
"loss": 0.9534,
"step": 1800
},
{
"epoch": 0.39900787231748086,
"grad_norm": 0.23279784619808197,
"learning_rate": 4.862516469081505e-05,
"loss": 0.9578,
"step": 1850
},
{
"epoch": 0.40979186886660196,
"grad_norm": 0.2038542479276657,
"learning_rate": 4.852838122622264e-05,
"loss": 0.9416,
"step": 1900
},
{
"epoch": 0.42057586541572306,
"grad_norm": 0.21980704367160797,
"learning_rate": 4.842840908956692e-05,
"loss": 0.9359,
"step": 1950
},
{
"epoch": 0.43135986196484416,
"grad_norm": 0.20842380821704865,
"learning_rate": 4.832526182952156e-05,
"loss": 0.9495,
"step": 2000
},
{
"epoch": 0.44214385851396526,
"grad_norm": 0.2161971479654312,
"learning_rate": 4.821895342506724e-05,
"loss": 0.9388,
"step": 2050
},
{
"epoch": 0.45292785506308636,
"grad_norm": 0.2119661122560501,
"learning_rate": 4.8109498283597146e-05,
"loss": 0.9618,
"step": 2100
},
{
"epoch": 0.46371185161220746,
"grad_norm": 0.17877915501594543,
"learning_rate": 4.799691123896441e-05,
"loss": 0.9498,
"step": 2150
},
{
"epoch": 0.4744958481613286,
"grad_norm": 0.2198779135942459,
"learning_rate": 4.788120754947179e-05,
"loss": 0.9464,
"step": 2200
},
{
"epoch": 0.4852798447104497,
"grad_norm": 0.20385344326496124,
"learning_rate": 4.7762402895803763e-05,
"loss": 0.9423,
"step": 2250
},
{
"epoch": 0.4960638412595708,
"grad_norm": 0.21472816169261932,
"learning_rate": 4.764051337890143e-05,
"loss": 0.9295,
"step": 2300
},
{
"epoch": 0.5068478378086919,
"grad_norm": 0.21423693001270294,
"learning_rate": 4.7515555517780405e-05,
"loss": 0.9557,
"step": 2350
},
{
"epoch": 0.517631834357813,
"grad_norm": 0.2088768184185028,
"learning_rate": 4.7387546247292156e-05,
"loss": 0.9392,
"step": 2400
},
{
"epoch": 0.5284158309069341,
"grad_norm": 0.18323567509651184,
"learning_rate": 4.725650291582885e-05,
"loss": 0.9418,
"step": 2450
},
{
"epoch": 0.5391998274560552,
"grad_norm": 0.22341737151145935,
"learning_rate": 4.712244328297224e-05,
"loss": 0.9207,
"step": 2500
},
{
"epoch": 0.5499838240051763,
"grad_norm": 0.2024504542350769,
"learning_rate": 4.698538551708682e-05,
"loss": 0.9337,
"step": 2550
},
{
"epoch": 0.5607678205542974,
"grad_norm": 0.20455148816108704,
"learning_rate": 4.684534819285758e-05,
"loss": 0.9451,
"step": 2600
},
{
"epoch": 0.5715518171034185,
"grad_norm": 0.19093358516693115,
"learning_rate": 4.6702350288772626e-05,
"loss": 0.9468,
"step": 2650
},
{
"epoch": 0.5823358136525396,
"grad_norm": 0.1995963305234909,
"learning_rate": 4.6556411184551176e-05,
"loss": 0.9373,
"step": 2700
},
{
"epoch": 0.5931198102016607,
"grad_norm": 0.19664354622364044,
"learning_rate": 4.640755065851712e-05,
"loss": 0.9609,
"step": 2750
},
{
"epoch": 0.6039038067507818,
"grad_norm": 0.20155999064445496,
"learning_rate": 4.6255788884918595e-05,
"loss": 0.9221,
"step": 2800
},
{
"epoch": 0.6146878032999029,
"grad_norm": 0.2094108611345291,
"learning_rate": 4.610114643119382e-05,
"loss": 0.9665,
"step": 2850
},
{
"epoch": 0.625471799849024,
"grad_norm": 0.23038670420646667,
"learning_rate": 4.5943644255183785e-05,
"loss": 0.9223,
"step": 2900
},
{
"epoch": 0.6362557963981451,
"grad_norm": 0.22103433310985565,
"learning_rate": 4.5783303702291856e-05,
"loss": 0.9271,
"step": 2950
},
{
"epoch": 0.6470397929472662,
"grad_norm": 0.21444232761859894,
"learning_rate": 4.5620146502591065e-05,
"loss": 0.9553,
"step": 3000
},
{
"epoch": 0.6578237894963873,
"grad_norm": 0.20402322709560394,
"learning_rate": 4.5454194767879046e-05,
"loss": 0.9342,
"step": 3050
},
{
"epoch": 0.6686077860455084,
"grad_norm": 0.17598140239715576,
"learning_rate": 4.52854709886814e-05,
"loss": 0.9343,
"step": 3100
},
{
"epoch": 0.6793917825946296,
"grad_norm": 0.2235531210899353,
"learning_rate": 4.511399803120367e-05,
"loss": 0.9325,
"step": 3150
},
{
"epoch": 0.6901757791437507,
"grad_norm": 0.1978316605091095,
"learning_rate": 4.49397991342324e-05,
"loss": 0.9175,
"step": 3200
},
{
"epoch": 0.7009597756928718,
"grad_norm": 0.20724375545978546,
"learning_rate": 4.476289790598571e-05,
"loss": 0.9509,
"step": 3250
},
{
"epoch": 0.7117437722419929,
"grad_norm": 0.19276615977287292,
"learning_rate": 4.458331832091385e-05,
"loss": 0.9247,
"step": 3300
},
{
"epoch": 0.722527768791114,
"grad_norm": 0.2208387851715088,
"learning_rate": 4.440108471644997e-05,
"loss": 0.9409,
"step": 3350
},
{
"epoch": 0.7333117653402351,
"grad_norm": 0.21308571100234985,
"learning_rate": 4.421622178971193e-05,
"loss": 0.9267,
"step": 3400
},
{
"epoch": 0.7440957618893562,
"grad_norm": 0.2115100473165512,
"learning_rate": 4.4028754594155125e-05,
"loss": 0.933,
"step": 3450
},
{
"epoch": 0.7548797584384773,
"grad_norm": 0.21246980130672455,
"learning_rate": 4.383870853617721e-05,
"loss": 0.9422,
"step": 3500
},
{
"epoch": 0.7656637549875984,
"grad_norm": 0.2082446962594986,
"learning_rate": 4.364610937167485e-05,
"loss": 0.9204,
"step": 3550
},
{
"epoch": 0.7764477515367195,
"grad_norm": 0.22102369368076324,
"learning_rate": 4.345098320255321e-05,
"loss": 0.9226,
"step": 3600
},
{
"epoch": 0.7872317480858406,
"grad_norm": 0.19831791520118713,
"learning_rate": 4.325335647318848e-05,
"loss": 0.9327,
"step": 3650
},
{
"epoch": 0.7980157446349617,
"grad_norm": 0.2220238745212555,
"learning_rate": 4.3053255966844016e-05,
"loss": 0.9318,
"step": 3700
},
{
"epoch": 0.8087997411840828,
"grad_norm": 0.20910035073757172,
"learning_rate": 4.285070880204057e-05,
"loss": 0.9306,
"step": 3750
},
{
"epoch": 0.8195837377332039,
"grad_norm": 0.21745839715003967,
"learning_rate": 4.264574242888105e-05,
"loss": 0.9304,
"step": 3800
},
{
"epoch": 0.830367734282325,
"grad_norm": 0.24437028169631958,
"learning_rate": 4.2438384625330374e-05,
"loss": 0.9433,
"step": 3850
},
{
"epoch": 0.8411517308314461,
"grad_norm": 0.2319614738225937,
"learning_rate": 4.222866349345083e-05,
"loss": 0.9536,
"step": 3900
},
{
"epoch": 0.8519357273805672,
"grad_norm": 0.2375030517578125,
"learning_rate": 4.2016607455593624e-05,
"loss": 0.9421,
"step": 3950
},
{
"epoch": 0.8627197239296883,
"grad_norm": 0.2176317423582077,
"learning_rate": 4.1802245250546926e-05,
"loss": 0.9268,
"step": 4000
},
{
"epoch": 0.8735037204788094,
"grad_norm": 0.2226661890745163,
"learning_rate": 4.158560592964104e-05,
"loss": 0.925,
"step": 4050
},
{
"epoch": 0.8842877170279305,
"grad_norm": 0.2202196568250656,
"learning_rate": 4.136671885281124e-05,
"loss": 0.9465,
"step": 4100
},
{
"epoch": 0.8950717135770516,
"grad_norm": 0.20654049515724182,
"learning_rate": 4.114561368461884e-05,
"loss": 0.9251,
"step": 4150
},
{
"epoch": 0.9058557101261727,
"grad_norm": 0.23357035219669342,
"learning_rate": 4.092232039023084e-05,
"loss": 0.9417,
"step": 4200
},
{
"epoch": 0.9166397066752938,
"grad_norm": 0.20816297829151154,
"learning_rate": 4.069686923135896e-05,
"loss": 0.9225,
"step": 4250
},
{
"epoch": 0.9274237032244149,
"grad_norm": 0.20184196531772614,
"learning_rate": 4.04692907621584e-05,
"loss": 0.9212,
"step": 4300
},
{
"epoch": 0.938207699773536,
"grad_norm": 0.1984609067440033,
"learning_rate": 4.023961582508704e-05,
"loss": 0.9261,
"step": 4350
},
{
"epoch": 0.9489916963226572,
"grad_norm": 0.22444488108158112,
"learning_rate": 4.000787554672553e-05,
"loss": 0.9291,
"step": 4400
},
{
"epoch": 0.9597756928717783,
"grad_norm": 0.21115441620349884,
"learning_rate": 3.977410133355884e-05,
"loss": 0.9349,
"step": 4450
},
{
"epoch": 0.9705596894208994,
"grad_norm": 0.19569146633148193,
"learning_rate": 3.953832486771996e-05,
"loss": 0.9049,
"step": 4500
},
{
"epoch": 0.9813436859700205,
"grad_norm": 0.22996151447296143,
"learning_rate": 3.930057810269612e-05,
"loss": 0.894,
"step": 4550
},
{
"epoch": 0.9921276825191416,
"grad_norm": 0.19879557192325592,
"learning_rate": 3.906089325899841e-05,
"loss": 0.955,
"step": 4600
},
{
"epoch": 1.0028038391027714,
"grad_norm": 0.207550510764122,
"learning_rate": 3.8819302819795046e-05,
"loss": 0.9362,
"step": 4650
},
{
"epoch": 1.0135878356518926,
"grad_norm": 0.20435990393161774,
"learning_rate": 3.8575839526509105e-05,
"loss": 0.9217,
"step": 4700
},
{
"epoch": 1.0243718322010138,
"grad_norm": 0.22362500429153442,
"learning_rate": 3.833053637438128e-05,
"loss": 0.9342,
"step": 4750
},
{
"epoch": 1.0351558287501348,
"grad_norm": 0.18318387866020203,
"learning_rate": 3.8083426607998216e-05,
"loss": 0.8937,
"step": 4800
},
{
"epoch": 1.045939825299256,
"grad_norm": 0.20834890007972717,
"learning_rate": 3.783454371678705e-05,
"loss": 0.9103,
"step": 4850
},
{
"epoch": 1.056723821848377,
"grad_norm": 0.2138434648513794,
"learning_rate": 3.758392143047677e-05,
"loss": 0.9003,
"step": 4900
},
{
"epoch": 1.0675078183974982,
"grad_norm": 0.21266281604766846,
"learning_rate": 3.733159371452701e-05,
"loss": 0.9142,
"step": 4950
},
{
"epoch": 1.0782918149466192,
"grad_norm": 0.25879135727882385,
"learning_rate": 3.707759476552489e-05,
"loss": 0.8976,
"step": 5000
},
{
"epoch": 1.0890758114957404,
"grad_norm": 0.2042112946510315,
"learning_rate": 3.682195900655057e-05,
"loss": 0.9092,
"step": 5050
},
{
"epoch": 1.0998598080448614,
"grad_norm": 0.25018027424812317,
"learning_rate": 3.656472108251205e-05,
"loss": 0.8843,
"step": 5100
},
{
"epoch": 1.1106438045939826,
"grad_norm": 0.2371663898229599,
"learning_rate": 3.630591585544995e-05,
"loss": 0.8764,
"step": 5150
},
{
"epoch": 1.1214278011431036,
"grad_norm": 0.23503442108631134,
"learning_rate": 3.604557839981284e-05,
"loss": 0.9091,
"step": 5200
},
{
"epoch": 1.1322117976922248,
"grad_norm": 0.24042187631130219,
"learning_rate": 3.5783743997703824e-05,
"loss": 0.9206,
"step": 5250
},
{
"epoch": 1.1429957942413458,
"grad_norm": 0.25456419587135315,
"learning_rate": 3.5520448134098886e-05,
"loss": 0.8784,
"step": 5300
},
{
"epoch": 1.153779790790467,
"grad_norm": 0.23184941709041595,
"learning_rate": 3.5255726492037854e-05,
"loss": 0.8798,
"step": 5350
},
{
"epoch": 1.164563787339588,
"grad_norm": 0.24035029113292694,
"learning_rate": 3.498961494778851e-05,
"loss": 0.9039,
"step": 5400
},
{
"epoch": 1.1753477838887092,
"grad_norm": 0.24733129143714905,
"learning_rate": 3.4722149565984385e-05,
"loss": 0.9094,
"step": 5450
},
{
"epoch": 1.1861317804378302,
"grad_norm": 0.25908830761909485,
"learning_rate": 3.445336659473718e-05,
"loss": 0.9167,
"step": 5500
},
{
"epoch": 1.1969157769869514,
"grad_norm": 0.24497312307357788,
"learning_rate": 3.4183302460724246e-05,
"loss": 0.8919,
"step": 5550
},
{
"epoch": 1.2076997735360724,
"grad_norm": 0.24705035984516144,
"learning_rate": 3.391199376425188e-05,
"loss": 0.9018,
"step": 5600
},
{
"epoch": 1.2184837700851936,
"grad_norm": 0.2370757907629013,
"learning_rate": 3.363947727429507e-05,
"loss": 0.8925,
"step": 5650
},
{
"epoch": 1.2292677666343146,
"grad_norm": 0.24430540204048157,
"learning_rate": 3.336578992351442e-05,
"loss": 0.8834,
"step": 5700
},
{
"epoch": 1.2400517631834358,
"grad_norm": 0.20415450632572174,
"learning_rate": 3.3090968803250856e-05,
"loss": 0.9195,
"step": 5750
},
{
"epoch": 1.2508357597325568,
"grad_norm": 0.24224655330181122,
"learning_rate": 3.281505115849885e-05,
"loss": 0.8963,
"step": 5800
},
{
"epoch": 1.261619756281678,
"grad_norm": 0.263614684343338,
"learning_rate": 3.253807438285879e-05,
"loss": 0.9081,
"step": 5850
},
{
"epoch": 1.2724037528307992,
"grad_norm": 0.22934329509735107,
"learning_rate": 3.226007601346927e-05,
"loss": 0.8957,
"step": 5900
},
{
"epoch": 1.2831877493799202,
"grad_norm": 0.2595406770706177,
"learning_rate": 3.198109372591984e-05,
"loss": 0.8798,
"step": 5950
},
{
"epoch": 1.2939717459290412,
"grad_norm": 0.2610589861869812,
"learning_rate": 3.170677292377989e-05,
"loss": 0.9074,
"step": 6000
},
{
"epoch": 1.3047557424781624,
"grad_norm": 0.27022746205329895,
"learning_rate": 3.142595414578805e-05,
"loss": 0.9059,
"step": 6050
},
{
"epoch": 1.3155397390272836,
"grad_norm": 0.21983672678470612,
"learning_rate": 3.114426449358401e-05,
"loss": 0.9179,
"step": 6100
},
{
"epoch": 1.3263237355764046,
"grad_norm": 0.22227706015110016,
"learning_rate": 3.086174214301658e-05,
"loss": 0.8916,
"step": 6150
},
{
"epoch": 1.3371077321255256,
"grad_norm": 0.2406383454799652,
"learning_rate": 3.05784253827856e-05,
"loss": 0.8994,
"step": 6200
},
{
"epoch": 1.3478917286746468,
"grad_norm": 0.23662422597408295,
"learning_rate": 3.029435260925288e-05,
"loss": 0.893,
"step": 6250
},
{
"epoch": 1.358675725223768,
"grad_norm": 0.26936379075050354,
"learning_rate": 3.000956232123856e-05,
"loss": 0.9033,
"step": 6300
},
{
"epoch": 1.369459721772889,
"grad_norm": 0.253090500831604,
"learning_rate": 2.972409311480357e-05,
"loss": 0.8867,
"step": 6350
},
{
"epoch": 1.3802437183220102,
"grad_norm": 0.2847846746444702,
"learning_rate": 2.94379836780189e-05,
"loss": 0.8721,
"step": 6400
},
{
"epoch": 1.3910277148711312,
"grad_norm": 0.26056525111198425,
"learning_rate": 2.9151272785722466e-05,
"loss": 0.8913,
"step": 6450
},
{
"epoch": 1.4018117114202524,
"grad_norm": 0.23132337629795074,
"learning_rate": 2.8863999294264122e-05,
"loss": 0.9058,
"step": 6500
},
{
"epoch": 1.4125957079693734,
"grad_norm": 0.2190658152103424,
"learning_rate": 2.8576202136239688e-05,
"loss": 0.8906,
"step": 6550
},
{
"epoch": 1.4233797045184946,
"grad_norm": 0.26291966438293457,
"learning_rate": 2.8287920315214643e-05,
"loss": 0.9229,
"step": 6600
},
{
"epoch": 1.4341637010676156,
"grad_norm": 0.23218290507793427,
"learning_rate": 2.799919290043818e-05,
"loss": 0.9242,
"step": 6650
},
{
"epoch": 1.4449476976167368,
"grad_norm": 0.2565305233001709,
"learning_rate": 2.7710059021548344e-05,
"loss": 0.883,
"step": 6700
},
{
"epoch": 1.4557316941658578,
"grad_norm": 0.2470102459192276,
"learning_rate": 2.7420557863269043e-05,
"loss": 0.8949,
"step": 6750
},
{
"epoch": 1.466515690714979,
"grad_norm": 0.25169292092323303,
"learning_rate": 2.713072866009953e-05,
"loss": 0.9122,
"step": 6800
},
{
"epoch": 1.4772996872641002,
"grad_norm": 0.23668742179870605,
"learning_rate": 2.6840610690997182e-05,
"loss": 0.8919,
"step": 6850
},
{
"epoch": 1.4880836838132212,
"grad_norm": 0.2786126732826233,
"learning_rate": 2.655024327405422e-05,
"loss": 0.8883,
"step": 6900
},
{
"epoch": 1.4988676803623422,
"grad_norm": 0.25976258516311646,
"learning_rate": 2.6259665761169183e-05,
"loss": 0.9291,
"step": 6950
},
{
"epoch": 1.5096516769114634,
"grad_norm": 0.2566768229007721,
"learning_rate": 2.5968917532713743e-05,
"loss": 0.901,
"step": 7000
},
{
"epoch": 1.5204356734605846,
"grad_norm": 0.24728557467460632,
"learning_rate": 2.5678037992195714e-05,
"loss": 0.8811,
"step": 7050
},
{
"epoch": 1.5312196700097056,
"grad_norm": 0.24409767985343933,
"learning_rate": 2.5387066560918906e-05,
"loss": 0.904,
"step": 7100
},
{
"epoch": 1.5420036665588266,
"grad_norm": 0.2483212798833847,
"learning_rate": 2.5096042672640596e-05,
"loss": 0.8945,
"step": 7150
},
{
"epoch": 1.5527876631079478,
"grad_norm": 0.23452620208263397,
"learning_rate": 2.4805005768227252e-05,
"loss": 0.9063,
"step": 7200
},
{
"epoch": 1.563571659657069,
"grad_norm": 0.22194162011146545,
"learning_rate": 2.4513995290309358e-05,
"loss": 0.8834,
"step": 7250
},
{
"epoch": 1.57435565620619,
"grad_norm": 0.25706538558006287,
"learning_rate": 2.4223050677935947e-05,
"loss": 0.9149,
"step": 7300
},
{
"epoch": 1.585139652755311,
"grad_norm": 0.2703045606613159,
"learning_rate": 2.3932211361229683e-05,
"loss": 0.9059,
"step": 7350
},
{
"epoch": 1.5959236493044322,
"grad_norm": 0.26212379336357117,
"learning_rate": 2.3641516756043053e-05,
"loss": 0.8996,
"step": 7400
},
{
"epoch": 1.6067076458535534,
"grad_norm": 0.241121307015419,
"learning_rate": 2.3351006258616618e-05,
"loss": 0.8934,
"step": 7450
},
{
"epoch": 1.6174916424026744,
"grad_norm": 0.2937757968902588,
"learning_rate": 2.3060719240239807e-05,
"loss": 0.8907,
"step": 7500
},
{
"epoch": 1.6282756389517954,
"grad_norm": 0.2826499938964844,
"learning_rate": 2.2770695041915187e-05,
"loss": 0.8963,
"step": 7550
},
{
"epoch": 1.6390596355009166,
"grad_norm": 0.2622433602809906,
"learning_rate": 2.248097296902672e-05,
"loss": 0.8797,
"step": 7600
},
{
"epoch": 1.6498436320500378,
"grad_norm": 0.26400211453437805,
"learning_rate": 2.2191592286013042e-05,
"loss": 0.9084,
"step": 7650
},
{
"epoch": 1.6606276285991588,
"grad_norm": 0.25721365213394165,
"learning_rate": 2.1902592211046032e-05,
"loss": 0.882,
"step": 7700
},
{
"epoch": 1.6714116251482798,
"grad_norm": 0.25235188007354736,
"learning_rate": 2.1614011910715896e-05,
"loss": 0.9306,
"step": 7750
},
{
"epoch": 1.6821956216974012,
"grad_norm": 0.2521611154079437,
"learning_rate": 2.1325890494723065e-05,
"loss": 0.8911,
"step": 7800
},
{
"epoch": 1.6929796182465222,
"grad_norm": 0.2881399691104889,
"learning_rate": 2.103826701057793e-05,
"loss": 0.8837,
"step": 7850
},
{
"epoch": 1.7037636147956432,
"grad_norm": 0.2743209898471832,
"learning_rate": 2.075118043830888e-05,
"loss": 0.9072,
"step": 7900
},
{
"epoch": 1.7145476113447644,
"grad_norm": 0.2475823312997818,
"learning_rate": 2.046466968517963e-05,
"loss": 0.9109,
"step": 7950
},
{
"epoch": 1.7253316078938856,
"grad_norm": 0.28786906599998474,
"learning_rate": 2.0178773580416263e-05,
"loss": 0.9085,
"step": 8000
},
{
"epoch": 1.7361156044430066,
"grad_norm": 0.2793081998825073,
"learning_rate": 1.9893530869944986e-05,
"loss": 0.8721,
"step": 8050
},
{
"epoch": 1.7468996009921276,
"grad_norm": 0.26357826590538025,
"learning_rate": 1.9608980211141028e-05,
"loss": 0.9014,
"step": 8100
},
{
"epoch": 1.7576835975412488,
"grad_norm": 0.26504483819007874,
"learning_rate": 1.93251601675897e-05,
"loss": 0.9091,
"step": 8150
},
{
"epoch": 1.76846759409037,
"grad_norm": 0.26386550068855286,
"learning_rate": 1.9042109203860027e-05,
"loss": 0.8985,
"step": 8200
},
{
"epoch": 1.779251590639491,
"grad_norm": 0.2590016722679138,
"learning_rate": 1.87598656802919e-05,
"loss": 0.8865,
"step": 8250
},
{
"epoch": 1.790035587188612,
"grad_norm": 0.2528024911880493,
"learning_rate": 1.8478467847797238e-05,
"loss": 0.9046,
"step": 8300
},
{
"epoch": 1.8008195837377332,
"grad_norm": 0.27202916145324707,
"learning_rate": 1.8197953842676168e-05,
"loss": 0.9021,
"step": 8350
},
{
"epoch": 1.8116035802868544,
"grad_norm": 0.240274578332901,
"learning_rate": 1.7918361681448504e-05,
"loss": 0.8921,
"step": 8400
},
{
"epoch": 1.8223875768359754,
"grad_norm": 0.29021942615509033,
"learning_rate": 1.7639729255701655e-05,
"loss": 0.9074,
"step": 8450
},
{
"epoch": 1.8331715733850964,
"grad_norm": 0.28871750831604004,
"learning_rate": 1.7362094326955336e-05,
"loss": 0.8962,
"step": 8500
},
{
"epoch": 1.8439555699342176,
"grad_norm": 0.2800693213939667,
"learning_rate": 1.7085494521544025e-05,
"loss": 0.9222,
"step": 8550
},
{
"epoch": 1.8547395664833388,
"grad_norm": 0.2543833255767822,
"learning_rate": 1.6809967325517573e-05,
"loss": 0.8925,
"step": 8600
},
{
"epoch": 1.8655235630324598,
"grad_norm": 0.255051851272583,
"learning_rate": 1.6535550079561027e-05,
"loss": 0.8818,
"step": 8650
},
{
"epoch": 1.8763075595815808,
"grad_norm": 0.289727121591568,
"learning_rate": 1.6262279973933984e-05,
"loss": 0.8878,
"step": 8700
},
{
"epoch": 1.887091556130702,
"grad_norm": 0.2506343424320221,
"learning_rate": 1.5990194043430444e-05,
"loss": 0.8961,
"step": 8750
},
{
"epoch": 1.8978755526798232,
"grad_norm": 0.3042599558830261,
"learning_rate": 1.5719329162359638e-05,
"loss": 0.9082,
"step": 8800
},
{
"epoch": 1.9086595492289442,
"grad_norm": 0.2791798710823059,
"learning_rate": 1.5449722039548706e-05,
"loss": 0.9023,
"step": 8850
},
{
"epoch": 1.9194435457780652,
"grad_norm": 0.2678021788597107,
"learning_rate": 1.5181409213367726e-05,
"loss": 0.8826,
"step": 8900
},
{
"epoch": 1.9302275423271864,
"grad_norm": 0.2640957832336426,
"learning_rate": 1.4914427046777879e-05,
"loss": 0.887,
"step": 8950
},
{
"epoch": 1.9410115388763076,
"grad_norm": 0.2847963869571686,
"learning_rate": 1.4648811722403358e-05,
"loss": 0.8906,
"step": 9000
},
{
"epoch": 1.9517955354254286,
"grad_norm": 0.2558712661266327,
"learning_rate": 1.4384599237627777e-05,
"loss": 0.9006,
"step": 9050
},
{
"epoch": 1.9625795319745498,
"grad_norm": 0.26001158356666565,
"learning_rate": 1.4121825399715577e-05,
"loss": 0.902,
"step": 9100
},
{
"epoch": 1.973363528523671,
"grad_norm": 0.250234991312027,
"learning_rate": 1.3860525820959358e-05,
"loss": 0.8966,
"step": 9150
},
{
"epoch": 1.984147525072792,
"grad_norm": 0.2639175355434418,
"learning_rate": 1.360073591385342e-05,
"loss": 0.9063,
"step": 9200
},
{
"epoch": 1.994931521621913,
"grad_norm": 0.2366214245557785,
"learning_rate": 1.334249088629464e-05,
"loss": 0.8907,
"step": 9250
},
{
"epoch": 2.0056076782055428,
"grad_norm": 0.291415274143219,
"learning_rate": 1.3085825736810828e-05,
"loss": 0.8729,
"step": 9300
},
{
"epoch": 2.016391674754664,
"grad_norm": 0.2706186771392822,
"learning_rate": 1.2830775249817595e-05,
"loss": 0.8663,
"step": 9350
},
{
"epoch": 2.027175671303785,
"grad_norm": 0.2555548846721649,
"learning_rate": 1.2577373990904279e-05,
"loss": 0.8663,
"step": 9400
},
{
"epoch": 2.037959667852906,
"grad_norm": 0.254191517829895,
"learning_rate": 1.2325656302149374e-05,
"loss": 0.8592,
"step": 9450
},
{
"epoch": 2.0487436644020276,
"grad_norm": 0.30470383167266846,
"learning_rate": 1.2075656297466382e-05,
"loss": 0.8938,
"step": 9500
},
{
"epoch": 2.0595276609511486,
"grad_norm": 0.2882542908191681,
"learning_rate": 1.1827407857980522e-05,
"loss": 0.8754,
"step": 9550
},
{
"epoch": 2.0703116575002696,
"grad_norm": 0.33889445662498474,
"learning_rate": 1.1580944627437052e-05,
"loss": 0.8645,
"step": 9600
},
{
"epoch": 2.0810956540493906,
"grad_norm": 0.29919326305389404,
"learning_rate": 1.1336300007641628e-05,
"loss": 0.8685,
"step": 9650
},
{
"epoch": 2.091879650598512,
"grad_norm": 0.2923993468284607,
"learning_rate": 1.1098344650456325e-05,
"loss": 0.8577,
"step": 9700
},
{
"epoch": 2.102663647147633,
"grad_norm": 0.2865777611732483,
"learning_rate": 1.0857398452987955e-05,
"loss": 0.8968,
"step": 9750
},
{
"epoch": 2.113447643696754,
"grad_norm": 0.28677886724472046,
"learning_rate": 1.0618368924500005e-05,
"loss": 0.8678,
"step": 9800
},
{
"epoch": 2.124231640245875,
"grad_norm": 0.2737389802932739,
"learning_rate": 1.0381288459349405e-05,
"loss": 0.8865,
"step": 9850
},
{
"epoch": 2.1350156367949964,
"grad_norm": 0.27073368430137634,
"learning_rate": 1.0146189187747276e-05,
"loss": 0.8733,
"step": 9900
},
{
"epoch": 2.1457996333441174,
"grad_norm": 0.280775785446167,
"learning_rate": 9.913102971404456e-06,
"loss": 0.8408,
"step": 9950
},
{
"epoch": 2.1565836298932384,
"grad_norm": 0.2671400308609009,
"learning_rate": 9.682061399213525e-06,
"loss": 0.8792,
"step": 10000
},
{
"epoch": 2.1673676264423594,
"grad_norm": 0.3240983188152313,
"learning_rate": 9.45309578296762e-06,
"loss": 0.8739,
"step": 10050
},
{
"epoch": 2.178151622991481,
"grad_norm": 0.30578577518463135,
"learning_rate": 9.226237153117056e-06,
"loss": 0.8731,
"step": 10100
},
{
"epoch": 2.188935619540602,
"grad_norm": 0.2961669862270355,
"learning_rate": 9.001516254563835e-06,
"loss": 0.8861,
"step": 10150
},
{
"epoch": 2.1997196160897228,
"grad_norm": 0.31330254673957825,
"learning_rate": 8.778963542495015e-06,
"loss": 0.8327,
"step": 10200
},
{
"epoch": 2.2105036126388438,
"grad_norm": 0.3293406665325165,
"learning_rate": 8.558609178255252e-06,
"loss": 0.8567,
"step": 10250
},
{
"epoch": 2.221287609187965,
"grad_norm": 0.3065802752971649,
"learning_rate": 8.340483025259233e-06,
"loss": 0.8515,
"step": 10300
},
{
"epoch": 2.232071605737086,
"grad_norm": 0.2637750208377838,
"learning_rate": 8.124614644944412e-06,
"loss": 0.874,
"step": 10350
},
{
"epoch": 2.242855602286207,
"grad_norm": 0.26482629776000977,
"learning_rate": 7.911033292764774e-06,
"loss": 0.8373,
"step": 10400
},
{
"epoch": 2.2536395988353286,
"grad_norm": 0.27340102195739746,
"learning_rate": 7.699767914225903e-06,
"loss": 0.9063,
"step": 10450
},
{
"epoch": 2.2644235953844496,
"grad_norm": 0.25882843136787415,
"learning_rate": 7.490847140962273e-06,
"loss": 0.8377,
"step": 10500
},
{
"epoch": 2.2752075919335706,
"grad_norm": 0.3063746690750122,
"learning_rate": 7.284299286856877e-06,
"loss": 0.8767,
"step": 10550
},
{
"epoch": 2.2859915884826916,
"grad_norm": 0.27114883065223694,
"learning_rate": 7.080152344204028e-06,
"loss": 0.8517,
"step": 10600
},
{
"epoch": 2.2967755850318126,
"grad_norm": 0.26992297172546387,
"learning_rate": 6.878433979915719e-06,
"loss": 0.873,
"step": 10650
},
{
"epoch": 2.307559581580934,
"grad_norm": 0.30842849612236023,
"learning_rate": 6.6791715317721075e-06,
"loss": 0.8645,
"step": 10700
},
{
"epoch": 2.318343578130055,
"grad_norm": 0.2740515172481537,
"learning_rate": 6.482392004716492e-06,
"loss": 0.8772,
"step": 10750
},
{
"epoch": 2.329127574679176,
"grad_norm": 0.28314441442489624,
"learning_rate": 6.288122067195592e-06,
"loss": 0.87,
"step": 10800
},
{
"epoch": 2.3399115712282974,
"grad_norm": 0.2951704263687134,
"learning_rate": 6.096388047545232e-06,
"loss": 0.8801,
"step": 10850
},
{
"epoch": 2.3506955677774184,
"grad_norm": 0.3134472966194153,
"learning_rate": 5.907215930422244e-06,
"loss": 0.8598,
"step": 10900
},
{
"epoch": 2.3614795643265394,
"grad_norm": 0.3114987313747406,
"learning_rate": 5.7206313532829095e-06,
"loss": 0.8578,
"step": 10950
},
{
"epoch": 2.3722635608756604,
"grad_norm": 0.3185006380081177,
"learning_rate": 5.5366596029084535e-06,
"loss": 0.8713,
"step": 11000
}
],
"logging_steps": 50,
"max_steps": 13911,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.923221364727559e+19,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}