VidEmo-7B / trainer_state.json
nku-zhichengzhang's picture
Upload folder using huggingface_hub
13e1619 verified
{
"best_metric": 0.46374601,
"best_model_checkpoint": "/home/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-1040_data-ATTR_OPEN_EMO_500k_CAP_78k_lr-4e-5/v0-20250512-052808/checkpoint-1050",
"epoch": 2.913557779799818,
"eval_steps": 50,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018198362147406734,
"grad_norm": 25.030844046592083,
"learning_rate": 3.9999963615834764e-05,
"loss": 2.025822877883911,
"memory(GiB)": 43.02,
"step": 1,
"token_acc": 0.609375,
"train_speed(iter/s)": 0.005908
},
{
"epoch": 0.009099181073703366,
"grad_norm": 10.802963796008704,
"learning_rate": 3.9999090402488034e-05,
"loss": 2.101142644882202,
"memory(GiB)": 71.21,
"step": 5,
"token_acc": 0.5037481259370314,
"train_speed(iter/s)": 0.013918
},
{
"epoch": 0.018198362147406732,
"grad_norm": 4.512540795817656,
"learning_rate": 3.99963616926889e-05,
"loss": 2.7770095825195313,
"memory(GiB)": 71.21,
"step": 10,
"token_acc": 0.4725118483412322,
"train_speed(iter/s)": 0.016736
},
{
"epoch": 0.0272975432211101,
"grad_norm": 3.2273598665998664,
"learning_rate": 3.999181411880536e-05,
"loss": 1.1679546356201171,
"memory(GiB)": 71.21,
"step": 15,
"token_acc": 0.6352619233776388,
"train_speed(iter/s)": 0.018014
},
{
"epoch": 0.036396724294813464,
"grad_norm": 2.7344684028279493,
"learning_rate": 3.99854480944836e-05,
"loss": 1.0935646057128907,
"memory(GiB)": 76.02,
"step": 20,
"token_acc": 0.6871584699453552,
"train_speed(iter/s)": 0.018758
},
{
"epoch": 0.04549590536851683,
"grad_norm": 2.6368838955923546,
"learning_rate": 3.9977264198775616e-05,
"loss": 1.0634303092956543,
"memory(GiB)": 76.02,
"step": 25,
"token_acc": 0.6443461781427668,
"train_speed(iter/s)": 0.019279
},
{
"epoch": 0.0545950864422202,
"grad_norm": 2.0635877115380987,
"learning_rate": 3.996726317608652e-05,
"loss": 1.0315238952636718,
"memory(GiB)": 76.02,
"step": 30,
"token_acc": 0.6216628527841342,
"train_speed(iter/s)": 0.019641
},
{
"epoch": 0.06369426751592357,
"grad_norm": 1.769746764765505,
"learning_rate": 3.995544593610685e-05,
"loss": 1.0012907981872559,
"memory(GiB)": 76.02,
"step": 35,
"token_acc": 0.6820960698689956,
"train_speed(iter/s)": 0.019879
},
{
"epoch": 0.07279344858962693,
"grad_norm": 1.7921148583961297,
"learning_rate": 3.994181355372981e-05,
"loss": 1.0219003677368164,
"memory(GiB)": 76.02,
"step": 40,
"token_acc": 0.6666666666666666,
"train_speed(iter/s)": 0.020113
},
{
"epoch": 0.0818926296633303,
"grad_norm": 2.3622358395090868,
"learning_rate": 3.9926367268953514e-05,
"loss": 0.9893597602844239,
"memory(GiB)": 76.02,
"step": 45,
"token_acc": 0.6443586443586443,
"train_speed(iter/s)": 0.02023
},
{
"epoch": 0.09099181073703366,
"grad_norm": 1.8847444781295586,
"learning_rate": 3.990910848676819e-05,
"loss": 1.0064857482910157,
"memory(GiB)": 76.02,
"step": 50,
"token_acc": 0.6833550065019506,
"train_speed(iter/s)": 0.020396
},
{
"epoch": 0.09099181073703366,
"eval_loss": 0.6069548726081848,
"eval_runtime": 124.0182,
"eval_samples_per_second": 46.542,
"eval_steps_per_second": 0.452,
"eval_token_acc": 0.6658791259916742,
"step": 50
},
{
"epoch": 0.10009099181073704,
"grad_norm": 2.2611595119155297,
"learning_rate": 3.989003877702835e-05,
"loss": 1.0090344429016114,
"memory(GiB)": 76.02,
"step": 55,
"token_acc": 0.6743224621038126,
"train_speed(iter/s)": 0.019334
},
{
"epoch": 0.1091901728844404,
"grad_norm": 2.155765617865288,
"learning_rate": 3.986915987431006e-05,
"loss": 0.9812187194824219,
"memory(GiB)": 76.02,
"step": 60,
"token_acc": 0.6862615587846763,
"train_speed(iter/s)": 0.019541
},
{
"epoch": 0.11828935395814377,
"grad_norm": 1.9675392436886496,
"learning_rate": 3.984647367775312e-05,
"loss": 0.967503547668457,
"memory(GiB)": 76.02,
"step": 65,
"token_acc": 0.6425840978593272,
"train_speed(iter/s)": 0.019703
},
{
"epoch": 0.12738853503184713,
"grad_norm": 1.6136749581314442,
"learning_rate": 3.9821982250888316e-05,
"loss": 0.9946205139160156,
"memory(GiB)": 76.02,
"step": 70,
"token_acc": 0.6822200392927309,
"train_speed(iter/s)": 0.01985
},
{
"epoch": 0.1364877161055505,
"grad_norm": 2.1416143299162544,
"learning_rate": 3.9795687821449754e-05,
"loss": 0.9689006805419922,
"memory(GiB)": 76.02,
"step": 75,
"token_acc": 0.6582365003417635,
"train_speed(iter/s)": 0.019982
},
{
"epoch": 0.14558689717925385,
"grad_norm": 2.2094541193048074,
"learning_rate": 3.9767592781172185e-05,
"loss": 0.9927925109863281,
"memory(GiB)": 76.02,
"step": 80,
"token_acc": 0.6676557863501483,
"train_speed(iter/s)": 0.020086
},
{
"epoch": 0.15468607825295724,
"grad_norm": 1.6788879867996525,
"learning_rate": 3.973769968557348e-05,
"loss": 0.9653422355651855,
"memory(GiB)": 76.02,
"step": 85,
"token_acc": 0.6833890746934225,
"train_speed(iter/s)": 0.020194
},
{
"epoch": 0.1637852593266606,
"grad_norm": 1.6608567558622684,
"learning_rate": 3.970601125372218e-05,
"loss": 0.9711417198181153,
"memory(GiB)": 76.02,
"step": 90,
"token_acc": 0.6648721399730821,
"train_speed(iter/s)": 0.020273
},
{
"epoch": 0.17288444040036396,
"grad_norm": 1.8971338914920044,
"learning_rate": 3.967253036799017e-05,
"loss": 0.9714339256286622,
"memory(GiB)": 76.02,
"step": 95,
"token_acc": 0.6907407407407408,
"train_speed(iter/s)": 0.020364
},
{
"epoch": 0.18198362147406733,
"grad_norm": 2.2276291949458913,
"learning_rate": 3.963726007379047e-05,
"loss": 0.9623370170593262,
"memory(GiB)": 76.02,
"step": 100,
"token_acc": 0.6705935659265972,
"train_speed(iter/s)": 0.02043
},
{
"epoch": 0.18198362147406733,
"eval_loss": 0.5809512138366699,
"eval_runtime": 123.9915,
"eval_samples_per_second": 46.552,
"eval_steps_per_second": 0.452,
"eval_token_acc": 0.6707621478823382,
"step": 100
},
{
"epoch": 0.1910828025477707,
"grad_norm": 1.7032248137533175,
"learning_rate": 3.960020357930028e-05,
"loss": 0.9466117858886719,
"memory(GiB)": 76.02,
"step": 105,
"token_acc": 0.6678478620363808,
"train_speed(iter/s)": 0.019882
},
{
"epoch": 0.20018198362147407,
"grad_norm": 1.855445035035624,
"learning_rate": 3.9561364255169114e-05,
"loss": 0.9585418701171875,
"memory(GiB)": 76.02,
"step": 110,
"token_acc": 0.666546633057256,
"train_speed(iter/s)": 0.019966
},
{
"epoch": 0.20928116469517744,
"grad_norm": 2.271456601509792,
"learning_rate": 3.9520745634212225e-05,
"loss": 0.9546641349792481,
"memory(GiB)": 76.02,
"step": 115,
"token_acc": 0.6983430799220273,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 0.2183803457688808,
"grad_norm": 1.727865009111447,
"learning_rate": 3.947835141108928e-05,
"loss": 0.9411544799804688,
"memory(GiB)": 76.02,
"step": 120,
"token_acc": 0.6998714652956298,
"train_speed(iter/s)": 0.020118
},
{
"epoch": 0.22747952684258416,
"grad_norm": 1.5407295558813352,
"learning_rate": 3.943418544196826e-05,
"loss": 0.9641068458557129,
"memory(GiB)": 76.02,
"step": 125,
"token_acc": 0.6722915963550455,
"train_speed(iter/s)": 0.020179
},
{
"epoch": 0.23657870791628755,
"grad_norm": 1.6770942231997907,
"learning_rate": 3.938825174417473e-05,
"loss": 0.956147575378418,
"memory(GiB)": 76.02,
"step": 130,
"token_acc": 0.7067484662576687,
"train_speed(iter/s)": 0.020251
},
{
"epoch": 0.2456778889899909,
"grad_norm": 1.799020682507979,
"learning_rate": 3.934055449582641e-05,
"loss": 0.9465121269226074,
"memory(GiB)": 76.02,
"step": 135,
"token_acc": 0.6822670674109059,
"train_speed(iter/s)": 0.020307
},
{
"epoch": 0.25477707006369427,
"grad_norm": 1.6975378766800486,
"learning_rate": 3.929109803545315e-05,
"loss": 0.9593283653259277,
"memory(GiB)": 76.02,
"step": 140,
"token_acc": 0.6935749588138386,
"train_speed(iter/s)": 0.020367
},
{
"epoch": 0.26387625113739765,
"grad_norm": 1.6873696015578077,
"learning_rate": 3.9239886861602265e-05,
"loss": 0.9509831428527832,
"memory(GiB)": 76.02,
"step": 145,
"token_acc": 0.6785370548604427,
"train_speed(iter/s)": 0.020417
},
{
"epoch": 0.272975432211101,
"grad_norm": 1.605433238469568,
"learning_rate": 3.9186925632429396e-05,
"loss": 0.9489663124084473,
"memory(GiB)": 76.02,
"step": 150,
"token_acc": 0.6493083807973963,
"train_speed(iter/s)": 0.020465
},
{
"epoch": 0.272975432211101,
"eval_loss": 0.556602954864502,
"eval_runtime": 119.5036,
"eval_samples_per_second": 48.3,
"eval_steps_per_second": 0.469,
"eval_token_acc": 0.6771622643952052,
"step": 150
},
{
"epoch": 0.2820746132848044,
"grad_norm": 1.665760265285853,
"learning_rate": 3.9132219165274786e-05,
"loss": 0.9691334724426269,
"memory(GiB)": 76.02,
"step": 155,
"token_acc": 0.6817427385892116,
"train_speed(iter/s)": 0.020097
},
{
"epoch": 0.2911737943585077,
"grad_norm": 1.4504880716204094,
"learning_rate": 3.907577243622505e-05,
"loss": 0.9517691612243653,
"memory(GiB)": 76.02,
"step": 160,
"token_acc": 0.6508152173913043,
"train_speed(iter/s)": 0.020145
},
{
"epoch": 0.3002729754322111,
"grad_norm": 1.4909379207696947,
"learning_rate": 3.901759057966064e-05,
"loss": 0.9396313667297364,
"memory(GiB)": 76.02,
"step": 165,
"token_acc": 0.6924564796905223,
"train_speed(iter/s)": 0.0202
},
{
"epoch": 0.3093721565059145,
"grad_norm": 1.6755025509294692,
"learning_rate": 3.895767888778874e-05,
"loss": 0.958685302734375,
"memory(GiB)": 76.02,
"step": 170,
"token_acc": 0.6812801402893468,
"train_speed(iter/s)": 0.020241
},
{
"epoch": 0.3184713375796178,
"grad_norm": 1.39424961728271,
"learning_rate": 3.889604281016194e-05,
"loss": 0.9179913520812988,
"memory(GiB)": 76.02,
"step": 175,
"token_acc": 0.6434395848776872,
"train_speed(iter/s)": 0.020291
},
{
"epoch": 0.3275705186533212,
"grad_norm": 1.810023496149751,
"learning_rate": 3.883268795318252e-05,
"loss": 0.95927734375,
"memory(GiB)": 76.02,
"step": 180,
"token_acc": 0.6510866329264662,
"train_speed(iter/s)": 0.020334
},
{
"epoch": 0.33666969972702454,
"grad_norm": 2.080560793664787,
"learning_rate": 3.876762007959253e-05,
"loss": 0.9460148811340332,
"memory(GiB)": 76.02,
"step": 185,
"token_acc": 0.6614173228346457,
"train_speed(iter/s)": 0.020378
},
{
"epoch": 0.34576888080072793,
"grad_norm": 1.6314724313426552,
"learning_rate": 3.870084510794953e-05,
"loss": 0.9372352600097656,
"memory(GiB)": 76.02,
"step": 190,
"token_acc": 0.7167736021998167,
"train_speed(iter/s)": 0.020418
},
{
"epoch": 0.3548680618744313,
"grad_norm": 1.5214499000610326,
"learning_rate": 3.863236911208835e-05,
"loss": 0.9120028495788575,
"memory(GiB)": 76.02,
"step": 195,
"token_acc": 0.6961974110032363,
"train_speed(iter/s)": 0.020453
},
{
"epoch": 0.36396724294813465,
"grad_norm": 1.403385243202059,
"learning_rate": 3.856219832056853e-05,
"loss": 0.9274997711181641,
"memory(GiB)": 76.02,
"step": 200,
"token_acc": 0.6597971867844292,
"train_speed(iter/s)": 0.020494
},
{
"epoch": 0.36396724294813465,
"eval_loss": 0.5442519783973694,
"eval_runtime": 121.7991,
"eval_samples_per_second": 47.389,
"eval_steps_per_second": 0.46,
"eval_token_acc": 0.6795491599341379,
"step": 200
},
{
"epoch": 0.37306642402183804,
"grad_norm": 1.7288195921368568,
"learning_rate": 3.8490339116107814e-05,
"loss": 0.9254457473754882,
"memory(GiB)": 76.02,
"step": 205,
"token_acc": 0.6976498547663058,
"train_speed(iter/s)": 0.020208
},
{
"epoch": 0.3821656050955414,
"grad_norm": 1.7934469116880778,
"learning_rate": 3.8416798035001545e-05,
"loss": 0.9426854133605957,
"memory(GiB)": 76.02,
"step": 210,
"token_acc": 0.6734362307067425,
"train_speed(iter/s)": 0.020248
},
{
"epoch": 0.39126478616924476,
"grad_norm": 1.3762724847783987,
"learning_rate": 3.8341581766528185e-05,
"loss": 0.949736499786377,
"memory(GiB)": 76.02,
"step": 215,
"token_acc": 0.6799800299550673,
"train_speed(iter/s)": 0.020279
},
{
"epoch": 0.40036396724294815,
"grad_norm": 1.8318501236469258,
"learning_rate": 3.826469715234078e-05,
"loss": 0.9189864158630371,
"memory(GiB)": 76.02,
"step": 220,
"token_acc": 0.6768424298489053,
"train_speed(iter/s)": 0.020316
},
{
"epoch": 0.4094631483166515,
"grad_norm": 1.734985910099827,
"learning_rate": 3.818615118584472e-05,
"loss": 0.9207481384277344,
"memory(GiB)": 76.02,
"step": 225,
"token_acc": 0.6853369763205829,
"train_speed(iter/s)": 0.020349
},
{
"epoch": 0.41856232939035487,
"grad_norm": 1.46610540352475,
"learning_rate": 3.810595101156157e-05,
"loss": 0.949979305267334,
"memory(GiB)": 76.02,
"step": 230,
"token_acc": 0.7674418604651163,
"train_speed(iter/s)": 0.020378
},
{
"epoch": 0.42766151046405826,
"grad_norm": 1.4640218284405278,
"learning_rate": 3.8024103924479225e-05,
"loss": 0.9503008842468261,
"memory(GiB)": 76.02,
"step": 235,
"token_acc": 0.6691435275713727,
"train_speed(iter/s)": 0.020412
},
{
"epoch": 0.4367606915377616,
"grad_norm": 1.3582380492653447,
"learning_rate": 3.794061736938837e-05,
"loss": 0.9213446617126465,
"memory(GiB)": 76.02,
"step": 240,
"token_acc": 0.6814469078179697,
"train_speed(iter/s)": 0.020441
},
{
"epoch": 0.445859872611465,
"grad_norm": 1.24168837408377,
"learning_rate": 3.785549894020529e-05,
"loss": 0.927124309539795,
"memory(GiB)": 76.02,
"step": 245,
"token_acc": 0.7300613496932515,
"train_speed(iter/s)": 0.020473
},
{
"epoch": 0.4549590536851683,
"grad_norm": 1.4540581012012834,
"learning_rate": 3.77687563792811e-05,
"loss": 0.9168607711791992,
"memory(GiB)": 76.02,
"step": 250,
"token_acc": 0.6800291545189504,
"train_speed(iter/s)": 0.020497
},
{
"epoch": 0.4549590536851683,
"eval_loss": 0.5409244894981384,
"eval_runtime": 120.7512,
"eval_samples_per_second": 47.801,
"eval_steps_per_second": 0.464,
"eval_token_acc": 0.6797271657031431,
"step": 250
},
{
"epoch": 0.4640582347588717,
"grad_norm": 1.7178666143628036,
"learning_rate": 3.768039757669759e-05,
"loss": 0.9190607070922852,
"memory(GiB)": 76.02,
"step": 255,
"token_acc": 0.6971046770601337,
"train_speed(iter/s)": 0.020269
},
{
"epoch": 0.4731574158325751,
"grad_norm": 1.4533539479949111,
"learning_rate": 3.759043056954943e-05,
"loss": 0.9371905326843262,
"memory(GiB)": 76.02,
"step": 260,
"token_acc": 0.6667847025495751,
"train_speed(iter/s)": 0.020296
},
{
"epoch": 0.4822565969062784,
"grad_norm": 1.8242714144160546,
"learning_rate": 3.749886354121324e-05,
"loss": 0.9172127723693848,
"memory(GiB)": 76.02,
"step": 265,
"token_acc": 0.7086137281292059,
"train_speed(iter/s)": 0.020325
},
{
"epoch": 0.4913557779799818,
"grad_norm": 1.3386774946853799,
"learning_rate": 3.740570482060311e-05,
"loss": 0.9408517837524414,
"memory(GiB)": 76.02,
"step": 270,
"token_acc": 0.7290575916230366,
"train_speed(iter/s)": 0.020353
},
{
"epoch": 0.5004549590536852,
"grad_norm": 1.6524604438416564,
"learning_rate": 3.731096288141309e-05,
"loss": 0.9067551612854003,
"memory(GiB)": 76.02,
"step": 275,
"token_acc": 0.678743961352657,
"train_speed(iter/s)": 0.020379
},
{
"epoch": 0.5095541401273885,
"grad_norm": 1.7068717522460979,
"learning_rate": 3.721464634134641e-05,
"loss": 0.9261470794677734,
"memory(GiB)": 76.02,
"step": 280,
"token_acc": 0.7159965782720273,
"train_speed(iter/s)": 0.020408
},
{
"epoch": 0.5186533212010919,
"grad_norm": 1.5886442512862196,
"learning_rate": 3.711676396133158e-05,
"loss": 0.9242866516113282,
"memory(GiB)": 76.02,
"step": 285,
"token_acc": 0.6532932129722501,
"train_speed(iter/s)": 0.020431
},
{
"epoch": 0.5277525022747953,
"grad_norm": 1.3930674320536802,
"learning_rate": 3.701732464472553e-05,
"loss": 0.9128170967102051,
"memory(GiB)": 76.02,
"step": 290,
"token_acc": 0.6779987171263631,
"train_speed(iter/s)": 0.020457
},
{
"epoch": 0.5368516833484986,
"grad_norm": 1.4564537325119185,
"learning_rate": 3.691633743650377e-05,
"loss": 0.9042372703552246,
"memory(GiB)": 76.02,
"step": 295,
"token_acc": 0.6832191780821918,
"train_speed(iter/s)": 0.020478
},
{
"epoch": 0.545950864422202,
"grad_norm": 1.4788538883263567,
"learning_rate": 3.681381152243763e-05,
"loss": 0.9223553657531738,
"memory(GiB)": 76.02,
"step": 300,
"token_acc": 0.6808054841473865,
"train_speed(iter/s)": 0.020502
},
{
"epoch": 0.545950864422202,
"eval_loss": 0.5335711240768433,
"eval_runtime": 119.2512,
"eval_samples_per_second": 48.402,
"eval_steps_per_second": 0.47,
"eval_token_acc": 0.682199018540919,
"step": 300
},
{
"epoch": 0.5550500454959054,
"grad_norm": 1.6525258776892648,
"learning_rate": 3.6709756228258735e-05,
"loss": 0.9161547660827637,
"memory(GiB)": 76.02,
"step": 305,
"token_acc": 0.6724870221802737,
"train_speed(iter/s)": 0.02031
},
{
"epoch": 0.5641492265696088,
"grad_norm": 1.298480729022936,
"learning_rate": 3.6604181018810764e-05,
"loss": 0.8824697494506836,
"memory(GiB)": 76.02,
"step": 310,
"token_acc": 0.6935075885328836,
"train_speed(iter/s)": 0.020334
},
{
"epoch": 0.5732484076433121,
"grad_norm": 1.3254867339008374,
"learning_rate": 3.649709549718849e-05,
"loss": 0.8925297737121582,
"memory(GiB)": 76.02,
"step": 315,
"token_acc": 0.6668953687821613,
"train_speed(iter/s)": 0.020357
},
{
"epoch": 0.5823475887170154,
"grad_norm": 1.4003301586141983,
"learning_rate": 3.638850940386433e-05,
"loss": 0.9219451904296875,
"memory(GiB)": 76.02,
"step": 320,
"token_acc": 0.6934164394234515,
"train_speed(iter/s)": 0.020381
},
{
"epoch": 0.5914467697907189,
"grad_norm": 1.2198877131221963,
"learning_rate": 3.627843261580231e-05,
"loss": 0.9142662048339844,
"memory(GiB)": 76.02,
"step": 325,
"token_acc": 0.6796973518284993,
"train_speed(iter/s)": 0.020407
},
{
"epoch": 0.6005459508644222,
"grad_norm": 1.2491149251440654,
"learning_rate": 3.6166875145559684e-05,
"loss": 0.9013506889343261,
"memory(GiB)": 76.02,
"step": 330,
"token_acc": 0.7270875763747454,
"train_speed(iter/s)": 0.020426
},
{
"epoch": 0.6096451319381255,
"grad_norm": 1.3464860154655747,
"learning_rate": 3.6053847140376194e-05,
"loss": 0.9187211990356445,
"memory(GiB)": 76.02,
"step": 335,
"token_acc": 0.6677791262135923,
"train_speed(iter/s)": 0.020449
},
{
"epoch": 0.618744313011829,
"grad_norm": 1.3081495464213557,
"learning_rate": 3.593935888125107e-05,
"loss": 0.9130012512207031,
"memory(GiB)": 76.02,
"step": 340,
"token_acc": 0.6820603907637656,
"train_speed(iter/s)": 0.020469
},
{
"epoch": 0.6278434940855323,
"grad_norm": 1.3056329501412263,
"learning_rate": 3.582342078200786e-05,
"loss": 0.903553581237793,
"memory(GiB)": 76.02,
"step": 345,
"token_acc": 0.7179723502304147,
"train_speed(iter/s)": 0.020488
},
{
"epoch": 0.6369426751592356,
"grad_norm": 1.2033803940833903,
"learning_rate": 3.570604338834725e-05,
"loss": 0.9074154853820801,
"memory(GiB)": 76.02,
"step": 350,
"token_acc": 0.7170805116629044,
"train_speed(iter/s)": 0.020509
},
{
"epoch": 0.6369426751592356,
"eval_loss": 0.5156524777412415,
"eval_runtime": 121.7142,
"eval_samples_per_second": 47.423,
"eval_steps_per_second": 0.46,
"eval_token_acc": 0.6832346884696763,
"step": 350
},
{
"epoch": 0.6460418562329391,
"grad_norm": 1.3706427880274294,
"learning_rate": 3.558723737688775e-05,
"loss": 0.9084077835083008,
"memory(GiB)": 76.02,
"step": 355,
"token_acc": 0.7012306886619534,
"train_speed(iter/s)": 0.020344
},
{
"epoch": 0.6551410373066424,
"grad_norm": 1.4525504674274499,
"learning_rate": 3.54670135541946e-05,
"loss": 0.9108301162719726,
"memory(GiB)": 76.02,
"step": 360,
"token_acc": 0.6819548872180451,
"train_speed(iter/s)": 0.020365
},
{
"epoch": 0.6642402183803457,
"grad_norm": 1.371067326824918,
"learning_rate": 3.534538285579681e-05,
"loss": 0.9166597366333008,
"memory(GiB)": 76.02,
"step": 365,
"token_acc": 0.68828125,
"train_speed(iter/s)": 0.020383
},
{
"epoch": 0.6733393994540491,
"grad_norm": 1.404728462002113,
"learning_rate": 3.522235634519244e-05,
"loss": 0.8995059967041016,
"memory(GiB)": 76.02,
"step": 370,
"token_acc": 0.6734115742614326,
"train_speed(iter/s)": 0.020405
},
{
"epoch": 0.6824385805277525,
"grad_norm": 1.4153346949849819,
"learning_rate": 3.509794521284228e-05,
"loss": 0.8986475944519043,
"memory(GiB)": 76.02,
"step": 375,
"token_acc": 0.6696600384862091,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 0.6915377616014559,
"grad_norm": 1.357991579405462,
"learning_rate": 3.497216077515198e-05,
"loss": 0.914306354522705,
"memory(GiB)": 76.02,
"step": 380,
"token_acc": 0.668999300209937,
"train_speed(iter/s)": 0.020442
},
{
"epoch": 0.7006369426751592,
"grad_norm": 1.421643318524058,
"learning_rate": 3.48450144734427e-05,
"loss": 0.9151236534118652,
"memory(GiB)": 76.02,
"step": 385,
"token_acc": 0.6687898089171974,
"train_speed(iter/s)": 0.02046
},
{
"epoch": 0.7097361237488626,
"grad_norm": 1.1089727601654944,
"learning_rate": 3.4716517872910405e-05,
"loss": 0.8921234130859375,
"memory(GiB)": 76.02,
"step": 390,
"token_acc": 0.6953678474114442,
"train_speed(iter/s)": 0.020478
},
{
"epoch": 0.718835304822566,
"grad_norm": 1.3211880131463927,
"learning_rate": 3.45866826615739e-05,
"loss": 0.9150146484375,
"memory(GiB)": 76.02,
"step": 395,
"token_acc": 0.6571167327034441,
"train_speed(iter/s)": 0.020496
},
{
"epoch": 0.7279344858962693,
"grad_norm": 1.4350745291439944,
"learning_rate": 3.445552064921172e-05,
"loss": 0.9022627830505371,
"memory(GiB)": 76.02,
"step": 400,
"token_acc": 0.6755852842809364,
"train_speed(iter/s)": 0.020512
},
{
"epoch": 0.7279344858962693,
"eval_loss": 0.5100554823875427,
"eval_runtime": 119.6911,
"eval_samples_per_second": 48.224,
"eval_steps_per_second": 0.468,
"eval_token_acc": 0.6859209573473904,
"step": 400
},
{
"epoch": 0.7370336669699727,
"grad_norm": 1.1612524813118632,
"learning_rate": 3.432304376628787e-05,
"loss": 0.9135440826416016,
"memory(GiB)": 76.02,
"step": 405,
"token_acc": 0.7024793388429752,
"train_speed(iter/s)": 0.020366
},
{
"epoch": 0.7461328480436761,
"grad_norm": 1.3506987538568946,
"learning_rate": 3.418926406286666e-05,
"loss": 0.9180900573730468,
"memory(GiB)": 76.02,
"step": 410,
"token_acc": 0.715203426124197,
"train_speed(iter/s)": 0.020382
},
{
"epoch": 0.7552320291173794,
"grad_norm": 1.3682849356535443,
"learning_rate": 3.405419370751663e-05,
"loss": 0.9025050163269043,
"memory(GiB)": 76.02,
"step": 415,
"token_acc": 0.7220916568742656,
"train_speed(iter/s)": 0.020402
},
{
"epoch": 0.7643312101910829,
"grad_norm": 1.4354924987431779,
"learning_rate": 3.391784498620369e-05,
"loss": 0.9032191276550293,
"memory(GiB)": 76.02,
"step": 420,
"token_acc": 0.6772521062864549,
"train_speed(iter/s)": 0.020419
},
{
"epoch": 0.7734303912647862,
"grad_norm": 1.3319624335350189,
"learning_rate": 3.378023030117361e-05,
"loss": 0.9076663970947265,
"memory(GiB)": 76.02,
"step": 425,
"token_acc": 0.6790314270994333,
"train_speed(iter/s)": 0.020436
},
{
"epoch": 0.7825295723384895,
"grad_norm": 1.2560401393743486,
"learning_rate": 3.364136216982391e-05,
"loss": 0.9036032676696777,
"memory(GiB)": 76.02,
"step": 430,
"token_acc": 0.6832980972515856,
"train_speed(iter/s)": 0.020453
},
{
"epoch": 0.7916287534121929,
"grad_norm": 1.331582467821213,
"learning_rate": 3.350125322356525e-05,
"loss": 0.9180031776428222,
"memory(GiB)": 76.02,
"step": 435,
"token_acc": 0.6918290043290043,
"train_speed(iter/s)": 0.020468
},
{
"epoch": 0.8007279344858963,
"grad_norm": 1.3101601945182637,
"learning_rate": 3.335991620667254e-05,
"loss": 0.9090401649475097,
"memory(GiB)": 76.02,
"step": 440,
"token_acc": 0.6886586695747001,
"train_speed(iter/s)": 0.020484
},
{
"epoch": 0.8098271155595996,
"grad_norm": 1.490959565832233,
"learning_rate": 3.321736397512566e-05,
"loss": 0.8914430618286133,
"memory(GiB)": 76.02,
"step": 445,
"token_acc": 0.7289220917822838,
"train_speed(iter/s)": 0.020498
},
{
"epoch": 0.818926296633303,
"grad_norm": 1.6826531523568926,
"learning_rate": 3.307360949544012e-05,
"loss": 0.8871423721313476,
"memory(GiB)": 76.02,
"step": 450,
"token_acc": 0.6811023622047244,
"train_speed(iter/s)": 0.020515
},
{
"epoch": 0.818926296633303,
"eval_loss": 0.5105797648429871,
"eval_runtime": 119.2169,
"eval_samples_per_second": 48.416,
"eval_steps_per_second": 0.47,
"eval_token_acc": 0.6859007294190943,
"step": 450
},
{
"epoch": 0.8280254777070064,
"grad_norm": 1.5351362870698657,
"learning_rate": 3.2928665843487646e-05,
"loss": 0.9084842681884766,
"memory(GiB)": 76.02,
"step": 455,
"token_acc": 0.6964930376482723,
"train_speed(iter/s)": 0.020387
},
{
"epoch": 0.8371246587807097,
"grad_norm": 1.76414300067586,
"learning_rate": 3.278254620330673e-05,
"loss": 0.8832217216491699,
"memory(GiB)": 76.02,
"step": 460,
"token_acc": 0.6910656620021528,
"train_speed(iter/s)": 0.020403
},
{
"epoch": 0.8462238398544131,
"grad_norm": 1.26108516359597,
"learning_rate": 3.263526386590351e-05,
"loss": 0.9098955154418945,
"memory(GiB)": 76.02,
"step": 465,
"token_acc": 0.6647430612805716,
"train_speed(iter/s)": 0.020418
},
{
"epoch": 0.8553230209281165,
"grad_norm": 1.4539630443562455,
"learning_rate": 3.248683222804274e-05,
"loss": 0.8848261833190918,
"memory(GiB)": 76.02,
"step": 470,
"token_acc": 0.7338235294117647,
"train_speed(iter/s)": 0.020432
},
{
"epoch": 0.8644222020018199,
"grad_norm": 1.6326834981575191,
"learning_rate": 3.233726479102927e-05,
"loss": 0.9008934020996093,
"memory(GiB)": 76.02,
"step": 475,
"token_acc": 0.7064676616915423,
"train_speed(iter/s)": 0.020448
},
{
"epoch": 0.8735213830755232,
"grad_norm": 1.2054817005259488,
"learning_rate": 3.2186575159479966e-05,
"loss": 0.8803308486938477,
"memory(GiB)": 76.02,
"step": 480,
"token_acc": 0.7033673855467272,
"train_speed(iter/s)": 0.020462
},
{
"epoch": 0.8826205641492265,
"grad_norm": 1.1783711102902867,
"learning_rate": 3.203477704008622e-05,
"loss": 0.9082450866699219,
"memory(GiB)": 76.02,
"step": 485,
"token_acc": 0.7070333157059757,
"train_speed(iter/s)": 0.020477
},
{
"epoch": 0.89171974522293,
"grad_norm": 1.241716165408502,
"learning_rate": 3.188188424036719e-05,
"loss": 0.9072214126586914,
"memory(GiB)": 76.02,
"step": 490,
"token_acc": 0.6927956502038967,
"train_speed(iter/s)": 0.02049
},
{
"epoch": 0.9008189262966333,
"grad_norm": 1.1673048249036013,
"learning_rate": 3.172791066741392e-05,
"loss": 0.886620044708252,
"memory(GiB)": 76.02,
"step": 495,
"token_acc": 0.7046548956661316,
"train_speed(iter/s)": 0.020505
},
{
"epoch": 0.9099181073703366,
"grad_norm": 1.5005936662764863,
"learning_rate": 3.157287032662428e-05,
"loss": 0.8825222015380859,
"memory(GiB)": 76.02,
"step": 500,
"token_acc": 0.6940532081377152,
"train_speed(iter/s)": 0.020518
},
{
"epoch": 0.9099181073703366,
"eval_loss": 0.49878114461898804,
"eval_runtime": 121.4101,
"eval_samples_per_second": 47.541,
"eval_steps_per_second": 0.461,
"eval_token_acc": 0.6875917842246433,
"step": 500
},
{
"epoch": 0.9190172884440401,
"grad_norm": 1.2321769009736276,
"learning_rate": 3.14167773204291e-05,
"loss": 0.8877192497253418,
"memory(GiB)": 76.02,
"step": 505,
"token_acc": 0.7100805331852263,
"train_speed(iter/s)": 0.020401
},
{
"epoch": 0.9281164695177434,
"grad_norm": 1.2301460920284364,
"learning_rate": 3.1259645847009384e-05,
"loss": 0.9063457489013672,
"memory(GiB)": 76.02,
"step": 510,
"token_acc": 0.6885245901639344,
"train_speed(iter/s)": 0.020414
},
{
"epoch": 0.9372156505914467,
"grad_norm": 1.4857123341096659,
"learning_rate": 3.110149019900486e-05,
"loss": 0.8702260971069335,
"memory(GiB)": 76.02,
"step": 515,
"token_acc": 0.6863874345549739,
"train_speed(iter/s)": 0.020427
},
{
"epoch": 0.9463148316651502,
"grad_norm": 1.189993476966276,
"learning_rate": 3.094232476221392e-05,
"loss": 0.9034518241882324,
"memory(GiB)": 76.02,
"step": 520,
"token_acc": 0.7082294264339152,
"train_speed(iter/s)": 0.020441
},
{
"epoch": 0.9554140127388535,
"grad_norm": 1.3161268491995117,
"learning_rate": 3.07821640142851e-05,
"loss": 0.87875394821167,
"memory(GiB)": 76.02,
"step": 525,
"token_acc": 0.683948569058482,
"train_speed(iter/s)": 0.020453
},
{
"epoch": 0.9645131938125568,
"grad_norm": 1.1112974134834392,
"learning_rate": 3.062102252340019e-05,
"loss": 0.8922388076782226,
"memory(GiB)": 76.02,
"step": 530,
"token_acc": 0.6777905638665133,
"train_speed(iter/s)": 0.020468
},
{
"epoch": 0.9736123748862603,
"grad_norm": 1.292894697629211,
"learning_rate": 3.045891494694908e-05,
"loss": 0.908051872253418,
"memory(GiB)": 76.02,
"step": 535,
"token_acc": 0.6983343615052436,
"train_speed(iter/s)": 0.020479
},
{
"epoch": 0.9827115559599636,
"grad_norm": 1.166045668885059,
"learning_rate": 3.0295856030196618e-05,
"loss": 0.9091971397399903,
"memory(GiB)": 76.02,
"step": 540,
"token_acc": 0.7089144936325046,
"train_speed(iter/s)": 0.020492
},
{
"epoch": 0.991810737033667,
"grad_norm": 1.3674012690083148,
"learning_rate": 3.0131860604941287e-05,
"loss": 0.8997166633605957,
"memory(GiB)": 76.02,
"step": 545,
"token_acc": 0.6767097082735534,
"train_speed(iter/s)": 0.020504
},
{
"epoch": 1.0,
"grad_norm": 1.4019349528909308,
"learning_rate": 2.996694358816618e-05,
"loss": 0.8638315200805664,
"memory(GiB)": 76.02,
"step": 550,
"token_acc": 0.7002042900919305,
"train_speed(iter/s)": 0.020533
},
{
"epoch": 1.0,
"eval_loss": 0.4928109347820282,
"eval_runtime": 119.0212,
"eval_samples_per_second": 48.496,
"eval_steps_per_second": 0.471,
"eval_token_acc": 0.6892747478588738,
"step": 550
},
{
"epoch": 1.0090991810737033,
"grad_norm": 1.4220386258897009,
"learning_rate": 2.9801119980682095e-05,
"loss": 0.8142873764038085,
"memory(GiB)": 76.02,
"step": 555,
"token_acc": 0.7055921052631579,
"train_speed(iter/s)": 0.020412
},
{
"epoch": 1.0181983621474067,
"grad_norm": 1.187181373009717,
"learning_rate": 2.9634404865763122e-05,
"loss": 0.7935843467712402,
"memory(GiB)": 76.02,
"step": 560,
"token_acc": 0.7032755298651252,
"train_speed(iter/s)": 0.02042
},
{
"epoch": 1.02729754322111,
"grad_norm": 1.0185191433966165,
"learning_rate": 2.9466813407774627e-05,
"loss": 0.7965437889099121,
"memory(GiB)": 76.02,
"step": 565,
"token_acc": 0.6973250274825944,
"train_speed(iter/s)": 0.020432
},
{
"epoch": 1.0363967242948136,
"grad_norm": 1.2024810924675036,
"learning_rate": 2.9298360850793944e-05,
"loss": 0.7800662517547607,
"memory(GiB)": 76.02,
"step": 570,
"token_acc": 0.7089552238805971,
"train_speed(iter/s)": 0.020443
},
{
"epoch": 1.0454959053685169,
"grad_norm": 0.9855874534546613,
"learning_rate": 2.912906251722373e-05,
"loss": 0.8090152740478516,
"memory(GiB)": 76.02,
"step": 575,
"token_acc": 0.7137375287797391,
"train_speed(iter/s)": 0.020455
},
{
"epoch": 1.0545950864422202,
"grad_norm": 1.183729346768703,
"learning_rate": 2.895893380639829e-05,
"loss": 0.8083430290222168,
"memory(GiB)": 76.02,
"step": 580,
"token_acc": 0.7071651090342679,
"train_speed(iter/s)": 0.020466
},
{
"epoch": 1.0636942675159236,
"grad_norm": 1.527448245905063,
"learning_rate": 2.878799019318283e-05,
"loss": 0.787087345123291,
"memory(GiB)": 76.02,
"step": 585,
"token_acc": 0.7470379146919431,
"train_speed(iter/s)": 0.020477
},
{
"epoch": 1.0727934485896269,
"grad_norm": 1.2570337520295112,
"learning_rate": 2.8616247226565888e-05,
"loss": 0.8103050231933594,
"memory(GiB)": 76.02,
"step": 590,
"token_acc": 0.7105431309904153,
"train_speed(iter/s)": 0.020489
},
{
"epoch": 1.0818926296633302,
"grad_norm": 1.1805179088694353,
"learning_rate": 2.8443720528244964e-05,
"loss": 0.8091272354125977,
"memory(GiB)": 76.02,
"step": 595,
"token_acc": 0.7236403995560489,
"train_speed(iter/s)": 0.0205
},
{
"epoch": 1.0909918107370338,
"grad_norm": 1.3005835459012032,
"learning_rate": 2.827042579120562e-05,
"loss": 0.7841366767883301,
"memory(GiB)": 76.02,
"step": 600,
"token_acc": 0.7160133444537115,
"train_speed(iter/s)": 0.020511
},
{
"epoch": 1.0909918107370338,
"eval_loss": 0.4980168640613556,
"eval_runtime": 122.0994,
"eval_samples_per_second": 47.273,
"eval_steps_per_second": 0.459,
"eval_token_acc": 0.68817030297391,
"step": 600
},
{
"epoch": 1.100090991810737,
"grad_norm": 1.0825655683949489,
"learning_rate": 2.809637877829401e-05,
"loss": 0.8102677345275879,
"memory(GiB)": 76.02,
"step": 605,
"token_acc": 0.7054728756601056,
"train_speed(iter/s)": 0.020407
},
{
"epoch": 1.1091901728844404,
"grad_norm": 1.269997926727983,
"learning_rate": 2.792159532078314e-05,
"loss": 0.8190704345703125,
"memory(GiB)": 76.02,
"step": 610,
"token_acc": 0.7151929653150952,
"train_speed(iter/s)": 0.020418
},
{
"epoch": 1.1182893539581438,
"grad_norm": 1.3197280690186768,
"learning_rate": 2.7746091316932807e-05,
"loss": 0.7909206867218017,
"memory(GiB)": 76.02,
"step": 615,
"token_acc": 0.8111888111888111,
"train_speed(iter/s)": 0.020428
},
{
"epoch": 1.127388535031847,
"grad_norm": 1.3074486932691716,
"learning_rate": 2.756988273054354e-05,
"loss": 0.7989336967468261,
"memory(GiB)": 76.02,
"step": 620,
"token_acc": 0.6923334449280214,
"train_speed(iter/s)": 0.020439
},
{
"epoch": 1.1364877161055504,
"grad_norm": 1.09154376619437,
"learning_rate": 2.7392985589504512e-05,
"loss": 0.7985887050628662,
"memory(GiB)": 76.02,
"step": 625,
"token_acc": 0.6959603118355776,
"train_speed(iter/s)": 0.02045
},
{
"epoch": 1.1455868971792538,
"grad_norm": 1.105083946015695,
"learning_rate": 2.721541598433567e-05,
"loss": 0.7879680156707763,
"memory(GiB)": 76.02,
"step": 630,
"token_acc": 0.7151389249545572,
"train_speed(iter/s)": 0.020461
},
{
"epoch": 1.1546860782529573,
"grad_norm": 1.1369632866951163,
"learning_rate": 2.7037190066724108e-05,
"loss": 0.8013208389282227,
"memory(GiB)": 76.02,
"step": 635,
"token_acc": 0.6987542468856173,
"train_speed(iter/s)": 0.020471
},
{
"epoch": 1.1637852593266607,
"grad_norm": 1.084161120288602,
"learning_rate": 2.6858324048054956e-05,
"loss": 0.8041671752929688,
"memory(GiB)": 76.02,
"step": 640,
"token_acc": 0.6834153197470133,
"train_speed(iter/s)": 0.020482
},
{
"epoch": 1.172884440400364,
"grad_norm": 1.154991176116474,
"learning_rate": 2.667883419793676e-05,
"loss": 0.8061488151550293,
"memory(GiB)": 76.02,
"step": 645,
"token_acc": 0.7004991680532446,
"train_speed(iter/s)": 0.020492
},
{
"epoch": 1.1819836214740673,
"grad_norm": 1.1196634253017694,
"learning_rate": 2.649873684272164e-05,
"loss": 0.8086748123168945,
"memory(GiB)": 76.02,
"step": 650,
"token_acc": 0.6978937441056272,
"train_speed(iter/s)": 0.020502
},
{
"epoch": 1.1819836214740673,
"eval_loss": 0.5025342702865601,
"eval_runtime": 120.6757,
"eval_samples_per_second": 47.831,
"eval_steps_per_second": 0.464,
"eval_token_acc": 0.6888256878507018,
"step": 650
},
{
"epoch": 1.1910828025477707,
"grad_norm": 1.1155649313448126,
"learning_rate": 2.6318048364020214e-05,
"loss": 0.7836286544799804,
"memory(GiB)": 76.02,
"step": 655,
"token_acc": 0.7220535467844328,
"train_speed(iter/s)": 0.020409
},
{
"epoch": 1.200181983621474,
"grad_norm": 1.1072757367187032,
"learning_rate": 2.613678519721155e-05,
"loss": 0.7940217018127441,
"memory(GiB)": 76.02,
"step": 660,
"token_acc": 0.7217682020802377,
"train_speed(iter/s)": 0.02042
},
{
"epoch": 1.2092811646951773,
"grad_norm": 1.0457391204034119,
"learning_rate": 2.5954963829948195e-05,
"loss": 0.7881236553192139,
"memory(GiB)": 76.02,
"step": 665,
"token_acc": 0.7111846946284033,
"train_speed(iter/s)": 0.020429
},
{
"epoch": 1.2183803457688809,
"grad_norm": 1.2226481761675059,
"learning_rate": 2.577260080065649e-05,
"loss": 0.8019227981567383,
"memory(GiB)": 76.02,
"step": 670,
"token_acc": 0.7422535211267606,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 1.2274795268425842,
"grad_norm": 1.27401099270194,
"learning_rate": 2.558971269703219e-05,
"loss": 0.7942542552947998,
"memory(GiB)": 76.02,
"step": 675,
"token_acc": 0.7235213204951857,
"train_speed(iter/s)": 0.020449
},
{
"epoch": 1.2365787079162875,
"grad_norm": 1.3601936101076058,
"learning_rate": 2.5406316154531717e-05,
"loss": 0.8046051025390625,
"memory(GiB)": 76.02,
"step": 680,
"token_acc": 0.7112280701754385,
"train_speed(iter/s)": 0.020459
},
{
"epoch": 1.2456778889899909,
"grad_norm": 1.1617605645583995,
"learning_rate": 2.522242785485893e-05,
"loss": 0.8000314712524415,
"memory(GiB)": 76.02,
"step": 685,
"token_acc": 0.6886890349360083,
"train_speed(iter/s)": 0.020469
},
{
"epoch": 1.2547770700636942,
"grad_norm": 1.3512273187713244,
"learning_rate": 2.5038064524447827e-05,
"loss": 0.8067909240722656,
"memory(GiB)": 76.02,
"step": 690,
"token_acc": 0.7467532467532467,
"train_speed(iter/s)": 0.020479
},
{
"epoch": 1.2638762511373978,
"grad_norm": 1.3157719287072271,
"learning_rate": 2.4853242932941064e-05,
"loss": 0.7853587150573731,
"memory(GiB)": 76.02,
"step": 695,
"token_acc": 0.7197480881691408,
"train_speed(iter/s)": 0.020488
},
{
"epoch": 1.2729754322111009,
"grad_norm": 1.1947998857326674,
"learning_rate": 2.4667979891664625e-05,
"loss": 0.7679170131683349,
"memory(GiB)": 76.02,
"step": 700,
"token_acc": 0.7413360120542442,
"train_speed(iter/s)": 0.020498
},
{
"epoch": 1.2729754322111009,
"eval_loss": 0.4833757281303406,
"eval_runtime": 119.9805,
"eval_samples_per_second": 48.108,
"eval_steps_per_second": 0.467,
"eval_token_acc": 0.6897318990383643,
"step": 700
},
{
"epoch": 1.2820746132848044,
"grad_norm": 1.3268470864740665,
"learning_rate": 2.448229225209865e-05,
"loss": 0.788662052154541,
"memory(GiB)": 76.02,
"step": 705,
"token_acc": 0.716280170373876,
"train_speed(iter/s)": 0.020416
},
{
"epoch": 1.2911737943585078,
"grad_norm": 1.2466125304642335,
"learning_rate": 2.429619690434464e-05,
"loss": 0.7932944297790527,
"memory(GiB)": 76.02,
"step": 710,
"token_acc": 0.7371388301620859,
"train_speed(iter/s)": 0.020426
},
{
"epoch": 1.300272975432211,
"grad_norm": 1.3582445751553864,
"learning_rate": 2.4109710775589104e-05,
"loss": 0.8029943466186523,
"memory(GiB)": 76.02,
"step": 715,
"token_acc": 0.7082366589327146,
"train_speed(iter/s)": 0.020435
},
{
"epoch": 1.3093721565059144,
"grad_norm": 1.098320752598586,
"learning_rate": 2.392285082856394e-05,
"loss": 0.8051022529602051,
"memory(GiB)": 76.02,
"step": 720,
"token_acc": 0.6993071593533488,
"train_speed(iter/s)": 0.020444
},
{
"epoch": 1.3184713375796178,
"grad_norm": 1.1993515162762007,
"learning_rate": 2.3735634060003428e-05,
"loss": 0.7886831760406494,
"memory(GiB)": 76.02,
"step": 725,
"token_acc": 0.7265460664703408,
"train_speed(iter/s)": 0.020453
},
{
"epoch": 1.3275705186533213,
"grad_norm": 1.4913459363975115,
"learning_rate": 2.3548077499098256e-05,
"loss": 0.7917290687561035,
"memory(GiB)": 76.02,
"step": 730,
"token_acc": 0.7044052863436123,
"train_speed(iter/s)": 0.020462
},
{
"epoch": 1.3366696997270244,
"grad_norm": 1.3995123406507142,
"learning_rate": 2.3360198205946542e-05,
"loss": 0.788825798034668,
"memory(GiB)": 76.02,
"step": 735,
"token_acc": 0.7135922330097088,
"train_speed(iter/s)": 0.020471
},
{
"epoch": 1.345768880800728,
"grad_norm": 1.3354117848213083,
"learning_rate": 2.3172013270002038e-05,
"loss": 0.7835997581481934,
"memory(GiB)": 76.02,
"step": 740,
"token_acc": 0.7201051248357424,
"train_speed(iter/s)": 0.02048
},
{
"epoch": 1.3548680618744313,
"grad_norm": 1.0749964264738503,
"learning_rate": 2.2983539808519702e-05,
"loss": 0.7911547660827637,
"memory(GiB)": 76.02,
"step": 745,
"token_acc": 0.7271609995903319,
"train_speed(iter/s)": 0.020488
},
{
"epoch": 1.3639672429481347,
"grad_norm": 0.9437159555687519,
"learning_rate": 2.2794794964998705e-05,
"loss": 0.7891970634460449,
"memory(GiB)": 76.02,
"step": 750,
"token_acc": 0.7132644956314536,
"train_speed(iter/s)": 0.020497
},
{
"epoch": 1.3639672429481347,
"eval_loss": 0.48184001445770264,
"eval_runtime": 120.3801,
"eval_samples_per_second": 47.948,
"eval_steps_per_second": 0.465,
"eval_token_acc": 0.6908322983376689,
"step": 750
},
{
"epoch": 1.373066424021838,
"grad_norm": 1.3416671636490984,
"learning_rate": 2.260579590762304e-05,
"loss": 0.8072065353393555,
"memory(GiB)": 76.02,
"step": 755,
"token_acc": 0.7023445463812437,
"train_speed(iter/s)": 0.020418
},
{
"epoch": 1.3821656050955413,
"grad_norm": 1.1639847848783198,
"learning_rate": 2.2416559827699945e-05,
"loss": 0.8082324028015136,
"memory(GiB)": 76.02,
"step": 760,
"token_acc": 0.7145284621920136,
"train_speed(iter/s)": 0.020427
},
{
"epoch": 1.3912647861692449,
"grad_norm": 1.132127107571287,
"learning_rate": 2.2227103938096176e-05,
"loss": 0.7869006156921386,
"memory(GiB)": 76.02,
"step": 765,
"token_acc": 0.7099471830985915,
"train_speed(iter/s)": 0.020436
},
{
"epoch": 1.4003639672429482,
"grad_norm": 1.0194297655037412,
"learning_rate": 2.2037445471672312e-05,
"loss": 0.8034600257873535,
"memory(GiB)": 76.02,
"step": 770,
"token_acc": 0.7037037037037037,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 1.4094631483166515,
"grad_norm": 1.3328252272724603,
"learning_rate": 2.1847601679715263e-05,
"loss": 0.8002717971801758,
"memory(GiB)": 76.02,
"step": 775,
"token_acc": 0.7140373750543242,
"train_speed(iter/s)": 0.020454
},
{
"epoch": 1.4185623293903549,
"grad_norm": 1.265718534410907,
"learning_rate": 2.1657589830369113e-05,
"loss": 0.8017659187316895,
"memory(GiB)": 76.02,
"step": 780,
"token_acc": 0.7063737623762376,
"train_speed(iter/s)": 0.020462
},
{
"epoch": 1.4276615104640582,
"grad_norm": 0.9977051429918016,
"learning_rate": 2.146742720706441e-05,
"loss": 0.7789717674255371,
"memory(GiB)": 76.02,
"step": 785,
"token_acc": 0.710708782742681,
"train_speed(iter/s)": 0.02047
},
{
"epoch": 1.4367606915377615,
"grad_norm": 1.0283878536421338,
"learning_rate": 2.127713110694606e-05,
"loss": 0.8202502250671386,
"memory(GiB)": 76.02,
"step": 790,
"token_acc": 0.707347972972973,
"train_speed(iter/s)": 0.020478
},
{
"epoch": 1.4458598726114649,
"grad_norm": 1.0457464903588745,
"learning_rate": 2.1086718839299972e-05,
"loss": 0.7791718482971192,
"memory(GiB)": 76.02,
"step": 795,
"token_acc": 0.7183828610919143,
"train_speed(iter/s)": 0.020486
},
{
"epoch": 1.4549590536851684,
"grad_norm": 1.1827863278388744,
"learning_rate": 2.0896207723978637e-05,
"loss": 0.8088536262512207,
"memory(GiB)": 76.02,
"step": 800,
"token_acc": 0.7157598499061913,
"train_speed(iter/s)": 0.020494
},
{
"epoch": 1.4549590536851684,
"eval_loss": 0.4799867272377014,
"eval_runtime": 120.658,
"eval_samples_per_second": 47.838,
"eval_steps_per_second": 0.464,
"eval_token_acc": 0.6916009596129183,
"step": 800
},
{
"epoch": 1.4640582347588718,
"grad_norm": 1.1034251914058373,
"learning_rate": 2.070561508982571e-05,
"loss": 0.7959201335906982,
"memory(GiB)": 76.02,
"step": 805,
"token_acc": 0.7082542694497154,
"train_speed(iter/s)": 0.020414
},
{
"epoch": 1.473157415832575,
"grad_norm": 1.1403649470949677,
"learning_rate": 2.0514958273099778e-05,
"loss": 0.8099080085754394,
"memory(GiB)": 76.02,
"step": 810,
"token_acc": 0.6938775510204082,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 1.4822565969062784,
"grad_norm": 1.242956861788932,
"learning_rate": 2.0324254615897438e-05,
"loss": 0.7870995044708252,
"memory(GiB)": 76.02,
"step": 815,
"token_acc": 0.6989182692307693,
"train_speed(iter/s)": 0.020431
},
{
"epoch": 1.4913557779799818,
"grad_norm": 1.2480879646871645,
"learning_rate": 2.0133521464575915e-05,
"loss": 0.8157112121582031,
"memory(GiB)": 76.02,
"step": 820,
"token_acc": 0.6917945296864576,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 1.5004549590536853,
"grad_norm": 1.4455782166201527,
"learning_rate": 1.99427761681752e-05,
"loss": 0.7882473945617676,
"memory(GiB)": 76.02,
"step": 825,
"token_acc": 0.7195308516063234,
"train_speed(iter/s)": 0.020446
},
{
"epoch": 1.5095541401273884,
"grad_norm": 1.129414363377021,
"learning_rate": 1.9752036076839988e-05,
"loss": 0.7893435955047607,
"memory(GiB)": 76.02,
"step": 830,
"token_acc": 0.7249863313285949,
"train_speed(iter/s)": 0.020454
},
{
"epoch": 1.518653321201092,
"grad_norm": 1.1611426190154455,
"learning_rate": 1.9561318540241528e-05,
"loss": 0.7893610000610352,
"memory(GiB)": 76.02,
"step": 835,
"token_acc": 0.7279521674140508,
"train_speed(iter/s)": 0.020463
},
{
"epoch": 1.5277525022747953,
"grad_norm": 1.387275557971045,
"learning_rate": 1.93706409059995e-05,
"loss": 0.7986185073852539,
"memory(GiB)": 76.02,
"step": 840,
"token_acc": 0.7054386661373561,
"train_speed(iter/s)": 0.02047
},
{
"epoch": 1.5368516833484986,
"grad_norm": 1.1029714828712447,
"learning_rate": 1.9180020518104088e-05,
"loss": 0.7868841171264649,
"memory(GiB)": 76.02,
"step": 845,
"token_acc": 0.7180851063829787,
"train_speed(iter/s)": 0.020478
},
{
"epoch": 1.545950864422202,
"grad_norm": 1.055709561997052,
"learning_rate": 1.898947471533833e-05,
"loss": 0.7913725852966309,
"memory(GiB)": 76.02,
"step": 850,
"token_acc": 0.6924932167621345,
"train_speed(iter/s)": 0.020486
},
{
"epoch": 1.545950864422202,
"eval_loss": 0.4763409495353699,
"eval_runtime": 119.4883,
"eval_samples_per_second": 48.306,
"eval_steps_per_second": 0.469,
"eval_token_acc": 0.6927134956692006,
"step": 850
},
{
"epoch": 1.5550500454959053,
"grad_norm": 1.1281157034877283,
"learning_rate": 1.8799020829701036e-05,
"loss": 0.8020171165466309,
"memory(GiB)": 76.02,
"step": 855,
"token_acc": 0.7118734923612973,
"train_speed(iter/s)": 0.020415
},
{
"epoch": 1.5641492265696089,
"grad_norm": 1.0786368581164274,
"learning_rate": 1.860867618483027e-05,
"loss": 0.7822349071502686,
"memory(GiB)": 76.02,
"step": 860,
"token_acc": 0.6926726410121244,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 1.573248407643312,
"grad_norm": 1.2124940318046376,
"learning_rate": 1.8418458094427567e-05,
"loss": 0.7907929420471191,
"memory(GiB)": 76.02,
"step": 865,
"token_acc": 0.7004744958481613,
"train_speed(iter/s)": 0.02043
},
{
"epoch": 1.5823475887170155,
"grad_norm": 1.087815247895776,
"learning_rate": 1.82283838606831e-05,
"loss": 0.78410964012146,
"memory(GiB)": 76.02,
"step": 870,
"token_acc": 0.7159194876486734,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 1.5914467697907189,
"grad_norm": 1.033926015572944,
"learning_rate": 1.803847077270188e-05,
"loss": 0.786978006362915,
"memory(GiB)": 76.02,
"step": 875,
"token_acc": 0.7101845522898155,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 1.6005459508644222,
"grad_norm": 1.162364059290432,
"learning_rate": 1.7848736104931142e-05,
"loss": 0.7876530647277832,
"memory(GiB)": 76.02,
"step": 880,
"token_acc": 0.7407407407407407,
"train_speed(iter/s)": 0.020452
},
{
"epoch": 1.6096451319381255,
"grad_norm": 1.0965939407284515,
"learning_rate": 1.765919711558906e-05,
"loss": 0.7792027473449707,
"memory(GiB)": 76.02,
"step": 885,
"token_acc": 0.7125279642058165,
"train_speed(iter/s)": 0.020459
},
{
"epoch": 1.6187443130118289,
"grad_norm": 1.1822482702836845,
"learning_rate": 1.746987104509494e-05,
"loss": 0.7893452644348145,
"memory(GiB)": 76.02,
"step": 890,
"token_acc": 0.6998714652956298,
"train_speed(iter/s)": 0.020466
},
{
"epoch": 1.6278434940855324,
"grad_norm": 1.0733217293598245,
"learning_rate": 1.7280775114501057e-05,
"loss": 0.7864848613739014,
"memory(GiB)": 76.02,
"step": 895,
"token_acc": 0.7469492614001284,
"train_speed(iter/s)": 0.020474
},
{
"epoch": 1.6369426751592355,
"grad_norm": 0.9761043125519061,
"learning_rate": 1.7091926523926205e-05,
"loss": 0.7935813426971435,
"memory(GiB)": 76.02,
"step": 900,
"token_acc": 0.7378048780487805,
"train_speed(iter/s)": 0.020481
},
{
"epoch": 1.6369426751592355,
"eval_loss": 0.4734553098678589,
"eval_runtime": 120.2875,
"eval_samples_per_second": 47.985,
"eval_steps_per_second": 0.466,
"eval_token_acc": 0.6929117293665017,
"step": 900
},
{
"epoch": 1.646041856232939,
"grad_norm": 0.9775890422129749,
"learning_rate": 1.6903342450991203e-05,
"loss": 0.7867559909820556,
"memory(GiB)": 76.02,
"step": 905,
"token_acc": 0.7061556329849012,
"train_speed(iter/s)": 0.020416
},
{
"epoch": 1.6551410373066424,
"grad_norm": 1.032340730518062,
"learning_rate": 1.6715040049256393e-05,
"loss": 0.7743623733520508,
"memory(GiB)": 76.02,
"step": 910,
"token_acc": 0.7131681877444589,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 1.6642402183803457,
"grad_norm": 1.0919952776609756,
"learning_rate": 1.6527036446661396e-05,
"loss": 0.7813485145568848,
"memory(GiB)": 76.02,
"step": 915,
"token_acc": 0.7281947261663286,
"train_speed(iter/s)": 0.02043
},
{
"epoch": 1.673339399454049,
"grad_norm": 1.2303788872377346,
"learning_rate": 1.6339348743967126e-05,
"loss": 0.7993118762969971,
"memory(GiB)": 76.02,
"step": 920,
"token_acc": 0.7152953054013125,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 1.6824385805277524,
"grad_norm": 1.118393217178591,
"learning_rate": 1.6151994013200325e-05,
"loss": 0.7818034648895263,
"memory(GiB)": 76.02,
"step": 925,
"token_acc": 0.7246165084002922,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 1.691537761601456,
"grad_norm": 1.2781086578084908,
"learning_rate": 1.5964989296100682e-05,
"loss": 0.7822434902191162,
"memory(GiB)": 76.02,
"step": 930,
"token_acc": 0.7342391304347826,
"train_speed(iter/s)": 0.020452
},
{
"epoch": 1.700636942675159,
"grad_norm": 1.0706561030394075,
"learning_rate": 1.5778351602570742e-05,
"loss": 0.7954679965972901,
"memory(GiB)": 76.02,
"step": 935,
"token_acc": 0.7032355915065723,
"train_speed(iter/s)": 0.020459
},
{
"epoch": 1.7097361237488626,
"grad_norm": 1.2217572797748102,
"learning_rate": 1.5592097909128673e-05,
"loss": 0.7845365524291992,
"memory(GiB)": 76.02,
"step": 940,
"token_acc": 0.7320365224295355,
"train_speed(iter/s)": 0.020466
},
{
"epoch": 1.718835304822566,
"grad_norm": 1.2477451151406387,
"learning_rate": 1.5406245157364093e-05,
"loss": 0.7835155010223389,
"memory(GiB)": 76.02,
"step": 945,
"token_acc": 0.7151702786377709,
"train_speed(iter/s)": 0.020473
},
{
"epoch": 1.7279344858962693,
"grad_norm": 1.1968781249693217,
"learning_rate": 1.5220810252397054e-05,
"loss": 0.7988658905029297,
"memory(GiB)": 76.02,
"step": 950,
"token_acc": 0.7049180327868853,
"train_speed(iter/s)": 0.020479
},
{
"epoch": 1.7279344858962693,
"eval_loss": 0.4713653028011322,
"eval_runtime": 120.1658,
"eval_samples_per_second": 48.034,
"eval_steps_per_second": 0.466,
"eval_token_acc": 0.6942872284906324,
"step": 950
},
{
"epoch": 1.7370336669699729,
"grad_norm": 0.9590399831837186,
"learning_rate": 1.5035810061340376e-05,
"loss": 0.7818658828735352,
"memory(GiB)": 76.02,
"step": 955,
"token_acc": 0.7186684073107049,
"train_speed(iter/s)": 0.020417
},
{
"epoch": 1.746132848043676,
"grad_norm": 1.311812274039409,
"learning_rate": 1.4851261411765414e-05,
"loss": 0.7812034130096436,
"memory(GiB)": 76.02,
"step": 960,
"token_acc": 0.7130058696323757,
"train_speed(iter/s)": 0.020424
},
{
"epoch": 1.7552320291173795,
"grad_norm": 1.2066428640501157,
"learning_rate": 1.4667181090171418e-05,
"loss": 0.7740418910980225,
"memory(GiB)": 76.02,
"step": 965,
"token_acc": 0.7142857142857143,
"train_speed(iter/s)": 0.02043
},
{
"epoch": 1.7643312101910829,
"grad_norm": 1.1309046997472656,
"learning_rate": 1.4483585840458632e-05,
"loss": 0.7716457843780518,
"memory(GiB)": 76.02,
"step": 970,
"token_acc": 0.7535986452159187,
"train_speed(iter/s)": 0.020437
},
{
"epoch": 1.7734303912647862,
"grad_norm": 1.0597243121965947,
"learning_rate": 1.4300492362405296e-05,
"loss": 0.7900642871856689,
"memory(GiB)": 76.02,
"step": 975,
"token_acc": 0.7184942716857611,
"train_speed(iter/s)": 0.020444
},
{
"epoch": 1.7825295723384895,
"grad_norm": 0.9136761859628779,
"learning_rate": 1.4117917310148624e-05,
"loss": 0.7912971019744873,
"memory(GiB)": 76.02,
"step": 980,
"token_acc": 0.7580794090489381,
"train_speed(iter/s)": 0.02045
},
{
"epoch": 1.7916287534121929,
"grad_norm": 1.123085792919359,
"learning_rate": 1.3935877290669932e-05,
"loss": 0.7823569774627686,
"memory(GiB)": 76.02,
"step": 985,
"token_acc": 0.7234323432343235,
"train_speed(iter/s)": 0.020457
},
{
"epoch": 1.8007279344858964,
"grad_norm": 1.1608781306244833,
"learning_rate": 1.375438886228411e-05,
"loss": 0.7732644081115723,
"memory(GiB)": 76.02,
"step": 990,
"token_acc": 0.6950644451430368,
"train_speed(iter/s)": 0.020464
},
{
"epoch": 1.8098271155595995,
"grad_norm": 1.1283275236864316,
"learning_rate": 1.3573468533133442e-05,
"loss": 0.7756358623504639,
"memory(GiB)": 76.02,
"step": 995,
"token_acc": 0.7115031238515251,
"train_speed(iter/s)": 0.02047
},
{
"epoch": 1.818926296633303,
"grad_norm": 1.0540865657542784,
"learning_rate": 1.3393132759686064e-05,
"loss": 0.7759748935699463,
"memory(GiB)": 76.02,
"step": 1000,
"token_acc": 0.6963375057950858,
"train_speed(iter/s)": 0.020477
},
{
"epoch": 1.818926296633303,
"eval_loss": 0.4693294167518616,
"eval_runtime": 119.7422,
"eval_samples_per_second": 48.204,
"eval_steps_per_second": 0.468,
"eval_token_acc": 0.6942063167774483,
"step": 1000
},
{
"epoch": 1.8280254777070064,
"grad_norm": 1.2097721619516764,
"learning_rate": 1.3213397945239053e-05,
"loss": 0.7718574047088623,
"memory(GiB)": 76.02,
"step": 1005,
"token_acc": 0.7104117843990626,
"train_speed(iter/s)": 0.020419
},
{
"epoch": 1.8371246587807097,
"grad_norm": 1.3429375958388912,
"learning_rate": 1.303428043842641e-05,
"loss": 0.7779555320739746,
"memory(GiB)": 76.02,
"step": 1010,
"token_acc": 0.7344594594594595,
"train_speed(iter/s)": 0.020425
},
{
"epoch": 1.846223839854413,
"grad_norm": 1.1502202864135298,
"learning_rate": 1.2855796531731994e-05,
"loss": 0.784113597869873,
"memory(GiB)": 76.02,
"step": 1015,
"token_acc": 0.7116066903193107,
"train_speed(iter/s)": 0.020432
},
{
"epoch": 1.8553230209281164,
"grad_norm": 0.9764736580354538,
"learning_rate": 1.2677962460007555e-05,
"loss": 0.769007682800293,
"memory(GiB)": 76.02,
"step": 1020,
"token_acc": 0.7275985663082437,
"train_speed(iter/s)": 0.020439
},
{
"epoch": 1.86442220200182,
"grad_norm": 1.0395064733034296,
"learning_rate": 1.2500794398996004e-05,
"loss": 0.7842848300933838,
"memory(GiB)": 76.02,
"step": 1025,
"token_acc": 0.7331868131868132,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 1.873521383075523,
"grad_norm": 1.1556386067848643,
"learning_rate": 1.2324308463860089e-05,
"loss": 0.7766573905944825,
"memory(GiB)": 76.02,
"step": 1030,
"token_acc": 0.729426433915212,
"train_speed(iter/s)": 0.020451
},
{
"epoch": 1.8826205641492266,
"grad_norm": 1.261343214410371,
"learning_rate": 1.2148520707716567e-05,
"loss": 0.7785522937774658,
"memory(GiB)": 76.02,
"step": 1035,
"token_acc": 0.7095070422535211,
"train_speed(iter/s)": 0.020458
},
{
"epoch": 1.89171974522293,
"grad_norm": 1.3077190411896333,
"learning_rate": 1.1973447120175998e-05,
"loss": 0.7712287425994873,
"memory(GiB)": 76.02,
"step": 1040,
"token_acc": 0.6994839221913458,
"train_speed(iter/s)": 0.020464
},
{
"epoch": 1.9008189262966333,
"grad_norm": 1.0009654605437637,
"learning_rate": 1.1799103625888342e-05,
"loss": 0.7672115802764893,
"memory(GiB)": 76.02,
"step": 1045,
"token_acc": 0.7111845210004719,
"train_speed(iter/s)": 0.020471
},
{
"epoch": 1.9099181073703366,
"grad_norm": 1.1500066718260178,
"learning_rate": 1.162550608309446e-05,
"loss": 0.7593209743499756,
"memory(GiB)": 76.02,
"step": 1050,
"token_acc": 0.7720478325859492,
"train_speed(iter/s)": 0.020477
},
{
"epoch": 1.9099181073703366,
"eval_loss": 0.46374601125717163,
"eval_runtime": 119.6783,
"eval_samples_per_second": 48.229,
"eval_steps_per_second": 0.468,
"eval_token_acc": 0.6953795366186186,
"step": 1050
},
{
"epoch": 1.91901728844404,
"grad_norm": 1.0354960902542707,
"learning_rate": 1.1452670282183664e-05,
"loss": 0.7757611274719238,
"memory(GiB)": 76.02,
"step": 1055,
"token_acc": 0.7227655986509275,
"train_speed(iter/s)": 0.02042
},
{
"epoch": 1.9281164695177435,
"grad_norm": 1.1181099943024946,
"learning_rate": 1.12806119442574e-05,
"loss": 0.7624452590942383,
"memory(GiB)": 76.02,
"step": 1060,
"token_acc": 0.7370562130177515,
"train_speed(iter/s)": 0.020426
},
{
"epoch": 1.9372156505914466,
"grad_norm": 1.020900947874345,
"learning_rate": 1.1109346719699263e-05,
"loss": 0.7685122489929199,
"memory(GiB)": 76.02,
"step": 1065,
"token_acc": 0.7123585726718886,
"train_speed(iter/s)": 0.020432
},
{
"epoch": 1.9463148316651502,
"grad_norm": 1.0619107995533037,
"learning_rate": 1.0938890186751487e-05,
"loss": 0.7687143325805664,
"memory(GiB)": 76.02,
"step": 1070,
"token_acc": 0.7249620637329287,
"train_speed(iter/s)": 0.020439
},
{
"epoch": 1.9554140127388535,
"grad_norm": 1.0950602334931028,
"learning_rate": 1.0769257850097881e-05,
"loss": 0.7737876415252686,
"memory(GiB)": 76.02,
"step": 1075,
"token_acc": 0.6985485671752885,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 1.9645131938125568,
"grad_norm": 1.307250719010874,
"learning_rate": 1.060046513945361e-05,
"loss": 0.7766946792602539,
"memory(GiB)": 76.02,
"step": 1080,
"token_acc": 0.7377892030848329,
"train_speed(iter/s)": 0.020451
},
{
"epoch": 1.9736123748862604,
"grad_norm": 1.1430361120086814,
"learning_rate": 1.0432527408161597e-05,
"loss": 0.7805325031280518,
"memory(GiB)": 76.02,
"step": 1085,
"token_acc": 0.7078861409239384,
"train_speed(iter/s)": 0.020457
},
{
"epoch": 1.9827115559599635,
"grad_norm": 1.002916433279442,
"learning_rate": 1.026545993179612e-05,
"loss": 0.7858685493469239,
"memory(GiB)": 76.02,
"step": 1090,
"token_acc": 0.7466666666666667,
"train_speed(iter/s)": 0.020463
},
{
"epoch": 1.991810737033667,
"grad_norm": 1.0871219922265896,
"learning_rate": 1.009927790677327e-05,
"loss": 0.7784292697906494,
"memory(GiB)": 76.02,
"step": 1095,
"token_acc": 0.7174170616113744,
"train_speed(iter/s)": 0.020469
},
{
"epoch": 2.0,
"grad_norm": 1.7655187909978691,
"learning_rate": 9.933996448968688e-06,
"loss": 0.7408246994018555,
"memory(GiB)": 76.02,
"step": 1100,
"token_acc": 0.7477064220183486,
"train_speed(iter/s)": 0.020483
},
{
"epoch": 2.0,
"eval_loss": 0.4639655649662018,
"eval_runtime": 118.882,
"eval_samples_per_second": 48.552,
"eval_steps_per_second": 0.471,
"eval_token_acc": 0.6956344085151487,
"step": 1100
},
{
"epoch": 2.0090991810737036,
"grad_norm": 1.1083572508394148,
"learning_rate": 9.769630592342643e-06,
"loss": 0.6631475925445557,
"memory(GiB)": 76.02,
"step": 1105,
"token_acc": 0.732795337368303,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 2.0181983621474067,
"grad_norm": 1.1068844322629663,
"learning_rate": 9.606195287572577e-06,
"loss": 0.6467893600463868,
"memory(GiB)": 76.02,
"step": 1110,
"token_acc": 0.7836676217765043,
"train_speed(iter/s)": 0.020427
},
{
"epoch": 2.02729754322111,
"grad_norm": 1.1238716711584054,
"learning_rate": 9.443705400693133e-06,
"loss": 0.6334795475006103,
"memory(GiB)": 76.02,
"step": 1115,
"token_acc": 0.746772864597638,
"train_speed(iter/s)": 0.020432
},
{
"epoch": 2.0363967242948133,
"grad_norm": 0.9545754331665411,
"learning_rate": 9.282175711744012e-06,
"loss": 0.643845796585083,
"memory(GiB)": 76.02,
"step": 1120,
"token_acc": 0.783322390019698,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 2.045495905368517,
"grad_norm": 1.112189160795635,
"learning_rate": 9.121620913425508e-06,
"loss": 0.6376824378967285,
"memory(GiB)": 76.02,
"step": 1125,
"token_acc": 0.7677035076108537,
"train_speed(iter/s)": 0.020444
},
{
"epoch": 2.05459508644222,
"grad_norm": 1.069654016986732,
"learning_rate": 8.962055609762143e-06,
"loss": 0.6328807353973389,
"memory(GiB)": 76.02,
"step": 1130,
"token_acc": 0.7605409705648369,
"train_speed(iter/s)": 0.020449
},
{
"epoch": 2.0636942675159236,
"grad_norm": 1.134992866714782,
"learning_rate": 8.803494314774241e-06,
"loss": 0.6297794342041015,
"memory(GiB)": 76.02,
"step": 1135,
"token_acc": 0.7869767441860465,
"train_speed(iter/s)": 0.020456
},
{
"epoch": 2.072793448589627,
"grad_norm": 1.1668054237375585,
"learning_rate": 8.645951451157741e-06,
"loss": 0.6355114459991456,
"memory(GiB)": 76.02,
"step": 1140,
"token_acc": 0.7761146496815287,
"train_speed(iter/s)": 0.020462
},
{
"epoch": 2.08189262966333,
"grad_norm": 1.1864938776830725,
"learning_rate": 8.489441348972312e-06,
"loss": 0.6331965923309326,
"memory(GiB)": 76.02,
"step": 1145,
"token_acc": 0.7740963855421686,
"train_speed(iter/s)": 0.020468
},
{
"epoch": 2.0909918107370338,
"grad_norm": 1.0454450783179292,
"learning_rate": 8.333978244337921e-06,
"loss": 0.6294968605041504,
"memory(GiB)": 76.02,
"step": 1150,
"token_acc": 0.77819937909624,
"train_speed(iter/s)": 0.020473
},
{
"epoch": 2.0909918107370338,
"eval_loss": 0.47781530022621155,
"eval_runtime": 120.0715,
"eval_samples_per_second": 48.071,
"eval_steps_per_second": 0.466,
"eval_token_acc": 0.6904358309430665,
"step": 1150
},
{
"epoch": 2.100090991810737,
"grad_norm": 0.9951435698165627,
"learning_rate": 8.179576278139872e-06,
"loss": 0.6304058074951172,
"memory(GiB)": 76.02,
"step": 1155,
"token_acc": 0.7404277792447848,
"train_speed(iter/s)": 0.020422
},
{
"epoch": 2.1091901728844404,
"grad_norm": 1.067908969484696,
"learning_rate": 8.026249494742617e-06,
"loss": 0.6222400665283203,
"memory(GiB)": 76.02,
"step": 1160,
"token_acc": 0.7715277777777778,
"train_speed(iter/s)": 0.020428
},
{
"epoch": 2.1182893539581436,
"grad_norm": 1.057238882123902,
"learning_rate": 7.874011840712197e-06,
"loss": 0.6318105697631836,
"memory(GiB)": 76.02,
"step": 1165,
"token_acc": 0.7550738007380073,
"train_speed(iter/s)": 0.020433
},
{
"epoch": 2.127388535031847,
"grad_norm": 1.0798825041809057,
"learning_rate": 7.72287716354776e-06,
"loss": 0.6285967350006103,
"memory(GiB)": 76.02,
"step": 1170,
"token_acc": 0.7547770700636943,
"train_speed(iter/s)": 0.020439
},
{
"epoch": 2.1364877161055507,
"grad_norm": 1.0478822425834018,
"learning_rate": 7.572859210421945e-06,
"loss": 0.6234595775604248,
"memory(GiB)": 76.02,
"step": 1175,
"token_acc": 0.7690631808278867,
"train_speed(iter/s)": 0.020444
},
{
"epoch": 2.1455868971792538,
"grad_norm": 0.9867274025718497,
"learning_rate": 7.423971626930435e-06,
"loss": 0.6359669685363769,
"memory(GiB)": 76.02,
"step": 1180,
"token_acc": 0.7695961995249406,
"train_speed(iter/s)": 0.02045
},
{
"epoch": 2.1546860782529573,
"grad_norm": 1.0045378569587455,
"learning_rate": 7.276227955850774e-06,
"loss": 0.6464476585388184,
"memory(GiB)": 76.02,
"step": 1185,
"token_acc": 0.7841451766953199,
"train_speed(iter/s)": 0.020455
},
{
"epoch": 2.1637852593266604,
"grad_norm": 1.022012980465645,
"learning_rate": 7.12964163591054e-06,
"loss": 0.6201572895050049,
"memory(GiB)": 76.02,
"step": 1190,
"token_acc": 0.74373795761079,
"train_speed(iter/s)": 0.020461
},
{
"epoch": 2.172884440400364,
"grad_norm": 1.2093399237034956,
"learning_rate": 6.984226000564907e-06,
"loss": 0.6306787490844726,
"memory(GiB)": 76.02,
"step": 1195,
"token_acc": 0.7755102040816326,
"train_speed(iter/s)": 0.020467
},
{
"epoch": 2.1819836214740675,
"grad_norm": 0.966059090473921,
"learning_rate": 6.8399942767839075e-06,
"loss": 0.6421105861663818,
"memory(GiB)": 76.02,
"step": 1200,
"token_acc": 0.7779262426509888,
"train_speed(iter/s)": 0.020473
},
{
"epoch": 2.1819836214740675,
"eval_loss": 0.47876349091529846,
"eval_runtime": 119.7281,
"eval_samples_per_second": 48.209,
"eval_steps_per_second": 0.468,
"eval_token_acc": 0.6895983947116104,
"step": 1200
},
{
"epoch": 2.1910828025477707,
"grad_norm": 1.069591399453908,
"learning_rate": 6.696959583849228e-06,
"loss": 0.6228060245513916,
"memory(GiB)": 76.02,
"step": 1205,
"token_acc": 0.725686591276252,
"train_speed(iter/s)": 0.020424
},
{
"epoch": 2.200181983621474,
"grad_norm": 1.0705675997492539,
"learning_rate": 6.5551349321609585e-06,
"loss": 0.6346144676208496,
"memory(GiB)": 76.02,
"step": 1210,
"token_acc": 0.7361563517915309,
"train_speed(iter/s)": 0.020429
},
{
"epoch": 2.2092811646951773,
"grad_norm": 0.99473395335189,
"learning_rate": 6.414533222054138e-06,
"loss": 0.6288974761962891,
"memory(GiB)": 76.02,
"step": 1215,
"token_acc": 0.7661224489795918,
"train_speed(iter/s)": 0.020435
},
{
"epoch": 2.218380345768881,
"grad_norm": 1.0273110672808459,
"learning_rate": 6.275167242625331e-06,
"loss": 0.6033660411834717,
"memory(GiB)": 76.02,
"step": 1220,
"token_acc": 0.7424931756141947,
"train_speed(iter/s)": 0.02044
},
{
"epoch": 2.227479526842584,
"grad_norm": 1.1134175189046431,
"learning_rate": 6.137049670569344e-06,
"loss": 0.6237975120544433,
"memory(GiB)": 76.02,
"step": 1225,
"token_acc": 0.7610619469026548,
"train_speed(iter/s)": 0.020445
},
{
"epoch": 2.2365787079162875,
"grad_norm": 1.0391880977302441,
"learning_rate": 6.000193069026181e-06,
"loss": 0.633206558227539,
"memory(GiB)": 76.02,
"step": 1230,
"token_acc": 0.7656550134460238,
"train_speed(iter/s)": 0.020451
},
{
"epoch": 2.245677888989991,
"grad_norm": 1.1575554243921846,
"learning_rate": 5.8646098864382525e-06,
"loss": 0.6448534488677978,
"memory(GiB)": 76.02,
"step": 1235,
"token_acc": 0.7768777614138439,
"train_speed(iter/s)": 0.020456
},
{
"epoch": 2.254777070063694,
"grad_norm": 1.0130550727371117,
"learning_rate": 5.730312455418134e-06,
"loss": 0.6195736408233643,
"memory(GiB)": 76.02,
"step": 1240,
"token_acc": 0.7690447400241838,
"train_speed(iter/s)": 0.020461
},
{
"epoch": 2.2638762511373978,
"grad_norm": 1.0895008794001835,
"learning_rate": 5.597312991626713e-06,
"loss": 0.6155508041381836,
"memory(GiB)": 76.02,
"step": 1245,
"token_acc": 0.7842149454240135,
"train_speed(iter/s)": 0.020466
},
{
"epoch": 2.272975432211101,
"grad_norm": 1.0868616738166854,
"learning_rate": 5.465623592662137e-06,
"loss": 0.6290598392486573,
"memory(GiB)": 76.02,
"step": 1250,
"token_acc": 0.7843260188087774,
"train_speed(iter/s)": 0.020471
},
{
"epoch": 2.272975432211101,
"eval_loss": 0.47770801186561584,
"eval_runtime": 119.4212,
"eval_samples_per_second": 48.333,
"eval_steps_per_second": 0.469,
"eval_token_acc": 0.6896631240821578,
"step": 1250
},
{
"epoch": 2.2820746132848044,
"grad_norm": 1.0252310733499297,
"learning_rate": 5.335256236959379e-06,
"loss": 0.6228739261627197,
"memory(GiB)": 76.02,
"step": 1255,
"token_acc": 0.7295555555555555,
"train_speed(iter/s)": 0.020423
},
{
"epoch": 2.2911737943585075,
"grad_norm": 1.1274971851401754,
"learning_rate": 5.206222782700667e-06,
"loss": 0.6328925609588623,
"memory(GiB)": 76.02,
"step": 1260,
"token_acc": 0.772992700729927,
"train_speed(iter/s)": 0.020428
},
{
"epoch": 2.300272975432211,
"grad_norm": 0.9968940954527525,
"learning_rate": 5.078534966736895e-06,
"loss": 0.6318979740142823,
"memory(GiB)": 76.02,
"step": 1265,
"token_acc": 0.766875691626706,
"train_speed(iter/s)": 0.020433
},
{
"epoch": 2.3093721565059147,
"grad_norm": 1.0466074457299364,
"learning_rate": 4.952204403520042e-06,
"loss": 0.6296024799346924,
"memory(GiB)": 76.02,
"step": 1270,
"token_acc": 0.7647696476964769,
"train_speed(iter/s)": 0.020438
},
{
"epoch": 2.3184713375796178,
"grad_norm": 1.059039551077919,
"learning_rate": 4.827242584046698e-06,
"loss": 0.6291126251220703,
"memory(GiB)": 76.02,
"step": 1275,
"token_acc": 0.7655979202772963,
"train_speed(iter/s)": 0.020443
},
{
"epoch": 2.3275705186533213,
"grad_norm": 1.1223580815679548,
"learning_rate": 4.70366087481289e-06,
"loss": 0.620822811126709,
"memory(GiB)": 76.02,
"step": 1280,
"token_acc": 0.7782307378719935,
"train_speed(iter/s)": 0.020448
},
{
"epoch": 2.3366696997270244,
"grad_norm": 1.0233004088935174,
"learning_rate": 4.581470516780115e-06,
"loss": 0.6297062873840332,
"memory(GiB)": 76.02,
"step": 1285,
"token_acc": 0.7572519083969466,
"train_speed(iter/s)": 0.020453
},
{
"epoch": 2.345768880800728,
"grad_norm": 1.0470029791397224,
"learning_rate": 4.460682624352952e-06,
"loss": 0.625699806213379,
"memory(GiB)": 76.02,
"step": 1290,
"token_acc": 0.7591605596269154,
"train_speed(iter/s)": 0.020458
},
{
"epoch": 2.3548680618744315,
"grad_norm": 0.915808456859335,
"learning_rate": 4.34130818436805e-06,
"loss": 0.6242890357971191,
"memory(GiB)": 76.02,
"step": 1295,
"token_acc": 0.7637987012987013,
"train_speed(iter/s)": 0.020462
},
{
"epoch": 2.3639672429481347,
"grad_norm": 0.9679022008759249,
"learning_rate": 4.223358055094762e-06,
"loss": 0.6215915203094482,
"memory(GiB)": 76.02,
"step": 1300,
"token_acc": 0.7939560439560439,
"train_speed(iter/s)": 0.020467
},
{
"epoch": 2.3639672429481347,
"eval_loss": 0.4746646285057068,
"eval_runtime": 120.7999,
"eval_samples_per_second": 47.782,
"eval_steps_per_second": 0.464,
"eval_token_acc": 0.6904034662577928,
"step": 1300
},
{
"epoch": 2.3767060964513194,
"grad_norm": 1.0864039116899251,
"learning_rate": 4.106842965247497e-06,
"loss": 0.607478666305542,
"memory(GiB)": 53.99,
"step": 1305,
"token_acc": 0.777601899485556,
"train_speed(iter/s)": 4.038665
},
{
"epoch": 2.385805277525023,
"grad_norm": 0.955554735442322,
"learning_rate": 3.991773513009849e-06,
"loss": 0.6158496856689453,
"memory(GiB)": 53.99,
"step": 1310,
"token_acc": 0.7964731814842028,
"train_speed(iter/s)": 2.330085
},
{
"epoch": 2.394904458598726,
"grad_norm": 1.0615963891170637,
"learning_rate": 3.87816016507055e-06,
"loss": 0.6333821296691895,
"memory(GiB)": 53.99,
"step": 1315,
"token_acc": 0.7811782708492732,
"train_speed(iter/s)": 1.665234
},
{
"epoch": 2.4040036396724296,
"grad_norm": 1.148829953509744,
"learning_rate": 3.766013255671479e-06,
"loss": 0.6272965908050537,
"memory(GiB)": 53.99,
"step": 1320,
"token_acc": 0.7688679245283019,
"train_speed(iter/s)": 1.297177
},
{
"epoch": 2.4131028207461327,
"grad_norm": 1.0891236462035252,
"learning_rate": 3.6553429856675915e-06,
"loss": 0.6266043663024903,
"memory(GiB)": 77.52,
"step": 1325,
"token_acc": 0.7914959016393442,
"train_speed(iter/s)": 1.06612
},
{
"epoch": 2.4222020018198362,
"grad_norm": 1.1117445945203506,
"learning_rate": 3.5461594215991247e-06,
"loss": 0.6159255981445313,
"memory(GiB)": 77.52,
"step": 1330,
"token_acc": 0.7893491124260354,
"train_speed(iter/s)": 0.90399
},
{
"epoch": 2.43130118289354,
"grad_norm": 0.9824968556280764,
"learning_rate": 3.438472494775902e-06,
"loss": 0.6225139141082764,
"memory(GiB)": 77.52,
"step": 1335,
"token_acc": 0.7502756339581036,
"train_speed(iter/s)": 0.785373
},
{
"epoch": 2.440400363967243,
"grad_norm": 0.9912665739642537,
"learning_rate": 3.3322920003739913e-06,
"loss": 0.6153748989105224,
"memory(GiB)": 77.52,
"step": 1340,
"token_acc": 0.790268456375839,
"train_speed(iter/s)": 0.696672
},
{
"epoch": 2.4494995450409465,
"grad_norm": 1.064566119713343,
"learning_rate": 3.227627596544738e-06,
"loss": 0.6232125759124756,
"memory(GiB)": 77.52,
"step": 1345,
"token_acc": 0.7880870561282932,
"train_speed(iter/s)": 0.625452
},
{
"epoch": 2.4585987261146496,
"grad_norm": 1.0308754966071667,
"learning_rate": 3.1244888035362875e-06,
"loss": 0.6144218444824219,
"memory(GiB)": 77.52,
"step": 1350,
"token_acc": 0.7680478428022213,
"train_speed(iter/s)": 0.569157
},
{
"epoch": 2.4585987261146496,
"eval_loss": 0.4741266369819641,
"eval_runtime": 123.1251,
"eval_samples_per_second": 46.879,
"eval_steps_per_second": 0.455,
"eval_token_acc": 0.6901121840903298,
"step": 1350
},
{
"epoch": 2.467697907188353,
"grad_norm": 1.1124385874562812,
"learning_rate": 3.0228850028275803e-06,
"loss": 0.6197083950042724,
"memory(GiB)": 77.52,
"step": 1355,
"token_acc": 0.7441558441558441,
"train_speed(iter/s)": 0.491445
},
{
"epoch": 2.4767970882620562,
"grad_norm": 1.0177391538655736,
"learning_rate": 2.922825436275061e-06,
"loss": 0.6326658248901367,
"memory(GiB)": 77.52,
"step": 1360,
"token_acc": 0.774859287054409,
"train_speed(iter/s)": 0.456689
},
{
"epoch": 2.48589626933576,
"grad_norm": 0.9939709571788379,
"learning_rate": 2.8243192052719902e-06,
"loss": 0.6353094577789307,
"memory(GiB)": 77.52,
"step": 1365,
"token_acc": 0.7515923566878981,
"train_speed(iter/s)": 0.426316
},
{
"epoch": 2.494995450409463,
"grad_norm": 1.0864856971626622,
"learning_rate": 2.72737526992064e-06,
"loss": 0.6143672466278076,
"memory(GiB)": 77.52,
"step": 1370,
"token_acc": 0.800497203231821,
"train_speed(iter/s)": 0.399977
},
{
"epoch": 2.5040946314831665,
"grad_norm": 0.9778765243753255,
"learning_rate": 2.6320024482172592e-06,
"loss": 0.6241840362548828,
"memory(GiB)": 77.52,
"step": 1375,
"token_acc": 0.7901711761457758,
"train_speed(iter/s)": 0.376966
},
{
"epoch": 2.51319381255687,
"grad_norm": 0.963647645236081,
"learning_rate": 2.5382094152499705e-06,
"loss": 0.635280704498291,
"memory(GiB)": 77.52,
"step": 1380,
"token_acc": 0.7607636068237206,
"train_speed(iter/s)": 0.356417
},
{
"epoch": 2.522292993630573,
"grad_norm": 0.9666636858906085,
"learning_rate": 2.4460047024097144e-06,
"loss": 0.6261641502380371,
"memory(GiB)": 77.52,
"step": 1385,
"token_acc": 0.7655134541460736,
"train_speed(iter/s)": 0.338341
},
{
"epoch": 2.5313921747042767,
"grad_norm": 0.9689736671771748,
"learning_rate": 2.3553966966142384e-06,
"loss": 0.6166990280151368,
"memory(GiB)": 77.52,
"step": 1390,
"token_acc": 0.7619183556951185,
"train_speed(iter/s)": 0.321781
},
{
"epoch": 2.54049135577798,
"grad_norm": 1.0530841209630801,
"learning_rate": 2.266393639545197e-06,
"loss": 0.6244637966156006,
"memory(GiB)": 77.52,
"step": 1395,
"token_acc": 0.7679372197309418,
"train_speed(iter/s)": 0.307132
},
{
"epoch": 2.5495905368516834,
"grad_norm": 0.9878733985818398,
"learning_rate": 2.1790036268985284e-06,
"loss": 0.6239931106567382,
"memory(GiB)": 77.52,
"step": 1400,
"token_acc": 0.7469059405940595,
"train_speed(iter/s)": 0.293674
},
{
"epoch": 2.5495905368516834,
"eval_loss": 0.47406768798828125,
"eval_runtime": 121.0349,
"eval_samples_per_second": 47.689,
"eval_steps_per_second": 0.463,
"eval_token_acc": 0.6900029532775313,
"step": 1400
},
{
"epoch": 2.558689717925387,
"grad_norm": 1.0090114818567588,
"learning_rate": 2.0932346076480314e-06,
"loss": 0.6187572956085206,
"memory(GiB)": 77.52,
"step": 1405,
"token_acc": 0.7450779851700332,
"train_speed(iter/s)": 0.272986
},
{
"epoch": 2.56778889899909,
"grad_norm": 0.9588816991739316,
"learning_rate": 2.009094383322356e-06,
"loss": 0.6277956485748291,
"memory(GiB)": 77.52,
"step": 1410,
"token_acc": 0.7810402684563759,
"train_speed(iter/s)": 0.262478
},
{
"epoch": 2.5768880800727936,
"grad_norm": 0.9909418694472445,
"learning_rate": 1.9265906072953822e-06,
"loss": 0.6175178050994873,
"memory(GiB)": 77.52,
"step": 1415,
"token_acc": 0.7652439024390244,
"train_speed(iter/s)": 0.252862
},
{
"epoch": 2.5859872611464967,
"grad_norm": 1.1182023779440498,
"learning_rate": 1.8457307840900428e-06,
"loss": 0.6154948711395264,
"memory(GiB)": 77.52,
"step": 1420,
"token_acc": 0.7852161785216178,
"train_speed(iter/s)": 0.244119
},
{
"epoch": 2.5950864422202002,
"grad_norm": 1.0404157493617592,
"learning_rate": 1.7665222686957362e-06,
"loss": 0.6219567775726318,
"memory(GiB)": 77.52,
"step": 1425,
"token_acc": 0.7628019323671498,
"train_speed(iter/s)": 0.235826
},
{
"epoch": 2.604185623293904,
"grad_norm": 1.0786639447035942,
"learning_rate": 1.6889722658993223e-06,
"loss": 0.6350451946258545,
"memory(GiB)": 77.52,
"step": 1430,
"token_acc": 0.7704379562043796,
"train_speed(iter/s)": 0.228331
},
{
"epoch": 2.613284804367607,
"grad_norm": 1.0095118897080797,
"learning_rate": 1.6130878296297536e-06,
"loss": 0.6284623622894288,
"memory(GiB)": 77.52,
"step": 1435,
"token_acc": 0.7636180228648285,
"train_speed(iter/s)": 0.221176
},
{
"epoch": 2.62238398544131,
"grad_norm": 0.94070647379727,
"learning_rate": 1.5388758623164802e-06,
"loss": 0.6281323432922363,
"memory(GiB)": 77.52,
"step": 1440,
"token_acc": 0.7643463497453311,
"train_speed(iter/s)": 0.214634
},
{
"epoch": 2.6314831665150136,
"grad_norm": 1.0651613672971816,
"learning_rate": 1.4663431142615792e-06,
"loss": 0.6090371608734131,
"memory(GiB)": 77.52,
"step": 1445,
"token_acc": 0.8246628131021194,
"train_speed(iter/s)": 0.208466
},
{
"epoch": 2.640582347588717,
"grad_norm": 1.0004848001888615,
"learning_rate": 1.3954961830257685e-06,
"loss": 0.624143123626709,
"memory(GiB)": 77.52,
"step": 1450,
"token_acc": 0.7779850746268657,
"train_speed(iter/s)": 0.202625
},
{
"epoch": 2.640582347588717,
"eval_loss": 0.47285741567611694,
"eval_runtime": 117.6959,
"eval_samples_per_second": 49.042,
"eval_steps_per_second": 0.476,
"eval_token_acc": 0.6904803323853178,
"step": 1450
},
{
"epoch": 2.6496815286624202,
"grad_norm": 1.0494169722074018,
"learning_rate": 1.3263415128282908e-06,
"loss": 0.6255748271942139,
"memory(GiB)": 77.52,
"step": 1455,
"token_acc": 0.732059542323928,
"train_speed(iter/s)": 0.193115
},
{
"epoch": 2.658780709736124,
"grad_norm": 1.0337513224693766,
"learning_rate": 1.2588853939607338e-06,
"loss": 0.6212813377380371,
"memory(GiB)": 77.52,
"step": 1460,
"token_acc": 0.7488385598141696,
"train_speed(iter/s)": 0.188151
},
{
"epoch": 2.667879890809827,
"grad_norm": 0.9339498438090048,
"learning_rate": 1.1931339622148897e-06,
"loss": 0.6209768295288086,
"memory(GiB)": 77.52,
"step": 1465,
"token_acc": 0.7604208822339134,
"train_speed(iter/s)": 0.183569
},
{
"epoch": 2.6769790718835305,
"grad_norm": 1.009290828686064,
"learning_rate": 1.1290931983246334e-06,
"loss": 0.619508934020996,
"memory(GiB)": 77.52,
"step": 1470,
"token_acc": 0.7703793381759484,
"train_speed(iter/s)": 0.179159
},
{
"epoch": 2.686078252957234,
"grad_norm": 0.9092366819727269,
"learning_rate": 1.0667689274219128e-06,
"loss": 0.6159298419952393,
"memory(GiB)": 77.52,
"step": 1475,
"token_acc": 0.7770177838577291,
"train_speed(iter/s)": 0.175056
},
{
"epoch": 2.695177434030937,
"grad_norm": 0.9840242855378942,
"learning_rate": 1.0061668185068996e-06,
"loss": 0.6134575843811035,
"memory(GiB)": 77.52,
"step": 1480,
"token_acc": 0.7733843537414966,
"train_speed(iter/s)": 0.171104
},
{
"epoch": 2.7042766151046407,
"grad_norm": 1.0092116973578455,
"learning_rate": 9.4729238393235e-07,
"loss": 0.6143134593963623,
"memory(GiB)": 77.52,
"step": 1485,
"token_acc": 0.7900072411296162,
"train_speed(iter/s)": 0.167358
},
{
"epoch": 2.713375796178344,
"grad_norm": 1.0868815484832741,
"learning_rate": 8.901509789021779e-07,
"loss": 0.600148344039917,
"memory(GiB)": 77.52,
"step": 1490,
"token_acc": 0.7679245283018868,
"train_speed(iter/s)": 0.163825
},
{
"epoch": 2.7224749772520473,
"grad_norm": 1.0410901018430865,
"learning_rate": 8.347478009843746e-07,
"loss": 0.6201463222503663,
"memory(GiB)": 77.52,
"step": 1495,
"token_acc": 0.738926899531869,
"train_speed(iter/s)": 0.160424
},
{
"epoch": 2.731574158325751,
"grad_norm": 0.9884777012197261,
"learning_rate": 7.810878896382101e-07,
"loss": 0.6072117805480957,
"memory(GiB)": 77.52,
"step": 1500,
"token_acc": 0.7709691438504997,
"train_speed(iter/s)": 0.157229
},
{
"epoch": 2.731574158325751,
"eval_loss": 0.4724496603012085,
"eval_runtime": 119.0959,
"eval_samples_per_second": 48.465,
"eval_steps_per_second": 0.47,
"eval_token_acc": 0.6903711015725191,
"step": 1500
},
{
"epoch": 2.740673339399454,
"grad_norm": 0.9523503494388392,
"learning_rate": 7.291761257558749e-07,
"loss": 0.6324088096618652,
"memory(GiB)": 77.52,
"step": 1505,
"token_acc": 0.7417567924030599,
"train_speed(iter/s)": 0.151702
},
{
"epoch": 2.7497725204731576,
"grad_norm": 0.9815948952007479,
"learning_rate": 6.790172312184972e-07,
"loss": 0.6338190078735352,
"memory(GiB)": 77.52,
"step": 1510,
"token_acc": 0.7562122229684352,
"train_speed(iter/s)": 0.148878
},
{
"epoch": 2.7588717015468607,
"grad_norm": 1.0475192921698937,
"learning_rate": 6.306157684666425e-07,
"loss": 0.6202810764312744,
"memory(GiB)": 77.52,
"step": 1515,
"token_acc": 0.7550281576830249,
"train_speed(iter/s)": 0.146148
},
{
"epoch": 2.7679708826205642,
"grad_norm": 1.0206535695296246,
"learning_rate": 5.839761400853183e-07,
"loss": 0.6317409992218017,
"memory(GiB)": 77.52,
"step": 1520,
"token_acc": 0.7529880478087649,
"train_speed(iter/s)": 0.143534
},
{
"epoch": 2.777070063694268,
"grad_norm": 0.9666971373448247,
"learning_rate": 5.391025884035239e-07,
"loss": 0.6138282775878906,
"memory(GiB)": 77.52,
"step": 1525,
"token_acc": 0.767303609341826,
"train_speed(iter/s)": 0.141033
},
{
"epoch": 2.786169244767971,
"grad_norm": 1.002591354360291,
"learning_rate": 4.959991951083498e-07,
"loss": 0.617135763168335,
"memory(GiB)": 77.52,
"step": 1530,
"token_acc": 0.8161559888579387,
"train_speed(iter/s)": 0.13864
},
{
"epoch": 2.795268425841674,
"grad_norm": 1.006202469505235,
"learning_rate": 4.5466988087373044e-07,
"loss": 0.6056863784790039,
"memory(GiB)": 77.52,
"step": 1535,
"token_acc": 0.760498687664042,
"train_speed(iter/s)": 0.136344
},
{
"epoch": 2.8043676069153776,
"grad_norm": 0.9830438526460707,
"learning_rate": 4.151184050038004e-07,
"loss": 0.6215356349945068,
"memory(GiB)": 77.52,
"step": 1540,
"token_acc": 0.7701478302336672,
"train_speed(iter/s)": 0.134118
},
{
"epoch": 2.813466787989081,
"grad_norm": 1.0560207375711046,
"learning_rate": 3.7734836509096596e-07,
"loss": 0.6116134643554687,
"memory(GiB)": 77.52,
"step": 1545,
"token_acc": 0.7759115116755428,
"train_speed(iter/s)": 0.132005
},
{
"epoch": 2.8225659690627842,
"grad_norm": 1.0225913174286714,
"learning_rate": 3.4136319668866434e-07,
"loss": 0.625472116470337,
"memory(GiB)": 77.52,
"step": 1550,
"token_acc": 0.7980769230769231,
"train_speed(iter/s)": 0.129952
},
{
"epoch": 2.8225659690627842,
"eval_loss": 0.4721684753894806,
"eval_runtime": 118.0382,
"eval_samples_per_second": 48.899,
"eval_steps_per_second": 0.474,
"eval_token_acc": 0.6906583381543229,
"step": 1550
},
{
"epoch": 2.831665150136488,
"grad_norm": 1.0572860374958588,
"learning_rate": 3.071661729988584e-07,
"loss": 0.6085397720336914,
"memory(GiB)": 77.52,
"step": 1555,
"token_acc": 0.7432788613600422,
"train_speed(iter/s)": 0.126385
},
{
"epoch": 2.840764331210191,
"grad_norm": 0.977034680594488,
"learning_rate": 2.747604045743102e-07,
"loss": 0.6142263889312745,
"memory(GiB)": 77.52,
"step": 1560,
"token_acc": 0.7400581959262852,
"train_speed(iter/s)": 0.124515
},
{
"epoch": 2.8498635122838945,
"grad_norm": 0.9902282597829868,
"learning_rate": 2.4414883903565834e-07,
"loss": 0.6152991771697998,
"memory(GiB)": 77.52,
"step": 1565,
"token_acc": 0.8156277436347673,
"train_speed(iter/s)": 0.122739
},
{
"epoch": 2.858962693357598,
"grad_norm": 1.112609715887069,
"learning_rate": 2.15334260803286e-07,
"loss": 0.6211013793945312,
"memory(GiB)": 77.52,
"step": 1570,
"token_acc": 0.7968069666182874,
"train_speed(iter/s)": 0.121013
},
{
"epoch": 2.868061874431301,
"grad_norm": 1.007653504358626,
"learning_rate": 1.8831929084406119e-07,
"loss": 0.6160074234008789,
"memory(GiB)": 77.52,
"step": 1575,
"token_acc": 0.7956026058631922,
"train_speed(iter/s)": 0.119343
},
{
"epoch": 2.8771610555050047,
"grad_norm": 1.0328729828726175,
"learning_rate": 1.631063864329274e-07,
"loss": 0.6106714725494384,
"memory(GiB)": 77.52,
"step": 1580,
"token_acc": 0.8102600140548137,
"train_speed(iter/s)": 0.11774
},
{
"epoch": 2.886260236578708,
"grad_norm": 0.9727986501836436,
"learning_rate": 1.3969784092939588e-07,
"loss": 0.6038858890533447,
"memory(GiB)": 77.52,
"step": 1585,
"token_acc": 0.7294275491949911,
"train_speed(iter/s)": 0.116161
},
{
"epoch": 2.8953594176524113,
"grad_norm": 1.0580993770834335,
"learning_rate": 1.180957835689478e-07,
"loss": 0.6102193832397461,
"memory(GiB)": 77.52,
"step": 1590,
"token_acc": 0.7574578469520103,
"train_speed(iter/s)": 0.114662
},
{
"epoch": 2.904458598726115,
"grad_norm": 0.9841890221890635,
"learning_rate": 9.83021792693406e-08,
"loss": 0.6162684917449951,
"memory(GiB)": 77.52,
"step": 1595,
"token_acc": 0.7871674491392802,
"train_speed(iter/s)": 0.113191
},
{
"epoch": 2.913557779799818,
"grad_norm": 1.0209356603903383,
"learning_rate": 8.031882845189743e-08,
"loss": 0.6077028751373291,
"memory(GiB)": 77.52,
"step": 1600,
"token_acc": 0.7544715447154472,
"train_speed(iter/s)": 0.111782
},
{
"epoch": 2.913557779799818,
"eval_loss": 0.4720407724380493,
"eval_runtime": 116.4832,
"eval_samples_per_second": 49.552,
"eval_steps_per_second": 0.481,
"eval_token_acc": 0.6904317853574072,
"step": 1600
}
],
"logging_steps": 5,
"max_steps": 1647,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.5861721952354304e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}