| { | |
| "best_metric": 0.46374601, | |
| "best_model_checkpoint": "/home/zhangzhicheng03/code/face-llm/ms-swift/Emo-CFG_bs-1040_data-ATTR_OPEN_EMO_500k_CAP_78k_lr-4e-5/v0-20250512-052808/checkpoint-1050", | |
| "epoch": 2.913557779799818, | |
| "eval_steps": 50, | |
| "global_step": 1600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018198362147406734, | |
| "grad_norm": 25.030844046592083, | |
| "learning_rate": 3.9999963615834764e-05, | |
| "loss": 2.025822877883911, | |
| "memory(GiB)": 43.02, | |
| "step": 1, | |
| "token_acc": 0.609375, | |
| "train_speed(iter/s)": 0.005908 | |
| }, | |
| { | |
| "epoch": 0.009099181073703366, | |
| "grad_norm": 10.802963796008704, | |
| "learning_rate": 3.9999090402488034e-05, | |
| "loss": 2.101142644882202, | |
| "memory(GiB)": 71.21, | |
| "step": 5, | |
| "token_acc": 0.5037481259370314, | |
| "train_speed(iter/s)": 0.013918 | |
| }, | |
| { | |
| "epoch": 0.018198362147406732, | |
| "grad_norm": 4.512540795817656, | |
| "learning_rate": 3.99963616926889e-05, | |
| "loss": 2.7770095825195313, | |
| "memory(GiB)": 71.21, | |
| "step": 10, | |
| "token_acc": 0.4725118483412322, | |
| "train_speed(iter/s)": 0.016736 | |
| }, | |
| { | |
| "epoch": 0.0272975432211101, | |
| "grad_norm": 3.2273598665998664, | |
| "learning_rate": 3.999181411880536e-05, | |
| "loss": 1.1679546356201171, | |
| "memory(GiB)": 71.21, | |
| "step": 15, | |
| "token_acc": 0.6352619233776388, | |
| "train_speed(iter/s)": 0.018014 | |
| }, | |
| { | |
| "epoch": 0.036396724294813464, | |
| "grad_norm": 2.7344684028279493, | |
| "learning_rate": 3.99854480944836e-05, | |
| "loss": 1.0935646057128907, | |
| "memory(GiB)": 76.02, | |
| "step": 20, | |
| "token_acc": 0.6871584699453552, | |
| "train_speed(iter/s)": 0.018758 | |
| }, | |
| { | |
| "epoch": 0.04549590536851683, | |
| "grad_norm": 2.6368838955923546, | |
| "learning_rate": 3.9977264198775616e-05, | |
| "loss": 1.0634303092956543, | |
| "memory(GiB)": 76.02, | |
| "step": 25, | |
| "token_acc": 0.6443461781427668, | |
| "train_speed(iter/s)": 0.019279 | |
| }, | |
| { | |
| "epoch": 0.0545950864422202, | |
| "grad_norm": 2.0635877115380987, | |
| "learning_rate": 3.996726317608652e-05, | |
| "loss": 1.0315238952636718, | |
| "memory(GiB)": 76.02, | |
| "step": 30, | |
| "token_acc": 0.6216628527841342, | |
| "train_speed(iter/s)": 0.019641 | |
| }, | |
| { | |
| "epoch": 0.06369426751592357, | |
| "grad_norm": 1.769746764765505, | |
| "learning_rate": 3.995544593610685e-05, | |
| "loss": 1.0012907981872559, | |
| "memory(GiB)": 76.02, | |
| "step": 35, | |
| "token_acc": 0.6820960698689956, | |
| "train_speed(iter/s)": 0.019879 | |
| }, | |
| { | |
| "epoch": 0.07279344858962693, | |
| "grad_norm": 1.7921148583961297, | |
| "learning_rate": 3.994181355372981e-05, | |
| "loss": 1.0219003677368164, | |
| "memory(GiB)": 76.02, | |
| "step": 40, | |
| "token_acc": 0.6666666666666666, | |
| "train_speed(iter/s)": 0.020113 | |
| }, | |
| { | |
| "epoch": 0.0818926296633303, | |
| "grad_norm": 2.3622358395090868, | |
| "learning_rate": 3.9926367268953514e-05, | |
| "loss": 0.9893597602844239, | |
| "memory(GiB)": 76.02, | |
| "step": 45, | |
| "token_acc": 0.6443586443586443, | |
| "train_speed(iter/s)": 0.02023 | |
| }, | |
| { | |
| "epoch": 0.09099181073703366, | |
| "grad_norm": 1.8847444781295586, | |
| "learning_rate": 3.990910848676819e-05, | |
| "loss": 1.0064857482910157, | |
| "memory(GiB)": 76.02, | |
| "step": 50, | |
| "token_acc": 0.6833550065019506, | |
| "train_speed(iter/s)": 0.020396 | |
| }, | |
| { | |
| "epoch": 0.09099181073703366, | |
| "eval_loss": 0.6069548726081848, | |
| "eval_runtime": 124.0182, | |
| "eval_samples_per_second": 46.542, | |
| "eval_steps_per_second": 0.452, | |
| "eval_token_acc": 0.6658791259916742, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10009099181073704, | |
| "grad_norm": 2.2611595119155297, | |
| "learning_rate": 3.989003877702835e-05, | |
| "loss": 1.0090344429016114, | |
| "memory(GiB)": 76.02, | |
| "step": 55, | |
| "token_acc": 0.6743224621038126, | |
| "train_speed(iter/s)": 0.019334 | |
| }, | |
| { | |
| "epoch": 0.1091901728844404, | |
| "grad_norm": 2.155765617865288, | |
| "learning_rate": 3.986915987431006e-05, | |
| "loss": 0.9812187194824219, | |
| "memory(GiB)": 76.02, | |
| "step": 60, | |
| "token_acc": 0.6862615587846763, | |
| "train_speed(iter/s)": 0.019541 | |
| }, | |
| { | |
| "epoch": 0.11828935395814377, | |
| "grad_norm": 1.9675392436886496, | |
| "learning_rate": 3.984647367775312e-05, | |
| "loss": 0.967503547668457, | |
| "memory(GiB)": 76.02, | |
| "step": 65, | |
| "token_acc": 0.6425840978593272, | |
| "train_speed(iter/s)": 0.019703 | |
| }, | |
| { | |
| "epoch": 0.12738853503184713, | |
| "grad_norm": 1.6136749581314442, | |
| "learning_rate": 3.9821982250888316e-05, | |
| "loss": 0.9946205139160156, | |
| "memory(GiB)": 76.02, | |
| "step": 70, | |
| "token_acc": 0.6822200392927309, | |
| "train_speed(iter/s)": 0.01985 | |
| }, | |
| { | |
| "epoch": 0.1364877161055505, | |
| "grad_norm": 2.1416143299162544, | |
| "learning_rate": 3.9795687821449754e-05, | |
| "loss": 0.9689006805419922, | |
| "memory(GiB)": 76.02, | |
| "step": 75, | |
| "token_acc": 0.6582365003417635, | |
| "train_speed(iter/s)": 0.019982 | |
| }, | |
| { | |
| "epoch": 0.14558689717925385, | |
| "grad_norm": 2.2094541193048074, | |
| "learning_rate": 3.9767592781172185e-05, | |
| "loss": 0.9927925109863281, | |
| "memory(GiB)": 76.02, | |
| "step": 80, | |
| "token_acc": 0.6676557863501483, | |
| "train_speed(iter/s)": 0.020086 | |
| }, | |
| { | |
| "epoch": 0.15468607825295724, | |
| "grad_norm": 1.6788879867996525, | |
| "learning_rate": 3.973769968557348e-05, | |
| "loss": 0.9653422355651855, | |
| "memory(GiB)": 76.02, | |
| "step": 85, | |
| "token_acc": 0.6833890746934225, | |
| "train_speed(iter/s)": 0.020194 | |
| }, | |
| { | |
| "epoch": 0.1637852593266606, | |
| "grad_norm": 1.6608567558622684, | |
| "learning_rate": 3.970601125372218e-05, | |
| "loss": 0.9711417198181153, | |
| "memory(GiB)": 76.02, | |
| "step": 90, | |
| "token_acc": 0.6648721399730821, | |
| "train_speed(iter/s)": 0.020273 | |
| }, | |
| { | |
| "epoch": 0.17288444040036396, | |
| "grad_norm": 1.8971338914920044, | |
| "learning_rate": 3.967253036799017e-05, | |
| "loss": 0.9714339256286622, | |
| "memory(GiB)": 76.02, | |
| "step": 95, | |
| "token_acc": 0.6907407407407408, | |
| "train_speed(iter/s)": 0.020364 | |
| }, | |
| { | |
| "epoch": 0.18198362147406733, | |
| "grad_norm": 2.2276291949458913, | |
| "learning_rate": 3.963726007379047e-05, | |
| "loss": 0.9623370170593262, | |
| "memory(GiB)": 76.02, | |
| "step": 100, | |
| "token_acc": 0.6705935659265972, | |
| "train_speed(iter/s)": 0.02043 | |
| }, | |
| { | |
| "epoch": 0.18198362147406733, | |
| "eval_loss": 0.5809512138366699, | |
| "eval_runtime": 123.9915, | |
| "eval_samples_per_second": 46.552, | |
| "eval_steps_per_second": 0.452, | |
| "eval_token_acc": 0.6707621478823382, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1910828025477707, | |
| "grad_norm": 1.7032248137533175, | |
| "learning_rate": 3.960020357930028e-05, | |
| "loss": 0.9466117858886719, | |
| "memory(GiB)": 76.02, | |
| "step": 105, | |
| "token_acc": 0.6678478620363808, | |
| "train_speed(iter/s)": 0.019882 | |
| }, | |
| { | |
| "epoch": 0.20018198362147407, | |
| "grad_norm": 1.855445035035624, | |
| "learning_rate": 3.9561364255169114e-05, | |
| "loss": 0.9585418701171875, | |
| "memory(GiB)": 76.02, | |
| "step": 110, | |
| "token_acc": 0.666546633057256, | |
| "train_speed(iter/s)": 0.019966 | |
| }, | |
| { | |
| "epoch": 0.20928116469517744, | |
| "grad_norm": 2.271456601509792, | |
| "learning_rate": 3.9520745634212225e-05, | |
| "loss": 0.9546641349792481, | |
| "memory(GiB)": 76.02, | |
| "step": 115, | |
| "token_acc": 0.6983430799220273, | |
| "train_speed(iter/s)": 0.02004 | |
| }, | |
| { | |
| "epoch": 0.2183803457688808, | |
| "grad_norm": 1.727865009111447, | |
| "learning_rate": 3.947835141108928e-05, | |
| "loss": 0.9411544799804688, | |
| "memory(GiB)": 76.02, | |
| "step": 120, | |
| "token_acc": 0.6998714652956298, | |
| "train_speed(iter/s)": 0.020118 | |
| }, | |
| { | |
| "epoch": 0.22747952684258416, | |
| "grad_norm": 1.5407295558813352, | |
| "learning_rate": 3.943418544196826e-05, | |
| "loss": 0.9641068458557129, | |
| "memory(GiB)": 76.02, | |
| "step": 125, | |
| "token_acc": 0.6722915963550455, | |
| "train_speed(iter/s)": 0.020179 | |
| }, | |
| { | |
| "epoch": 0.23657870791628755, | |
| "grad_norm": 1.6770942231997907, | |
| "learning_rate": 3.938825174417473e-05, | |
| "loss": 0.956147575378418, | |
| "memory(GiB)": 76.02, | |
| "step": 130, | |
| "token_acc": 0.7067484662576687, | |
| "train_speed(iter/s)": 0.020251 | |
| }, | |
| { | |
| "epoch": 0.2456778889899909, | |
| "grad_norm": 1.799020682507979, | |
| "learning_rate": 3.934055449582641e-05, | |
| "loss": 0.9465121269226074, | |
| "memory(GiB)": 76.02, | |
| "step": 135, | |
| "token_acc": 0.6822670674109059, | |
| "train_speed(iter/s)": 0.020307 | |
| }, | |
| { | |
| "epoch": 0.25477707006369427, | |
| "grad_norm": 1.6975378766800486, | |
| "learning_rate": 3.929109803545315e-05, | |
| "loss": 0.9593283653259277, | |
| "memory(GiB)": 76.02, | |
| "step": 140, | |
| "token_acc": 0.6935749588138386, | |
| "train_speed(iter/s)": 0.020367 | |
| }, | |
| { | |
| "epoch": 0.26387625113739765, | |
| "grad_norm": 1.6873696015578077, | |
| "learning_rate": 3.9239886861602265e-05, | |
| "loss": 0.9509831428527832, | |
| "memory(GiB)": 76.02, | |
| "step": 145, | |
| "token_acc": 0.6785370548604427, | |
| "train_speed(iter/s)": 0.020417 | |
| }, | |
| { | |
| "epoch": 0.272975432211101, | |
| "grad_norm": 1.605433238469568, | |
| "learning_rate": 3.9186925632429396e-05, | |
| "loss": 0.9489663124084473, | |
| "memory(GiB)": 76.02, | |
| "step": 150, | |
| "token_acc": 0.6493083807973963, | |
| "train_speed(iter/s)": 0.020465 | |
| }, | |
| { | |
| "epoch": 0.272975432211101, | |
| "eval_loss": 0.556602954864502, | |
| "eval_runtime": 119.5036, | |
| "eval_samples_per_second": 48.3, | |
| "eval_steps_per_second": 0.469, | |
| "eval_token_acc": 0.6771622643952052, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2820746132848044, | |
| "grad_norm": 1.665760265285853, | |
| "learning_rate": 3.9132219165274786e-05, | |
| "loss": 0.9691334724426269, | |
| "memory(GiB)": 76.02, | |
| "step": 155, | |
| "token_acc": 0.6817427385892116, | |
| "train_speed(iter/s)": 0.020097 | |
| }, | |
| { | |
| "epoch": 0.2911737943585077, | |
| "grad_norm": 1.4504880716204094, | |
| "learning_rate": 3.907577243622505e-05, | |
| "loss": 0.9517691612243653, | |
| "memory(GiB)": 76.02, | |
| "step": 160, | |
| "token_acc": 0.6508152173913043, | |
| "train_speed(iter/s)": 0.020145 | |
| }, | |
| { | |
| "epoch": 0.3002729754322111, | |
| "grad_norm": 1.4909379207696947, | |
| "learning_rate": 3.901759057966064e-05, | |
| "loss": 0.9396313667297364, | |
| "memory(GiB)": 76.02, | |
| "step": 165, | |
| "token_acc": 0.6924564796905223, | |
| "train_speed(iter/s)": 0.0202 | |
| }, | |
| { | |
| "epoch": 0.3093721565059145, | |
| "grad_norm": 1.6755025509294692, | |
| "learning_rate": 3.895767888778874e-05, | |
| "loss": 0.958685302734375, | |
| "memory(GiB)": 76.02, | |
| "step": 170, | |
| "token_acc": 0.6812801402893468, | |
| "train_speed(iter/s)": 0.020241 | |
| }, | |
| { | |
| "epoch": 0.3184713375796178, | |
| "grad_norm": 1.39424961728271, | |
| "learning_rate": 3.889604281016194e-05, | |
| "loss": 0.9179913520812988, | |
| "memory(GiB)": 76.02, | |
| "step": 175, | |
| "token_acc": 0.6434395848776872, | |
| "train_speed(iter/s)": 0.020291 | |
| }, | |
| { | |
| "epoch": 0.3275705186533212, | |
| "grad_norm": 1.810023496149751, | |
| "learning_rate": 3.883268795318252e-05, | |
| "loss": 0.95927734375, | |
| "memory(GiB)": 76.02, | |
| "step": 180, | |
| "token_acc": 0.6510866329264662, | |
| "train_speed(iter/s)": 0.020334 | |
| }, | |
| { | |
| "epoch": 0.33666969972702454, | |
| "grad_norm": 2.080560793664787, | |
| "learning_rate": 3.876762007959253e-05, | |
| "loss": 0.9460148811340332, | |
| "memory(GiB)": 76.02, | |
| "step": 185, | |
| "token_acc": 0.6614173228346457, | |
| "train_speed(iter/s)": 0.020378 | |
| }, | |
| { | |
| "epoch": 0.34576888080072793, | |
| "grad_norm": 1.6314724313426552, | |
| "learning_rate": 3.870084510794953e-05, | |
| "loss": 0.9372352600097656, | |
| "memory(GiB)": 76.02, | |
| "step": 190, | |
| "token_acc": 0.7167736021998167, | |
| "train_speed(iter/s)": 0.020418 | |
| }, | |
| { | |
| "epoch": 0.3548680618744313, | |
| "grad_norm": 1.5214499000610326, | |
| "learning_rate": 3.863236911208835e-05, | |
| "loss": 0.9120028495788575, | |
| "memory(GiB)": 76.02, | |
| "step": 195, | |
| "token_acc": 0.6961974110032363, | |
| "train_speed(iter/s)": 0.020453 | |
| }, | |
| { | |
| "epoch": 0.36396724294813465, | |
| "grad_norm": 1.403385243202059, | |
| "learning_rate": 3.856219832056853e-05, | |
| "loss": 0.9274997711181641, | |
| "memory(GiB)": 76.02, | |
| "step": 200, | |
| "token_acc": 0.6597971867844292, | |
| "train_speed(iter/s)": 0.020494 | |
| }, | |
| { | |
| "epoch": 0.36396724294813465, | |
| "eval_loss": 0.5442519783973694, | |
| "eval_runtime": 121.7991, | |
| "eval_samples_per_second": 47.389, | |
| "eval_steps_per_second": 0.46, | |
| "eval_token_acc": 0.6795491599341379, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.37306642402183804, | |
| "grad_norm": 1.7288195921368568, | |
| "learning_rate": 3.8490339116107814e-05, | |
| "loss": 0.9254457473754882, | |
| "memory(GiB)": 76.02, | |
| "step": 205, | |
| "token_acc": 0.6976498547663058, | |
| "train_speed(iter/s)": 0.020208 | |
| }, | |
| { | |
| "epoch": 0.3821656050955414, | |
| "grad_norm": 1.7934469116880778, | |
| "learning_rate": 3.8416798035001545e-05, | |
| "loss": 0.9426854133605957, | |
| "memory(GiB)": 76.02, | |
| "step": 210, | |
| "token_acc": 0.6734362307067425, | |
| "train_speed(iter/s)": 0.020248 | |
| }, | |
| { | |
| "epoch": 0.39126478616924476, | |
| "grad_norm": 1.3762724847783987, | |
| "learning_rate": 3.8341581766528185e-05, | |
| "loss": 0.949736499786377, | |
| "memory(GiB)": 76.02, | |
| "step": 215, | |
| "token_acc": 0.6799800299550673, | |
| "train_speed(iter/s)": 0.020279 | |
| }, | |
| { | |
| "epoch": 0.40036396724294815, | |
| "grad_norm": 1.8318501236469258, | |
| "learning_rate": 3.826469715234078e-05, | |
| "loss": 0.9189864158630371, | |
| "memory(GiB)": 76.02, | |
| "step": 220, | |
| "token_acc": 0.6768424298489053, | |
| "train_speed(iter/s)": 0.020316 | |
| }, | |
| { | |
| "epoch": 0.4094631483166515, | |
| "grad_norm": 1.734985910099827, | |
| "learning_rate": 3.818615118584472e-05, | |
| "loss": 0.9207481384277344, | |
| "memory(GiB)": 76.02, | |
| "step": 225, | |
| "token_acc": 0.6853369763205829, | |
| "train_speed(iter/s)": 0.020349 | |
| }, | |
| { | |
| "epoch": 0.41856232939035487, | |
| "grad_norm": 1.46610540352475, | |
| "learning_rate": 3.810595101156157e-05, | |
| "loss": 0.949979305267334, | |
| "memory(GiB)": 76.02, | |
| "step": 230, | |
| "token_acc": 0.7674418604651163, | |
| "train_speed(iter/s)": 0.020378 | |
| }, | |
| { | |
| "epoch": 0.42766151046405826, | |
| "grad_norm": 1.4640218284405278, | |
| "learning_rate": 3.8024103924479225e-05, | |
| "loss": 0.9503008842468261, | |
| "memory(GiB)": 76.02, | |
| "step": 235, | |
| "token_acc": 0.6691435275713727, | |
| "train_speed(iter/s)": 0.020412 | |
| }, | |
| { | |
| "epoch": 0.4367606915377616, | |
| "grad_norm": 1.3582380492653447, | |
| "learning_rate": 3.794061736938837e-05, | |
| "loss": 0.9213446617126465, | |
| "memory(GiB)": 76.02, | |
| "step": 240, | |
| "token_acc": 0.6814469078179697, | |
| "train_speed(iter/s)": 0.020441 | |
| }, | |
| { | |
| "epoch": 0.445859872611465, | |
| "grad_norm": 1.24168837408377, | |
| "learning_rate": 3.785549894020529e-05, | |
| "loss": 0.927124309539795, | |
| "memory(GiB)": 76.02, | |
| "step": 245, | |
| "token_acc": 0.7300613496932515, | |
| "train_speed(iter/s)": 0.020473 | |
| }, | |
| { | |
| "epoch": 0.4549590536851683, | |
| "grad_norm": 1.4540581012012834, | |
| "learning_rate": 3.77687563792811e-05, | |
| "loss": 0.9168607711791992, | |
| "memory(GiB)": 76.02, | |
| "step": 250, | |
| "token_acc": 0.6800291545189504, | |
| "train_speed(iter/s)": 0.020497 | |
| }, | |
| { | |
| "epoch": 0.4549590536851683, | |
| "eval_loss": 0.5409244894981384, | |
| "eval_runtime": 120.7512, | |
| "eval_samples_per_second": 47.801, | |
| "eval_steps_per_second": 0.464, | |
| "eval_token_acc": 0.6797271657031431, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4640582347588717, | |
| "grad_norm": 1.7178666143628036, | |
| "learning_rate": 3.768039757669759e-05, | |
| "loss": 0.9190607070922852, | |
| "memory(GiB)": 76.02, | |
| "step": 255, | |
| "token_acc": 0.6971046770601337, | |
| "train_speed(iter/s)": 0.020269 | |
| }, | |
| { | |
| "epoch": 0.4731574158325751, | |
| "grad_norm": 1.4533539479949111, | |
| "learning_rate": 3.759043056954943e-05, | |
| "loss": 0.9371905326843262, | |
| "memory(GiB)": 76.02, | |
| "step": 260, | |
| "token_acc": 0.6667847025495751, | |
| "train_speed(iter/s)": 0.020296 | |
| }, | |
| { | |
| "epoch": 0.4822565969062784, | |
| "grad_norm": 1.8242714144160546, | |
| "learning_rate": 3.749886354121324e-05, | |
| "loss": 0.9172127723693848, | |
| "memory(GiB)": 76.02, | |
| "step": 265, | |
| "token_acc": 0.7086137281292059, | |
| "train_speed(iter/s)": 0.020325 | |
| }, | |
| { | |
| "epoch": 0.4913557779799818, | |
| "grad_norm": 1.3386774946853799, | |
| "learning_rate": 3.740570482060311e-05, | |
| "loss": 0.9408517837524414, | |
| "memory(GiB)": 76.02, | |
| "step": 270, | |
| "token_acc": 0.7290575916230366, | |
| "train_speed(iter/s)": 0.020353 | |
| }, | |
| { | |
| "epoch": 0.5004549590536852, | |
| "grad_norm": 1.6524604438416564, | |
| "learning_rate": 3.731096288141309e-05, | |
| "loss": 0.9067551612854003, | |
| "memory(GiB)": 76.02, | |
| "step": 275, | |
| "token_acc": 0.678743961352657, | |
| "train_speed(iter/s)": 0.020379 | |
| }, | |
| { | |
| "epoch": 0.5095541401273885, | |
| "grad_norm": 1.7068717522460979, | |
| "learning_rate": 3.721464634134641e-05, | |
| "loss": 0.9261470794677734, | |
| "memory(GiB)": 76.02, | |
| "step": 280, | |
| "token_acc": 0.7159965782720273, | |
| "train_speed(iter/s)": 0.020408 | |
| }, | |
| { | |
| "epoch": 0.5186533212010919, | |
| "grad_norm": 1.5886442512862196, | |
| "learning_rate": 3.711676396133158e-05, | |
| "loss": 0.9242866516113282, | |
| "memory(GiB)": 76.02, | |
| "step": 285, | |
| "token_acc": 0.6532932129722501, | |
| "train_speed(iter/s)": 0.020431 | |
| }, | |
| { | |
| "epoch": 0.5277525022747953, | |
| "grad_norm": 1.3930674320536802, | |
| "learning_rate": 3.701732464472553e-05, | |
| "loss": 0.9128170967102051, | |
| "memory(GiB)": 76.02, | |
| "step": 290, | |
| "token_acc": 0.6779987171263631, | |
| "train_speed(iter/s)": 0.020457 | |
| }, | |
| { | |
| "epoch": 0.5368516833484986, | |
| "grad_norm": 1.4564537325119185, | |
| "learning_rate": 3.691633743650377e-05, | |
| "loss": 0.9042372703552246, | |
| "memory(GiB)": 76.02, | |
| "step": 295, | |
| "token_acc": 0.6832191780821918, | |
| "train_speed(iter/s)": 0.020478 | |
| }, | |
| { | |
| "epoch": 0.545950864422202, | |
| "grad_norm": 1.4788538883263567, | |
| "learning_rate": 3.681381152243763e-05, | |
| "loss": 0.9223553657531738, | |
| "memory(GiB)": 76.02, | |
| "step": 300, | |
| "token_acc": 0.6808054841473865, | |
| "train_speed(iter/s)": 0.020502 | |
| }, | |
| { | |
| "epoch": 0.545950864422202, | |
| "eval_loss": 0.5335711240768433, | |
| "eval_runtime": 119.2512, | |
| "eval_samples_per_second": 48.402, | |
| "eval_steps_per_second": 0.47, | |
| "eval_token_acc": 0.682199018540919, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5550500454959054, | |
| "grad_norm": 1.6525258776892648, | |
| "learning_rate": 3.6709756228258735e-05, | |
| "loss": 0.9161547660827637, | |
| "memory(GiB)": 76.02, | |
| "step": 305, | |
| "token_acc": 0.6724870221802737, | |
| "train_speed(iter/s)": 0.02031 | |
| }, | |
| { | |
| "epoch": 0.5641492265696088, | |
| "grad_norm": 1.298480729022936, | |
| "learning_rate": 3.6604181018810764e-05, | |
| "loss": 0.8824697494506836, | |
| "memory(GiB)": 76.02, | |
| "step": 310, | |
| "token_acc": 0.6935075885328836, | |
| "train_speed(iter/s)": 0.020334 | |
| }, | |
| { | |
| "epoch": 0.5732484076433121, | |
| "grad_norm": 1.3254867339008374, | |
| "learning_rate": 3.649709549718849e-05, | |
| "loss": 0.8925297737121582, | |
| "memory(GiB)": 76.02, | |
| "step": 315, | |
| "token_acc": 0.6668953687821613, | |
| "train_speed(iter/s)": 0.020357 | |
| }, | |
| { | |
| "epoch": 0.5823475887170154, | |
| "grad_norm": 1.4003301586141983, | |
| "learning_rate": 3.638850940386433e-05, | |
| "loss": 0.9219451904296875, | |
| "memory(GiB)": 76.02, | |
| "step": 320, | |
| "token_acc": 0.6934164394234515, | |
| "train_speed(iter/s)": 0.020381 | |
| }, | |
| { | |
| "epoch": 0.5914467697907189, | |
| "grad_norm": 1.2198877131221963, | |
| "learning_rate": 3.627843261580231e-05, | |
| "loss": 0.9142662048339844, | |
| "memory(GiB)": 76.02, | |
| "step": 325, | |
| "token_acc": 0.6796973518284993, | |
| "train_speed(iter/s)": 0.020407 | |
| }, | |
| { | |
| "epoch": 0.6005459508644222, | |
| "grad_norm": 1.2491149251440654, | |
| "learning_rate": 3.6166875145559684e-05, | |
| "loss": 0.9013506889343261, | |
| "memory(GiB)": 76.02, | |
| "step": 330, | |
| "token_acc": 0.7270875763747454, | |
| "train_speed(iter/s)": 0.020426 | |
| }, | |
| { | |
| "epoch": 0.6096451319381255, | |
| "grad_norm": 1.3464860154655747, | |
| "learning_rate": 3.6053847140376194e-05, | |
| "loss": 0.9187211990356445, | |
| "memory(GiB)": 76.02, | |
| "step": 335, | |
| "token_acc": 0.6677791262135923, | |
| "train_speed(iter/s)": 0.020449 | |
| }, | |
| { | |
| "epoch": 0.618744313011829, | |
| "grad_norm": 1.3081495464213557, | |
| "learning_rate": 3.593935888125107e-05, | |
| "loss": 0.9130012512207031, | |
| "memory(GiB)": 76.02, | |
| "step": 340, | |
| "token_acc": 0.6820603907637656, | |
| "train_speed(iter/s)": 0.020469 | |
| }, | |
| { | |
| "epoch": 0.6278434940855323, | |
| "grad_norm": 1.3056329501412263, | |
| "learning_rate": 3.582342078200786e-05, | |
| "loss": 0.903553581237793, | |
| "memory(GiB)": 76.02, | |
| "step": 345, | |
| "token_acc": 0.7179723502304147, | |
| "train_speed(iter/s)": 0.020488 | |
| }, | |
| { | |
| "epoch": 0.6369426751592356, | |
| "grad_norm": 1.2033803940833903, | |
| "learning_rate": 3.570604338834725e-05, | |
| "loss": 0.9074154853820801, | |
| "memory(GiB)": 76.02, | |
| "step": 350, | |
| "token_acc": 0.7170805116629044, | |
| "train_speed(iter/s)": 0.020509 | |
| }, | |
| { | |
| "epoch": 0.6369426751592356, | |
| "eval_loss": 0.5156524777412415, | |
| "eval_runtime": 121.7142, | |
| "eval_samples_per_second": 47.423, | |
| "eval_steps_per_second": 0.46, | |
| "eval_token_acc": 0.6832346884696763, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6460418562329391, | |
| "grad_norm": 1.3706427880274294, | |
| "learning_rate": 3.558723737688775e-05, | |
| "loss": 0.9084077835083008, | |
| "memory(GiB)": 76.02, | |
| "step": 355, | |
| "token_acc": 0.7012306886619534, | |
| "train_speed(iter/s)": 0.020344 | |
| }, | |
| { | |
| "epoch": 0.6551410373066424, | |
| "grad_norm": 1.4525504674274499, | |
| "learning_rate": 3.54670135541946e-05, | |
| "loss": 0.9108301162719726, | |
| "memory(GiB)": 76.02, | |
| "step": 360, | |
| "token_acc": 0.6819548872180451, | |
| "train_speed(iter/s)": 0.020365 | |
| }, | |
| { | |
| "epoch": 0.6642402183803457, | |
| "grad_norm": 1.371067326824918, | |
| "learning_rate": 3.534538285579681e-05, | |
| "loss": 0.9166597366333008, | |
| "memory(GiB)": 76.02, | |
| "step": 365, | |
| "token_acc": 0.68828125, | |
| "train_speed(iter/s)": 0.020383 | |
| }, | |
| { | |
| "epoch": 0.6733393994540491, | |
| "grad_norm": 1.404728462002113, | |
| "learning_rate": 3.522235634519244e-05, | |
| "loss": 0.8995059967041016, | |
| "memory(GiB)": 76.02, | |
| "step": 370, | |
| "token_acc": 0.6734115742614326, | |
| "train_speed(iter/s)": 0.020405 | |
| }, | |
| { | |
| "epoch": 0.6824385805277525, | |
| "grad_norm": 1.4153346949849819, | |
| "learning_rate": 3.509794521284228e-05, | |
| "loss": 0.8986475944519043, | |
| "memory(GiB)": 76.02, | |
| "step": 375, | |
| "token_acc": 0.6696600384862091, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 0.6915377616014559, | |
| "grad_norm": 1.357991579405462, | |
| "learning_rate": 3.497216077515198e-05, | |
| "loss": 0.914306354522705, | |
| "memory(GiB)": 76.02, | |
| "step": 380, | |
| "token_acc": 0.668999300209937, | |
| "train_speed(iter/s)": 0.020442 | |
| }, | |
| { | |
| "epoch": 0.7006369426751592, | |
| "grad_norm": 1.421643318524058, | |
| "learning_rate": 3.48450144734427e-05, | |
| "loss": 0.9151236534118652, | |
| "memory(GiB)": 76.02, | |
| "step": 385, | |
| "token_acc": 0.6687898089171974, | |
| "train_speed(iter/s)": 0.02046 | |
| }, | |
| { | |
| "epoch": 0.7097361237488626, | |
| "grad_norm": 1.1089727601654944, | |
| "learning_rate": 3.4716517872910405e-05, | |
| "loss": 0.8921234130859375, | |
| "memory(GiB)": 76.02, | |
| "step": 390, | |
| "token_acc": 0.6953678474114442, | |
| "train_speed(iter/s)": 0.020478 | |
| }, | |
| { | |
| "epoch": 0.718835304822566, | |
| "grad_norm": 1.3211880131463927, | |
| "learning_rate": 3.45866826615739e-05, | |
| "loss": 0.9150146484375, | |
| "memory(GiB)": 76.02, | |
| "step": 395, | |
| "token_acc": 0.6571167327034441, | |
| "train_speed(iter/s)": 0.020496 | |
| }, | |
| { | |
| "epoch": 0.7279344858962693, | |
| "grad_norm": 1.4350745291439944, | |
| "learning_rate": 3.445552064921172e-05, | |
| "loss": 0.9022627830505371, | |
| "memory(GiB)": 76.02, | |
| "step": 400, | |
| "token_acc": 0.6755852842809364, | |
| "train_speed(iter/s)": 0.020512 | |
| }, | |
| { | |
| "epoch": 0.7279344858962693, | |
| "eval_loss": 0.5100554823875427, | |
| "eval_runtime": 119.6911, | |
| "eval_samples_per_second": 48.224, | |
| "eval_steps_per_second": 0.468, | |
| "eval_token_acc": 0.6859209573473904, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7370336669699727, | |
| "grad_norm": 1.1612524813118632, | |
| "learning_rate": 3.432304376628787e-05, | |
| "loss": 0.9135440826416016, | |
| "memory(GiB)": 76.02, | |
| "step": 405, | |
| "token_acc": 0.7024793388429752, | |
| "train_speed(iter/s)": 0.020366 | |
| }, | |
| { | |
| "epoch": 0.7461328480436761, | |
| "grad_norm": 1.3506987538568946, | |
| "learning_rate": 3.418926406286666e-05, | |
| "loss": 0.9180900573730468, | |
| "memory(GiB)": 76.02, | |
| "step": 410, | |
| "token_acc": 0.715203426124197, | |
| "train_speed(iter/s)": 0.020382 | |
| }, | |
| { | |
| "epoch": 0.7552320291173794, | |
| "grad_norm": 1.3682849356535443, | |
| "learning_rate": 3.405419370751663e-05, | |
| "loss": 0.9025050163269043, | |
| "memory(GiB)": 76.02, | |
| "step": 415, | |
| "token_acc": 0.7220916568742656, | |
| "train_speed(iter/s)": 0.020402 | |
| }, | |
| { | |
| "epoch": 0.7643312101910829, | |
| "grad_norm": 1.4354924987431779, | |
| "learning_rate": 3.391784498620369e-05, | |
| "loss": 0.9032191276550293, | |
| "memory(GiB)": 76.02, | |
| "step": 420, | |
| "token_acc": 0.6772521062864549, | |
| "train_speed(iter/s)": 0.020419 | |
| }, | |
| { | |
| "epoch": 0.7734303912647862, | |
| "grad_norm": 1.3319624335350189, | |
| "learning_rate": 3.378023030117361e-05, | |
| "loss": 0.9076663970947265, | |
| "memory(GiB)": 76.02, | |
| "step": 425, | |
| "token_acc": 0.6790314270994333, | |
| "train_speed(iter/s)": 0.020436 | |
| }, | |
| { | |
| "epoch": 0.7825295723384895, | |
| "grad_norm": 1.2560401393743486, | |
| "learning_rate": 3.364136216982391e-05, | |
| "loss": 0.9036032676696777, | |
| "memory(GiB)": 76.02, | |
| "step": 430, | |
| "token_acc": 0.6832980972515856, | |
| "train_speed(iter/s)": 0.020453 | |
| }, | |
| { | |
| "epoch": 0.7916287534121929, | |
| "grad_norm": 1.331582467821213, | |
| "learning_rate": 3.350125322356525e-05, | |
| "loss": 0.9180031776428222, | |
| "memory(GiB)": 76.02, | |
| "step": 435, | |
| "token_acc": 0.6918290043290043, | |
| "train_speed(iter/s)": 0.020468 | |
| }, | |
| { | |
| "epoch": 0.8007279344858963, | |
| "grad_norm": 1.3101601945182637, | |
| "learning_rate": 3.335991620667254e-05, | |
| "loss": 0.9090401649475097, | |
| "memory(GiB)": 76.02, | |
| "step": 440, | |
| "token_acc": 0.6886586695747001, | |
| "train_speed(iter/s)": 0.020484 | |
| }, | |
| { | |
| "epoch": 0.8098271155595996, | |
| "grad_norm": 1.490959565832233, | |
| "learning_rate": 3.321736397512566e-05, | |
| "loss": 0.8914430618286133, | |
| "memory(GiB)": 76.02, | |
| "step": 445, | |
| "token_acc": 0.7289220917822838, | |
| "train_speed(iter/s)": 0.020498 | |
| }, | |
| { | |
| "epoch": 0.818926296633303, | |
| "grad_norm": 1.6826531523568926, | |
| "learning_rate": 3.307360949544012e-05, | |
| "loss": 0.8871423721313476, | |
| "memory(GiB)": 76.02, | |
| "step": 450, | |
| "token_acc": 0.6811023622047244, | |
| "train_speed(iter/s)": 0.020515 | |
| }, | |
| { | |
| "epoch": 0.818926296633303, | |
| "eval_loss": 0.5105797648429871, | |
| "eval_runtime": 119.2169, | |
| "eval_samples_per_second": 48.416, | |
| "eval_steps_per_second": 0.47, | |
| "eval_token_acc": 0.6859007294190943, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8280254777070064, | |
| "grad_norm": 1.5351362870698657, | |
| "learning_rate": 3.2928665843487646e-05, | |
| "loss": 0.9084842681884766, | |
| "memory(GiB)": 76.02, | |
| "step": 455, | |
| "token_acc": 0.6964930376482723, | |
| "train_speed(iter/s)": 0.020387 | |
| }, | |
| { | |
| "epoch": 0.8371246587807097, | |
| "grad_norm": 1.76414300067586, | |
| "learning_rate": 3.278254620330673e-05, | |
| "loss": 0.8832217216491699, | |
| "memory(GiB)": 76.02, | |
| "step": 460, | |
| "token_acc": 0.6910656620021528, | |
| "train_speed(iter/s)": 0.020403 | |
| }, | |
| { | |
| "epoch": 0.8462238398544131, | |
| "grad_norm": 1.26108516359597, | |
| "learning_rate": 3.263526386590351e-05, | |
| "loss": 0.9098955154418945, | |
| "memory(GiB)": 76.02, | |
| "step": 465, | |
| "token_acc": 0.6647430612805716, | |
| "train_speed(iter/s)": 0.020418 | |
| }, | |
| { | |
| "epoch": 0.8553230209281165, | |
| "grad_norm": 1.4539630443562455, | |
| "learning_rate": 3.248683222804274e-05, | |
| "loss": 0.8848261833190918, | |
| "memory(GiB)": 76.02, | |
| "step": 470, | |
| "token_acc": 0.7338235294117647, | |
| "train_speed(iter/s)": 0.020432 | |
| }, | |
| { | |
| "epoch": 0.8644222020018199, | |
| "grad_norm": 1.6326834981575191, | |
| "learning_rate": 3.233726479102927e-05, | |
| "loss": 0.9008934020996093, | |
| "memory(GiB)": 76.02, | |
| "step": 475, | |
| "token_acc": 0.7064676616915423, | |
| "train_speed(iter/s)": 0.020448 | |
| }, | |
| { | |
| "epoch": 0.8735213830755232, | |
| "grad_norm": 1.2054817005259488, | |
| "learning_rate": 3.2186575159479966e-05, | |
| "loss": 0.8803308486938477, | |
| "memory(GiB)": 76.02, | |
| "step": 480, | |
| "token_acc": 0.7033673855467272, | |
| "train_speed(iter/s)": 0.020462 | |
| }, | |
| { | |
| "epoch": 0.8826205641492265, | |
| "grad_norm": 1.1783711102902867, | |
| "learning_rate": 3.203477704008622e-05, | |
| "loss": 0.9082450866699219, | |
| "memory(GiB)": 76.02, | |
| "step": 485, | |
| "token_acc": 0.7070333157059757, | |
| "train_speed(iter/s)": 0.020477 | |
| }, | |
| { | |
| "epoch": 0.89171974522293, | |
| "grad_norm": 1.241716165408502, | |
| "learning_rate": 3.188188424036719e-05, | |
| "loss": 0.9072214126586914, | |
| "memory(GiB)": 76.02, | |
| "step": 490, | |
| "token_acc": 0.6927956502038967, | |
| "train_speed(iter/s)": 0.02049 | |
| }, | |
| { | |
| "epoch": 0.9008189262966333, | |
| "grad_norm": 1.1673048249036013, | |
| "learning_rate": 3.172791066741392e-05, | |
| "loss": 0.886620044708252, | |
| "memory(GiB)": 76.02, | |
| "step": 495, | |
| "token_acc": 0.7046548956661316, | |
| "train_speed(iter/s)": 0.020505 | |
| }, | |
| { | |
| "epoch": 0.9099181073703366, | |
| "grad_norm": 1.5005936662764863, | |
| "learning_rate": 3.157287032662428e-05, | |
| "loss": 0.8825222015380859, | |
| "memory(GiB)": 76.02, | |
| "step": 500, | |
| "token_acc": 0.6940532081377152, | |
| "train_speed(iter/s)": 0.020518 | |
| }, | |
| { | |
| "epoch": 0.9099181073703366, | |
| "eval_loss": 0.49878114461898804, | |
| "eval_runtime": 121.4101, | |
| "eval_samples_per_second": 47.541, | |
| "eval_steps_per_second": 0.461, | |
| "eval_token_acc": 0.6875917842246433, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9190172884440401, | |
| "grad_norm": 1.2321769009736276, | |
| "learning_rate": 3.14167773204291e-05, | |
| "loss": 0.8877192497253418, | |
| "memory(GiB)": 76.02, | |
| "step": 505, | |
| "token_acc": 0.7100805331852263, | |
| "train_speed(iter/s)": 0.020401 | |
| }, | |
| { | |
| "epoch": 0.9281164695177434, | |
| "grad_norm": 1.2301460920284364, | |
| "learning_rate": 3.1259645847009384e-05, | |
| "loss": 0.9063457489013672, | |
| "memory(GiB)": 76.02, | |
| "step": 510, | |
| "token_acc": 0.6885245901639344, | |
| "train_speed(iter/s)": 0.020414 | |
| }, | |
| { | |
| "epoch": 0.9372156505914467, | |
| "grad_norm": 1.4857123341096659, | |
| "learning_rate": 3.110149019900486e-05, | |
| "loss": 0.8702260971069335, | |
| "memory(GiB)": 76.02, | |
| "step": 515, | |
| "token_acc": 0.6863874345549739, | |
| "train_speed(iter/s)": 0.020427 | |
| }, | |
| { | |
| "epoch": 0.9463148316651502, | |
| "grad_norm": 1.189993476966276, | |
| "learning_rate": 3.094232476221392e-05, | |
| "loss": 0.9034518241882324, | |
| "memory(GiB)": 76.02, | |
| "step": 520, | |
| "token_acc": 0.7082294264339152, | |
| "train_speed(iter/s)": 0.020441 | |
| }, | |
| { | |
| "epoch": 0.9554140127388535, | |
| "grad_norm": 1.3161268491995117, | |
| "learning_rate": 3.07821640142851e-05, | |
| "loss": 0.87875394821167, | |
| "memory(GiB)": 76.02, | |
| "step": 525, | |
| "token_acc": 0.683948569058482, | |
| "train_speed(iter/s)": 0.020453 | |
| }, | |
| { | |
| "epoch": 0.9645131938125568, | |
| "grad_norm": 1.1112974134834392, | |
| "learning_rate": 3.062102252340019e-05, | |
| "loss": 0.8922388076782226, | |
| "memory(GiB)": 76.02, | |
| "step": 530, | |
| "token_acc": 0.6777905638665133, | |
| "train_speed(iter/s)": 0.020468 | |
| }, | |
| { | |
| "epoch": 0.9736123748862603, | |
| "grad_norm": 1.292894697629211, | |
| "learning_rate": 3.045891494694908e-05, | |
| "loss": 0.908051872253418, | |
| "memory(GiB)": 76.02, | |
| "step": 535, | |
| "token_acc": 0.6983343615052436, | |
| "train_speed(iter/s)": 0.020479 | |
| }, | |
| { | |
| "epoch": 0.9827115559599636, | |
| "grad_norm": 1.166045668885059, | |
| "learning_rate": 3.0295856030196618e-05, | |
| "loss": 0.9091971397399903, | |
| "memory(GiB)": 76.02, | |
| "step": 540, | |
| "token_acc": 0.7089144936325046, | |
| "train_speed(iter/s)": 0.020492 | |
| }, | |
| { | |
| "epoch": 0.991810737033667, | |
| "grad_norm": 1.3674012690083148, | |
| "learning_rate": 3.0131860604941287e-05, | |
| "loss": 0.8997166633605957, | |
| "memory(GiB)": 76.02, | |
| "step": 545, | |
| "token_acc": 0.6767097082735534, | |
| "train_speed(iter/s)": 0.020504 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.4019349528909308, | |
| "learning_rate": 2.996694358816618e-05, | |
| "loss": 0.8638315200805664, | |
| "memory(GiB)": 76.02, | |
| "step": 550, | |
| "token_acc": 0.7002042900919305, | |
| "train_speed(iter/s)": 0.020533 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4928109347820282, | |
| "eval_runtime": 119.0212, | |
| "eval_samples_per_second": 48.496, | |
| "eval_steps_per_second": 0.471, | |
| "eval_token_acc": 0.6892747478588738, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.0090991810737033, | |
| "grad_norm": 1.4220386258897009, | |
| "learning_rate": 2.9801119980682095e-05, | |
| "loss": 0.8142873764038085, | |
| "memory(GiB)": 76.02, | |
| "step": 555, | |
| "token_acc": 0.7055921052631579, | |
| "train_speed(iter/s)": 0.020412 | |
| }, | |
| { | |
| "epoch": 1.0181983621474067, | |
| "grad_norm": 1.187181373009717, | |
| "learning_rate": 2.9634404865763122e-05, | |
| "loss": 0.7935843467712402, | |
| "memory(GiB)": 76.02, | |
| "step": 560, | |
| "token_acc": 0.7032755298651252, | |
| "train_speed(iter/s)": 0.02042 | |
| }, | |
| { | |
| "epoch": 1.02729754322111, | |
| "grad_norm": 1.0185191433966165, | |
| "learning_rate": 2.9466813407774627e-05, | |
| "loss": 0.7965437889099121, | |
| "memory(GiB)": 76.02, | |
| "step": 565, | |
| "token_acc": 0.6973250274825944, | |
| "train_speed(iter/s)": 0.020432 | |
| }, | |
| { | |
| "epoch": 1.0363967242948136, | |
| "grad_norm": 1.2024810924675036, | |
| "learning_rate": 2.9298360850793944e-05, | |
| "loss": 0.7800662517547607, | |
| "memory(GiB)": 76.02, | |
| "step": 570, | |
| "token_acc": 0.7089552238805971, | |
| "train_speed(iter/s)": 0.020443 | |
| }, | |
| { | |
| "epoch": 1.0454959053685169, | |
| "grad_norm": 0.9855874534546613, | |
| "learning_rate": 2.912906251722373e-05, | |
| "loss": 0.8090152740478516, | |
| "memory(GiB)": 76.02, | |
| "step": 575, | |
| "token_acc": 0.7137375287797391, | |
| "train_speed(iter/s)": 0.020455 | |
| }, | |
| { | |
| "epoch": 1.0545950864422202, | |
| "grad_norm": 1.183729346768703, | |
| "learning_rate": 2.895893380639829e-05, | |
| "loss": 0.8083430290222168, | |
| "memory(GiB)": 76.02, | |
| "step": 580, | |
| "token_acc": 0.7071651090342679, | |
| "train_speed(iter/s)": 0.020466 | |
| }, | |
| { | |
| "epoch": 1.0636942675159236, | |
| "grad_norm": 1.527448245905063, | |
| "learning_rate": 2.878799019318283e-05, | |
| "loss": 0.787087345123291, | |
| "memory(GiB)": 76.02, | |
| "step": 585, | |
| "token_acc": 0.7470379146919431, | |
| "train_speed(iter/s)": 0.020477 | |
| }, | |
| { | |
| "epoch": 1.0727934485896269, | |
| "grad_norm": 1.2570337520295112, | |
| "learning_rate": 2.8616247226565888e-05, | |
| "loss": 0.8103050231933594, | |
| "memory(GiB)": 76.02, | |
| "step": 590, | |
| "token_acc": 0.7105431309904153, | |
| "train_speed(iter/s)": 0.020489 | |
| }, | |
| { | |
| "epoch": 1.0818926296633302, | |
| "grad_norm": 1.1805179088694353, | |
| "learning_rate": 2.8443720528244964e-05, | |
| "loss": 0.8091272354125977, | |
| "memory(GiB)": 76.02, | |
| "step": 595, | |
| "token_acc": 0.7236403995560489, | |
| "train_speed(iter/s)": 0.0205 | |
| }, | |
| { | |
| "epoch": 1.0909918107370338, | |
| "grad_norm": 1.3005835459012032, | |
| "learning_rate": 2.827042579120562e-05, | |
| "loss": 0.7841366767883301, | |
| "memory(GiB)": 76.02, | |
| "step": 600, | |
| "token_acc": 0.7160133444537115, | |
| "train_speed(iter/s)": 0.020511 | |
| }, | |
| { | |
| "epoch": 1.0909918107370338, | |
| "eval_loss": 0.4980168640613556, | |
| "eval_runtime": 122.0994, | |
| "eval_samples_per_second": 47.273, | |
| "eval_steps_per_second": 0.459, | |
| "eval_token_acc": 0.68817030297391, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.100090991810737, | |
| "grad_norm": 1.0825655683949489, | |
| "learning_rate": 2.809637877829401e-05, | |
| "loss": 0.8102677345275879, | |
| "memory(GiB)": 76.02, | |
| "step": 605, | |
| "token_acc": 0.7054728756601056, | |
| "train_speed(iter/s)": 0.020407 | |
| }, | |
| { | |
| "epoch": 1.1091901728844404, | |
| "grad_norm": 1.269997926727983, | |
| "learning_rate": 2.792159532078314e-05, | |
| "loss": 0.8190704345703125, | |
| "memory(GiB)": 76.02, | |
| "step": 610, | |
| "token_acc": 0.7151929653150952, | |
| "train_speed(iter/s)": 0.020418 | |
| }, | |
| { | |
| "epoch": 1.1182893539581438, | |
| "grad_norm": 1.3197280690186768, | |
| "learning_rate": 2.7746091316932807e-05, | |
| "loss": 0.7909206867218017, | |
| "memory(GiB)": 76.02, | |
| "step": 615, | |
| "token_acc": 0.8111888111888111, | |
| "train_speed(iter/s)": 0.020428 | |
| }, | |
| { | |
| "epoch": 1.127388535031847, | |
| "grad_norm": 1.3074486932691716, | |
| "learning_rate": 2.756988273054354e-05, | |
| "loss": 0.7989336967468261, | |
| "memory(GiB)": 76.02, | |
| "step": 620, | |
| "token_acc": 0.6923334449280214, | |
| "train_speed(iter/s)": 0.020439 | |
| }, | |
| { | |
| "epoch": 1.1364877161055504, | |
| "grad_norm": 1.09154376619437, | |
| "learning_rate": 2.7392985589504512e-05, | |
| "loss": 0.7985887050628662, | |
| "memory(GiB)": 76.02, | |
| "step": 625, | |
| "token_acc": 0.6959603118355776, | |
| "train_speed(iter/s)": 0.02045 | |
| }, | |
| { | |
| "epoch": 1.1455868971792538, | |
| "grad_norm": 1.105083946015695, | |
| "learning_rate": 2.721541598433567e-05, | |
| "loss": 0.7879680156707763, | |
| "memory(GiB)": 76.02, | |
| "step": 630, | |
| "token_acc": 0.7151389249545572, | |
| "train_speed(iter/s)": 0.020461 | |
| }, | |
| { | |
| "epoch": 1.1546860782529573, | |
| "grad_norm": 1.1369632866951163, | |
| "learning_rate": 2.7037190066724108e-05, | |
| "loss": 0.8013208389282227, | |
| "memory(GiB)": 76.02, | |
| "step": 635, | |
| "token_acc": 0.6987542468856173, | |
| "train_speed(iter/s)": 0.020471 | |
| }, | |
| { | |
| "epoch": 1.1637852593266607, | |
| "grad_norm": 1.084161120288602, | |
| "learning_rate": 2.6858324048054956e-05, | |
| "loss": 0.8041671752929688, | |
| "memory(GiB)": 76.02, | |
| "step": 640, | |
| "token_acc": 0.6834153197470133, | |
| "train_speed(iter/s)": 0.020482 | |
| }, | |
| { | |
| "epoch": 1.172884440400364, | |
| "grad_norm": 1.154991176116474, | |
| "learning_rate": 2.667883419793676e-05, | |
| "loss": 0.8061488151550293, | |
| "memory(GiB)": 76.02, | |
| "step": 645, | |
| "token_acc": 0.7004991680532446, | |
| "train_speed(iter/s)": 0.020492 | |
| }, | |
| { | |
| "epoch": 1.1819836214740673, | |
| "grad_norm": 1.1196634253017694, | |
| "learning_rate": 2.649873684272164e-05, | |
| "loss": 0.8086748123168945, | |
| "memory(GiB)": 76.02, | |
| "step": 650, | |
| "token_acc": 0.6978937441056272, | |
| "train_speed(iter/s)": 0.020502 | |
| }, | |
| { | |
| "epoch": 1.1819836214740673, | |
| "eval_loss": 0.5025342702865601, | |
| "eval_runtime": 120.6757, | |
| "eval_samples_per_second": 47.831, | |
| "eval_steps_per_second": 0.464, | |
| "eval_token_acc": 0.6888256878507018, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1910828025477707, | |
| "grad_norm": 1.1155649313448126, | |
| "learning_rate": 2.6318048364020214e-05, | |
| "loss": 0.7836286544799804, | |
| "memory(GiB)": 76.02, | |
| "step": 655, | |
| "token_acc": 0.7220535467844328, | |
| "train_speed(iter/s)": 0.020409 | |
| }, | |
| { | |
| "epoch": 1.200181983621474, | |
| "grad_norm": 1.1072757367187032, | |
| "learning_rate": 2.613678519721155e-05, | |
| "loss": 0.7940217018127441, | |
| "memory(GiB)": 76.02, | |
| "step": 660, | |
| "token_acc": 0.7217682020802377, | |
| "train_speed(iter/s)": 0.02042 | |
| }, | |
| { | |
| "epoch": 1.2092811646951773, | |
| "grad_norm": 1.0457391204034119, | |
| "learning_rate": 2.5954963829948195e-05, | |
| "loss": 0.7881236553192139, | |
| "memory(GiB)": 76.02, | |
| "step": 665, | |
| "token_acc": 0.7111846946284033, | |
| "train_speed(iter/s)": 0.020429 | |
| }, | |
| { | |
| "epoch": 1.2183803457688809, | |
| "grad_norm": 1.2226481761675059, | |
| "learning_rate": 2.577260080065649e-05, | |
| "loss": 0.8019227981567383, | |
| "memory(GiB)": 76.02, | |
| "step": 670, | |
| "token_acc": 0.7422535211267606, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 1.2274795268425842, | |
| "grad_norm": 1.27401099270194, | |
| "learning_rate": 2.558971269703219e-05, | |
| "loss": 0.7942542552947998, | |
| "memory(GiB)": 76.02, | |
| "step": 675, | |
| "token_acc": 0.7235213204951857, | |
| "train_speed(iter/s)": 0.020449 | |
| }, | |
| { | |
| "epoch": 1.2365787079162875, | |
| "grad_norm": 1.3601936101076058, | |
| "learning_rate": 2.5406316154531717e-05, | |
| "loss": 0.8046051025390625, | |
| "memory(GiB)": 76.02, | |
| "step": 680, | |
| "token_acc": 0.7112280701754385, | |
| "train_speed(iter/s)": 0.020459 | |
| }, | |
| { | |
| "epoch": 1.2456778889899909, | |
| "grad_norm": 1.1617605645583995, | |
| "learning_rate": 2.522242785485893e-05, | |
| "loss": 0.8000314712524415, | |
| "memory(GiB)": 76.02, | |
| "step": 685, | |
| "token_acc": 0.6886890349360083, | |
| "train_speed(iter/s)": 0.020469 | |
| }, | |
| { | |
| "epoch": 1.2547770700636942, | |
| "grad_norm": 1.3512273187713244, | |
| "learning_rate": 2.5038064524447827e-05, | |
| "loss": 0.8067909240722656, | |
| "memory(GiB)": 76.02, | |
| "step": 690, | |
| "token_acc": 0.7467532467532467, | |
| "train_speed(iter/s)": 0.020479 | |
| }, | |
| { | |
| "epoch": 1.2638762511373978, | |
| "grad_norm": 1.3157719287072271, | |
| "learning_rate": 2.4853242932941064e-05, | |
| "loss": 0.7853587150573731, | |
| "memory(GiB)": 76.02, | |
| "step": 695, | |
| "token_acc": 0.7197480881691408, | |
| "train_speed(iter/s)": 0.020488 | |
| }, | |
| { | |
| "epoch": 1.2729754322111009, | |
| "grad_norm": 1.1947998857326674, | |
| "learning_rate": 2.4667979891664625e-05, | |
| "loss": 0.7679170131683349, | |
| "memory(GiB)": 76.02, | |
| "step": 700, | |
| "token_acc": 0.7413360120542442, | |
| "train_speed(iter/s)": 0.020498 | |
| }, | |
| { | |
| "epoch": 1.2729754322111009, | |
| "eval_loss": 0.4833757281303406, | |
| "eval_runtime": 119.9805, | |
| "eval_samples_per_second": 48.108, | |
| "eval_steps_per_second": 0.467, | |
| "eval_token_acc": 0.6897318990383643, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2820746132848044, | |
| "grad_norm": 1.3268470864740665, | |
| "learning_rate": 2.448229225209865e-05, | |
| "loss": 0.788662052154541, | |
| "memory(GiB)": 76.02, | |
| "step": 705, | |
| "token_acc": 0.716280170373876, | |
| "train_speed(iter/s)": 0.020416 | |
| }, | |
| { | |
| "epoch": 1.2911737943585078, | |
| "grad_norm": 1.2466125304642335, | |
| "learning_rate": 2.429619690434464e-05, | |
| "loss": 0.7932944297790527, | |
| "memory(GiB)": 76.02, | |
| "step": 710, | |
| "token_acc": 0.7371388301620859, | |
| "train_speed(iter/s)": 0.020426 | |
| }, | |
| { | |
| "epoch": 1.300272975432211, | |
| "grad_norm": 1.3582445751553864, | |
| "learning_rate": 2.4109710775589104e-05, | |
| "loss": 0.8029943466186523, | |
| "memory(GiB)": 76.02, | |
| "step": 715, | |
| "token_acc": 0.7082366589327146, | |
| "train_speed(iter/s)": 0.020435 | |
| }, | |
| { | |
| "epoch": 1.3093721565059144, | |
| "grad_norm": 1.098320752598586, | |
| "learning_rate": 2.392285082856394e-05, | |
| "loss": 0.8051022529602051, | |
| "memory(GiB)": 76.02, | |
| "step": 720, | |
| "token_acc": 0.6993071593533488, | |
| "train_speed(iter/s)": 0.020444 | |
| }, | |
| { | |
| "epoch": 1.3184713375796178, | |
| "grad_norm": 1.1993515162762007, | |
| "learning_rate": 2.3735634060003428e-05, | |
| "loss": 0.7886831760406494, | |
| "memory(GiB)": 76.02, | |
| "step": 725, | |
| "token_acc": 0.7265460664703408, | |
| "train_speed(iter/s)": 0.020453 | |
| }, | |
| { | |
| "epoch": 1.3275705186533213, | |
| "grad_norm": 1.4913459363975115, | |
| "learning_rate": 2.3548077499098256e-05, | |
| "loss": 0.7917290687561035, | |
| "memory(GiB)": 76.02, | |
| "step": 730, | |
| "token_acc": 0.7044052863436123, | |
| "train_speed(iter/s)": 0.020462 | |
| }, | |
| { | |
| "epoch": 1.3366696997270244, | |
| "grad_norm": 1.3995123406507142, | |
| "learning_rate": 2.3360198205946542e-05, | |
| "loss": 0.788825798034668, | |
| "memory(GiB)": 76.02, | |
| "step": 735, | |
| "token_acc": 0.7135922330097088, | |
| "train_speed(iter/s)": 0.020471 | |
| }, | |
| { | |
| "epoch": 1.345768880800728, | |
| "grad_norm": 1.3354117848213083, | |
| "learning_rate": 2.3172013270002038e-05, | |
| "loss": 0.7835997581481934, | |
| "memory(GiB)": 76.02, | |
| "step": 740, | |
| "token_acc": 0.7201051248357424, | |
| "train_speed(iter/s)": 0.02048 | |
| }, | |
| { | |
| "epoch": 1.3548680618744313, | |
| "grad_norm": 1.0749964264738503, | |
| "learning_rate": 2.2983539808519702e-05, | |
| "loss": 0.7911547660827637, | |
| "memory(GiB)": 76.02, | |
| "step": 745, | |
| "token_acc": 0.7271609995903319, | |
| "train_speed(iter/s)": 0.020488 | |
| }, | |
| { | |
| "epoch": 1.3639672429481347, | |
| "grad_norm": 0.9437159555687519, | |
| "learning_rate": 2.2794794964998705e-05, | |
| "loss": 0.7891970634460449, | |
| "memory(GiB)": 76.02, | |
| "step": 750, | |
| "token_acc": 0.7132644956314536, | |
| "train_speed(iter/s)": 0.020497 | |
| }, | |
| { | |
| "epoch": 1.3639672429481347, | |
| "eval_loss": 0.48184001445770264, | |
| "eval_runtime": 120.3801, | |
| "eval_samples_per_second": 47.948, | |
| "eval_steps_per_second": 0.465, | |
| "eval_token_acc": 0.6908322983376689, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.373066424021838, | |
| "grad_norm": 1.3416671636490984, | |
| "learning_rate": 2.260579590762304e-05, | |
| "loss": 0.8072065353393555, | |
| "memory(GiB)": 76.02, | |
| "step": 755, | |
| "token_acc": 0.7023445463812437, | |
| "train_speed(iter/s)": 0.020418 | |
| }, | |
| { | |
| "epoch": 1.3821656050955413, | |
| "grad_norm": 1.1639847848783198, | |
| "learning_rate": 2.2416559827699945e-05, | |
| "loss": 0.8082324028015136, | |
| "memory(GiB)": 76.02, | |
| "step": 760, | |
| "token_acc": 0.7145284621920136, | |
| "train_speed(iter/s)": 0.020427 | |
| }, | |
| { | |
| "epoch": 1.3912647861692449, | |
| "grad_norm": 1.132127107571287, | |
| "learning_rate": 2.2227103938096176e-05, | |
| "loss": 0.7869006156921386, | |
| "memory(GiB)": 76.02, | |
| "step": 765, | |
| "token_acc": 0.7099471830985915, | |
| "train_speed(iter/s)": 0.020436 | |
| }, | |
| { | |
| "epoch": 1.4003639672429482, | |
| "grad_norm": 1.0194297655037412, | |
| "learning_rate": 2.2037445471672312e-05, | |
| "loss": 0.8034600257873535, | |
| "memory(GiB)": 76.02, | |
| "step": 770, | |
| "token_acc": 0.7037037037037037, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 1.4094631483166515, | |
| "grad_norm": 1.3328252272724603, | |
| "learning_rate": 2.1847601679715263e-05, | |
| "loss": 0.8002717971801758, | |
| "memory(GiB)": 76.02, | |
| "step": 775, | |
| "token_acc": 0.7140373750543242, | |
| "train_speed(iter/s)": 0.020454 | |
| }, | |
| { | |
| "epoch": 1.4185623293903549, | |
| "grad_norm": 1.265718534410907, | |
| "learning_rate": 2.1657589830369113e-05, | |
| "loss": 0.8017659187316895, | |
| "memory(GiB)": 76.02, | |
| "step": 780, | |
| "token_acc": 0.7063737623762376, | |
| "train_speed(iter/s)": 0.020462 | |
| }, | |
| { | |
| "epoch": 1.4276615104640582, | |
| "grad_norm": 0.9977051429918016, | |
| "learning_rate": 2.146742720706441e-05, | |
| "loss": 0.7789717674255371, | |
| "memory(GiB)": 76.02, | |
| "step": 785, | |
| "token_acc": 0.710708782742681, | |
| "train_speed(iter/s)": 0.02047 | |
| }, | |
| { | |
| "epoch": 1.4367606915377615, | |
| "grad_norm": 1.0283878536421338, | |
| "learning_rate": 2.127713110694606e-05, | |
| "loss": 0.8202502250671386, | |
| "memory(GiB)": 76.02, | |
| "step": 790, | |
| "token_acc": 0.707347972972973, | |
| "train_speed(iter/s)": 0.020478 | |
| }, | |
| { | |
| "epoch": 1.4458598726114649, | |
| "grad_norm": 1.0457464903588745, | |
| "learning_rate": 2.1086718839299972e-05, | |
| "loss": 0.7791718482971192, | |
| "memory(GiB)": 76.02, | |
| "step": 795, | |
| "token_acc": 0.7183828610919143, | |
| "train_speed(iter/s)": 0.020486 | |
| }, | |
| { | |
| "epoch": 1.4549590536851684, | |
| "grad_norm": 1.1827863278388744, | |
| "learning_rate": 2.0896207723978637e-05, | |
| "loss": 0.8088536262512207, | |
| "memory(GiB)": 76.02, | |
| "step": 800, | |
| "token_acc": 0.7157598499061913, | |
| "train_speed(iter/s)": 0.020494 | |
| }, | |
| { | |
| "epoch": 1.4549590536851684, | |
| "eval_loss": 0.4799867272377014, | |
| "eval_runtime": 120.658, | |
| "eval_samples_per_second": 47.838, | |
| "eval_steps_per_second": 0.464, | |
| "eval_token_acc": 0.6916009596129183, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4640582347588718, | |
| "grad_norm": 1.1034251914058373, | |
| "learning_rate": 2.070561508982571e-05, | |
| "loss": 0.7959201335906982, | |
| "memory(GiB)": 76.02, | |
| "step": 805, | |
| "token_acc": 0.7082542694497154, | |
| "train_speed(iter/s)": 0.020414 | |
| }, | |
| { | |
| "epoch": 1.473157415832575, | |
| "grad_norm": 1.1403649470949677, | |
| "learning_rate": 2.0514958273099778e-05, | |
| "loss": 0.8099080085754394, | |
| "memory(GiB)": 76.02, | |
| "step": 810, | |
| "token_acc": 0.6938775510204082, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 1.4822565969062784, | |
| "grad_norm": 1.242956861788932, | |
| "learning_rate": 2.0324254615897438e-05, | |
| "loss": 0.7870995044708252, | |
| "memory(GiB)": 76.02, | |
| "step": 815, | |
| "token_acc": 0.6989182692307693, | |
| "train_speed(iter/s)": 0.020431 | |
| }, | |
| { | |
| "epoch": 1.4913557779799818, | |
| "grad_norm": 1.2480879646871645, | |
| "learning_rate": 2.0133521464575915e-05, | |
| "loss": 0.8157112121582031, | |
| "memory(GiB)": 76.02, | |
| "step": 820, | |
| "token_acc": 0.6917945296864576, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 1.5004549590536853, | |
| "grad_norm": 1.4455782166201527, | |
| "learning_rate": 1.99427761681752e-05, | |
| "loss": 0.7882473945617676, | |
| "memory(GiB)": 76.02, | |
| "step": 825, | |
| "token_acc": 0.7195308516063234, | |
| "train_speed(iter/s)": 0.020446 | |
| }, | |
| { | |
| "epoch": 1.5095541401273884, | |
| "grad_norm": 1.129414363377021, | |
| "learning_rate": 1.9752036076839988e-05, | |
| "loss": 0.7893435955047607, | |
| "memory(GiB)": 76.02, | |
| "step": 830, | |
| "token_acc": 0.7249863313285949, | |
| "train_speed(iter/s)": 0.020454 | |
| }, | |
| { | |
| "epoch": 1.518653321201092, | |
| "grad_norm": 1.1611426190154455, | |
| "learning_rate": 1.9561318540241528e-05, | |
| "loss": 0.7893610000610352, | |
| "memory(GiB)": 76.02, | |
| "step": 835, | |
| "token_acc": 0.7279521674140508, | |
| "train_speed(iter/s)": 0.020463 | |
| }, | |
| { | |
| "epoch": 1.5277525022747953, | |
| "grad_norm": 1.387275557971045, | |
| "learning_rate": 1.93706409059995e-05, | |
| "loss": 0.7986185073852539, | |
| "memory(GiB)": 76.02, | |
| "step": 840, | |
| "token_acc": 0.7054386661373561, | |
| "train_speed(iter/s)": 0.02047 | |
| }, | |
| { | |
| "epoch": 1.5368516833484986, | |
| "grad_norm": 1.1029714828712447, | |
| "learning_rate": 1.9180020518104088e-05, | |
| "loss": 0.7868841171264649, | |
| "memory(GiB)": 76.02, | |
| "step": 845, | |
| "token_acc": 0.7180851063829787, | |
| "train_speed(iter/s)": 0.020478 | |
| }, | |
| { | |
| "epoch": 1.545950864422202, | |
| "grad_norm": 1.055709561997052, | |
| "learning_rate": 1.898947471533833e-05, | |
| "loss": 0.7913725852966309, | |
| "memory(GiB)": 76.02, | |
| "step": 850, | |
| "token_acc": 0.6924932167621345, | |
| "train_speed(iter/s)": 0.020486 | |
| }, | |
| { | |
| "epoch": 1.545950864422202, | |
| "eval_loss": 0.4763409495353699, | |
| "eval_runtime": 119.4883, | |
| "eval_samples_per_second": 48.306, | |
| "eval_steps_per_second": 0.469, | |
| "eval_token_acc": 0.6927134956692006, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5550500454959053, | |
| "grad_norm": 1.1281157034877283, | |
| "learning_rate": 1.8799020829701036e-05, | |
| "loss": 0.8020171165466309, | |
| "memory(GiB)": 76.02, | |
| "step": 855, | |
| "token_acc": 0.7118734923612973, | |
| "train_speed(iter/s)": 0.020415 | |
| }, | |
| { | |
| "epoch": 1.5641492265696089, | |
| "grad_norm": 1.0786368581164274, | |
| "learning_rate": 1.860867618483027e-05, | |
| "loss": 0.7822349071502686, | |
| "memory(GiB)": 76.02, | |
| "step": 860, | |
| "token_acc": 0.6926726410121244, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 1.573248407643312, | |
| "grad_norm": 1.2124940318046376, | |
| "learning_rate": 1.8418458094427567e-05, | |
| "loss": 0.7907929420471191, | |
| "memory(GiB)": 76.02, | |
| "step": 865, | |
| "token_acc": 0.7004744958481613, | |
| "train_speed(iter/s)": 0.02043 | |
| }, | |
| { | |
| "epoch": 1.5823475887170155, | |
| "grad_norm": 1.087815247895776, | |
| "learning_rate": 1.82283838606831e-05, | |
| "loss": 0.78410964012146, | |
| "memory(GiB)": 76.02, | |
| "step": 870, | |
| "token_acc": 0.7159194876486734, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 1.5914467697907189, | |
| "grad_norm": 1.033926015572944, | |
| "learning_rate": 1.803847077270188e-05, | |
| "loss": 0.786978006362915, | |
| "memory(GiB)": 76.02, | |
| "step": 875, | |
| "token_acc": 0.7101845522898155, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 1.6005459508644222, | |
| "grad_norm": 1.162364059290432, | |
| "learning_rate": 1.7848736104931142e-05, | |
| "loss": 0.7876530647277832, | |
| "memory(GiB)": 76.02, | |
| "step": 880, | |
| "token_acc": 0.7407407407407407, | |
| "train_speed(iter/s)": 0.020452 | |
| }, | |
| { | |
| "epoch": 1.6096451319381255, | |
| "grad_norm": 1.0965939407284515, | |
| "learning_rate": 1.765919711558906e-05, | |
| "loss": 0.7792027473449707, | |
| "memory(GiB)": 76.02, | |
| "step": 885, | |
| "token_acc": 0.7125279642058165, | |
| "train_speed(iter/s)": 0.020459 | |
| }, | |
| { | |
| "epoch": 1.6187443130118289, | |
| "grad_norm": 1.1822482702836845, | |
| "learning_rate": 1.746987104509494e-05, | |
| "loss": 0.7893452644348145, | |
| "memory(GiB)": 76.02, | |
| "step": 890, | |
| "token_acc": 0.6998714652956298, | |
| "train_speed(iter/s)": 0.020466 | |
| }, | |
| { | |
| "epoch": 1.6278434940855324, | |
| "grad_norm": 1.0733217293598245, | |
| "learning_rate": 1.7280775114501057e-05, | |
| "loss": 0.7864848613739014, | |
| "memory(GiB)": 76.02, | |
| "step": 895, | |
| "token_acc": 0.7469492614001284, | |
| "train_speed(iter/s)": 0.020474 | |
| }, | |
| { | |
| "epoch": 1.6369426751592355, | |
| "grad_norm": 0.9761043125519061, | |
| "learning_rate": 1.7091926523926205e-05, | |
| "loss": 0.7935813426971435, | |
| "memory(GiB)": 76.02, | |
| "step": 900, | |
| "token_acc": 0.7378048780487805, | |
| "train_speed(iter/s)": 0.020481 | |
| }, | |
| { | |
| "epoch": 1.6369426751592355, | |
| "eval_loss": 0.4734553098678589, | |
| "eval_runtime": 120.2875, | |
| "eval_samples_per_second": 47.985, | |
| "eval_steps_per_second": 0.466, | |
| "eval_token_acc": 0.6929117293665017, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.646041856232939, | |
| "grad_norm": 0.9775890422129749, | |
| "learning_rate": 1.6903342450991203e-05, | |
| "loss": 0.7867559909820556, | |
| "memory(GiB)": 76.02, | |
| "step": 905, | |
| "token_acc": 0.7061556329849012, | |
| "train_speed(iter/s)": 0.020416 | |
| }, | |
| { | |
| "epoch": 1.6551410373066424, | |
| "grad_norm": 1.032340730518062, | |
| "learning_rate": 1.6715040049256393e-05, | |
| "loss": 0.7743623733520508, | |
| "memory(GiB)": 76.02, | |
| "step": 910, | |
| "token_acc": 0.7131681877444589, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 1.6642402183803457, | |
| "grad_norm": 1.0919952776609756, | |
| "learning_rate": 1.6527036446661396e-05, | |
| "loss": 0.7813485145568848, | |
| "memory(GiB)": 76.02, | |
| "step": 915, | |
| "token_acc": 0.7281947261663286, | |
| "train_speed(iter/s)": 0.02043 | |
| }, | |
| { | |
| "epoch": 1.673339399454049, | |
| "grad_norm": 1.2303788872377346, | |
| "learning_rate": 1.6339348743967126e-05, | |
| "loss": 0.7993118762969971, | |
| "memory(GiB)": 76.02, | |
| "step": 920, | |
| "token_acc": 0.7152953054013125, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 1.6824385805277524, | |
| "grad_norm": 1.118393217178591, | |
| "learning_rate": 1.6151994013200325e-05, | |
| "loss": 0.7818034648895263, | |
| "memory(GiB)": 76.02, | |
| "step": 925, | |
| "token_acc": 0.7246165084002922, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 1.691537761601456, | |
| "grad_norm": 1.2781086578084908, | |
| "learning_rate": 1.5964989296100682e-05, | |
| "loss": 0.7822434902191162, | |
| "memory(GiB)": 76.02, | |
| "step": 930, | |
| "token_acc": 0.7342391304347826, | |
| "train_speed(iter/s)": 0.020452 | |
| }, | |
| { | |
| "epoch": 1.700636942675159, | |
| "grad_norm": 1.0706561030394075, | |
| "learning_rate": 1.5778351602570742e-05, | |
| "loss": 0.7954679965972901, | |
| "memory(GiB)": 76.02, | |
| "step": 935, | |
| "token_acc": 0.7032355915065723, | |
| "train_speed(iter/s)": 0.020459 | |
| }, | |
| { | |
| "epoch": 1.7097361237488626, | |
| "grad_norm": 1.2217572797748102, | |
| "learning_rate": 1.5592097909128673e-05, | |
| "loss": 0.7845365524291992, | |
| "memory(GiB)": 76.02, | |
| "step": 940, | |
| "token_acc": 0.7320365224295355, | |
| "train_speed(iter/s)": 0.020466 | |
| }, | |
| { | |
| "epoch": 1.718835304822566, | |
| "grad_norm": 1.2477451151406387, | |
| "learning_rate": 1.5406245157364093e-05, | |
| "loss": 0.7835155010223389, | |
| "memory(GiB)": 76.02, | |
| "step": 945, | |
| "token_acc": 0.7151702786377709, | |
| "train_speed(iter/s)": 0.020473 | |
| }, | |
| { | |
| "epoch": 1.7279344858962693, | |
| "grad_norm": 1.1968781249693217, | |
| "learning_rate": 1.5220810252397054e-05, | |
| "loss": 0.7988658905029297, | |
| "memory(GiB)": 76.02, | |
| "step": 950, | |
| "token_acc": 0.7049180327868853, | |
| "train_speed(iter/s)": 0.020479 | |
| }, | |
| { | |
| "epoch": 1.7279344858962693, | |
| "eval_loss": 0.4713653028011322, | |
| "eval_runtime": 120.1658, | |
| "eval_samples_per_second": 48.034, | |
| "eval_steps_per_second": 0.466, | |
| "eval_token_acc": 0.6942872284906324, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.7370336669699729, | |
| "grad_norm": 0.9590399831837186, | |
| "learning_rate": 1.5035810061340376e-05, | |
| "loss": 0.7818658828735352, | |
| "memory(GiB)": 76.02, | |
| "step": 955, | |
| "token_acc": 0.7186684073107049, | |
| "train_speed(iter/s)": 0.020417 | |
| }, | |
| { | |
| "epoch": 1.746132848043676, | |
| "grad_norm": 1.311812274039409, | |
| "learning_rate": 1.4851261411765414e-05, | |
| "loss": 0.7812034130096436, | |
| "memory(GiB)": 76.02, | |
| "step": 960, | |
| "token_acc": 0.7130058696323757, | |
| "train_speed(iter/s)": 0.020424 | |
| }, | |
| { | |
| "epoch": 1.7552320291173795, | |
| "grad_norm": 1.2066428640501157, | |
| "learning_rate": 1.4667181090171418e-05, | |
| "loss": 0.7740418910980225, | |
| "memory(GiB)": 76.02, | |
| "step": 965, | |
| "token_acc": 0.7142857142857143, | |
| "train_speed(iter/s)": 0.02043 | |
| }, | |
| { | |
| "epoch": 1.7643312101910829, | |
| "grad_norm": 1.1309046997472656, | |
| "learning_rate": 1.4483585840458632e-05, | |
| "loss": 0.7716457843780518, | |
| "memory(GiB)": 76.02, | |
| "step": 970, | |
| "token_acc": 0.7535986452159187, | |
| "train_speed(iter/s)": 0.020437 | |
| }, | |
| { | |
| "epoch": 1.7734303912647862, | |
| "grad_norm": 1.0597243121965947, | |
| "learning_rate": 1.4300492362405296e-05, | |
| "loss": 0.7900642871856689, | |
| "memory(GiB)": 76.02, | |
| "step": 975, | |
| "token_acc": 0.7184942716857611, | |
| "train_speed(iter/s)": 0.020444 | |
| }, | |
| { | |
| "epoch": 1.7825295723384895, | |
| "grad_norm": 0.9136761859628779, | |
| "learning_rate": 1.4117917310148624e-05, | |
| "loss": 0.7912971019744873, | |
| "memory(GiB)": 76.02, | |
| "step": 980, | |
| "token_acc": 0.7580794090489381, | |
| "train_speed(iter/s)": 0.02045 | |
| }, | |
| { | |
| "epoch": 1.7916287534121929, | |
| "grad_norm": 1.123085792919359, | |
| "learning_rate": 1.3935877290669932e-05, | |
| "loss": 0.7823569774627686, | |
| "memory(GiB)": 76.02, | |
| "step": 985, | |
| "token_acc": 0.7234323432343235, | |
| "train_speed(iter/s)": 0.020457 | |
| }, | |
| { | |
| "epoch": 1.8007279344858964, | |
| "grad_norm": 1.1608781306244833, | |
| "learning_rate": 1.375438886228411e-05, | |
| "loss": 0.7732644081115723, | |
| "memory(GiB)": 76.02, | |
| "step": 990, | |
| "token_acc": 0.6950644451430368, | |
| "train_speed(iter/s)": 0.020464 | |
| }, | |
| { | |
| "epoch": 1.8098271155595995, | |
| "grad_norm": 1.1283275236864316, | |
| "learning_rate": 1.3573468533133442e-05, | |
| "loss": 0.7756358623504639, | |
| "memory(GiB)": 76.02, | |
| "step": 995, | |
| "token_acc": 0.7115031238515251, | |
| "train_speed(iter/s)": 0.02047 | |
| }, | |
| { | |
| "epoch": 1.818926296633303, | |
| "grad_norm": 1.0540865657542784, | |
| "learning_rate": 1.3393132759686064e-05, | |
| "loss": 0.7759748935699463, | |
| "memory(GiB)": 76.02, | |
| "step": 1000, | |
| "token_acc": 0.6963375057950858, | |
| "train_speed(iter/s)": 0.020477 | |
| }, | |
| { | |
| "epoch": 1.818926296633303, | |
| "eval_loss": 0.4693294167518616, | |
| "eval_runtime": 119.7422, | |
| "eval_samples_per_second": 48.204, | |
| "eval_steps_per_second": 0.468, | |
| "eval_token_acc": 0.6942063167774483, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.8280254777070064, | |
| "grad_norm": 1.2097721619516764, | |
| "learning_rate": 1.3213397945239053e-05, | |
| "loss": 0.7718574047088623, | |
| "memory(GiB)": 76.02, | |
| "step": 1005, | |
| "token_acc": 0.7104117843990626, | |
| "train_speed(iter/s)": 0.020419 | |
| }, | |
| { | |
| "epoch": 1.8371246587807097, | |
| "grad_norm": 1.3429375958388912, | |
| "learning_rate": 1.303428043842641e-05, | |
| "loss": 0.7779555320739746, | |
| "memory(GiB)": 76.02, | |
| "step": 1010, | |
| "token_acc": 0.7344594594594595, | |
| "train_speed(iter/s)": 0.020425 | |
| }, | |
| { | |
| "epoch": 1.846223839854413, | |
| "grad_norm": 1.1502202864135298, | |
| "learning_rate": 1.2855796531731994e-05, | |
| "loss": 0.784113597869873, | |
| "memory(GiB)": 76.02, | |
| "step": 1015, | |
| "token_acc": 0.7116066903193107, | |
| "train_speed(iter/s)": 0.020432 | |
| }, | |
| { | |
| "epoch": 1.8553230209281164, | |
| "grad_norm": 0.9764736580354538, | |
| "learning_rate": 1.2677962460007555e-05, | |
| "loss": 0.769007682800293, | |
| "memory(GiB)": 76.02, | |
| "step": 1020, | |
| "token_acc": 0.7275985663082437, | |
| "train_speed(iter/s)": 0.020439 | |
| }, | |
| { | |
| "epoch": 1.86442220200182, | |
| "grad_norm": 1.0395064733034296, | |
| "learning_rate": 1.2500794398996004e-05, | |
| "loss": 0.7842848300933838, | |
| "memory(GiB)": 76.02, | |
| "step": 1025, | |
| "token_acc": 0.7331868131868132, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 1.873521383075523, | |
| "grad_norm": 1.1556386067848643, | |
| "learning_rate": 1.2324308463860089e-05, | |
| "loss": 0.7766573905944825, | |
| "memory(GiB)": 76.02, | |
| "step": 1030, | |
| "token_acc": 0.729426433915212, | |
| "train_speed(iter/s)": 0.020451 | |
| }, | |
| { | |
| "epoch": 1.8826205641492266, | |
| "grad_norm": 1.261343214410371, | |
| "learning_rate": 1.2148520707716567e-05, | |
| "loss": 0.7785522937774658, | |
| "memory(GiB)": 76.02, | |
| "step": 1035, | |
| "token_acc": 0.7095070422535211, | |
| "train_speed(iter/s)": 0.020458 | |
| }, | |
| { | |
| "epoch": 1.89171974522293, | |
| "grad_norm": 1.3077190411896333, | |
| "learning_rate": 1.1973447120175998e-05, | |
| "loss": 0.7712287425994873, | |
| "memory(GiB)": 76.02, | |
| "step": 1040, | |
| "token_acc": 0.6994839221913458, | |
| "train_speed(iter/s)": 0.020464 | |
| }, | |
| { | |
| "epoch": 1.9008189262966333, | |
| "grad_norm": 1.0009654605437637, | |
| "learning_rate": 1.1799103625888342e-05, | |
| "loss": 0.7672115802764893, | |
| "memory(GiB)": 76.02, | |
| "step": 1045, | |
| "token_acc": 0.7111845210004719, | |
| "train_speed(iter/s)": 0.020471 | |
| }, | |
| { | |
| "epoch": 1.9099181073703366, | |
| "grad_norm": 1.1500066718260178, | |
| "learning_rate": 1.162550608309446e-05, | |
| "loss": 0.7593209743499756, | |
| "memory(GiB)": 76.02, | |
| "step": 1050, | |
| "token_acc": 0.7720478325859492, | |
| "train_speed(iter/s)": 0.020477 | |
| }, | |
| { | |
| "epoch": 1.9099181073703366, | |
| "eval_loss": 0.46374601125717163, | |
| "eval_runtime": 119.6783, | |
| "eval_samples_per_second": 48.229, | |
| "eval_steps_per_second": 0.468, | |
| "eval_token_acc": 0.6953795366186186, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.91901728844404, | |
| "grad_norm": 1.0354960902542707, | |
| "learning_rate": 1.1452670282183664e-05, | |
| "loss": 0.7757611274719238, | |
| "memory(GiB)": 76.02, | |
| "step": 1055, | |
| "token_acc": 0.7227655986509275, | |
| "train_speed(iter/s)": 0.02042 | |
| }, | |
| { | |
| "epoch": 1.9281164695177435, | |
| "grad_norm": 1.1181099943024946, | |
| "learning_rate": 1.12806119442574e-05, | |
| "loss": 0.7624452590942383, | |
| "memory(GiB)": 76.02, | |
| "step": 1060, | |
| "token_acc": 0.7370562130177515, | |
| "train_speed(iter/s)": 0.020426 | |
| }, | |
| { | |
| "epoch": 1.9372156505914466, | |
| "grad_norm": 1.020900947874345, | |
| "learning_rate": 1.1109346719699263e-05, | |
| "loss": 0.7685122489929199, | |
| "memory(GiB)": 76.02, | |
| "step": 1065, | |
| "token_acc": 0.7123585726718886, | |
| "train_speed(iter/s)": 0.020432 | |
| }, | |
| { | |
| "epoch": 1.9463148316651502, | |
| "grad_norm": 1.0619107995533037, | |
| "learning_rate": 1.0938890186751487e-05, | |
| "loss": 0.7687143325805664, | |
| "memory(GiB)": 76.02, | |
| "step": 1070, | |
| "token_acc": 0.7249620637329287, | |
| "train_speed(iter/s)": 0.020439 | |
| }, | |
| { | |
| "epoch": 1.9554140127388535, | |
| "grad_norm": 1.0950602334931028, | |
| "learning_rate": 1.0769257850097881e-05, | |
| "loss": 0.7737876415252686, | |
| "memory(GiB)": 76.02, | |
| "step": 1075, | |
| "token_acc": 0.6985485671752885, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 1.9645131938125568, | |
| "grad_norm": 1.307250719010874, | |
| "learning_rate": 1.060046513945361e-05, | |
| "loss": 0.7766946792602539, | |
| "memory(GiB)": 76.02, | |
| "step": 1080, | |
| "token_acc": 0.7377892030848329, | |
| "train_speed(iter/s)": 0.020451 | |
| }, | |
| { | |
| "epoch": 1.9736123748862604, | |
| "grad_norm": 1.1430361120086814, | |
| "learning_rate": 1.0432527408161597e-05, | |
| "loss": 0.7805325031280518, | |
| "memory(GiB)": 76.02, | |
| "step": 1085, | |
| "token_acc": 0.7078861409239384, | |
| "train_speed(iter/s)": 0.020457 | |
| }, | |
| { | |
| "epoch": 1.9827115559599635, | |
| "grad_norm": 1.002916433279442, | |
| "learning_rate": 1.026545993179612e-05, | |
| "loss": 0.7858685493469239, | |
| "memory(GiB)": 76.02, | |
| "step": 1090, | |
| "token_acc": 0.7466666666666667, | |
| "train_speed(iter/s)": 0.020463 | |
| }, | |
| { | |
| "epoch": 1.991810737033667, | |
| "grad_norm": 1.0871219922265896, | |
| "learning_rate": 1.009927790677327e-05, | |
| "loss": 0.7784292697906494, | |
| "memory(GiB)": 76.02, | |
| "step": 1095, | |
| "token_acc": 0.7174170616113744, | |
| "train_speed(iter/s)": 0.020469 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.7655187909978691, | |
| "learning_rate": 9.933996448968688e-06, | |
| "loss": 0.7408246994018555, | |
| "memory(GiB)": 76.02, | |
| "step": 1100, | |
| "token_acc": 0.7477064220183486, | |
| "train_speed(iter/s)": 0.020483 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.4639655649662018, | |
| "eval_runtime": 118.882, | |
| "eval_samples_per_second": 48.552, | |
| "eval_steps_per_second": 0.471, | |
| "eval_token_acc": 0.6956344085151487, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.0090991810737036, | |
| "grad_norm": 1.1083572508394148, | |
| "learning_rate": 9.769630592342643e-06, | |
| "loss": 0.6631475925445557, | |
| "memory(GiB)": 76.02, | |
| "step": 1105, | |
| "token_acc": 0.732795337368303, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 2.0181983621474067, | |
| "grad_norm": 1.1068844322629663, | |
| "learning_rate": 9.606195287572577e-06, | |
| "loss": 0.6467893600463868, | |
| "memory(GiB)": 76.02, | |
| "step": 1110, | |
| "token_acc": 0.7836676217765043, | |
| "train_speed(iter/s)": 0.020427 | |
| }, | |
| { | |
| "epoch": 2.02729754322111, | |
| "grad_norm": 1.1238716711584054, | |
| "learning_rate": 9.443705400693133e-06, | |
| "loss": 0.6334795475006103, | |
| "memory(GiB)": 76.02, | |
| "step": 1115, | |
| "token_acc": 0.746772864597638, | |
| "train_speed(iter/s)": 0.020432 | |
| }, | |
| { | |
| "epoch": 2.0363967242948133, | |
| "grad_norm": 0.9545754331665411, | |
| "learning_rate": 9.282175711744012e-06, | |
| "loss": 0.643845796585083, | |
| "memory(GiB)": 76.02, | |
| "step": 1120, | |
| "token_acc": 0.783322390019698, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 2.045495905368517, | |
| "grad_norm": 1.112189160795635, | |
| "learning_rate": 9.121620913425508e-06, | |
| "loss": 0.6376824378967285, | |
| "memory(GiB)": 76.02, | |
| "step": 1125, | |
| "token_acc": 0.7677035076108537, | |
| "train_speed(iter/s)": 0.020444 | |
| }, | |
| { | |
| "epoch": 2.05459508644222, | |
| "grad_norm": 1.069654016986732, | |
| "learning_rate": 8.962055609762143e-06, | |
| "loss": 0.6328807353973389, | |
| "memory(GiB)": 76.02, | |
| "step": 1130, | |
| "token_acc": 0.7605409705648369, | |
| "train_speed(iter/s)": 0.020449 | |
| }, | |
| { | |
| "epoch": 2.0636942675159236, | |
| "grad_norm": 1.134992866714782, | |
| "learning_rate": 8.803494314774241e-06, | |
| "loss": 0.6297794342041015, | |
| "memory(GiB)": 76.02, | |
| "step": 1135, | |
| "token_acc": 0.7869767441860465, | |
| "train_speed(iter/s)": 0.020456 | |
| }, | |
| { | |
| "epoch": 2.072793448589627, | |
| "grad_norm": 1.1668054237375585, | |
| "learning_rate": 8.645951451157741e-06, | |
| "loss": 0.6355114459991456, | |
| "memory(GiB)": 76.02, | |
| "step": 1140, | |
| "token_acc": 0.7761146496815287, | |
| "train_speed(iter/s)": 0.020462 | |
| }, | |
| { | |
| "epoch": 2.08189262966333, | |
| "grad_norm": 1.1864938776830725, | |
| "learning_rate": 8.489441348972312e-06, | |
| "loss": 0.6331965923309326, | |
| "memory(GiB)": 76.02, | |
| "step": 1145, | |
| "token_acc": 0.7740963855421686, | |
| "train_speed(iter/s)": 0.020468 | |
| }, | |
| { | |
| "epoch": 2.0909918107370338, | |
| "grad_norm": 1.0454450783179292, | |
| "learning_rate": 8.333978244337921e-06, | |
| "loss": 0.6294968605041504, | |
| "memory(GiB)": 76.02, | |
| "step": 1150, | |
| "token_acc": 0.77819937909624, | |
| "train_speed(iter/s)": 0.020473 | |
| }, | |
| { | |
| "epoch": 2.0909918107370338, | |
| "eval_loss": 0.47781530022621155, | |
| "eval_runtime": 120.0715, | |
| "eval_samples_per_second": 48.071, | |
| "eval_steps_per_second": 0.466, | |
| "eval_token_acc": 0.6904358309430665, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.100090991810737, | |
| "grad_norm": 0.9951435698165627, | |
| "learning_rate": 8.179576278139872e-06, | |
| "loss": 0.6304058074951172, | |
| "memory(GiB)": 76.02, | |
| "step": 1155, | |
| "token_acc": 0.7404277792447848, | |
| "train_speed(iter/s)": 0.020422 | |
| }, | |
| { | |
| "epoch": 2.1091901728844404, | |
| "grad_norm": 1.067908969484696, | |
| "learning_rate": 8.026249494742617e-06, | |
| "loss": 0.6222400665283203, | |
| "memory(GiB)": 76.02, | |
| "step": 1160, | |
| "token_acc": 0.7715277777777778, | |
| "train_speed(iter/s)": 0.020428 | |
| }, | |
| { | |
| "epoch": 2.1182893539581436, | |
| "grad_norm": 1.057238882123902, | |
| "learning_rate": 7.874011840712197e-06, | |
| "loss": 0.6318105697631836, | |
| "memory(GiB)": 76.02, | |
| "step": 1165, | |
| "token_acc": 0.7550738007380073, | |
| "train_speed(iter/s)": 0.020433 | |
| }, | |
| { | |
| "epoch": 2.127388535031847, | |
| "grad_norm": 1.0798825041809057, | |
| "learning_rate": 7.72287716354776e-06, | |
| "loss": 0.6285967350006103, | |
| "memory(GiB)": 76.02, | |
| "step": 1170, | |
| "token_acc": 0.7547770700636943, | |
| "train_speed(iter/s)": 0.020439 | |
| }, | |
| { | |
| "epoch": 2.1364877161055507, | |
| "grad_norm": 1.0478822425834018, | |
| "learning_rate": 7.572859210421945e-06, | |
| "loss": 0.6234595775604248, | |
| "memory(GiB)": 76.02, | |
| "step": 1175, | |
| "token_acc": 0.7690631808278867, | |
| "train_speed(iter/s)": 0.020444 | |
| }, | |
| { | |
| "epoch": 2.1455868971792538, | |
| "grad_norm": 0.9867274025718497, | |
| "learning_rate": 7.423971626930435e-06, | |
| "loss": 0.6359669685363769, | |
| "memory(GiB)": 76.02, | |
| "step": 1180, | |
| "token_acc": 0.7695961995249406, | |
| "train_speed(iter/s)": 0.02045 | |
| }, | |
| { | |
| "epoch": 2.1546860782529573, | |
| "grad_norm": 1.0045378569587455, | |
| "learning_rate": 7.276227955850774e-06, | |
| "loss": 0.6464476585388184, | |
| "memory(GiB)": 76.02, | |
| "step": 1185, | |
| "token_acc": 0.7841451766953199, | |
| "train_speed(iter/s)": 0.020455 | |
| }, | |
| { | |
| "epoch": 2.1637852593266604, | |
| "grad_norm": 1.022012980465645, | |
| "learning_rate": 7.12964163591054e-06, | |
| "loss": 0.6201572895050049, | |
| "memory(GiB)": 76.02, | |
| "step": 1190, | |
| "token_acc": 0.74373795761079, | |
| "train_speed(iter/s)": 0.020461 | |
| }, | |
| { | |
| "epoch": 2.172884440400364, | |
| "grad_norm": 1.2093399237034956, | |
| "learning_rate": 6.984226000564907e-06, | |
| "loss": 0.6306787490844726, | |
| "memory(GiB)": 76.02, | |
| "step": 1195, | |
| "token_acc": 0.7755102040816326, | |
| "train_speed(iter/s)": 0.020467 | |
| }, | |
| { | |
| "epoch": 2.1819836214740675, | |
| "grad_norm": 0.966059090473921, | |
| "learning_rate": 6.8399942767839075e-06, | |
| "loss": 0.6421105861663818, | |
| "memory(GiB)": 76.02, | |
| "step": 1200, | |
| "token_acc": 0.7779262426509888, | |
| "train_speed(iter/s)": 0.020473 | |
| }, | |
| { | |
| "epoch": 2.1819836214740675, | |
| "eval_loss": 0.47876349091529846, | |
| "eval_runtime": 119.7281, | |
| "eval_samples_per_second": 48.209, | |
| "eval_steps_per_second": 0.468, | |
| "eval_token_acc": 0.6895983947116104, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.1910828025477707, | |
| "grad_norm": 1.069591399453908, | |
| "learning_rate": 6.696959583849228e-06, | |
| "loss": 0.6228060245513916, | |
| "memory(GiB)": 76.02, | |
| "step": 1205, | |
| "token_acc": 0.725686591276252, | |
| "train_speed(iter/s)": 0.020424 | |
| }, | |
| { | |
| "epoch": 2.200181983621474, | |
| "grad_norm": 1.0705675997492539, | |
| "learning_rate": 6.5551349321609585e-06, | |
| "loss": 0.6346144676208496, | |
| "memory(GiB)": 76.02, | |
| "step": 1210, | |
| "token_acc": 0.7361563517915309, | |
| "train_speed(iter/s)": 0.020429 | |
| }, | |
| { | |
| "epoch": 2.2092811646951773, | |
| "grad_norm": 0.99473395335189, | |
| "learning_rate": 6.414533222054138e-06, | |
| "loss": 0.6288974761962891, | |
| "memory(GiB)": 76.02, | |
| "step": 1215, | |
| "token_acc": 0.7661224489795918, | |
| "train_speed(iter/s)": 0.020435 | |
| }, | |
| { | |
| "epoch": 2.218380345768881, | |
| "grad_norm": 1.0273110672808459, | |
| "learning_rate": 6.275167242625331e-06, | |
| "loss": 0.6033660411834717, | |
| "memory(GiB)": 76.02, | |
| "step": 1220, | |
| "token_acc": 0.7424931756141947, | |
| "train_speed(iter/s)": 0.02044 | |
| }, | |
| { | |
| "epoch": 2.227479526842584, | |
| "grad_norm": 1.1134175189046431, | |
| "learning_rate": 6.137049670569344e-06, | |
| "loss": 0.6237975120544433, | |
| "memory(GiB)": 76.02, | |
| "step": 1225, | |
| "token_acc": 0.7610619469026548, | |
| "train_speed(iter/s)": 0.020445 | |
| }, | |
| { | |
| "epoch": 2.2365787079162875, | |
| "grad_norm": 1.0391880977302441, | |
| "learning_rate": 6.000193069026181e-06, | |
| "loss": 0.633206558227539, | |
| "memory(GiB)": 76.02, | |
| "step": 1230, | |
| "token_acc": 0.7656550134460238, | |
| "train_speed(iter/s)": 0.020451 | |
| }, | |
| { | |
| "epoch": 2.245677888989991, | |
| "grad_norm": 1.1575554243921846, | |
| "learning_rate": 5.8646098864382525e-06, | |
| "loss": 0.6448534488677978, | |
| "memory(GiB)": 76.02, | |
| "step": 1235, | |
| "token_acc": 0.7768777614138439, | |
| "train_speed(iter/s)": 0.020456 | |
| }, | |
| { | |
| "epoch": 2.254777070063694, | |
| "grad_norm": 1.0130550727371117, | |
| "learning_rate": 5.730312455418134e-06, | |
| "loss": 0.6195736408233643, | |
| "memory(GiB)": 76.02, | |
| "step": 1240, | |
| "token_acc": 0.7690447400241838, | |
| "train_speed(iter/s)": 0.020461 | |
| }, | |
| { | |
| "epoch": 2.2638762511373978, | |
| "grad_norm": 1.0895008794001835, | |
| "learning_rate": 5.597312991626713e-06, | |
| "loss": 0.6155508041381836, | |
| "memory(GiB)": 76.02, | |
| "step": 1245, | |
| "token_acc": 0.7842149454240135, | |
| "train_speed(iter/s)": 0.020466 | |
| }, | |
| { | |
| "epoch": 2.272975432211101, | |
| "grad_norm": 1.0868616738166854, | |
| "learning_rate": 5.465623592662137e-06, | |
| "loss": 0.6290598392486573, | |
| "memory(GiB)": 76.02, | |
| "step": 1250, | |
| "token_acc": 0.7843260188087774, | |
| "train_speed(iter/s)": 0.020471 | |
| }, | |
| { | |
| "epoch": 2.272975432211101, | |
| "eval_loss": 0.47770801186561584, | |
| "eval_runtime": 119.4212, | |
| "eval_samples_per_second": 48.333, | |
| "eval_steps_per_second": 0.469, | |
| "eval_token_acc": 0.6896631240821578, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.2820746132848044, | |
| "grad_norm": 1.0252310733499297, | |
| "learning_rate": 5.335256236959379e-06, | |
| "loss": 0.6228739261627197, | |
| "memory(GiB)": 76.02, | |
| "step": 1255, | |
| "token_acc": 0.7295555555555555, | |
| "train_speed(iter/s)": 0.020423 | |
| }, | |
| { | |
| "epoch": 2.2911737943585075, | |
| "grad_norm": 1.1274971851401754, | |
| "learning_rate": 5.206222782700667e-06, | |
| "loss": 0.6328925609588623, | |
| "memory(GiB)": 76.02, | |
| "step": 1260, | |
| "token_acc": 0.772992700729927, | |
| "train_speed(iter/s)": 0.020428 | |
| }, | |
| { | |
| "epoch": 2.300272975432211, | |
| "grad_norm": 0.9968940954527525, | |
| "learning_rate": 5.078534966736895e-06, | |
| "loss": 0.6318979740142823, | |
| "memory(GiB)": 76.02, | |
| "step": 1265, | |
| "token_acc": 0.766875691626706, | |
| "train_speed(iter/s)": 0.020433 | |
| }, | |
| { | |
| "epoch": 2.3093721565059147, | |
| "grad_norm": 1.0466074457299364, | |
| "learning_rate": 4.952204403520042e-06, | |
| "loss": 0.6296024799346924, | |
| "memory(GiB)": 76.02, | |
| "step": 1270, | |
| "token_acc": 0.7647696476964769, | |
| "train_speed(iter/s)": 0.020438 | |
| }, | |
| { | |
| "epoch": 2.3184713375796178, | |
| "grad_norm": 1.059039551077919, | |
| "learning_rate": 4.827242584046698e-06, | |
| "loss": 0.6291126251220703, | |
| "memory(GiB)": 76.02, | |
| "step": 1275, | |
| "token_acc": 0.7655979202772963, | |
| "train_speed(iter/s)": 0.020443 | |
| }, | |
| { | |
| "epoch": 2.3275705186533213, | |
| "grad_norm": 1.1223580815679548, | |
| "learning_rate": 4.70366087481289e-06, | |
| "loss": 0.620822811126709, | |
| "memory(GiB)": 76.02, | |
| "step": 1280, | |
| "token_acc": 0.7782307378719935, | |
| "train_speed(iter/s)": 0.020448 | |
| }, | |
| { | |
| "epoch": 2.3366696997270244, | |
| "grad_norm": 1.0233004088935174, | |
| "learning_rate": 4.581470516780115e-06, | |
| "loss": 0.6297062873840332, | |
| "memory(GiB)": 76.02, | |
| "step": 1285, | |
| "token_acc": 0.7572519083969466, | |
| "train_speed(iter/s)": 0.020453 | |
| }, | |
| { | |
| "epoch": 2.345768880800728, | |
| "grad_norm": 1.0470029791397224, | |
| "learning_rate": 4.460682624352952e-06, | |
| "loss": 0.625699806213379, | |
| "memory(GiB)": 76.02, | |
| "step": 1290, | |
| "token_acc": 0.7591605596269154, | |
| "train_speed(iter/s)": 0.020458 | |
| }, | |
| { | |
| "epoch": 2.3548680618744315, | |
| "grad_norm": 0.915808456859335, | |
| "learning_rate": 4.34130818436805e-06, | |
| "loss": 0.6242890357971191, | |
| "memory(GiB)": 76.02, | |
| "step": 1295, | |
| "token_acc": 0.7637987012987013, | |
| "train_speed(iter/s)": 0.020462 | |
| }, | |
| { | |
| "epoch": 2.3639672429481347, | |
| "grad_norm": 0.9679022008759249, | |
| "learning_rate": 4.223358055094762e-06, | |
| "loss": 0.6215915203094482, | |
| "memory(GiB)": 76.02, | |
| "step": 1300, | |
| "token_acc": 0.7939560439560439, | |
| "train_speed(iter/s)": 0.020467 | |
| }, | |
| { | |
| "epoch": 2.3639672429481347, | |
| "eval_loss": 0.4746646285057068, | |
| "eval_runtime": 120.7999, | |
| "eval_samples_per_second": 47.782, | |
| "eval_steps_per_second": 0.464, | |
| "eval_token_acc": 0.6904034662577928, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.3767060964513194, | |
| "grad_norm": 1.0864039116899251, | |
| "learning_rate": 4.106842965247497e-06, | |
| "loss": 0.607478666305542, | |
| "memory(GiB)": 53.99, | |
| "step": 1305, | |
| "token_acc": 0.777601899485556, | |
| "train_speed(iter/s)": 4.038665 | |
| }, | |
| { | |
| "epoch": 2.385805277525023, | |
| "grad_norm": 0.955554735442322, | |
| "learning_rate": 3.991773513009849e-06, | |
| "loss": 0.6158496856689453, | |
| "memory(GiB)": 53.99, | |
| "step": 1310, | |
| "token_acc": 0.7964731814842028, | |
| "train_speed(iter/s)": 2.330085 | |
| }, | |
| { | |
| "epoch": 2.394904458598726, | |
| "grad_norm": 1.0615963891170637, | |
| "learning_rate": 3.87816016507055e-06, | |
| "loss": 0.6333821296691895, | |
| "memory(GiB)": 53.99, | |
| "step": 1315, | |
| "token_acc": 0.7811782708492732, | |
| "train_speed(iter/s)": 1.665234 | |
| }, | |
| { | |
| "epoch": 2.4040036396724296, | |
| "grad_norm": 1.148829953509744, | |
| "learning_rate": 3.766013255671479e-06, | |
| "loss": 0.6272965908050537, | |
| "memory(GiB)": 53.99, | |
| "step": 1320, | |
| "token_acc": 0.7688679245283019, | |
| "train_speed(iter/s)": 1.297177 | |
| }, | |
| { | |
| "epoch": 2.4131028207461327, | |
| "grad_norm": 1.0891236462035252, | |
| "learning_rate": 3.6553429856675915e-06, | |
| "loss": 0.6266043663024903, | |
| "memory(GiB)": 77.52, | |
| "step": 1325, | |
| "token_acc": 0.7914959016393442, | |
| "train_speed(iter/s)": 1.06612 | |
| }, | |
| { | |
| "epoch": 2.4222020018198362, | |
| "grad_norm": 1.1117445945203506, | |
| "learning_rate": 3.5461594215991247e-06, | |
| "loss": 0.6159255981445313, | |
| "memory(GiB)": 77.52, | |
| "step": 1330, | |
| "token_acc": 0.7893491124260354, | |
| "train_speed(iter/s)": 0.90399 | |
| }, | |
| { | |
| "epoch": 2.43130118289354, | |
| "grad_norm": 0.9824968556280764, | |
| "learning_rate": 3.438472494775902e-06, | |
| "loss": 0.6225139141082764, | |
| "memory(GiB)": 77.52, | |
| "step": 1335, | |
| "token_acc": 0.7502756339581036, | |
| "train_speed(iter/s)": 0.785373 | |
| }, | |
| { | |
| "epoch": 2.440400363967243, | |
| "grad_norm": 0.9912665739642537, | |
| "learning_rate": 3.3322920003739913e-06, | |
| "loss": 0.6153748989105224, | |
| "memory(GiB)": 77.52, | |
| "step": 1340, | |
| "token_acc": 0.790268456375839, | |
| "train_speed(iter/s)": 0.696672 | |
| }, | |
| { | |
| "epoch": 2.4494995450409465, | |
| "grad_norm": 1.064566119713343, | |
| "learning_rate": 3.227627596544738e-06, | |
| "loss": 0.6232125759124756, | |
| "memory(GiB)": 77.52, | |
| "step": 1345, | |
| "token_acc": 0.7880870561282932, | |
| "train_speed(iter/s)": 0.625452 | |
| }, | |
| { | |
| "epoch": 2.4585987261146496, | |
| "grad_norm": 1.0308754966071667, | |
| "learning_rate": 3.1244888035362875e-06, | |
| "loss": 0.6144218444824219, | |
| "memory(GiB)": 77.52, | |
| "step": 1350, | |
| "token_acc": 0.7680478428022213, | |
| "train_speed(iter/s)": 0.569157 | |
| }, | |
| { | |
| "epoch": 2.4585987261146496, | |
| "eval_loss": 0.4741266369819641, | |
| "eval_runtime": 123.1251, | |
| "eval_samples_per_second": 46.879, | |
| "eval_steps_per_second": 0.455, | |
| "eval_token_acc": 0.6901121840903298, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.467697907188353, | |
| "grad_norm": 1.1124385874562812, | |
| "learning_rate": 3.0228850028275803e-06, | |
| "loss": 0.6197083950042724, | |
| "memory(GiB)": 77.52, | |
| "step": 1355, | |
| "token_acc": 0.7441558441558441, | |
| "train_speed(iter/s)": 0.491445 | |
| }, | |
| { | |
| "epoch": 2.4767970882620562, | |
| "grad_norm": 1.0177391538655736, | |
| "learning_rate": 2.922825436275061e-06, | |
| "loss": 0.6326658248901367, | |
| "memory(GiB)": 77.52, | |
| "step": 1360, | |
| "token_acc": 0.774859287054409, | |
| "train_speed(iter/s)": 0.456689 | |
| }, | |
| { | |
| "epoch": 2.48589626933576, | |
| "grad_norm": 0.9939709571788379, | |
| "learning_rate": 2.8243192052719902e-06, | |
| "loss": 0.6353094577789307, | |
| "memory(GiB)": 77.52, | |
| "step": 1365, | |
| "token_acc": 0.7515923566878981, | |
| "train_speed(iter/s)": 0.426316 | |
| }, | |
| { | |
| "epoch": 2.494995450409463, | |
| "grad_norm": 1.0864856971626622, | |
| "learning_rate": 2.72737526992064e-06, | |
| "loss": 0.6143672466278076, | |
| "memory(GiB)": 77.52, | |
| "step": 1370, | |
| "token_acc": 0.800497203231821, | |
| "train_speed(iter/s)": 0.399977 | |
| }, | |
| { | |
| "epoch": 2.5040946314831665, | |
| "grad_norm": 0.9778765243753255, | |
| "learning_rate": 2.6320024482172592e-06, | |
| "loss": 0.6241840362548828, | |
| "memory(GiB)": 77.52, | |
| "step": 1375, | |
| "token_acc": 0.7901711761457758, | |
| "train_speed(iter/s)": 0.376966 | |
| }, | |
| { | |
| "epoch": 2.51319381255687, | |
| "grad_norm": 0.963647645236081, | |
| "learning_rate": 2.5382094152499705e-06, | |
| "loss": 0.635280704498291, | |
| "memory(GiB)": 77.52, | |
| "step": 1380, | |
| "token_acc": 0.7607636068237206, | |
| "train_speed(iter/s)": 0.356417 | |
| }, | |
| { | |
| "epoch": 2.522292993630573, | |
| "grad_norm": 0.9666636858906085, | |
| "learning_rate": 2.4460047024097144e-06, | |
| "loss": 0.6261641502380371, | |
| "memory(GiB)": 77.52, | |
| "step": 1385, | |
| "token_acc": 0.7655134541460736, | |
| "train_speed(iter/s)": 0.338341 | |
| }, | |
| { | |
| "epoch": 2.5313921747042767, | |
| "grad_norm": 0.9689736671771748, | |
| "learning_rate": 2.3553966966142384e-06, | |
| "loss": 0.6166990280151368, | |
| "memory(GiB)": 77.52, | |
| "step": 1390, | |
| "token_acc": 0.7619183556951185, | |
| "train_speed(iter/s)": 0.321781 | |
| }, | |
| { | |
| "epoch": 2.54049135577798, | |
| "grad_norm": 1.0530841209630801, | |
| "learning_rate": 2.266393639545197e-06, | |
| "loss": 0.6244637966156006, | |
| "memory(GiB)": 77.52, | |
| "step": 1395, | |
| "token_acc": 0.7679372197309418, | |
| "train_speed(iter/s)": 0.307132 | |
| }, | |
| { | |
| "epoch": 2.5495905368516834, | |
| "grad_norm": 0.9878733985818398, | |
| "learning_rate": 2.1790036268985284e-06, | |
| "loss": 0.6239931106567382, | |
| "memory(GiB)": 77.52, | |
| "step": 1400, | |
| "token_acc": 0.7469059405940595, | |
| "train_speed(iter/s)": 0.293674 | |
| }, | |
| { | |
| "epoch": 2.5495905368516834, | |
| "eval_loss": 0.47406768798828125, | |
| "eval_runtime": 121.0349, | |
| "eval_samples_per_second": 47.689, | |
| "eval_steps_per_second": 0.463, | |
| "eval_token_acc": 0.6900029532775313, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.558689717925387, | |
| "grad_norm": 1.0090114818567588, | |
| "learning_rate": 2.0932346076480314e-06, | |
| "loss": 0.6187572956085206, | |
| "memory(GiB)": 77.52, | |
| "step": 1405, | |
| "token_acc": 0.7450779851700332, | |
| "train_speed(iter/s)": 0.272986 | |
| }, | |
| { | |
| "epoch": 2.56778889899909, | |
| "grad_norm": 0.9588816991739316, | |
| "learning_rate": 2.009094383322356e-06, | |
| "loss": 0.6277956485748291, | |
| "memory(GiB)": 77.52, | |
| "step": 1410, | |
| "token_acc": 0.7810402684563759, | |
| "train_speed(iter/s)": 0.262478 | |
| }, | |
| { | |
| "epoch": 2.5768880800727936, | |
| "grad_norm": 0.9909418694472445, | |
| "learning_rate": 1.9265906072953822e-06, | |
| "loss": 0.6175178050994873, | |
| "memory(GiB)": 77.52, | |
| "step": 1415, | |
| "token_acc": 0.7652439024390244, | |
| "train_speed(iter/s)": 0.252862 | |
| }, | |
| { | |
| "epoch": 2.5859872611464967, | |
| "grad_norm": 1.1182023779440498, | |
| "learning_rate": 1.8457307840900428e-06, | |
| "loss": 0.6154948711395264, | |
| "memory(GiB)": 77.52, | |
| "step": 1420, | |
| "token_acc": 0.7852161785216178, | |
| "train_speed(iter/s)": 0.244119 | |
| }, | |
| { | |
| "epoch": 2.5950864422202002, | |
| "grad_norm": 1.0404157493617592, | |
| "learning_rate": 1.7665222686957362e-06, | |
| "loss": 0.6219567775726318, | |
| "memory(GiB)": 77.52, | |
| "step": 1425, | |
| "token_acc": 0.7628019323671498, | |
| "train_speed(iter/s)": 0.235826 | |
| }, | |
| { | |
| "epoch": 2.604185623293904, | |
| "grad_norm": 1.0786639447035942, | |
| "learning_rate": 1.6889722658993223e-06, | |
| "loss": 0.6350451946258545, | |
| "memory(GiB)": 77.52, | |
| "step": 1430, | |
| "token_acc": 0.7704379562043796, | |
| "train_speed(iter/s)": 0.228331 | |
| }, | |
| { | |
| "epoch": 2.613284804367607, | |
| "grad_norm": 1.0095118897080797, | |
| "learning_rate": 1.6130878296297536e-06, | |
| "loss": 0.6284623622894288, | |
| "memory(GiB)": 77.52, | |
| "step": 1435, | |
| "token_acc": 0.7636180228648285, | |
| "train_speed(iter/s)": 0.221176 | |
| }, | |
| { | |
| "epoch": 2.62238398544131, | |
| "grad_norm": 0.94070647379727, | |
| "learning_rate": 1.5388758623164802e-06, | |
| "loss": 0.6281323432922363, | |
| "memory(GiB)": 77.52, | |
| "step": 1440, | |
| "token_acc": 0.7643463497453311, | |
| "train_speed(iter/s)": 0.214634 | |
| }, | |
| { | |
| "epoch": 2.6314831665150136, | |
| "grad_norm": 1.0651613672971816, | |
| "learning_rate": 1.4663431142615792e-06, | |
| "loss": 0.6090371608734131, | |
| "memory(GiB)": 77.52, | |
| "step": 1445, | |
| "token_acc": 0.8246628131021194, | |
| "train_speed(iter/s)": 0.208466 | |
| }, | |
| { | |
| "epoch": 2.640582347588717, | |
| "grad_norm": 1.0004848001888615, | |
| "learning_rate": 1.3954961830257685e-06, | |
| "loss": 0.624143123626709, | |
| "memory(GiB)": 77.52, | |
| "step": 1450, | |
| "token_acc": 0.7779850746268657, | |
| "train_speed(iter/s)": 0.202625 | |
| }, | |
| { | |
| "epoch": 2.640582347588717, | |
| "eval_loss": 0.47285741567611694, | |
| "eval_runtime": 117.6959, | |
| "eval_samples_per_second": 49.042, | |
| "eval_steps_per_second": 0.476, | |
| "eval_token_acc": 0.6904803323853178, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.6496815286624202, | |
| "grad_norm": 1.0494169722074018, | |
| "learning_rate": 1.3263415128282908e-06, | |
| "loss": 0.6255748271942139, | |
| "memory(GiB)": 77.52, | |
| "step": 1455, | |
| "token_acc": 0.732059542323928, | |
| "train_speed(iter/s)": 0.193115 | |
| }, | |
| { | |
| "epoch": 2.658780709736124, | |
| "grad_norm": 1.0337513224693766, | |
| "learning_rate": 1.2588853939607338e-06, | |
| "loss": 0.6212813377380371, | |
| "memory(GiB)": 77.52, | |
| "step": 1460, | |
| "token_acc": 0.7488385598141696, | |
| "train_speed(iter/s)": 0.188151 | |
| }, | |
| { | |
| "epoch": 2.667879890809827, | |
| "grad_norm": 0.9339498438090048, | |
| "learning_rate": 1.1931339622148897e-06, | |
| "loss": 0.6209768295288086, | |
| "memory(GiB)": 77.52, | |
| "step": 1465, | |
| "token_acc": 0.7604208822339134, | |
| "train_speed(iter/s)": 0.183569 | |
| }, | |
| { | |
| "epoch": 2.6769790718835305, | |
| "grad_norm": 1.009290828686064, | |
| "learning_rate": 1.1290931983246334e-06, | |
| "loss": 0.619508934020996, | |
| "memory(GiB)": 77.52, | |
| "step": 1470, | |
| "token_acc": 0.7703793381759484, | |
| "train_speed(iter/s)": 0.179159 | |
| }, | |
| { | |
| "epoch": 2.686078252957234, | |
| "grad_norm": 0.9092366819727269, | |
| "learning_rate": 1.0667689274219128e-06, | |
| "loss": 0.6159298419952393, | |
| "memory(GiB)": 77.52, | |
| "step": 1475, | |
| "token_acc": 0.7770177838577291, | |
| "train_speed(iter/s)": 0.175056 | |
| }, | |
| { | |
| "epoch": 2.695177434030937, | |
| "grad_norm": 0.9840242855378942, | |
| "learning_rate": 1.0061668185068996e-06, | |
| "loss": 0.6134575843811035, | |
| "memory(GiB)": 77.52, | |
| "step": 1480, | |
| "token_acc": 0.7733843537414966, | |
| "train_speed(iter/s)": 0.171104 | |
| }, | |
| { | |
| "epoch": 2.7042766151046407, | |
| "grad_norm": 1.0092116973578455, | |
| "learning_rate": 9.4729238393235e-07, | |
| "loss": 0.6143134593963623, | |
| "memory(GiB)": 77.52, | |
| "step": 1485, | |
| "token_acc": 0.7900072411296162, | |
| "train_speed(iter/s)": 0.167358 | |
| }, | |
| { | |
| "epoch": 2.713375796178344, | |
| "grad_norm": 1.0868815484832741, | |
| "learning_rate": 8.901509789021779e-07, | |
| "loss": 0.600148344039917, | |
| "memory(GiB)": 77.52, | |
| "step": 1490, | |
| "token_acc": 0.7679245283018868, | |
| "train_speed(iter/s)": 0.163825 | |
| }, | |
| { | |
| "epoch": 2.7224749772520473, | |
| "grad_norm": 1.0410901018430865, | |
| "learning_rate": 8.347478009843746e-07, | |
| "loss": 0.6201463222503663, | |
| "memory(GiB)": 77.52, | |
| "step": 1495, | |
| "token_acc": 0.738926899531869, | |
| "train_speed(iter/s)": 0.160424 | |
| }, | |
| { | |
| "epoch": 2.731574158325751, | |
| "grad_norm": 0.9884777012197261, | |
| "learning_rate": 7.810878896382101e-07, | |
| "loss": 0.6072117805480957, | |
| "memory(GiB)": 77.52, | |
| "step": 1500, | |
| "token_acc": 0.7709691438504997, | |
| "train_speed(iter/s)": 0.157229 | |
| }, | |
| { | |
| "epoch": 2.731574158325751, | |
| "eval_loss": 0.4724496603012085, | |
| "eval_runtime": 119.0959, | |
| "eval_samples_per_second": 48.465, | |
| "eval_steps_per_second": 0.47, | |
| "eval_token_acc": 0.6903711015725191, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.740673339399454, | |
| "grad_norm": 0.9523503494388392, | |
| "learning_rate": 7.291761257558749e-07, | |
| "loss": 0.6324088096618652, | |
| "memory(GiB)": 77.52, | |
| "step": 1505, | |
| "token_acc": 0.7417567924030599, | |
| "train_speed(iter/s)": 0.151702 | |
| }, | |
| { | |
| "epoch": 2.7497725204731576, | |
| "grad_norm": 0.9815948952007479, | |
| "learning_rate": 6.790172312184972e-07, | |
| "loss": 0.6338190078735352, | |
| "memory(GiB)": 77.52, | |
| "step": 1510, | |
| "token_acc": 0.7562122229684352, | |
| "train_speed(iter/s)": 0.148878 | |
| }, | |
| { | |
| "epoch": 2.7588717015468607, | |
| "grad_norm": 1.0475192921698937, | |
| "learning_rate": 6.306157684666425e-07, | |
| "loss": 0.6202810764312744, | |
| "memory(GiB)": 77.52, | |
| "step": 1515, | |
| "token_acc": 0.7550281576830249, | |
| "train_speed(iter/s)": 0.146148 | |
| }, | |
| { | |
| "epoch": 2.7679708826205642, | |
| "grad_norm": 1.0206535695296246, | |
| "learning_rate": 5.839761400853183e-07, | |
| "loss": 0.6317409992218017, | |
| "memory(GiB)": 77.52, | |
| "step": 1520, | |
| "token_acc": 0.7529880478087649, | |
| "train_speed(iter/s)": 0.143534 | |
| }, | |
| { | |
| "epoch": 2.777070063694268, | |
| "grad_norm": 0.9666971373448247, | |
| "learning_rate": 5.391025884035239e-07, | |
| "loss": 0.6138282775878906, | |
| "memory(GiB)": 77.52, | |
| "step": 1525, | |
| "token_acc": 0.767303609341826, | |
| "train_speed(iter/s)": 0.141033 | |
| }, | |
| { | |
| "epoch": 2.786169244767971, | |
| "grad_norm": 1.002591354360291, | |
| "learning_rate": 4.959991951083498e-07, | |
| "loss": 0.617135763168335, | |
| "memory(GiB)": 77.52, | |
| "step": 1530, | |
| "token_acc": 0.8161559888579387, | |
| "train_speed(iter/s)": 0.13864 | |
| }, | |
| { | |
| "epoch": 2.795268425841674, | |
| "grad_norm": 1.006202469505235, | |
| "learning_rate": 4.5466988087373044e-07, | |
| "loss": 0.6056863784790039, | |
| "memory(GiB)": 77.52, | |
| "step": 1535, | |
| "token_acc": 0.760498687664042, | |
| "train_speed(iter/s)": 0.136344 | |
| }, | |
| { | |
| "epoch": 2.8043676069153776, | |
| "grad_norm": 0.9830438526460707, | |
| "learning_rate": 4.151184050038004e-07, | |
| "loss": 0.6215356349945068, | |
| "memory(GiB)": 77.52, | |
| "step": 1540, | |
| "token_acc": 0.7701478302336672, | |
| "train_speed(iter/s)": 0.134118 | |
| }, | |
| { | |
| "epoch": 2.813466787989081, | |
| "grad_norm": 1.0560207375711046, | |
| "learning_rate": 3.7734836509096596e-07, | |
| "loss": 0.6116134643554687, | |
| "memory(GiB)": 77.52, | |
| "step": 1545, | |
| "token_acc": 0.7759115116755428, | |
| "train_speed(iter/s)": 0.132005 | |
| }, | |
| { | |
| "epoch": 2.8225659690627842, | |
| "grad_norm": 1.0225913174286714, | |
| "learning_rate": 3.4136319668866434e-07, | |
| "loss": 0.625472116470337, | |
| "memory(GiB)": 77.52, | |
| "step": 1550, | |
| "token_acc": 0.7980769230769231, | |
| "train_speed(iter/s)": 0.129952 | |
| }, | |
| { | |
| "epoch": 2.8225659690627842, | |
| "eval_loss": 0.4721684753894806, | |
| "eval_runtime": 118.0382, | |
| "eval_samples_per_second": 48.899, | |
| "eval_steps_per_second": 0.474, | |
| "eval_token_acc": 0.6906583381543229, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.831665150136488, | |
| "grad_norm": 1.0572860374958588, | |
| "learning_rate": 3.071661729988584e-07, | |
| "loss": 0.6085397720336914, | |
| "memory(GiB)": 77.52, | |
| "step": 1555, | |
| "token_acc": 0.7432788613600422, | |
| "train_speed(iter/s)": 0.126385 | |
| }, | |
| { | |
| "epoch": 2.840764331210191, | |
| "grad_norm": 0.977034680594488, | |
| "learning_rate": 2.747604045743102e-07, | |
| "loss": 0.6142263889312745, | |
| "memory(GiB)": 77.52, | |
| "step": 1560, | |
| "token_acc": 0.7400581959262852, | |
| "train_speed(iter/s)": 0.124515 | |
| }, | |
| { | |
| "epoch": 2.8498635122838945, | |
| "grad_norm": 0.9902282597829868, | |
| "learning_rate": 2.4414883903565834e-07, | |
| "loss": 0.6152991771697998, | |
| "memory(GiB)": 77.52, | |
| "step": 1565, | |
| "token_acc": 0.8156277436347673, | |
| "train_speed(iter/s)": 0.122739 | |
| }, | |
| { | |
| "epoch": 2.858962693357598, | |
| "grad_norm": 1.112609715887069, | |
| "learning_rate": 2.15334260803286e-07, | |
| "loss": 0.6211013793945312, | |
| "memory(GiB)": 77.52, | |
| "step": 1570, | |
| "token_acc": 0.7968069666182874, | |
| "train_speed(iter/s)": 0.121013 | |
| }, | |
| { | |
| "epoch": 2.868061874431301, | |
| "grad_norm": 1.007653504358626, | |
| "learning_rate": 1.8831929084406119e-07, | |
| "loss": 0.6160074234008789, | |
| "memory(GiB)": 77.52, | |
| "step": 1575, | |
| "token_acc": 0.7956026058631922, | |
| "train_speed(iter/s)": 0.119343 | |
| }, | |
| { | |
| "epoch": 2.8771610555050047, | |
| "grad_norm": 1.0328729828726175, | |
| "learning_rate": 1.631063864329274e-07, | |
| "loss": 0.6106714725494384, | |
| "memory(GiB)": 77.52, | |
| "step": 1580, | |
| "token_acc": 0.8102600140548137, | |
| "train_speed(iter/s)": 0.11774 | |
| }, | |
| { | |
| "epoch": 2.886260236578708, | |
| "grad_norm": 0.9727986501836436, | |
| "learning_rate": 1.3969784092939588e-07, | |
| "loss": 0.6038858890533447, | |
| "memory(GiB)": 77.52, | |
| "step": 1585, | |
| "token_acc": 0.7294275491949911, | |
| "train_speed(iter/s)": 0.116161 | |
| }, | |
| { | |
| "epoch": 2.8953594176524113, | |
| "grad_norm": 1.0580993770834335, | |
| "learning_rate": 1.180957835689478e-07, | |
| "loss": 0.6102193832397461, | |
| "memory(GiB)": 77.52, | |
| "step": 1590, | |
| "token_acc": 0.7574578469520103, | |
| "train_speed(iter/s)": 0.114662 | |
| }, | |
| { | |
| "epoch": 2.904458598726115, | |
| "grad_norm": 0.9841890221890635, | |
| "learning_rate": 9.83021792693406e-08, | |
| "loss": 0.6162684917449951, | |
| "memory(GiB)": 77.52, | |
| "step": 1595, | |
| "token_acc": 0.7871674491392802, | |
| "train_speed(iter/s)": 0.113191 | |
| }, | |
| { | |
| "epoch": 2.913557779799818, | |
| "grad_norm": 1.0209356603903383, | |
| "learning_rate": 8.031882845189743e-08, | |
| "loss": 0.6077028751373291, | |
| "memory(GiB)": 77.52, | |
| "step": 1600, | |
| "token_acc": 0.7544715447154472, | |
| "train_speed(iter/s)": 0.111782 | |
| }, | |
| { | |
| "epoch": 2.913557779799818, | |
| "eval_loss": 0.4720407724380493, | |
| "eval_runtime": 116.4832, | |
| "eval_samples_per_second": 49.552, | |
| "eval_steps_per_second": 0.481, | |
| "eval_token_acc": 0.6904317853574072, | |
| "step": 1600 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1647, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5861721952354304e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |