T5-sMBR-PP-EN / trainer_state.json
lyu-boxuan's picture
Upload folder using huggingface_hub
e618d2e verified
{
"best_metric": 0.6407684683799744,
"best_model_checkpoint": "/gs/bs/tga-t2glrlab-1/lyu/t5/run4-flan-t5-large-base/checkpoint-672",
"epoch": 2.7008289374529015,
"eval_steps": 16,
"global_step": 672,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"grad_norm": 0.15875867009162903,
"learning_rate": 0.0009870967741935483,
"loss": 0.9746,
"step": 16
},
{
"epoch": 0.06,
"eval_loss": 0.7603664994239807,
"eval_runtime": 4.5631,
"eval_samples_per_second": 420.983,
"eval_steps_per_second": 26.517,
"step": 16
},
{
"epoch": 0.13,
"grad_norm": 0.10888980329036713,
"learning_rate": 0.0009741935483870968,
"loss": 0.7966,
"step": 32
},
{
"epoch": 0.13,
"eval_loss": 0.7332449555397034,
"eval_runtime": 4.6012,
"eval_samples_per_second": 417.502,
"eval_steps_per_second": 26.298,
"step": 32
},
{
"epoch": 0.19,
"grad_norm": 0.17990568280220032,
"learning_rate": 0.0009612903225806452,
"loss": 0.7655,
"step": 48
},
{
"epoch": 0.19,
"eval_loss": 0.7393877506256104,
"eval_runtime": 4.6014,
"eval_samples_per_second": 417.486,
"eval_steps_per_second": 26.297,
"step": 48
},
{
"epoch": 0.26,
"grad_norm": 0.11761835217475891,
"learning_rate": 0.0009483870967741936,
"loss": 0.8534,
"step": 64
},
{
"epoch": 0.26,
"eval_loss": 0.7214094400405884,
"eval_runtime": 4.6156,
"eval_samples_per_second": 416.194,
"eval_steps_per_second": 26.215,
"step": 64
},
{
"epoch": 0.32,
"grad_norm": 0.112620510160923,
"learning_rate": 0.0009354838709677419,
"loss": 0.7623,
"step": 80
},
{
"epoch": 0.32,
"eval_loss": 0.7035090327262878,
"eval_runtime": 4.602,
"eval_samples_per_second": 417.423,
"eval_steps_per_second": 26.293,
"step": 80
},
{
"epoch": 0.39,
"grad_norm": 0.14100195467472076,
"learning_rate": 0.0009225806451612904,
"loss": 0.7305,
"step": 96
},
{
"epoch": 0.39,
"eval_loss": 0.6993053555488586,
"eval_runtime": 4.5895,
"eval_samples_per_second": 418.561,
"eval_steps_per_second": 26.364,
"step": 96
},
{
"epoch": 0.45,
"grad_norm": 0.16868335008621216,
"learning_rate": 0.0009096774193548387,
"loss": 0.807,
"step": 112
},
{
"epoch": 0.45,
"eval_loss": 0.7026967406272888,
"eval_runtime": 4.7327,
"eval_samples_per_second": 405.898,
"eval_steps_per_second": 25.567,
"step": 112
},
{
"epoch": 0.51,
"grad_norm": 0.10501790046691895,
"learning_rate": 0.0008967741935483871,
"loss": 0.7418,
"step": 128
},
{
"epoch": 0.51,
"eval_loss": 0.6837323904037476,
"eval_runtime": 4.5936,
"eval_samples_per_second": 418.187,
"eval_steps_per_second": 26.341,
"step": 128
},
{
"epoch": 0.58,
"grad_norm": 0.12754693627357483,
"learning_rate": 0.0008838709677419356,
"loss": 0.7125,
"step": 144
},
{
"epoch": 0.58,
"eval_loss": 0.6846245527267456,
"eval_runtime": 4.6234,
"eval_samples_per_second": 415.493,
"eval_steps_per_second": 26.171,
"step": 144
},
{
"epoch": 0.64,
"grad_norm": 0.1712610423564911,
"learning_rate": 0.0008709677419354839,
"loss": 0.7813,
"step": 160
},
{
"epoch": 0.64,
"eval_loss": 0.687301516532898,
"eval_runtime": 4.576,
"eval_samples_per_second": 419.802,
"eval_steps_per_second": 26.443,
"step": 160
},
{
"epoch": 0.71,
"grad_norm": 0.10070759803056717,
"learning_rate": 0.0008580645161290323,
"loss": 0.7321,
"step": 176
},
{
"epoch": 0.71,
"eval_loss": 0.6738956570625305,
"eval_runtime": 4.5756,
"eval_samples_per_second": 419.835,
"eval_steps_per_second": 26.445,
"step": 176
},
{
"epoch": 0.77,
"grad_norm": 0.09799516946077347,
"learning_rate": 0.0008451612903225807,
"loss": 0.6972,
"step": 192
},
{
"epoch": 0.77,
"eval_loss": 0.6699069738388062,
"eval_runtime": 4.602,
"eval_samples_per_second": 417.424,
"eval_steps_per_second": 26.293,
"step": 192
},
{
"epoch": 0.84,
"grad_norm": 0.15926051139831543,
"learning_rate": 0.0008322580645161291,
"loss": 0.7642,
"step": 208
},
{
"epoch": 0.84,
"eval_loss": 0.6707108616828918,
"eval_runtime": 4.6005,
"eval_samples_per_second": 417.567,
"eval_steps_per_second": 26.302,
"step": 208
},
{
"epoch": 0.9,
"grad_norm": 0.1038859635591507,
"learning_rate": 0.0008193548387096774,
"loss": 0.7234,
"step": 224
},
{
"epoch": 0.9,
"eval_loss": 0.6664640307426453,
"eval_runtime": 4.5657,
"eval_samples_per_second": 420.746,
"eval_steps_per_second": 26.502,
"step": 224
},
{
"epoch": 0.96,
"grad_norm": 0.09894105046987534,
"learning_rate": 0.0008064516129032258,
"loss": 0.6891,
"step": 240
},
{
"epoch": 0.96,
"eval_loss": 0.6674955487251282,
"eval_runtime": 4.6128,
"eval_samples_per_second": 416.447,
"eval_steps_per_second": 26.231,
"step": 240
},
{
"epoch": 1.03,
"grad_norm": 0.108543761074543,
"learning_rate": 0.0007935483870967743,
"loss": 0.7219,
"step": 256
},
{
"epoch": 1.03,
"eval_loss": 0.6673880815505981,
"eval_runtime": 4.6113,
"eval_samples_per_second": 416.588,
"eval_steps_per_second": 26.24,
"step": 256
},
{
"epoch": 1.09,
"grad_norm": 0.08566311001777649,
"learning_rate": 0.0007806451612903226,
"loss": 0.6558,
"step": 272
},
{
"epoch": 1.09,
"eval_loss": 0.6653081774711609,
"eval_runtime": 4.6897,
"eval_samples_per_second": 409.621,
"eval_steps_per_second": 25.801,
"step": 272
},
{
"epoch": 1.16,
"grad_norm": 0.10951746255159378,
"learning_rate": 0.000767741935483871,
"loss": 0.5958,
"step": 288
},
{
"epoch": 1.16,
"eval_loss": 0.6638582348823547,
"eval_runtime": 4.7789,
"eval_samples_per_second": 401.974,
"eval_steps_per_second": 25.32,
"step": 288
},
{
"epoch": 1.22,
"grad_norm": 0.1710771769285202,
"learning_rate": 0.0007548387096774194,
"loss": 0.6209,
"step": 304
},
{
"epoch": 1.22,
"eval_loss": 0.6717860102653503,
"eval_runtime": 4.6605,
"eval_samples_per_second": 412.186,
"eval_steps_per_second": 25.963,
"step": 304
},
{
"epoch": 1.29,
"grad_norm": 0.09286555647850037,
"learning_rate": 0.0007419354838709678,
"loss": 0.664,
"step": 320
},
{
"epoch": 1.29,
"eval_loss": 0.6565249562263489,
"eval_runtime": 4.6747,
"eval_samples_per_second": 410.939,
"eval_steps_per_second": 25.884,
"step": 320
},
{
"epoch": 1.35,
"grad_norm": 0.09515868872404099,
"learning_rate": 0.0007290322580645162,
"loss": 0.6123,
"step": 336
},
{
"epoch": 1.35,
"eval_loss": 0.6616882085800171,
"eval_runtime": 4.6796,
"eval_samples_per_second": 410.502,
"eval_steps_per_second": 25.857,
"step": 336
},
{
"epoch": 1.41,
"grad_norm": 0.10440368950366974,
"learning_rate": 0.0007161290322580646,
"loss": 0.6113,
"step": 352
},
{
"epoch": 1.41,
"eval_loss": 0.6656071543693542,
"eval_runtime": 4.6341,
"eval_samples_per_second": 414.539,
"eval_steps_per_second": 26.111,
"step": 352
},
{
"epoch": 1.48,
"grad_norm": 0.08295060694217682,
"learning_rate": 0.000703225806451613,
"loss": 0.6742,
"step": 368
},
{
"epoch": 1.48,
"eval_loss": 0.6533424854278564,
"eval_runtime": 4.6967,
"eval_samples_per_second": 409.008,
"eval_steps_per_second": 25.763,
"step": 368
},
{
"epoch": 1.54,
"grad_norm": 0.10447151958942413,
"learning_rate": 0.0006903225806451613,
"loss": 0.6121,
"step": 384
},
{
"epoch": 1.54,
"eval_loss": 0.6559557914733887,
"eval_runtime": 4.6736,
"eval_samples_per_second": 411.033,
"eval_steps_per_second": 25.89,
"step": 384
},
{
"epoch": 1.61,
"grad_norm": 0.19215475022792816,
"learning_rate": 0.0006774193548387097,
"loss": 0.5878,
"step": 400
},
{
"epoch": 1.61,
"eval_loss": 0.6661805510520935,
"eval_runtime": 4.656,
"eval_samples_per_second": 412.586,
"eval_steps_per_second": 25.988,
"step": 400
},
{
"epoch": 1.67,
"grad_norm": 0.08589986711740494,
"learning_rate": 0.0006645161290322582,
"loss": 0.6835,
"step": 416
},
{
"epoch": 1.67,
"eval_loss": 0.6515570878982544,
"eval_runtime": 5.5084,
"eval_samples_per_second": 348.738,
"eval_steps_per_second": 21.966,
"step": 416
},
{
"epoch": 1.74,
"grad_norm": 0.09526026993989944,
"learning_rate": 0.0006516129032258064,
"loss": 0.6178,
"step": 432
},
{
"epoch": 1.74,
"eval_loss": 0.6476355791091919,
"eval_runtime": 4.6704,
"eval_samples_per_second": 411.312,
"eval_steps_per_second": 25.908,
"step": 432
},
{
"epoch": 1.8,
"grad_norm": 0.12676902115345,
"learning_rate": 0.0006387096774193548,
"loss": 0.5738,
"step": 448
},
{
"epoch": 1.8,
"eval_loss": 0.648709774017334,
"eval_runtime": 4.6719,
"eval_samples_per_second": 411.185,
"eval_steps_per_second": 25.9,
"step": 448
},
{
"epoch": 1.86,
"grad_norm": 0.08004370331764221,
"learning_rate": 0.0006258064516129032,
"loss": 0.6869,
"step": 464
},
{
"epoch": 1.86,
"eval_loss": 0.6497883200645447,
"eval_runtime": 4.6811,
"eval_samples_per_second": 410.376,
"eval_steps_per_second": 25.849,
"step": 464
},
{
"epoch": 1.93,
"grad_norm": 0.07917796820402145,
"learning_rate": 0.0006129032258064516,
"loss": 0.6231,
"step": 480
},
{
"epoch": 1.93,
"eval_loss": 0.6452683806419373,
"eval_runtime": 4.7528,
"eval_samples_per_second": 404.179,
"eval_steps_per_second": 25.458,
"step": 480
},
{
"epoch": 1.99,
"grad_norm": 0.11247972398996353,
"learning_rate": 0.0006,
"loss": 0.5753,
"step": 496
},
{
"epoch": 1.99,
"eval_loss": 0.6489792466163635,
"eval_runtime": 4.6875,
"eval_samples_per_second": 409.813,
"eval_steps_per_second": 25.813,
"step": 496
},
{
"epoch": 2.06,
"grad_norm": 0.08393968641757965,
"learning_rate": 0.0005870967741935483,
"loss": 0.6299,
"step": 512
},
{
"epoch": 2.06,
"eval_loss": 0.6549689173698425,
"eval_runtime": 4.6812,
"eval_samples_per_second": 410.367,
"eval_steps_per_second": 25.848,
"step": 512
},
{
"epoch": 2.12,
"grad_norm": 0.09901689738035202,
"learning_rate": 0.0005741935483870968,
"loss": 0.553,
"step": 528
},
{
"epoch": 2.12,
"eval_loss": 0.6531901955604553,
"eval_runtime": 4.6345,
"eval_samples_per_second": 414.5,
"eval_steps_per_second": 26.109,
"step": 528
},
{
"epoch": 2.19,
"grad_norm": 0.14240662753582,
"learning_rate": 0.0005612903225806451,
"loss": 0.4872,
"step": 544
},
{
"epoch": 2.19,
"eval_loss": 0.6619027256965637,
"eval_runtime": 4.6673,
"eval_samples_per_second": 411.588,
"eval_steps_per_second": 25.925,
"step": 544
},
{
"epoch": 2.25,
"grad_norm": 0.12037284672260284,
"learning_rate": 0.0005483870967741935,
"loss": 0.5957,
"step": 560
},
{
"epoch": 2.25,
"eval_loss": 0.6506627202033997,
"eval_runtime": 4.6368,
"eval_samples_per_second": 414.293,
"eval_steps_per_second": 26.096,
"step": 560
},
{
"epoch": 2.31,
"grad_norm": 0.08450206369161606,
"learning_rate": 0.000535483870967742,
"loss": 0.5647,
"step": 576
},
{
"epoch": 2.31,
"eval_loss": 0.6499984264373779,
"eval_runtime": 4.64,
"eval_samples_per_second": 414.005,
"eval_steps_per_second": 26.077,
"step": 576
},
{
"epoch": 2.38,
"grad_norm": 0.09682720899581909,
"learning_rate": 0.0005225806451612903,
"loss": 0.5017,
"step": 592
},
{
"epoch": 2.38,
"eval_loss": 0.6509167551994324,
"eval_runtime": 4.7996,
"eval_samples_per_second": 400.243,
"eval_steps_per_second": 25.21,
"step": 592
},
{
"epoch": 2.44,
"grad_norm": 0.1342058777809143,
"learning_rate": 0.0005096774193548387,
"loss": 0.5774,
"step": 608
},
{
"epoch": 2.44,
"eval_loss": 0.6469881534576416,
"eval_runtime": 4.6664,
"eval_samples_per_second": 411.664,
"eval_steps_per_second": 25.93,
"step": 608
},
{
"epoch": 2.51,
"grad_norm": 0.09072865545749664,
"learning_rate": 0.0004967741935483871,
"loss": 0.5773,
"step": 624
},
{
"epoch": 2.51,
"eval_loss": 0.6494817733764648,
"eval_runtime": 4.6407,
"eval_samples_per_second": 413.945,
"eval_steps_per_second": 26.074,
"step": 624
},
{
"epoch": 2.57,
"grad_norm": 0.10313040763139725,
"learning_rate": 0.0004838709677419355,
"loss": 0.5158,
"step": 640
},
{
"epoch": 2.57,
"eval_loss": 0.6480950117111206,
"eval_runtime": 4.6554,
"eval_samples_per_second": 412.638,
"eval_steps_per_second": 25.991,
"step": 640
},
{
"epoch": 2.64,
"grad_norm": 0.12878872454166412,
"learning_rate": 0.00047096774193548384,
"loss": 0.5638,
"step": 656
},
{
"epoch": 2.64,
"eval_loss": 0.645260214805603,
"eval_runtime": 4.654,
"eval_samples_per_second": 412.765,
"eval_steps_per_second": 25.999,
"step": 656
},
{
"epoch": 2.7,
"grad_norm": 0.1101013645529747,
"learning_rate": 0.00045806451612903225,
"loss": 0.5839,
"step": 672
},
{
"epoch": 2.7,
"eval_loss": 0.6407684683799744,
"eval_runtime": 4.6561,
"eval_samples_per_second": 412.576,
"eval_steps_per_second": 25.987,
"step": 672
}
],
"logging_steps": 16,
"max_steps": 1240,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 16,
"total_flos": 1.668443666998395e+17,
"train_batch_size": 96,
"trial_name": null,
"trial_params": null
}