{ "best_metric": 0.6407684683799744, "best_model_checkpoint": "/gs/bs/tga-t2glrlab-1/lyu/t5/run4-flan-t5-large-base/checkpoint-672", "epoch": 2.7008289374529015, "eval_steps": 16, "global_step": 672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "grad_norm": 0.15875867009162903, "learning_rate": 0.0009870967741935483, "loss": 0.9746, "step": 16 }, { "epoch": 0.06, "eval_loss": 0.7603664994239807, "eval_runtime": 4.5631, "eval_samples_per_second": 420.983, "eval_steps_per_second": 26.517, "step": 16 }, { "epoch": 0.13, "grad_norm": 0.10888980329036713, "learning_rate": 0.0009741935483870968, "loss": 0.7966, "step": 32 }, { "epoch": 0.13, "eval_loss": 0.7332449555397034, "eval_runtime": 4.6012, "eval_samples_per_second": 417.502, "eval_steps_per_second": 26.298, "step": 32 }, { "epoch": 0.19, "grad_norm": 0.17990568280220032, "learning_rate": 0.0009612903225806452, "loss": 0.7655, "step": 48 }, { "epoch": 0.19, "eval_loss": 0.7393877506256104, "eval_runtime": 4.6014, "eval_samples_per_second": 417.486, "eval_steps_per_second": 26.297, "step": 48 }, { "epoch": 0.26, "grad_norm": 0.11761835217475891, "learning_rate": 0.0009483870967741936, "loss": 0.8534, "step": 64 }, { "epoch": 0.26, "eval_loss": 0.7214094400405884, "eval_runtime": 4.6156, "eval_samples_per_second": 416.194, "eval_steps_per_second": 26.215, "step": 64 }, { "epoch": 0.32, "grad_norm": 0.112620510160923, "learning_rate": 0.0009354838709677419, "loss": 0.7623, "step": 80 }, { "epoch": 0.32, "eval_loss": 0.7035090327262878, "eval_runtime": 4.602, "eval_samples_per_second": 417.423, "eval_steps_per_second": 26.293, "step": 80 }, { "epoch": 0.39, "grad_norm": 0.14100195467472076, "learning_rate": 0.0009225806451612904, "loss": 0.7305, "step": 96 }, { "epoch": 0.39, "eval_loss": 0.6993053555488586, "eval_runtime": 4.5895, "eval_samples_per_second": 418.561, "eval_steps_per_second": 26.364, "step": 96 }, { "epoch": 0.45, "grad_norm": 0.16868335008621216, "learning_rate": 0.0009096774193548387, "loss": 0.807, "step": 112 }, { "epoch": 0.45, "eval_loss": 0.7026967406272888, "eval_runtime": 4.7327, "eval_samples_per_second": 405.898, "eval_steps_per_second": 25.567, "step": 112 }, { "epoch": 0.51, "grad_norm": 0.10501790046691895, "learning_rate": 0.0008967741935483871, "loss": 0.7418, "step": 128 }, { "epoch": 0.51, "eval_loss": 0.6837323904037476, "eval_runtime": 4.5936, "eval_samples_per_second": 418.187, "eval_steps_per_second": 26.341, "step": 128 }, { "epoch": 0.58, "grad_norm": 0.12754693627357483, "learning_rate": 0.0008838709677419356, "loss": 0.7125, "step": 144 }, { "epoch": 0.58, "eval_loss": 0.6846245527267456, "eval_runtime": 4.6234, "eval_samples_per_second": 415.493, "eval_steps_per_second": 26.171, "step": 144 }, { "epoch": 0.64, "grad_norm": 0.1712610423564911, "learning_rate": 0.0008709677419354839, "loss": 0.7813, "step": 160 }, { "epoch": 0.64, "eval_loss": 0.687301516532898, "eval_runtime": 4.576, "eval_samples_per_second": 419.802, "eval_steps_per_second": 26.443, "step": 160 }, { "epoch": 0.71, "grad_norm": 0.10070759803056717, "learning_rate": 0.0008580645161290323, "loss": 0.7321, "step": 176 }, { "epoch": 0.71, "eval_loss": 0.6738956570625305, "eval_runtime": 4.5756, "eval_samples_per_second": 419.835, "eval_steps_per_second": 26.445, "step": 176 }, { "epoch": 0.77, "grad_norm": 0.09799516946077347, "learning_rate": 0.0008451612903225807, "loss": 0.6972, "step": 192 }, { "epoch": 0.77, "eval_loss": 0.6699069738388062, "eval_runtime": 4.602, "eval_samples_per_second": 417.424, "eval_steps_per_second": 26.293, "step": 192 }, { "epoch": 0.84, "grad_norm": 0.15926051139831543, "learning_rate": 0.0008322580645161291, "loss": 0.7642, "step": 208 }, { "epoch": 0.84, "eval_loss": 0.6707108616828918, "eval_runtime": 4.6005, "eval_samples_per_second": 417.567, "eval_steps_per_second": 26.302, "step": 208 }, { "epoch": 0.9, "grad_norm": 0.1038859635591507, "learning_rate": 0.0008193548387096774, "loss": 0.7234, "step": 224 }, { "epoch": 0.9, "eval_loss": 0.6664640307426453, "eval_runtime": 4.5657, "eval_samples_per_second": 420.746, "eval_steps_per_second": 26.502, "step": 224 }, { "epoch": 0.96, "grad_norm": 0.09894105046987534, "learning_rate": 0.0008064516129032258, "loss": 0.6891, "step": 240 }, { "epoch": 0.96, "eval_loss": 0.6674955487251282, "eval_runtime": 4.6128, "eval_samples_per_second": 416.447, "eval_steps_per_second": 26.231, "step": 240 }, { "epoch": 1.03, "grad_norm": 0.108543761074543, "learning_rate": 0.0007935483870967743, "loss": 0.7219, "step": 256 }, { "epoch": 1.03, "eval_loss": 0.6673880815505981, "eval_runtime": 4.6113, "eval_samples_per_second": 416.588, "eval_steps_per_second": 26.24, "step": 256 }, { "epoch": 1.09, "grad_norm": 0.08566311001777649, "learning_rate": 0.0007806451612903226, "loss": 0.6558, "step": 272 }, { "epoch": 1.09, "eval_loss": 0.6653081774711609, "eval_runtime": 4.6897, "eval_samples_per_second": 409.621, "eval_steps_per_second": 25.801, "step": 272 }, { "epoch": 1.16, "grad_norm": 0.10951746255159378, "learning_rate": 0.000767741935483871, "loss": 0.5958, "step": 288 }, { "epoch": 1.16, "eval_loss": 0.6638582348823547, "eval_runtime": 4.7789, "eval_samples_per_second": 401.974, "eval_steps_per_second": 25.32, "step": 288 }, { "epoch": 1.22, "grad_norm": 0.1710771769285202, "learning_rate": 0.0007548387096774194, "loss": 0.6209, "step": 304 }, { "epoch": 1.22, "eval_loss": 0.6717860102653503, "eval_runtime": 4.6605, "eval_samples_per_second": 412.186, "eval_steps_per_second": 25.963, "step": 304 }, { "epoch": 1.29, "grad_norm": 0.09286555647850037, "learning_rate": 0.0007419354838709678, "loss": 0.664, "step": 320 }, { "epoch": 1.29, "eval_loss": 0.6565249562263489, "eval_runtime": 4.6747, "eval_samples_per_second": 410.939, "eval_steps_per_second": 25.884, "step": 320 }, { "epoch": 1.35, "grad_norm": 0.09515868872404099, "learning_rate": 0.0007290322580645162, "loss": 0.6123, "step": 336 }, { "epoch": 1.35, "eval_loss": 0.6616882085800171, "eval_runtime": 4.6796, "eval_samples_per_second": 410.502, "eval_steps_per_second": 25.857, "step": 336 }, { "epoch": 1.41, "grad_norm": 0.10440368950366974, "learning_rate": 0.0007161290322580646, "loss": 0.6113, "step": 352 }, { "epoch": 1.41, "eval_loss": 0.6656071543693542, "eval_runtime": 4.6341, "eval_samples_per_second": 414.539, "eval_steps_per_second": 26.111, "step": 352 }, { "epoch": 1.48, "grad_norm": 0.08295060694217682, "learning_rate": 0.000703225806451613, "loss": 0.6742, "step": 368 }, { "epoch": 1.48, "eval_loss": 0.6533424854278564, "eval_runtime": 4.6967, "eval_samples_per_second": 409.008, "eval_steps_per_second": 25.763, "step": 368 }, { "epoch": 1.54, "grad_norm": 0.10447151958942413, "learning_rate": 0.0006903225806451613, "loss": 0.6121, "step": 384 }, { "epoch": 1.54, "eval_loss": 0.6559557914733887, "eval_runtime": 4.6736, "eval_samples_per_second": 411.033, "eval_steps_per_second": 25.89, "step": 384 }, { "epoch": 1.61, "grad_norm": 0.19215475022792816, "learning_rate": 0.0006774193548387097, "loss": 0.5878, "step": 400 }, { "epoch": 1.61, "eval_loss": 0.6661805510520935, "eval_runtime": 4.656, "eval_samples_per_second": 412.586, "eval_steps_per_second": 25.988, "step": 400 }, { "epoch": 1.67, "grad_norm": 0.08589986711740494, "learning_rate": 0.0006645161290322582, "loss": 0.6835, "step": 416 }, { "epoch": 1.67, "eval_loss": 0.6515570878982544, "eval_runtime": 5.5084, "eval_samples_per_second": 348.738, "eval_steps_per_second": 21.966, "step": 416 }, { "epoch": 1.74, "grad_norm": 0.09526026993989944, "learning_rate": 0.0006516129032258064, "loss": 0.6178, "step": 432 }, { "epoch": 1.74, "eval_loss": 0.6476355791091919, "eval_runtime": 4.6704, "eval_samples_per_second": 411.312, "eval_steps_per_second": 25.908, "step": 432 }, { "epoch": 1.8, "grad_norm": 0.12676902115345, "learning_rate": 0.0006387096774193548, "loss": 0.5738, "step": 448 }, { "epoch": 1.8, "eval_loss": 0.648709774017334, "eval_runtime": 4.6719, "eval_samples_per_second": 411.185, "eval_steps_per_second": 25.9, "step": 448 }, { "epoch": 1.86, "grad_norm": 0.08004370331764221, "learning_rate": 0.0006258064516129032, "loss": 0.6869, "step": 464 }, { "epoch": 1.86, "eval_loss": 0.6497883200645447, "eval_runtime": 4.6811, "eval_samples_per_second": 410.376, "eval_steps_per_second": 25.849, "step": 464 }, { "epoch": 1.93, "grad_norm": 0.07917796820402145, "learning_rate": 0.0006129032258064516, "loss": 0.6231, "step": 480 }, { "epoch": 1.93, "eval_loss": 0.6452683806419373, "eval_runtime": 4.7528, "eval_samples_per_second": 404.179, "eval_steps_per_second": 25.458, "step": 480 }, { "epoch": 1.99, "grad_norm": 0.11247972398996353, "learning_rate": 0.0006, "loss": 0.5753, "step": 496 }, { "epoch": 1.99, "eval_loss": 0.6489792466163635, "eval_runtime": 4.6875, "eval_samples_per_second": 409.813, "eval_steps_per_second": 25.813, "step": 496 }, { "epoch": 2.06, "grad_norm": 0.08393968641757965, "learning_rate": 0.0005870967741935483, "loss": 0.6299, "step": 512 }, { "epoch": 2.06, "eval_loss": 0.6549689173698425, "eval_runtime": 4.6812, "eval_samples_per_second": 410.367, "eval_steps_per_second": 25.848, "step": 512 }, { "epoch": 2.12, "grad_norm": 0.09901689738035202, "learning_rate": 0.0005741935483870968, "loss": 0.553, "step": 528 }, { "epoch": 2.12, "eval_loss": 0.6531901955604553, "eval_runtime": 4.6345, "eval_samples_per_second": 414.5, "eval_steps_per_second": 26.109, "step": 528 }, { "epoch": 2.19, "grad_norm": 0.14240662753582, "learning_rate": 0.0005612903225806451, "loss": 0.4872, "step": 544 }, { "epoch": 2.19, "eval_loss": 0.6619027256965637, "eval_runtime": 4.6673, "eval_samples_per_second": 411.588, "eval_steps_per_second": 25.925, "step": 544 }, { "epoch": 2.25, "grad_norm": 0.12037284672260284, "learning_rate": 0.0005483870967741935, "loss": 0.5957, "step": 560 }, { "epoch": 2.25, "eval_loss": 0.6506627202033997, "eval_runtime": 4.6368, "eval_samples_per_second": 414.293, "eval_steps_per_second": 26.096, "step": 560 }, { "epoch": 2.31, "grad_norm": 0.08450206369161606, "learning_rate": 0.000535483870967742, "loss": 0.5647, "step": 576 }, { "epoch": 2.31, "eval_loss": 0.6499984264373779, "eval_runtime": 4.64, "eval_samples_per_second": 414.005, "eval_steps_per_second": 26.077, "step": 576 }, { "epoch": 2.38, "grad_norm": 0.09682720899581909, "learning_rate": 0.0005225806451612903, "loss": 0.5017, "step": 592 }, { "epoch": 2.38, "eval_loss": 0.6509167551994324, "eval_runtime": 4.7996, "eval_samples_per_second": 400.243, "eval_steps_per_second": 25.21, "step": 592 }, { "epoch": 2.44, "grad_norm": 0.1342058777809143, "learning_rate": 0.0005096774193548387, "loss": 0.5774, "step": 608 }, { "epoch": 2.44, "eval_loss": 0.6469881534576416, "eval_runtime": 4.6664, "eval_samples_per_second": 411.664, "eval_steps_per_second": 25.93, "step": 608 }, { "epoch": 2.51, "grad_norm": 0.09072865545749664, "learning_rate": 0.0004967741935483871, "loss": 0.5773, "step": 624 }, { "epoch": 2.51, "eval_loss": 0.6494817733764648, "eval_runtime": 4.6407, "eval_samples_per_second": 413.945, "eval_steps_per_second": 26.074, "step": 624 }, { "epoch": 2.57, "grad_norm": 0.10313040763139725, "learning_rate": 0.0004838709677419355, "loss": 0.5158, "step": 640 }, { "epoch": 2.57, "eval_loss": 0.6480950117111206, "eval_runtime": 4.6554, "eval_samples_per_second": 412.638, "eval_steps_per_second": 25.991, "step": 640 }, { "epoch": 2.64, "grad_norm": 0.12878872454166412, "learning_rate": 0.00047096774193548384, "loss": 0.5638, "step": 656 }, { "epoch": 2.64, "eval_loss": 0.645260214805603, "eval_runtime": 4.654, "eval_samples_per_second": 412.765, "eval_steps_per_second": 25.999, "step": 656 }, { "epoch": 2.7, "grad_norm": 0.1101013645529747, "learning_rate": 0.00045806451612903225, "loss": 0.5839, "step": 672 }, { "epoch": 2.7, "eval_loss": 0.6407684683799744, "eval_runtime": 4.6561, "eval_samples_per_second": 412.576, "eval_steps_per_second": 25.987, "step": 672 } ], "logging_steps": 16, "max_steps": 1240, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 16, "total_flos": 1.668443666998395e+17, "train_batch_size": 96, "trial_name": null, "trial_params": null }