Training in progress, step 100, checkpoint

Browse files

Files changed (6) hide show

checkpoint-100/adapter_model.safetensors +1 -1
checkpoint-100/optimizer.pt +1 -1
checkpoint-100/rng_state.pth +1 -1
checkpoint-100/scheduler.pt +1 -1
checkpoint-100/trainer_state.json +365 -325
checkpoint-100/training_args.bin +1 -1

checkpoint-100/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d140c685bbbdf73eff30c42b3eaef6949a1a4e79c0051674896322d5a2f35f65
 size 27313024

 version https://git-lfs.github.com/spec/v1
+oid sha256:05b3724d9cb5ad7f3ad6b3145ea19ec715b612d3bf48ac25eb25f71ad3350332
 size 27313024

checkpoint-100/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d45c2bef931e188dd0e466d4eb9509f5f61baf9e9ada63713c9fdff0fa4aeb9c
 size 54668218

 version https://git-lfs.github.com/spec/v1
+oid sha256:d74c2a34d8bd1e3bb822829ae2c223dfde93fc183c47197ac577c0d0e98ad96b
 size 54668218

checkpoint-100/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:666592a5b57156ed650e68139efccc3f2979768f94ac7ee04c6f1378fbe157f7
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:49a3c9b0005e2657e4343d430644e98c0322af865aa1a9053960adee68c999d5
 size 14244

checkpoint-100/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea217589f6a52e6e5bf252b883fdc2c5bb872bd2fee80104e01128c8070232c3
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:d8bf8a092e609ea1421206d102dbe42a4b8e939f00a4089ac1280c8ce0f99ed4
 size 1064

checkpoint-100/trainer_state.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-  "best_metric": 0.6870357394218445,
-  "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-80",
   "epoch": 0.9433962264150944,
-  "eval_steps": 20,
   "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
@@ -10,750 +10,790 @@
   "log_history": [
     {
       "epoch": 0.009433962264150943,
-      "grad_norm": 20.337095260620117,
-      "learning_rate": 2e-05,
-      "loss": 0.5913,
       "step": 1
     },
     {
       "epoch": 0.018867924528301886,
-      "grad_norm": 11.506393432617188,
-      "learning_rate": 2e-05,
-      "loss": 0.6775,
       "step": 2
     },
     {
       "epoch": 0.02830188679245283,
-      "grad_norm": 9.703904151916504,
-      "learning_rate": 2e-05,
-      "loss": 0.576,
       "step": 3
     },
     {
       "epoch": 0.03773584905660377,
-      "grad_norm": 11.118324279785156,
-      "learning_rate": 2e-05,
-      "loss": 0.5084,
       "step": 4
     },
     {
       "epoch": 0.04716981132075472,
-      "grad_norm": 13.329315185546875,
-      "learning_rate": 2e-05,
-      "loss": 0.5712,
       "step": 5
     },
     {
       "epoch": 0.05660377358490566,
-      "grad_norm": 29.63173484802246,
-      "learning_rate": 2e-05,
-      "loss": 0.7234,
       "step": 6
     },
     {
       "epoch": 0.0660377358490566,
-      "grad_norm": 17.787134170532227,
-      "learning_rate": 2e-05,
-      "loss": 0.6053,
       "step": 7
     },
     {
       "epoch": 0.07547169811320754,
-      "grad_norm": 30.639591217041016,
-      "learning_rate": 2e-05,
-      "loss": 0.6873,
       "step": 8
     },
     {
       "epoch": 0.08490566037735849,
-      "grad_norm": 9.612072944641113,
-      "learning_rate": 2e-05,
-      "loss": 0.5541,
       "step": 9
     },
     {
       "epoch": 0.09433962264150944,
-      "grad_norm": 8.989519119262695,
-      "learning_rate": 2e-05,
-      "loss": 0.7858,
       "step": 10
     },
     {
       "epoch": 0.10377358490566038,
-      "grad_norm": 17.486469268798828,
-      "learning_rate": 2e-05,
-      "loss": 0.9176,
       "step": 11
     },
     {
       "epoch": 0.11320754716981132,
-      "grad_norm": 35.29791259765625,
-      "learning_rate": 2e-05,
-      "loss": 0.6558,
       "step": 12
     },
     {
       "epoch": 0.12264150943396226,
-      "grad_norm": 19.468692779541016,
-      "learning_rate": 2e-05,
-      "loss": 0.6085,
       "step": 13
     },
     {
       "epoch": 0.1320754716981132,
-      "grad_norm": 9.410886764526367,
-      "learning_rate": 2e-05,
-      "loss": 0.6229,
       "step": 14
     },
     {
       "epoch": 0.14150943396226415,
-      "grad_norm": 11.87700080871582,
-      "learning_rate": 2e-05,
-      "loss": 0.5764,
       "step": 15
     },
     {
       "epoch": 0.1509433962264151,
-      "grad_norm": 20.188251495361328,
-      "learning_rate": 2e-05,
-      "loss": 0.8275,
       "step": 16
     },
     {
       "epoch": 0.16037735849056603,
-      "grad_norm": 28.298933029174805,
-      "learning_rate": 2e-05,
-      "loss": 0.5896,
       "step": 17
     },
     {
       "epoch": 0.16981132075471697,
-      "grad_norm": 47.8366813659668,
-      "learning_rate": 2e-05,
-      "loss": 0.8496,
       "step": 18
     },
     {
       "epoch": 0.1792452830188679,
-      "grad_norm": 36.19501495361328,
-      "learning_rate": 2e-05,
-      "loss": 0.6756,
       "step": 19
     },
     {
       "epoch": 0.18867924528301888,
-      "grad_norm": 22.574682235717773,
-      "learning_rate": 2e-05,
-      "loss": 0.6366,
       "step": 20
     },
     {
       "epoch": 0.18867924528301888,
-      "eval_loss": 0.7738199234008789,
-      "eval_runtime": 18.5622,
-      "eval_samples_per_second": 15.893,
-      "eval_steps_per_second": 3.179,
       "step": 20
     },
     {
       "epoch": 0.19811320754716982,
-      "grad_norm": 47.26109313964844,
-      "learning_rate": 2e-05,
-      "loss": 0.8049,
       "step": 21
     },
     {
       "epoch": 0.20754716981132076,
-      "grad_norm": 14.115569114685059,
-      "learning_rate": 2e-05,
-      "loss": 0.6604,
       "step": 22
     },
     {
       "epoch": 0.2169811320754717,
-      "grad_norm": 25.182506561279297,
-      "learning_rate": 2e-05,
-      "loss": 0.7591,
       "step": 23
     },
     {
       "epoch": 0.22641509433962265,
-      "grad_norm": 11.066629409790039,
-      "learning_rate": 2e-05,
-      "loss": 0.6497,
       "step": 24
     },
     {
       "epoch": 0.2358490566037736,
-      "grad_norm": 8.666443824768066,
-      "learning_rate": 2e-05,
-      "loss": 0.5788,
       "step": 25
     },
     {
       "epoch": 0.24528301886792453,
-      "grad_norm": 7.663419723510742,
-      "learning_rate": 2e-05,
-      "loss": 0.7128,
       "step": 26
     },
     {
       "epoch": 0.25471698113207547,
-      "grad_norm": 30.738019943237305,
-      "learning_rate": 2e-05,
-      "loss": 0.7349,
       "step": 27
     },
     {
       "epoch": 0.2641509433962264,
-      "grad_norm": 29.7031307220459,
-      "learning_rate": 2e-05,
-      "loss": 0.7618,
       "step": 28
     },
     {
       "epoch": 0.27358490566037735,
-      "grad_norm": 36.29247283935547,
-      "learning_rate": 2e-05,
-      "loss": 0.6923,
       "step": 29
     },
     {
       "epoch": 0.2830188679245283,
-      "grad_norm": 16.721107482910156,
-      "learning_rate": 2e-05,
-      "loss": 0.5942,
       "step": 30
     },
     {
       "epoch": 0.29245283018867924,
-      "grad_norm": 36.51066970825195,
-      "learning_rate": 2e-05,
-      "loss": 0.7745,
       "step": 31
     },
     {
       "epoch": 0.3018867924528302,
-      "grad_norm": 13.144597053527832,
-      "learning_rate": 2e-05,
-      "loss": 0.6199,
       "step": 32
     },
     {
       "epoch": 0.3113207547169811,
-      "grad_norm": 24.113306045532227,
-      "learning_rate": 2e-05,
-      "loss": 0.6653,
       "step": 33
     },
     {
       "epoch": 0.32075471698113206,
-      "grad_norm": 34.57608413696289,
-      "learning_rate": 2e-05,
-      "loss": 0.5586,
       "step": 34
     },
     {
       "epoch": 0.330188679245283,
-      "grad_norm": 15.308676719665527,
-      "learning_rate": 2e-05,
-      "loss": 0.7438,
       "step": 35
     },
     {
       "epoch": 0.33962264150943394,
-      "grad_norm": 34.94574737548828,
-      "learning_rate": 2e-05,
-      "loss": 0.7437,
       "step": 36
     },
     {
       "epoch": 0.3490566037735849,
-      "grad_norm": 53.19334030151367,
-      "learning_rate": 2e-05,
-      "loss": 0.8349,
       "step": 37
     },
     {
       "epoch": 0.3584905660377358,
-      "grad_norm": 38.979618072509766,
-      "learning_rate": 2e-05,
-      "loss": 0.6599,
       "step": 38
     },
     {
       "epoch": 0.36792452830188677,
-      "grad_norm": 30.653545379638672,
-      "learning_rate": 2e-05,
-      "loss": 0.6782,
       "step": 39
     },
     {
       "epoch": 0.37735849056603776,
-      "grad_norm": 28.044891357421875,
-      "learning_rate": 2e-05,
-      "loss": 0.7945,
       "step": 40
     },
     {
       "epoch": 0.37735849056603776,
-      "eval_loss": 0.7332659959793091,
-      "eval_runtime": 19.1011,
-      "eval_samples_per_second": 15.444,
-      "eval_steps_per_second": 3.089,
       "step": 40
     },
     {
       "epoch": 0.3867924528301887,
-      "grad_norm": 7.029095649719238,
-      "learning_rate": 2e-05,
-      "loss": 0.6402,
       "step": 41
     },
     {
       "epoch": 0.39622641509433965,
-      "grad_norm": 31.614521026611328,
-      "learning_rate": 2e-05,
-      "loss": 0.6392,
       "step": 42
     },
     {
       "epoch": 0.4056603773584906,
-      "grad_norm": 8.320149421691895,
-      "learning_rate": 2e-05,
-      "loss": 0.5229,
       "step": 43
     },
     {
       "epoch": 0.41509433962264153,
-      "grad_norm": 18.34058380126953,
-      "learning_rate": 2e-05,
-      "loss": 0.6935,
       "step": 44
     },
     {
       "epoch": 0.42452830188679247,
-      "grad_norm": 36.57161331176758,
-      "learning_rate": 2e-05,
-      "loss": 0.6399,
       "step": 45
     },
     {
       "epoch": 0.4339622641509434,
-      "grad_norm": 7.638645172119141,
-      "learning_rate": 2e-05,
-      "loss": 0.7015,
       "step": 46
     },
     {
       "epoch": 0.44339622641509435,
-      "grad_norm": 18.424884796142578,
-      "learning_rate": 2e-05,
-      "loss": 0.7157,
       "step": 47
     },
     {
       "epoch": 0.4528301886792453,
-      "grad_norm": 51.02284240722656,
-      "learning_rate": 2e-05,
-      "loss": 0.8159,
       "step": 48
     },
     {
       "epoch": 0.46226415094339623,
-      "grad_norm": 29.55755615234375,
-      "learning_rate": 2e-05,
-      "loss": 0.6529,
       "step": 49
     },
     {
       "epoch": 0.4716981132075472,
-      "grad_norm": 12.764640808105469,
-      "learning_rate": 2e-05,
-      "loss": 0.6704,
       "step": 50
     },
     {
       "epoch": 0.4811320754716981,
-      "grad_norm": 17.65540313720703,
-      "learning_rate": 2e-05,
-      "loss": 0.6762,
       "step": 51
     },
     {
       "epoch": 0.49056603773584906,
-      "grad_norm": 10.487552642822266,
-      "learning_rate": 2e-05,
-      "loss": 0.6094,
       "step": 52
     },
     {
       "epoch": 0.5,
-      "grad_norm": 10.158540725708008,
-      "learning_rate": 2e-05,
-      "loss": 0.6539,
       "step": 53
     },
     {
       "epoch": 0.5094339622641509,
-      "grad_norm": 27.807415008544922,
-      "learning_rate": 2e-05,
-      "loss": 0.7554,
       "step": 54
     },
     {
       "epoch": 0.5188679245283019,
-      "grad_norm": 39.26100540161133,
-      "learning_rate": 2e-05,
-      "loss": 0.8584,
       "step": 55
     },
     {
       "epoch": 0.5283018867924528,
-      "grad_norm": 8.890057563781738,
-      "learning_rate": 2e-05,
-      "loss": 0.7872,
       "step": 56
     },
     {
       "epoch": 0.5377358490566038,
-      "grad_norm": 11.212479591369629,
-      "learning_rate": 2e-05,
-      "loss": 0.8501,
       "step": 57
     },
     {
       "epoch": 0.5471698113207547,
-      "grad_norm": 8.871652603149414,
-      "learning_rate": 2e-05,
-      "loss": 0.6034,
       "step": 58
     },
     {
       "epoch": 0.5566037735849056,
-      "grad_norm": 13.393775939941406,
-      "learning_rate": 2e-05,
-      "loss": 0.5953,
       "step": 59
     },
     {
       "epoch": 0.5660377358490566,
-      "grad_norm": 16.56597328186035,
-      "learning_rate": 2e-05,
-      "loss": 0.5595,
       "step": 60
     },
     {
       "epoch": 0.5660377358490566,
-      "eval_loss": 0.7103222012519836,
-      "eval_runtime": 19.0536,
-      "eval_samples_per_second": 15.483,
-      "eval_steps_per_second": 3.097,
       "step": 60
     },
     {
       "epoch": 0.5754716981132075,
-      "grad_norm": 66.63365936279297,
-      "learning_rate": 2e-05,
-      "loss": 0.8609,
       "step": 61
     },
     {
       "epoch": 0.5849056603773585,
-      "grad_norm": 43.89859390258789,
-      "learning_rate": 2e-05,
-      "loss": 0.7555,
       "step": 62
     },
     {
       "epoch": 0.5943396226415094,
-      "grad_norm": 54.232025146484375,
-      "learning_rate": 2e-05,
-      "loss": 0.7666,
       "step": 63
     },
     {
       "epoch": 0.6037735849056604,
-      "grad_norm": 10.439966201782227,
-      "learning_rate": 2e-05,
-      "loss": 0.6019,
       "step": 64
     },
     {
       "epoch": 0.6132075471698113,
-      "grad_norm": 15.057198524475098,
-      "learning_rate": 2e-05,
-      "loss": 0.6797,
       "step": 65
     },
     {
       "epoch": 0.6226415094339622,
-      "grad_norm": 8.816701889038086,
-      "learning_rate": 2e-05,
-      "loss": 0.8066,
       "step": 66
     },
     {
       "epoch": 0.6320754716981132,
-      "grad_norm": 16.436609268188477,
-      "learning_rate": 2e-05,
-      "loss": 0.5891,
       "step": 67
     },
     {
       "epoch": 0.6415094339622641,
-      "grad_norm": 27.5755672454834,
-      "learning_rate": 2e-05,
-      "loss": 0.6204,
       "step": 68
     },
     {
       "epoch": 0.6509433962264151,
-      "grad_norm": 26.33946990966797,
-      "learning_rate": 2e-05,
-      "loss": 0.671,
       "step": 69
     },
     {
       "epoch": 0.660377358490566,
-      "grad_norm": 64.1870346069336,
-      "learning_rate": 2e-05,
-      "loss": 0.7638,
       "step": 70
     },
     {
       "epoch": 0.6698113207547169,
-      "grad_norm": 21.89188003540039,
-      "learning_rate": 2e-05,
-      "loss": 0.6771,
       "step": 71
     },
     {
       "epoch": 0.6792452830188679,
-      "grad_norm": 8.088455200195312,
-      "learning_rate": 2e-05,
-      "loss": 0.6761,
       "step": 72
     },
     {
       "epoch": 0.6886792452830188,
-      "grad_norm": 11.988521575927734,
-      "learning_rate": 2e-05,
-      "loss": 0.6315,
       "step": 73
     },
     {
       "epoch": 0.6981132075471698,
-      "grad_norm": 8.751002311706543,
-      "learning_rate": 2e-05,
-      "loss": 0.5967,
       "step": 74
     },
     {
       "epoch": 0.7075471698113207,
-      "grad_norm": 22.44446563720703,
-      "learning_rate": 2e-05,
-      "loss": 0.5986,
       "step": 75
     },
     {
       "epoch": 0.7169811320754716,
-      "grad_norm": 6.895334243774414,
-      "learning_rate": 2e-05,
-      "loss": 0.6324,
       "step": 76
     },
     {
       "epoch": 0.7264150943396226,
-      "grad_norm": 8.335739135742188,
-      "learning_rate": 2e-05,
-      "loss": 0.6581,
       "step": 77
     },
     {
       "epoch": 0.7358490566037735,
-      "grad_norm": 6.27984619140625,
-      "learning_rate": 2e-05,
-      "loss": 0.6899,
       "step": 78
     },
     {
       "epoch": 0.7452830188679245,
-      "grad_norm": 13.635252952575684,
-      "learning_rate": 2e-05,
-      "loss": 0.7032,
       "step": 79
     },
     {
       "epoch": 0.7547169811320755,
-      "grad_norm": 5.515637397766113,
-      "learning_rate": 2e-05,
-      "loss": 0.5121,
       "step": 80
     },
     {
       "epoch": 0.7547169811320755,
-      "eval_loss": 0.6870357394218445,
-      "eval_runtime": 19.1439,
-      "eval_samples_per_second": 15.41,
-      "eval_steps_per_second": 3.082,
       "step": 80
     },
     {
       "epoch": 0.7641509433962265,
-      "grad_norm": 14.854217529296875,
-      "learning_rate": 2e-05,
-      "loss": 0.7582,
       "step": 81
     },
     {
       "epoch": 0.7735849056603774,
-      "grad_norm": 19.503761291503906,
-      "learning_rate": 2e-05,
-      "loss": 0.7394,
       "step": 82
     },
     {
       "epoch": 0.7830188679245284,
-      "grad_norm": 5.10677433013916,
-      "learning_rate": 2e-05,
-      "loss": 0.5536,
       "step": 83
     },
     {
       "epoch": 0.7924528301886793,
-      "grad_norm": 48.037845611572266,
-      "learning_rate": 2e-05,
-      "loss": 0.7501,
       "step": 84
     },
     {
       "epoch": 0.8018867924528302,
-      "grad_norm": 28.357952117919922,
-      "learning_rate": 2e-05,
-      "loss": 0.7174,
       "step": 85
     },
     {
       "epoch": 0.8113207547169812,
-      "grad_norm": 18.693449020385742,
-      "learning_rate": 2e-05,
-      "loss": 0.8174,
       "step": 86
     },
     {
       "epoch": 0.8207547169811321,
-      "grad_norm": 36.01970672607422,
-      "learning_rate": 2e-05,
-      "loss": 0.7863,
       "step": 87
     },
     {
       "epoch": 0.8301886792452831,
-      "grad_norm": 63.98431396484375,
-      "learning_rate": 2e-05,
-      "loss": 0.7538,
       "step": 88
     },
     {
       "epoch": 0.839622641509434,
-      "grad_norm": 7.736374855041504,
-      "learning_rate": 2e-05,
-      "loss": 0.6478,
       "step": 89
     },
     {
       "epoch": 0.8490566037735849,
-      "grad_norm": 9.201268196105957,
-      "learning_rate": 2e-05,
-      "loss": 0.7841,
       "step": 90
     },
     {
       "epoch": 0.8584905660377359,
-      "grad_norm": 26.842529296875,
-      "learning_rate": 2e-05,
-      "loss": 0.7152,
       "step": 91
     },
     {
       "epoch": 0.8679245283018868,
-      "grad_norm": 21.91474723815918,
-      "learning_rate": 2e-05,
-      "loss": 0.5827,
       "step": 92
     },
     {
       "epoch": 0.8773584905660378,
-      "grad_norm": 9.022438049316406,
-      "learning_rate": 2e-05,
-      "loss": 0.6294,
       "step": 93
     },
     {
       "epoch": 0.8867924528301887,
-      "grad_norm": 9.270819664001465,
-      "learning_rate": 2e-05,
-      "loss": 0.6174,
       "step": 94
     },
     {
       "epoch": 0.8962264150943396,
-      "grad_norm": 11.497746467590332,
-      "learning_rate": 2e-05,
-      "loss": 0.7267,
       "step": 95
     },
     {
       "epoch": 0.9056603773584906,
-      "grad_norm": 19.90700912475586,
-      "learning_rate": 2e-05,
-      "loss": 0.665,
       "step": 96
     },
     {
       "epoch": 0.9150943396226415,
-      "grad_norm": 26.896240234375,
-      "learning_rate": 2e-05,
-      "loss": 0.7505,
       "step": 97
     },
     {
       "epoch": 0.9245283018867925,
-      "grad_norm": 12.731915473937988,
-      "learning_rate": 2e-05,
-      "loss": 0.6568,
       "step": 98
     },
     {
       "epoch": 0.9339622641509434,
-      "grad_norm": 29.186397552490234,
-      "learning_rate": 2e-05,
-      "loss": 0.701,
       "step": 99
     },
     {
       "epoch": 0.9433962264150944,
-      "grad_norm": 5.476395130157471,
-      "learning_rate": 2e-05,
-      "loss": 0.5688,
       "step": 100
     },
     {
       "epoch": 0.9433962264150944,
-      "eval_loss": 0.689711332321167,
-      "eval_runtime": 19.1115,
-      "eval_samples_per_second": 15.436,
       "eval_steps_per_second": 3.087,
       "step": 100
     }
   ],
   "logging_steps": 1,
-  "max_steps": 106,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 1,
-  "save_steps": 20,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {

 {
+  "best_metric": 0.6825625896453857,
+  "best_model_checkpoint": "../artifacts/LlaMa3-QLoRA-PatentMatch-v0.1/checkpoint-100",
   "epoch": 0.9433962264150944,
+  "eval_steps": 10,
   "global_step": 100,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "log_history": [
     {
       "epoch": 0.009433962264150943,
+      "grad_norm": 12.244806289672852,
+      "learning_rate": 2e-08,
+      "loss": 0.4715,
       "step": 1
     },
     {
       "epoch": 0.018867924528301886,
+      "grad_norm": 11.454357147216797,
+      "learning_rate": 4e-08,
+      "loss": 0.5527,
       "step": 2
     },
     {
       "epoch": 0.02830188679245283,
+      "grad_norm": 7.327939510345459,
+      "learning_rate": 6e-08,
+      "loss": 0.5359,
       "step": 3
     },
     {
       "epoch": 0.03773584905660377,
+      "grad_norm": 8.935256958007812,
+      "learning_rate": 8e-08,
+      "loss": 0.4292,
       "step": 4
     },
     {
       "epoch": 0.04716981132075472,
+      "grad_norm": 17.576908111572266,
+      "learning_rate": 1e-07,
+      "loss": 0.5657,
       "step": 5
     },
     {
       "epoch": 0.05660377358490566,
+      "grad_norm": 22.42218780517578,
+      "learning_rate": 1.2e-07,
+      "loss": 0.7024,
       "step": 6
     },
     {
       "epoch": 0.0660377358490566,
+      "grad_norm": 7.509771347045898,
+      "learning_rate": 1.4e-07,
+      "loss": 0.5426,
       "step": 7
     },
     {
       "epoch": 0.07547169811320754,
+      "grad_norm": 24.912858963012695,
+      "learning_rate": 1.6e-07,
+      "loss": 0.6312,
       "step": 8
     },
     {
       "epoch": 0.08490566037735849,
+      "grad_norm": 10.798696517944336,
+      "learning_rate": 1.8e-07,
+      "loss": 0.4632,
       "step": 9
     },
     {
       "epoch": 0.09433962264150944,
+      "grad_norm": 9.916950225830078,
+      "learning_rate": 2e-07,
+      "loss": 0.6934,
+      "step": 10
+    },
+    {
+      "epoch": 0.09433962264150944,
+      "eval_loss": 0.6845090985298157,
+      "eval_runtime": 18.8138,
+      "eval_samples_per_second": 15.68,
+      "eval_steps_per_second": 3.136,
       "step": 10
     },
     {
       "epoch": 0.10377358490566038,
+      "grad_norm": 8.111969947814941,
+      "learning_rate": 2.1999999999999998e-07,
+      "loss": 0.7586,
       "step": 11
     },
     {
       "epoch": 0.11320754716981132,
+      "grad_norm": 25.175071716308594,
+      "learning_rate": 2.4e-07,
+      "loss": 0.6298,
       "step": 12
     },
     {
       "epoch": 0.12264150943396226,
+      "grad_norm": 5.813445568084717,
+      "learning_rate": 2.6e-07,
+      "loss": 0.5559,
       "step": 13
     },
     {
       "epoch": 0.1320754716981132,
+      "grad_norm": 7.799736022949219,
+      "learning_rate": 2.8e-07,
+      "loss": 0.5321,
       "step": 14
     },
     {
       "epoch": 0.14150943396226415,
+      "grad_norm": 10.612166404724121,
+      "learning_rate": 3e-07,
+      "loss": 0.5567,
       "step": 15
     },
     {
       "epoch": 0.1509433962264151,
+      "grad_norm": 25.862613677978516,
+      "learning_rate": 3.2e-07,
+      "loss": 0.7949,
       "step": 16
     },
     {
       "epoch": 0.16037735849056603,
+      "grad_norm": 5.672112941741943,
+      "learning_rate": 3.4000000000000003e-07,
+      "loss": 0.5568,
       "step": 17
     },
     {
       "epoch": 0.16981132075471697,
+      "grad_norm": 22.59090805053711,
+      "learning_rate": 3.6e-07,
+      "loss": 0.651,
       "step": 18
     },
     {
       "epoch": 0.1792452830188679,
+      "grad_norm": 6.6907548904418945,
+      "learning_rate": 3.7999999999999996e-07,
+      "loss": 0.5429,
       "step": 19
     },
     {
       "epoch": 0.18867924528301888,
+      "grad_norm": 7.563165187835693,
+      "learning_rate": 4e-07,
+      "loss": 0.5338,
       "step": 20
     },
     {
       "epoch": 0.18867924528301888,
+      "eval_loss": 0.6880346536636353,
+      "eval_runtime": 19.0282,
+      "eval_samples_per_second": 15.503,
+      "eval_steps_per_second": 3.101,
       "step": 20
     },
     {
       "epoch": 0.19811320754716982,
+      "grad_norm": 22.867984771728516,
+      "learning_rate": 4.1999999999999995e-07,
+      "loss": 0.6839,
       "step": 21
     },
     {
       "epoch": 0.20754716981132076,
+      "grad_norm": 12.407017707824707,
+      "learning_rate": 4.3999999999999997e-07,
+      "loss": 0.6577,
       "step": 22
     },
     {
       "epoch": 0.2169811320754717,
+      "grad_norm": 12.605359077453613,
+      "learning_rate": 4.6e-07,
+      "loss": 0.7215,
       "step": 23
     },
     {
       "epoch": 0.22641509433962265,
+      "grad_norm": 8.375327110290527,
+      "learning_rate": 4.8e-07,
+      "loss": 0.5053,
       "step": 24
     },
     {
       "epoch": 0.2358490566037736,
+      "grad_norm": 16.666528701782227,
+      "learning_rate": 5e-07,
+      "loss": 0.5431,
       "step": 25
     },
     {
       "epoch": 0.24528301886792453,
+      "grad_norm": 27.57564353942871,
+      "learning_rate": 5.2e-07,
+      "loss": 0.6242,
       "step": 26
     },
     {
       "epoch": 0.25471698113207547,
+      "grad_norm": 14.450230598449707,
+      "learning_rate": 5.4e-07,
+      "loss": 0.6718,
       "step": 27
     },
     {
       "epoch": 0.2641509433962264,
+      "grad_norm": 16.55278968811035,
+      "learning_rate": 5.6e-07,
+      "loss": 0.6649,
       "step": 28
     },
     {
       "epoch": 0.27358490566037735,
+      "grad_norm": 17.196575164794922,
+      "learning_rate": 5.8e-07,
+      "loss": 0.6084,
       "step": 29
     },
     {
       "epoch": 0.2830188679245283,
+      "grad_norm": 38.10641860961914,
+      "learning_rate": 6e-07,
+      "loss": 0.6538,
+      "step": 30
+    },
+    {
+      "epoch": 0.2830188679245283,
+      "eval_loss": 0.6865962743759155,
+      "eval_runtime": 19.122,
+      "eval_samples_per_second": 15.427,
+      "eval_steps_per_second": 3.085,
       "step": 30
     },
     {
       "epoch": 0.29245283018867924,
+      "grad_norm": 9.382880210876465,
+      "learning_rate": 6.2e-07,
+      "loss": 0.6686,
       "step": 31
     },
     {
       "epoch": 0.3018867924528302,
+      "grad_norm": 25.904178619384766,
+      "learning_rate": 6.4e-07,
+      "loss": 0.5842,
       "step": 32
     },
     {
       "epoch": 0.3113207547169811,
+      "grad_norm": 10.835689544677734,
+      "learning_rate": 6.6e-07,
+      "loss": 0.6862,
       "step": 33
     },
     {
       "epoch": 0.32075471698113206,
+      "grad_norm": 16.35777473449707,
+      "learning_rate": 6.800000000000001e-07,
+      "loss": 0.4901,
       "step": 34
     },
     {
       "epoch": 0.330188679245283,
+      "grad_norm": 11.801332473754883,
+      "learning_rate": 7e-07,
+      "loss": 0.7005,
       "step": 35
     },
     {
       "epoch": 0.33962264150943394,
+      "grad_norm": 28.929777145385742,
+      "learning_rate": 7.2e-07,
+      "loss": 0.7141,
       "step": 36
     },
     {
       "epoch": 0.3490566037735849,
+      "grad_norm": 33.3692512512207,
+      "learning_rate": 7.4e-07,
+      "loss": 0.7235,
       "step": 37
     },
     {
       "epoch": 0.3584905660377358,
+      "grad_norm": 14.086514472961426,
+      "learning_rate": 7.599999999999999e-07,
+      "loss": 0.5546,
       "step": 38
     },
     {
       "epoch": 0.36792452830188677,
+      "grad_norm": 8.276351928710938,
+      "learning_rate": 7.799999999999999e-07,
+      "loss": 0.5855,
       "step": 39
     },
     {
       "epoch": 0.37735849056603776,
+      "grad_norm": 8.203176498413086,
+      "learning_rate": 8e-07,
+      "loss": 0.6988,
       "step": 40
     },
     {
       "epoch": 0.37735849056603776,
+      "eval_loss": 0.6843137741088867,
+      "eval_runtime": 19.1524,
+      "eval_samples_per_second": 15.403,
+      "eval_steps_per_second": 3.081,
       "step": 40
     },
     {
       "epoch": 0.3867924528301887,
+      "grad_norm": 15.79111099243164,
+      "learning_rate": 8.199999999999999e-07,
+      "loss": 0.5881,
       "step": 41
     },
     {
       "epoch": 0.39622641509433965,
+      "grad_norm": 16.36391258239746,
+      "learning_rate": 8.399999999999999e-07,
+      "loss": 0.6394,
       "step": 42
     },
     {
       "epoch": 0.4056603773584906,
+      "grad_norm": 14.09928035736084,
+      "learning_rate": 8.599999999999999e-07,
+      "loss": 0.5188,
       "step": 43
     },
     {
       "epoch": 0.41509433962264153,
+      "grad_norm": 13.666457176208496,
+      "learning_rate": 8.799999999999999e-07,
+      "loss": 0.6493,
       "step": 44
     },
     {
       "epoch": 0.42452830188679247,
+      "grad_norm": 26.71883773803711,
+      "learning_rate": 9e-07,
+      "loss": 0.5879,
       "step": 45
     },
     {
       "epoch": 0.4339622641509434,
+      "grad_norm": 7.5422844886779785,
+      "learning_rate": 9.2e-07,
+      "loss": 0.5821,
       "step": 46
     },
     {
       "epoch": 0.44339622641509435,
+      "grad_norm": 23.531204223632812,
+      "learning_rate": 9.399999999999999e-07,
+      "loss": 0.6332,
       "step": 47
     },
     {
       "epoch": 0.4528301886792453,
+      "grad_norm": 30.758493423461914,
+      "learning_rate": 9.6e-07,
+      "loss": 0.7319,
       "step": 48
     },
     {
       "epoch": 0.46226415094339623,
+      "grad_norm": 12.101729393005371,
+      "learning_rate": 9.8e-07,
+      "loss": 0.5698,
       "step": 49
     },
     {
       "epoch": 0.4716981132075472,
+      "grad_norm": 8.760655403137207,
+      "learning_rate": 1e-06,
+      "loss": 0.5976,
+      "step": 50
+    },
+    {
+      "epoch": 0.4716981132075472,
+      "eval_loss": 0.6827160120010376,
+      "eval_runtime": 19.1865,
+      "eval_samples_per_second": 15.375,
+      "eval_steps_per_second": 3.075,
       "step": 50
     },
     {
       "epoch": 0.4811320754716981,
+      "grad_norm": 9.829323768615723,
+      "learning_rate": 1.02e-06,
+      "loss": 0.6239,
       "step": 51
     },
     {
       "epoch": 0.49056603773584906,
+      "grad_norm": 5.044214248657227,
+      "learning_rate": 1.04e-06,
+      "loss": 0.5077,
       "step": 52
     },
     {
       "epoch": 0.5,
+      "grad_norm": 12.68688678741455,
+      "learning_rate": 1.06e-06,
+      "loss": 0.5502,
       "step": 53
     },
     {
       "epoch": 0.5094339622641509,
+      "grad_norm": 12.936463356018066,
+      "learning_rate": 1.08e-06,
+      "loss": 0.7308,
       "step": 54
     },
     {
       "epoch": 0.5188679245283019,
+      "grad_norm": 21.73121452331543,
+      "learning_rate": 1.1e-06,
+      "loss": 0.7065,
       "step": 55
     },
     {
       "epoch": 0.5283018867924528,
+      "grad_norm": 16.07210922241211,
+      "learning_rate": 1.12e-06,
+      "loss": 0.6706,
       "step": 56
     },
     {
       "epoch": 0.5377358490566038,
+      "grad_norm": 20.025632858276367,
+      "learning_rate": 1.1399999999999999e-06,
+      "loss": 0.772,
       "step": 57
     },
     {
       "epoch": 0.5471698113207547,
+      "grad_norm": 19.071701049804688,
+      "learning_rate": 1.16e-06,
+      "loss": 0.5186,
       "step": 58
     },
     {
       "epoch": 0.5566037735849056,
+      "grad_norm": 32.642337799072266,
+      "learning_rate": 1.18e-06,
+      "loss": 0.5967,
       "step": 59
     },
     {
       "epoch": 0.5660377358490566,
+      "grad_norm": 8.402029037475586,
+      "learning_rate": 1.2e-06,
+      "loss": 0.5027,
       "step": 60
     },
     {
       "epoch": 0.5660377358490566,
+      "eval_loss": 0.6824557781219482,
+      "eval_runtime": 19.0886,
+      "eval_samples_per_second": 15.454,
+      "eval_steps_per_second": 3.091,
       "step": 60
     },
     {
       "epoch": 0.5754716981132075,
+      "grad_norm": 45.89920425415039,
+      "learning_rate": 1.22e-06,
+      "loss": 0.7172,
       "step": 61
     },
     {
       "epoch": 0.5849056603773585,
+      "grad_norm": 31.668310165405273,
+      "learning_rate": 1.24e-06,
+      "loss": 0.659,
       "step": 62
     },
     {
       "epoch": 0.5943396226415094,
+      "grad_norm": 49.8336181640625,
+      "learning_rate": 1.26e-06,
+      "loss": 0.7355,
       "step": 63
     },
     {
       "epoch": 0.6037735849056604,
+      "grad_norm": 6.202754497528076,
+      "learning_rate": 1.28e-06,
+      "loss": 0.5217,
       "step": 64
     },
     {
       "epoch": 0.6132075471698113,
+      "grad_norm": 27.1490535736084,
+      "learning_rate": 1.3e-06,
+      "loss": 0.6641,
       "step": 65
     },
     {
       "epoch": 0.6226415094339622,
+      "grad_norm": 21.903213500976562,
+      "learning_rate": 1.32e-06,
+      "loss": 0.7061,
       "step": 66
     },
     {
       "epoch": 0.6320754716981132,
+      "grad_norm": 14.298906326293945,
+      "learning_rate": 1.34e-06,
+      "loss": 0.5415,
       "step": 67
     },
     {
       "epoch": 0.6415094339622641,
+      "grad_norm": 8.386432647705078,
+      "learning_rate": 1.3600000000000001e-06,
+      "loss": 0.5722,
       "step": 68
     },
     {
       "epoch": 0.6509433962264151,
+      "grad_norm": 6.960066318511963,
+      "learning_rate": 1.38e-06,
+      "loss": 0.616,
       "step": 69
     },
     {
       "epoch": 0.660377358490566,
+      "grad_norm": 34.883724212646484,
+      "learning_rate": 1.4e-06,
+      "loss": 0.6072,
+      "step": 70
+    },
+    {
+      "epoch": 0.660377358490566,
+      "eval_loss": 0.6845781803131104,
+      "eval_runtime": 19.1342,
+      "eval_samples_per_second": 15.417,
+      "eval_steps_per_second": 3.083,
       "step": 70
     },
     {
       "epoch": 0.6698113207547169,
+      "grad_norm": 6.061687469482422,
+      "learning_rate": 1.42e-06,
+      "loss": 0.6442,
       "step": 71
     },
     {
       "epoch": 0.6792452830188679,
+      "grad_norm": 12.237196922302246,
+      "learning_rate": 1.44e-06,
+      "loss": 0.6129,
       "step": 72
     },
     {
       "epoch": 0.6886792452830188,
+      "grad_norm": 9.720622062683105,
+      "learning_rate": 1.46e-06,
+      "loss": 0.5943,
       "step": 73
     },
     {
       "epoch": 0.6981132075471698,
+      "grad_norm": 6.667535305023193,
+      "learning_rate": 1.48e-06,
+      "loss": 0.5455,
       "step": 74
     },
     {
       "epoch": 0.7075471698113207,
+      "grad_norm": 9.55260181427002,
+      "learning_rate": 1.5e-06,
+      "loss": 0.5702,
       "step": 75
     },
     {
       "epoch": 0.7169811320754716,
+      "grad_norm": 16.85431671142578,
+      "learning_rate": 1.5199999999999998e-06,
+      "loss": 0.5913,
       "step": 76
     },
     {
       "epoch": 0.7264150943396226,
+      "grad_norm": 14.90990924835205,
+      "learning_rate": 1.5399999999999999e-06,
+      "loss": 0.6405,
       "step": 77
     },
     {
       "epoch": 0.7358490566037735,
+      "grad_norm": 21.149423599243164,
+      "learning_rate": 1.5599999999999999e-06,
+      "loss": 0.6653,
       "step": 78
     },
     {
       "epoch": 0.7452830188679245,
+      "grad_norm": 11.917136192321777,
+      "learning_rate": 1.58e-06,
+      "loss": 0.6892,
       "step": 79
     },
     {
       "epoch": 0.7547169811320755,
+      "grad_norm": 18.385757446289062,
+      "learning_rate": 1.6e-06,
+      "loss": 0.5136,
       "step": 80
     },
     {
       "epoch": 0.7547169811320755,
+      "eval_loss": 0.6837261915206909,
+      "eval_runtime": 19.1072,
+      "eval_samples_per_second": 15.439,
+      "eval_steps_per_second": 3.088,
       "step": 80
     },
     {
       "epoch": 0.7641509433962265,
+      "grad_norm": 8.149250984191895,
+      "learning_rate": 1.62e-06,
+      "loss": 0.7424,
       "step": 81
     },
     {
       "epoch": 0.7735849056603774,
+      "grad_norm": 26.717226028442383,
+      "learning_rate": 1.6399999999999998e-06,
+      "loss": 0.768,
       "step": 82
     },
     {
       "epoch": 0.7830188679245284,
+      "grad_norm": 9.56679630279541,
+      "learning_rate": 1.6599999999999998e-06,
+      "loss": 0.5652,
       "step": 83
     },
     {
       "epoch": 0.7924528301886793,
+      "grad_norm": 40.63279342651367,
+      "learning_rate": 1.6799999999999998e-06,
+      "loss": 0.7145,
       "step": 84
     },
     {
       "epoch": 0.8018867924528302,
+      "grad_norm": 26.526386260986328,
+      "learning_rate": 1.6999999999999998e-06,
+      "loss": 0.7287,
       "step": 85
     },
     {
       "epoch": 0.8113207547169812,
+      "grad_norm": 20.656476974487305,
+      "learning_rate": 1.7199999999999998e-06,
+      "loss": 0.8114,
       "step": 86
     },
     {
       "epoch": 0.8207547169811321,
+      "grad_norm": 21.340261459350586,
+      "learning_rate": 1.7399999999999999e-06,
+      "loss": 0.7421,
       "step": 87
     },
     {
       "epoch": 0.8301886792452831,
+      "grad_norm": 43.33297348022461,
+      "learning_rate": 1.7599999999999999e-06,
+      "loss": 0.6437,
       "step": 88
     },
     {
       "epoch": 0.839622641509434,
+      "grad_norm": 28.463003158569336,
+      "learning_rate": 1.78e-06,
+      "loss": 0.6925,
       "step": 89
     },
     {
       "epoch": 0.8490566037735849,
+      "grad_norm": 23.972209930419922,
+      "learning_rate": 1.8e-06,
+      "loss": 0.798,
+      "step": 90
+    },
+    {
+      "epoch": 0.8490566037735849,
+      "eval_loss": 0.6815512180328369,
+      "eval_runtime": 19.1285,
+      "eval_samples_per_second": 15.422,
+      "eval_steps_per_second": 3.084,
       "step": 90
     },
     {
       "epoch": 0.8584905660377359,
+      "grad_norm": 7.108420372009277,
+      "learning_rate": 1.82e-06,
+      "loss": 0.6795,
       "step": 91
     },
     {
       "epoch": 0.8679245283018868,
+      "grad_norm": 7.378752708435059,
+      "learning_rate": 1.84e-06,
+      "loss": 0.5856,
       "step": 92
     },
     {
       "epoch": 0.8773584905660378,
+      "grad_norm": 9.755034446716309,
+      "learning_rate": 1.86e-06,
+      "loss": 0.6236,
       "step": 93
     },
     {
       "epoch": 0.8867924528301887,
+      "grad_norm": 14.284013748168945,
+      "learning_rate": 1.8799999999999998e-06,
+      "loss": 0.6273,
       "step": 94
     },
     {
       "epoch": 0.8962264150943396,
+      "grad_norm": 11.498614311218262,
+      "learning_rate": 1.8999999999999998e-06,
+      "loss": 0.7204,
       "step": 95
     },
     {
       "epoch": 0.9056603773584906,
+      "grad_norm": 21.997724533081055,
+      "learning_rate": 1.92e-06,
+      "loss": 0.6552,
       "step": 96
     },
     {
       "epoch": 0.9150943396226415,
+      "grad_norm": 15.58300495147705,
+      "learning_rate": 1.94e-06,
+      "loss": 0.751,
       "step": 97
     },
     {
       "epoch": 0.9245283018867925,
+      "grad_norm": 10.124507904052734,
+      "learning_rate": 1.96e-06,
+      "loss": 0.664,
       "step": 98
     },
     {
       "epoch": 0.9339622641509434,
+      "grad_norm": 14.453956604003906,
+      "learning_rate": 1.98e-06,
+      "loss": 0.6503,
       "step": 99
     },
     {
       "epoch": 0.9433962264150944,
+      "grad_norm": 16.880348205566406,
+      "learning_rate": 2e-06,
+      "loss": 0.5736,
       "step": 100
     },
     {
       "epoch": 0.9433962264150944,
+      "eval_loss": 0.6825625896453857,
+      "eval_runtime": 19.1145,
+      "eval_samples_per_second": 15.433,
       "eval_steps_per_second": 3.087,
       "step": 100
     }
   ],
   "logging_steps": 1,
+  "max_steps": 318,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 50,
   "stateful_callbacks": {
     "TrainerControl": {
       "args": {

checkpoint-100/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26f455500c6d21c30e744017cea9fadfdc34176d20cac0307417afab0d9542d6
 size 5112

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e439f378b390e89bbeefa59eafafdb1ecc84a940e029c0e74ae9a73bbc405b3
 size 5112