{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04184100418410042, "grad_norm": 0.2925145155398498, "learning_rate": 3.7500000000000005e-06, "loss": 0.0243, "step": 10 }, { "epoch": 0.08368200836820083, "grad_norm": 0.09332366083077137, "learning_rate": 7.916666666666667e-06, "loss": 0.0066, "step": 20 }, { "epoch": 0.12552301255230125, "grad_norm": 0.037000329123698675, "learning_rate": 1.2083333333333333e-05, "loss": 0.0044, "step": 30 }, { "epoch": 0.16736401673640167, "grad_norm": 0.024975645672741548, "learning_rate": 1.6250000000000002e-05, "loss": 0.0028, "step": 40 }, { "epoch": 0.20920502092050208, "grad_norm": 0.033927305503936314, "learning_rate": 1.9999733110857237e-05, "loss": 0.0025, "step": 50 }, { "epoch": 0.2510460251046025, "grad_norm": 0.014612909987451435, "learning_rate": 1.9967723647752463e-05, "loss": 0.0018, "step": 60 }, { "epoch": 0.2928870292887029, "grad_norm": 0.017483510752084472, "learning_rate": 1.988253206622306e-05, "loss": 0.0019, "step": 70 }, { "epoch": 0.33472803347280333, "grad_norm": 0.014386718401807688, "learning_rate": 1.9744612900216588e-05, "loss": 0.0026, "step": 80 }, { "epoch": 0.37656903765690375, "grad_norm": 0.039483682416672744, "learning_rate": 1.9554702008157567e-05, "loss": 0.0017, "step": 90 }, { "epoch": 0.41841004184100417, "grad_norm": 0.017654984549017917, "learning_rate": 1.9313812646824432e-05, "loss": 0.0013, "step": 100 }, { "epoch": 0.4602510460251046, "grad_norm": 0.00968661778293667, "learning_rate": 1.9023230065186192e-05, "loss": 0.0031, "step": 110 }, { "epoch": 0.502092050209205, "grad_norm": 0.023973932216176105, "learning_rate": 1.8684504647043093e-05, "loss": 0.0022, "step": 120 }, { "epoch": 0.5439330543933054, "grad_norm": 0.021080771265352845, "learning_rate": 1.8299443639058238e-05, "loss": 0.0024, "step": 130 }, { "epoch": 0.5857740585774058, "grad_norm": 0.02233998245288774, "learning_rate": 1.7870101508314686e-05, "loss": 0.0008, "step": 140 }, { "epoch": 0.6276150627615062, "grad_norm": 0.015428869613923153, "learning_rate": 1.7398768980844664e-05, "loss": 0.002, "step": 150 }, { "epoch": 0.6694560669456067, "grad_norm": 0.011099263805834297, "learning_rate": 1.6887960819615025e-05, "loss": 0.0011, "step": 160 }, { "epoch": 0.7112970711297071, "grad_norm": 0.015474457950600984, "learning_rate": 1.634040240717878e-05, "loss": 0.0008, "step": 170 }, { "epoch": 0.7531380753138075, "grad_norm": 0.016896916614250686, "learning_rate": 1.5759015204579958e-05, "loss": 0.0012, "step": 180 }, { "epoch": 0.7949790794979079, "grad_norm": 0.010764154560653242, "learning_rate": 1.5146901164094914e-05, "loss": 0.0013, "step": 190 }, { "epoch": 0.8368200836820083, "grad_norm": 0.02624812740713589, "learning_rate": 1.4507326178974789e-05, "loss": 0.0021, "step": 200 }, { "epoch": 0.8786610878661087, "grad_norm": 0.02213838592470387, "learning_rate": 1.3843702658491961e-05, "loss": 0.0014, "step": 210 }, { "epoch": 0.9205020920502092, "grad_norm": 0.03256425649897184, "learning_rate": 1.3159571321260114e-05, "loss": 0.0007, "step": 220 }, { "epoch": 0.9623430962343096, "grad_norm": 0.008534054334232773, "learning_rate": 1.2458582303968466e-05, "loss": 0.0011, "step": 230 }, { "epoch": 1.00418410041841, "grad_norm": 0.015659822283413834, "learning_rate": 1.1744475686323225e-05, "loss": 0.0018, "step": 240 }, { "epoch": 1.0460251046025104, "grad_norm": 0.009318785906390915, "learning_rate": 1.1021061536104093e-05, "loss": 0.0007, "step": 250 }, { "epoch": 1.0878661087866108, "grad_norm": 0.02012715152590903, "learning_rate": 1.02921995808042e-05, "loss": 0.0008, "step": 260 }, { "epoch": 1.1297071129707112, "grad_norm": 0.01535009737960017, "learning_rate": 9.561778614313876e-06, "loss": 0.0005, "step": 270 }, { "epoch": 1.1715481171548117, "grad_norm": 0.011963284930504645, "learning_rate": 8.833695748522702e-06, "loss": 0.0011, "step": 280 }, { "epoch": 1.213389121338912, "grad_norm": 0.01174691491766897, "learning_rate": 8.111835620541397e-06, "loss": 0.0008, "step": 290 }, { "epoch": 1.2552301255230125, "grad_norm": 0.015140100128090778, "learning_rate": 7.400049666482061e-06, "loss": 0.0005, "step": 300 }, { "epoch": 1.297071129707113, "grad_norm": 0.0082059300361877, "learning_rate": 6.702135572380078e-06, "loss": 0.001, "step": 310 }, { "epoch": 1.3389121338912133, "grad_norm": 0.004079307018079572, "learning_rate": 6.021817011896004e-06, "loss": 0.0014, "step": 320 }, { "epoch": 1.3807531380753137, "grad_norm": 0.01109473064765619, "learning_rate": 5.362723778905427e-06, "loss": 0.001, "step": 330 }, { "epoch": 1.4225941422594142, "grad_norm": 0.0015855550385638771, "learning_rate": 4.728372420978119e-06, "loss": 0.0012, "step": 340 }, { "epoch": 1.4644351464435146, "grad_norm": 0.006116117187679222, "learning_rate": 4.12214747707527e-06, "loss": 0.0014, "step": 350 }, { "epoch": 1.506276150627615, "grad_norm": 0.020785630071871383, "learning_rate": 3.5472834195697017e-06, "loss": 0.0005, "step": 360 }, { "epoch": 1.5481171548117154, "grad_norm": 0.01356591109411228, "learning_rate": 3.0068473969362998e-06, "loss": 0.0016, "step": 370 }, { "epoch": 1.5899581589958158, "grad_norm": 0.0455348358926559, "learning_rate": 2.5037228691878424e-06, "loss": 0.0011, "step": 380 }, { "epoch": 1.6317991631799162, "grad_norm": 0.010270285275849788, "learning_rate": 2.0405942233682017e-06, "loss": 0.0011, "step": 390 }, { "epoch": 1.6736401673640167, "grad_norm": 0.009898474319911012, "learning_rate": 1.619932451186048e-06, "loss": 0.0019, "step": 400 }, { "epoch": 1.715481171548117, "grad_norm": 0.025182308499786255, "learning_rate": 1.2439819652049178e-06, "loss": 0.0008, "step": 410 }, { "epoch": 1.7573221757322175, "grad_norm": 0.01013827313449354, "learning_rate": 9.147486239311032e-07, "loss": 0.001, "step": 420 }, { "epoch": 1.799163179916318, "grad_norm": 0.03137196761877556, "learning_rate": 6.339890296906493e-07, "loss": 0.0013, "step": 430 }, { "epoch": 1.8410041841004183, "grad_norm": 0.016893363525461946, "learning_rate": 4.032011563958893e-07, "loss": 0.0014, "step": 440 }, { "epoch": 1.8828451882845187, "grad_norm": 0.015987721995170366, "learning_rate": 2.2361635720651199e-07, "loss": 0.0012, "step": 450 }, { "epoch": 1.9246861924686192, "grad_norm": 0.01083855584874376, "learning_rate": 9.619279472766863e-08, "loss": 0.0008, "step": 460 }, { "epoch": 1.9665271966527196, "grad_norm": 0.009804991431603429, "learning_rate": 2.1610328797904145e-08, "loss": 0.002, "step": 470 }, { "epoch": 2.0, "step": 478, "total_flos": 506334263902208.0, "train_loss": 0.002037838656673496, "train_runtime": 19073.3023, "train_samples_per_second": 6.413, "train_steps_per_second": 0.025 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 506334263902208.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }