| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9984464008285863, | |
| "eval_steps": 500, | |
| "global_step": 1446, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020714655618850338, | |
| "grad_norm": 1.7445119619369507, | |
| "learning_rate": 1.3793103448275862e-06, | |
| "loss": 0.6291, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.041429311237700675, | |
| "grad_norm": 1.0750082731246948, | |
| "learning_rate": 2.7586206896551725e-06, | |
| "loss": 0.5465, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06214396685655101, | |
| "grad_norm": 0.6934411525726318, | |
| "learning_rate": 4.137931034482759e-06, | |
| "loss": 0.3307, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08285862247540135, | |
| "grad_norm": 0.7975348830223083, | |
| "learning_rate": 5.517241379310345e-06, | |
| "loss": 0.1762, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10357327809425168, | |
| "grad_norm": 0.4914913773536682, | |
| "learning_rate": 6.896551724137932e-06, | |
| "loss": 0.1247, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12428793371310203, | |
| "grad_norm": 0.14945407211780548, | |
| "learning_rate": 8.275862068965518e-06, | |
| "loss": 0.094, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14500258933195237, | |
| "grad_norm": 0.1811889111995697, | |
| "learning_rate": 9.655172413793105e-06, | |
| "loss": 0.0814, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1657172449508027, | |
| "grad_norm": 0.7005437612533569, | |
| "learning_rate": 1.103448275862069e-05, | |
| "loss": 0.0732, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18643190056965303, | |
| "grad_norm": 0.2819240689277649, | |
| "learning_rate": 1.2413793103448277e-05, | |
| "loss": 0.0695, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20714655618850336, | |
| "grad_norm": 0.22254888713359833, | |
| "learning_rate": 1.3793103448275863e-05, | |
| "loss": 0.064, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2278612118073537, | |
| "grad_norm": 0.1880355030298233, | |
| "learning_rate": 1.5172413793103448e-05, | |
| "loss": 0.0562, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.24857586742620405, | |
| "grad_norm": 0.24531865119934082, | |
| "learning_rate": 1.6551724137931037e-05, | |
| "loss": 0.0539, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.26929052304505435, | |
| "grad_norm": 0.25905728340148926, | |
| "learning_rate": 1.7931034482758623e-05, | |
| "loss": 0.051, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.29000517866390474, | |
| "grad_norm": 0.21914663910865784, | |
| "learning_rate": 1.931034482758621e-05, | |
| "loss": 0.0492, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.31071983428275507, | |
| "grad_norm": 0.3093046545982361, | |
| "learning_rate": 1.9999271130311727e-05, | |
| "loss": 0.0428, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3314344899016054, | |
| "grad_norm": 0.17949309945106506, | |
| "learning_rate": 1.999344081029126e-05, | |
| "loss": 0.0433, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.35214914552045573, | |
| "grad_norm": 0.3568534255027771, | |
| "learning_rate": 1.9981783569761265e-05, | |
| "loss": 0.0409, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.37286380113930606, | |
| "grad_norm": 0.17900176346302032, | |
| "learning_rate": 1.9964306205761438e-05, | |
| "loss": 0.0395, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3935784567581564, | |
| "grad_norm": 0.2819104492664337, | |
| "learning_rate": 1.9941018908897066e-05, | |
| "loss": 0.0392, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4142931123770067, | |
| "grad_norm": 0.8110898733139038, | |
| "learning_rate": 1.9911935257397142e-05, | |
| "loss": 0.0385, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.43500776799585705, | |
| "grad_norm": 0.1698705404996872, | |
| "learning_rate": 1.987707220919723e-05, | |
| "loss": 0.0388, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4557224236147074, | |
| "grad_norm": 0.24668708443641663, | |
| "learning_rate": 1.9836450092051755e-05, | |
| "loss": 0.0355, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.47643707923355777, | |
| "grad_norm": 0.30196627974510193, | |
| "learning_rate": 1.979009259168138e-05, | |
| "loss": 0.0343, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4971517348524081, | |
| "grad_norm": 0.4836544990539551, | |
| "learning_rate": 1.973802673796249e-05, | |
| "loss": 0.0342, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5178663904712584, | |
| "grad_norm": 0.1502821296453476, | |
| "learning_rate": 1.9680282889166727e-05, | |
| "loss": 0.0327, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5385810460901087, | |
| "grad_norm": 0.30074331164360046, | |
| "learning_rate": 1.96168947142599e-05, | |
| "loss": 0.0326, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.559295701708959, | |
| "grad_norm": 0.2049945890903473, | |
| "learning_rate": 1.954789917327043e-05, | |
| "loss": 0.0308, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5800103573278095, | |
| "grad_norm": 0.4290943145751953, | |
| "learning_rate": 1.947333649573892e-05, | |
| "loss": 0.031, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6007250129466598, | |
| "grad_norm": 0.19926780462265015, | |
| "learning_rate": 1.939325015726134e-05, | |
| "loss": 0.0312, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6214396685655101, | |
| "grad_norm": 0.15679922699928284, | |
| "learning_rate": 1.9307686854139493e-05, | |
| "loss": 0.0295, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6421543241843605, | |
| "grad_norm": 0.18857069313526154, | |
| "learning_rate": 1.921669647615362e-05, | |
| "loss": 0.029, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6628689798032108, | |
| "grad_norm": 0.1605505794286728, | |
| "learning_rate": 1.912033207747292e-05, | |
| "loss": 0.0293, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6835836354220611, | |
| "grad_norm": 0.20999445021152496, | |
| "learning_rate": 1.9018649845721034e-05, | |
| "loss": 0.0276, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7042982910409115, | |
| "grad_norm": 0.13596299290657043, | |
| "learning_rate": 1.891170906921445e-05, | |
| "loss": 0.0277, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7250129466597618, | |
| "grad_norm": 0.16931457817554474, | |
| "learning_rate": 1.879957210239302e-05, | |
| "loss": 0.0276, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7457276022786121, | |
| "grad_norm": 0.16998381912708282, | |
| "learning_rate": 1.868230432946268e-05, | |
| "loss": 0.0282, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7664422578974625, | |
| "grad_norm": 0.1367356926202774, | |
| "learning_rate": 1.855997412627156e-05, | |
| "loss": 0.0263, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.7871569135163128, | |
| "grad_norm": 0.22779536247253418, | |
| "learning_rate": 1.843265282044179e-05, | |
| "loss": 0.0268, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8078715691351631, | |
| "grad_norm": 0.2417256385087967, | |
| "learning_rate": 1.830041464978018e-05, | |
| "loss": 0.0253, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8285862247540134, | |
| "grad_norm": 0.20256799459457397, | |
| "learning_rate": 1.816333671899205e-05, | |
| "loss": 0.0276, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8493008803728638, | |
| "grad_norm": 0.14771951735019684, | |
| "learning_rate": 1.802149895472343e-05, | |
| "loss": 0.0262, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8700155359917141, | |
| "grad_norm": 0.09515649080276489, | |
| "learning_rate": 1.7874984058957885e-05, | |
| "loss": 0.025, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8907301916105644, | |
| "grad_norm": 0.12947799265384674, | |
| "learning_rate": 1.7723877460795094e-05, | |
| "loss": 0.0243, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9114448472294148, | |
| "grad_norm": 0.16538962721824646, | |
| "learning_rate": 1.7568267266639313e-05, | |
| "loss": 0.0255, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9321595028482651, | |
| "grad_norm": 0.121688112616539, | |
| "learning_rate": 1.7408244208826782e-05, | |
| "loss": 0.0268, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9528741584671155, | |
| "grad_norm": 0.18749986588954926, | |
| "learning_rate": 1.724390159272202e-05, | |
| "loss": 0.0249, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9735888140859659, | |
| "grad_norm": 0.14363788068294525, | |
| "learning_rate": 1.7075335242313834e-05, | |
| "loss": 0.0245, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9943034697048162, | |
| "grad_norm": 0.138985276222229, | |
| "learning_rate": 1.690264344434281e-05, | |
| "loss": 0.025, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0165717244950803, | |
| "grad_norm": 0.0923820212483406, | |
| "learning_rate": 1.6725926890992788e-05, | |
| "loss": 0.0255, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0372863801139307, | |
| "grad_norm": 0.24760648608207703, | |
| "learning_rate": 1.6545288621179875e-05, | |
| "loss": 0.0227, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.058001035732781, | |
| "grad_norm": 0.11909480392932892, | |
| "learning_rate": 1.6360833960473015e-05, | |
| "loss": 0.0216, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0787156913516314, | |
| "grad_norm": 0.131114661693573, | |
| "learning_rate": 1.6172670459681385e-05, | |
| "loss": 0.0227, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0994303469704816, | |
| "grad_norm": 0.09913528710603714, | |
| "learning_rate": 1.5980907832144233e-05, | |
| "loss": 0.0225, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.120145002589332, | |
| "grad_norm": 0.22871269285678864, | |
| "learning_rate": 1.5785657889759843e-05, | |
| "loss": 0.0228, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1408596582081822, | |
| "grad_norm": 0.29105865955352783, | |
| "learning_rate": 1.5587034477790856e-05, | |
| "loss": 0.0225, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1615743138270327, | |
| "grad_norm": 0.10137622058391571, | |
| "learning_rate": 1.5385153408484015e-05, | |
| "loss": 0.0215, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.182288969445883, | |
| "grad_norm": 0.08646316826343536, | |
| "learning_rate": 1.5180132393542978e-05, | |
| "loss": 0.0207, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.2030036250647334, | |
| "grad_norm": 0.10893907397985458, | |
| "learning_rate": 1.4972090975493637e-05, | |
| "loss": 0.0194, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2237182806835836, | |
| "grad_norm": 0.09016920626163483, | |
| "learning_rate": 1.4761150457981926e-05, | |
| "loss": 0.0207, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.244432936302434, | |
| "grad_norm": 0.20583857595920563, | |
| "learning_rate": 1.4547433835044744e-05, | |
| "loss": 0.021, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2651475919212842, | |
| "grad_norm": 0.10427103191614151, | |
| "learning_rate": 1.4331065719395274e-05, | |
| "loss": 0.0211, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2858622475401347, | |
| "grad_norm": 0.2579711377620697, | |
| "learning_rate": 1.4112172269764493e-05, | |
| "loss": 0.021, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.3065769031589851, | |
| "grad_norm": 0.1089586690068245, | |
| "learning_rate": 1.3890881117341222e-05, | |
| "loss": 0.0193, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3272915587778353, | |
| "grad_norm": 0.1397068202495575, | |
| "learning_rate": 1.3667321291353641e-05, | |
| "loss": 0.0214, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3480062143966856, | |
| "grad_norm": 0.10761136561632156, | |
| "learning_rate": 1.344162314383564e-05, | |
| "loss": 0.0212, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.368720870015536, | |
| "grad_norm": 0.12136030197143555, | |
| "learning_rate": 1.3213918273621869e-05, | |
| "loss": 0.0199, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3894355256343864, | |
| "grad_norm": 0.14904293417930603, | |
| "learning_rate": 1.2984339449615797e-05, | |
| "loss": 0.0207, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.4101501812532367, | |
| "grad_norm": 0.11436508595943451, | |
| "learning_rate": 1.2753020533375584e-05, | |
| "loss": 0.0205, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4308648368720869, | |
| "grad_norm": 0.11292333900928497, | |
| "learning_rate": 1.2520096401062787e-05, | |
| "loss": 0.0194, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4515794924909373, | |
| "grad_norm": 0.10446956008672714, | |
| "learning_rate": 1.228570286479953e-05, | |
| "loss": 0.0187, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4722941481097878, | |
| "grad_norm": 0.10512887686491013, | |
| "learning_rate": 1.2049976593479902e-05, | |
| "loss": 0.0202, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.493008803728638, | |
| "grad_norm": 0.2583148181438446, | |
| "learning_rate": 1.1813055033081836e-05, | |
| "loss": 0.0186, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5137234593474882, | |
| "grad_norm": 0.29421666264533997, | |
| "learning_rate": 1.157507632652583e-05, | |
| "loss": 0.0217, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5344381149663386, | |
| "grad_norm": 0.2522526681423187, | |
| "learning_rate": 1.1336179233127364e-05, | |
| "loss": 0.0187, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.555152770585189, | |
| "grad_norm": 0.10002532601356506, | |
| "learning_rate": 1.1096503047689855e-05, | |
| "loss": 0.0181, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5758674262040393, | |
| "grad_norm": 0.1013544574379921, | |
| "learning_rate": 1.0856187519285433e-05, | |
| "loss": 0.0186, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.5965820818228897, | |
| "grad_norm": 0.16282466053962708, | |
| "learning_rate": 1.0615372769770798e-05, | |
| "loss": 0.0202, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6172967374417402, | |
| "grad_norm": 0.11279471963644028, | |
| "learning_rate": 1.0374199212085761e-05, | |
| "loss": 0.0189, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6380113930605904, | |
| "grad_norm": 0.08985172212123871, | |
| "learning_rate": 1.0132807468382016e-05, | |
| "loss": 0.0184, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6587260486794406, | |
| "grad_norm": 0.0785251185297966, | |
| "learning_rate": 9.891338288029974e-06, | |
| "loss": 0.0208, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.679440704298291, | |
| "grad_norm": 0.09164576977491379, | |
| "learning_rate": 9.649932465551373e-06, | |
| "loss": 0.0182, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.7001553599171415, | |
| "grad_norm": 0.0713551864027977, | |
| "learning_rate": 9.408730758525588e-06, | |
| "loss": 0.0199, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7208700155359917, | |
| "grad_norm": 0.08521983027458191, | |
| "learning_rate": 9.167873805517483e-06, | |
| "loss": 0.0194, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.741584671154842, | |
| "grad_norm": 0.09668238461017609, | |
| "learning_rate": 8.927502044074642e-06, | |
| "loss": 0.0193, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7622993267736924, | |
| "grad_norm": 0.08275121450424194, | |
| "learning_rate": 8.68775562884183e-06, | |
| "loss": 0.0191, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7830139823925428, | |
| "grad_norm": 0.09222189337015152, | |
| "learning_rate": 8.448774349840365e-06, | |
| "loss": 0.018, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.803728638011393, | |
| "grad_norm": 0.07218069583177567, | |
| "learning_rate": 8.210697550960157e-06, | |
| "loss": 0.0187, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8244432936302433, | |
| "grad_norm": 0.09693553298711777, | |
| "learning_rate": 7.973664048711805e-06, | |
| "loss": 0.0179, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8451579492490937, | |
| "grad_norm": 0.08260560035705566, | |
| "learning_rate": 7.73781205128626e-06, | |
| "loss": 0.0177, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8658726048679442, | |
| "grad_norm": 0.08100268244743347, | |
| "learning_rate": 7.503279077969126e-06, | |
| "loss": 0.02, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8865872604867944, | |
| "grad_norm": 0.16011330485343933, | |
| "learning_rate": 7.270201878956692e-06, | |
| "loss": 0.0176, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.9073019161056446, | |
| "grad_norm": 0.07956048846244812, | |
| "learning_rate": 7.038716355620364e-06, | |
| "loss": 0.0179, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.928016571724495, | |
| "grad_norm": 0.0816517025232315, | |
| "learning_rate": 6.8089574812660604e-06, | |
| "loss": 0.0169, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9487312273433455, | |
| "grad_norm": 0.07489065825939178, | |
| "learning_rate": 6.581059222434696e-06, | |
| "loss": 0.0188, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9694458829621957, | |
| "grad_norm": 0.10620886832475662, | |
| "learning_rate": 6.355154460789721e-06, | |
| "loss": 0.018, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.9901605385810461, | |
| "grad_norm": 0.10104135423898697, | |
| "learning_rate": 6.131374915637207e-06, | |
| "loss": 0.0178, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.0124287933713103, | |
| "grad_norm": 0.11048119515180588, | |
| "learning_rate": 5.90985106712369e-06, | |
| "loss": 0.0184, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.0331434489901605, | |
| "grad_norm": 0.07706999778747559, | |
| "learning_rate": 5.690712080156507e-06, | |
| "loss": 0.0148, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.0538581046090107, | |
| "grad_norm": 0.08715096116065979, | |
| "learning_rate": 5.474085729091039e-06, | |
| "loss": 0.0168, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.0745727602278614, | |
| "grad_norm": 0.07319490611553192, | |
| "learning_rate": 5.260098323228742e-06, | |
| "loss": 0.0153, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.0952874158467116, | |
| "grad_norm": 0.06975601613521576, | |
| "learning_rate": 5.048874633169427e-06, | |
| "loss": 0.0157, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.116002071465562, | |
| "grad_norm": 0.07877081632614136, | |
| "learning_rate": 4.840537818060685e-06, | |
| "loss": 0.0162, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.136716727084412, | |
| "grad_norm": 0.07609053701162338, | |
| "learning_rate": 4.635209353786935e-06, | |
| "loss": 0.0156, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.1574313827032627, | |
| "grad_norm": 0.08297361433506012, | |
| "learning_rate": 4.433008962139934e-06, | |
| "loss": 0.0166, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.178146038322113, | |
| "grad_norm": 0.13637958467006683, | |
| "learning_rate": 4.234054541012069e-06, | |
| "loss": 0.016, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.198860693940963, | |
| "grad_norm": 0.07267557829618454, | |
| "learning_rate": 4.038462095653071e-06, | |
| "loss": 0.0167, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.2195753495598134, | |
| "grad_norm": 0.08389929682016373, | |
| "learning_rate": 3.846345671030335e-06, | |
| "loss": 0.0157, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.240290005178664, | |
| "grad_norm": 0.07261879742145538, | |
| "learning_rate": 3.657817285332198e-06, | |
| "loss": 0.0163, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.2610046607975143, | |
| "grad_norm": 0.060720350593328476, | |
| "learning_rate": 3.472986864653004e-06, | |
| "loss": 0.0163, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.2817193164163645, | |
| "grad_norm": 0.07147302478551865, | |
| "learning_rate": 3.291962178897977e-06, | |
| "loss": 0.0149, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3024339720352147, | |
| "grad_norm": 0.084414042532444, | |
| "learning_rate": 3.1148487789453573e-06, | |
| "loss": 0.0159, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.3231486276540654, | |
| "grad_norm": 0.0757676213979721, | |
| "learning_rate": 2.9417499351023683e-06, | |
| "loss": 0.0159, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.3438632832729156, | |
| "grad_norm": 0.06704848259687424, | |
| "learning_rate": 2.772766576890934e-06, | |
| "loss": 0.015, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.364577938891766, | |
| "grad_norm": 0.07221634685993195, | |
| "learning_rate": 2.6079972341982495e-06, | |
| "loss": 0.0145, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.3852925945106165, | |
| "grad_norm": 0.08170408010482788, | |
| "learning_rate": 2.4475379798265175e-06, | |
| "loss": 0.0161, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.4060072501294667, | |
| "grad_norm": 0.10875322669744492, | |
| "learning_rate": 2.29148237347534e-06, | |
| "loss": 0.0144, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.426721905748317, | |
| "grad_norm": 0.076971136033535, | |
| "learning_rate": 2.139921407189459e-06, | |
| "loss": 0.0156, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.447436561367167, | |
| "grad_norm": 0.07969781011343002, | |
| "learning_rate": 1.992943452303596e-06, | |
| "loss": 0.0149, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.4681512169860174, | |
| "grad_norm": 0.0808907300233841, | |
| "learning_rate": 1.8506342079153904e-06, | |
| "loss": 0.0162, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.488865872604868, | |
| "grad_norm": 0.07730656862258911, | |
| "learning_rate": 1.7130766509164464e-06, | |
| "loss": 0.0157, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5095805282237182, | |
| "grad_norm": 0.07149532437324524, | |
| "learning_rate": 1.5803509876106094e-06, | |
| "loss": 0.0148, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.5302951838425685, | |
| "grad_norm": 0.080299012362957, | |
| "learning_rate": 1.4525346069477253e-06, | |
| "loss": 0.0155, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.551009839461419, | |
| "grad_norm": 0.06479936093091965, | |
| "learning_rate": 1.3297020354001121e-06, | |
| "loss": 0.0144, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.5717244950802693, | |
| "grad_norm": 0.07577677816152573, | |
| "learning_rate": 1.2119248935080863e-06, | |
| "loss": 0.0145, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.5924391506991196, | |
| "grad_norm": 0.06705212593078613, | |
| "learning_rate": 1.0992718541198323e-06, | |
| "loss": 0.015, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.6131538063179702, | |
| "grad_norm": 0.07738765329122543, | |
| "learning_rate": 9.918086023500318e-07, | |
| "loss": 0.0157, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.6338684619368204, | |
| "grad_norm": 0.06520453840494156, | |
| "learning_rate": 8.89597797280537e-07, | |
| "loss": 0.0148, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.6545831175556707, | |
| "grad_norm": 0.061123792082071304, | |
| "learning_rate": 7.926990354254749e-07, | |
| "loss": 0.0147, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.675297773174521, | |
| "grad_norm": 0.062305476516485214, | |
| "learning_rate": 7.011688159820152e-07, | |
| "loss": 0.0146, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.696012428793371, | |
| "grad_norm": 0.08513689786195755, | |
| "learning_rate": 6.150605078871486e-07, | |
| "loss": 0.0157, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.7167270844122218, | |
| "grad_norm": 0.06931740045547485, | |
| "learning_rate": 5.344243186996123e-07, | |
| "loss": 0.0141, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.737441740031072, | |
| "grad_norm": 0.06409303843975067, | |
| "learning_rate": 4.5930726532514956e-07, | |
| "loss": 0.0144, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.758156395649922, | |
| "grad_norm": 0.06241234764456749, | |
| "learning_rate": 3.897531466021387e-07, | |
| "loss": 0.0144, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.778871051268773, | |
| "grad_norm": 0.06524225324392319, | |
| "learning_rate": 3.2580251776361703e-07, | |
| "loss": 0.0144, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.799585706887623, | |
| "grad_norm": 0.06363238394260406, | |
| "learning_rate": 2.674926667905575e-07, | |
| "loss": 0.0138, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.8203003625064733, | |
| "grad_norm": 0.05807124823331833, | |
| "learning_rate": 2.148575926701957e-07, | |
| "loss": 0.0148, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.8410150181253235, | |
| "grad_norm": 0.06564666330814362, | |
| "learning_rate": 1.6792798557208612e-07, | |
| "loss": 0.0149, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.8617296737441738, | |
| "grad_norm": 0.06005607545375824, | |
| "learning_rate": 1.2673120895345002e-07, | |
| "loss": 0.0151, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.8824443293630244, | |
| "grad_norm": 0.08954086899757385, | |
| "learning_rate": 9.129128360424011e-08, | |
| "loss": 0.0151, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.9031589849818746, | |
| "grad_norm": 0.06695859879255295, | |
| "learning_rate": 6.162887364122961e-08, | |
| "loss": 0.0156, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.923873640600725, | |
| "grad_norm": 0.06316748261451721, | |
| "learning_rate": 3.7761274459293674e-08, | |
| "loss": 0.0148, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.9445882962195755, | |
| "grad_norm": 0.0705583244562149, | |
| "learning_rate": 1.970240264690615e-08, | |
| "loss": 0.0145, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.9653029518384257, | |
| "grad_norm": 0.07299486547708511, | |
| "learning_rate": 7.462787871729272e-09, | |
| "loss": 0.015, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.986017607457276, | |
| "grad_norm": 0.06303199380636215, | |
| "learning_rate": 1.0495667410359567e-09, | |
| "loss": 0.0149, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.9984464008285863, | |
| "step": 1446, | |
| "total_flos": 5.658275230980768e+19, | |
| "train_loss": 0.035626511010927446, | |
| "train_runtime": 182367.5166, | |
| "train_samples_per_second": 1.524, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1446, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.658275230980768e+19, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |