| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 1869, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.016051364365971106, | |
| "grad_norm": 0.5444720983505249, | |
| "learning_rate": 4.973247726056715e-05, | |
| "loss": 0.6974, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03210272873194221, | |
| "grad_norm": 0.46184447407722473, | |
| "learning_rate": 4.94649545211343e-05, | |
| "loss": 0.6938, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.048154093097913325, | |
| "grad_norm": 0.6587526798248291, | |
| "learning_rate": 4.919743178170145e-05, | |
| "loss": 0.6926, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06420545746388442, | |
| "grad_norm": 2.1942267417907715, | |
| "learning_rate": 4.8929909042268596e-05, | |
| "loss": 0.6628, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08025682182985554, | |
| "grad_norm": 1.835483431816101, | |
| "learning_rate": 4.8662386302835744e-05, | |
| "loss": 0.6777, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.09630818619582665, | |
| "grad_norm": 1.6423420906066895, | |
| "learning_rate": 4.839486356340289e-05, | |
| "loss": 0.6748, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.11235955056179775, | |
| "grad_norm": 1.618177056312561, | |
| "learning_rate": 4.812734082397004e-05, | |
| "loss": 0.6585, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.12841091492776885, | |
| "grad_norm": 2.7368061542510986, | |
| "learning_rate": 4.785981808453719e-05, | |
| "loss": 0.6208, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.14446227929373998, | |
| "grad_norm": 2.0427486896514893, | |
| "learning_rate": 4.759229534510434e-05, | |
| "loss": 0.6447, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.16051364365971107, | |
| "grad_norm": 2.044725179672241, | |
| "learning_rate": 4.7324772605671486e-05, | |
| "loss": 0.6543, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17656500802568217, | |
| "grad_norm": 1.4154126644134521, | |
| "learning_rate": 4.7057249866238635e-05, | |
| "loss": 0.6295, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1926163723916533, | |
| "grad_norm": 1.4071663618087769, | |
| "learning_rate": 4.678972712680578e-05, | |
| "loss": 0.6088, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2086677367576244, | |
| "grad_norm": 1.5164605379104614, | |
| "learning_rate": 4.652220438737293e-05, | |
| "loss": 0.6086, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2247191011235955, | |
| "grad_norm": 1.1694097518920898, | |
| "learning_rate": 4.625468164794008e-05, | |
| "loss": 0.6131, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.24077046548956663, | |
| "grad_norm": 2.416905164718628, | |
| "learning_rate": 4.598715890850723e-05, | |
| "loss": 0.6392, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2568218298555377, | |
| "grad_norm": 1.8865312337875366, | |
| "learning_rate": 4.571963616907438e-05, | |
| "loss": 0.5955, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.27287319422150885, | |
| "grad_norm": 1.4470267295837402, | |
| "learning_rate": 4.5452113429641525e-05, | |
| "loss": 0.6245, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.28892455858747995, | |
| "grad_norm": 2.124974489212036, | |
| "learning_rate": 4.5184590690208673e-05, | |
| "loss": 0.6251, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.30497592295345105, | |
| "grad_norm": 2.1896016597747803, | |
| "learning_rate": 4.491706795077582e-05, | |
| "loss": 0.631, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.32102728731942215, | |
| "grad_norm": 1.449318766593933, | |
| "learning_rate": 4.4649545211342963e-05, | |
| "loss": 0.5939, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.33707865168539325, | |
| "grad_norm": 2.2219743728637695, | |
| "learning_rate": 4.438202247191011e-05, | |
| "loss": 0.5746, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.35313001605136435, | |
| "grad_norm": 2.2409071922302246, | |
| "learning_rate": 4.411449973247726e-05, | |
| "loss": 0.6065, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.36918138041733545, | |
| "grad_norm": 1.1223793029785156, | |
| "learning_rate": 4.384697699304441e-05, | |
| "loss": 0.6123, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3852327447833066, | |
| "grad_norm": 1.2747622728347778, | |
| "learning_rate": 4.357945425361156e-05, | |
| "loss": 0.5952, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.4012841091492777, | |
| "grad_norm": 1.8558810949325562, | |
| "learning_rate": 4.3311931514178705e-05, | |
| "loss": 0.6027, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4173354735152488, | |
| "grad_norm": 1.1629371643066406, | |
| "learning_rate": 4.3044408774745854e-05, | |
| "loss": 0.6228, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4333868378812199, | |
| "grad_norm": 1.4288586378097534, | |
| "learning_rate": 4.2776886035313e-05, | |
| "loss": 0.5734, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 3.359997034072876, | |
| "learning_rate": 4.250936329588015e-05, | |
| "loss": 0.6018, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4654895666131621, | |
| "grad_norm": 1.8125280141830444, | |
| "learning_rate": 4.22418405564473e-05, | |
| "loss": 0.6288, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.48154093097913325, | |
| "grad_norm": 1.3432456254959106, | |
| "learning_rate": 4.197431781701445e-05, | |
| "loss": 0.5756, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.49759229534510435, | |
| "grad_norm": 3.372265100479126, | |
| "learning_rate": 4.1706795077581596e-05, | |
| "loss": 0.6197, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5136436597110754, | |
| "grad_norm": 1.7587655782699585, | |
| "learning_rate": 4.1439272338148744e-05, | |
| "loss": 0.5808, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5296950240770465, | |
| "grad_norm": 2.1404621601104736, | |
| "learning_rate": 4.117174959871589e-05, | |
| "loss": 0.6075, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5457463884430177, | |
| "grad_norm": 1.6256980895996094, | |
| "learning_rate": 4.090422685928304e-05, | |
| "loss": 0.5765, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5617977528089888, | |
| "grad_norm": 2.2659549713134766, | |
| "learning_rate": 4.063670411985019e-05, | |
| "loss": 0.6305, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5778491171749599, | |
| "grad_norm": 1.9907615184783936, | |
| "learning_rate": 4.036918138041734e-05, | |
| "loss": 0.5743, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.593900481540931, | |
| "grad_norm": 2.26408314704895, | |
| "learning_rate": 4.0101658640984486e-05, | |
| "loss": 0.5842, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6099518459069021, | |
| "grad_norm": 1.9207652807235718, | |
| "learning_rate": 3.9834135901551634e-05, | |
| "loss": 0.5719, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6260032102728732, | |
| "grad_norm": 2.5978338718414307, | |
| "learning_rate": 3.956661316211878e-05, | |
| "loss": 0.5811, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6420545746388443, | |
| "grad_norm": 1.6617166996002197, | |
| "learning_rate": 3.929909042268593e-05, | |
| "loss": 0.5819, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6581059390048154, | |
| "grad_norm": 2.7522661685943604, | |
| "learning_rate": 3.903156768325308e-05, | |
| "loss": 0.605, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6741573033707865, | |
| "grad_norm": 1.6527293920516968, | |
| "learning_rate": 3.876404494382023e-05, | |
| "loss": 0.6036, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6902086677367576, | |
| "grad_norm": 1.6553492546081543, | |
| "learning_rate": 3.8496522204387376e-05, | |
| "loss": 0.5723, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7062600321027287, | |
| "grad_norm": 1.9113073348999023, | |
| "learning_rate": 3.8228999464954525e-05, | |
| "loss": 0.5869, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7223113964686998, | |
| "grad_norm": 1.493342399597168, | |
| "learning_rate": 3.796147672552167e-05, | |
| "loss": 0.5912, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7383627608346709, | |
| "grad_norm": 1.3368749618530273, | |
| "learning_rate": 3.769395398608882e-05, | |
| "loss": 0.613, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7544141252006421, | |
| "grad_norm": 2.699831008911133, | |
| "learning_rate": 3.742643124665597e-05, | |
| "loss": 0.5658, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7704654895666132, | |
| "grad_norm": 1.0824522972106934, | |
| "learning_rate": 3.715890850722312e-05, | |
| "loss": 0.6116, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7865168539325843, | |
| "grad_norm": 2.092763900756836, | |
| "learning_rate": 3.689138576779027e-05, | |
| "loss": 0.5711, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8025682182985554, | |
| "grad_norm": 2.2845699787139893, | |
| "learning_rate": 3.6623863028357415e-05, | |
| "loss": 0.5825, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8186195826645265, | |
| "grad_norm": 2.034006118774414, | |
| "learning_rate": 3.6356340288924564e-05, | |
| "loss": 0.5857, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8346709470304976, | |
| "grad_norm": 1.5809577703475952, | |
| "learning_rate": 3.608881754949171e-05, | |
| "loss": 0.5539, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8507223113964687, | |
| "grad_norm": 2.2914111614227295, | |
| "learning_rate": 3.582129481005886e-05, | |
| "loss": 0.5806, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8667736757624398, | |
| "grad_norm": 1.6067487001419067, | |
| "learning_rate": 3.555377207062601e-05, | |
| "loss": 0.5986, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8828250401284109, | |
| "grad_norm": 1.690928339958191, | |
| "learning_rate": 3.528624933119316e-05, | |
| "loss": 0.5713, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 2.0696561336517334, | |
| "learning_rate": 3.5018726591760305e-05, | |
| "loss": 0.5601, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.9149277688603531, | |
| "grad_norm": 1.693708896636963, | |
| "learning_rate": 3.4751203852327454e-05, | |
| "loss": 0.5812, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9309791332263242, | |
| "grad_norm": 1.5981098413467407, | |
| "learning_rate": 3.44836811128946e-05, | |
| "loss": 0.5699, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9470304975922953, | |
| "grad_norm": 1.4630780220031738, | |
| "learning_rate": 3.421615837346175e-05, | |
| "loss": 0.5917, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9630818619582665, | |
| "grad_norm": 1.8144547939300537, | |
| "learning_rate": 3.394863563402889e-05, | |
| "loss": 0.5969, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9791332263242376, | |
| "grad_norm": 2.242295265197754, | |
| "learning_rate": 3.368111289459604e-05, | |
| "loss": 0.5524, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9951845906902087, | |
| "grad_norm": 3.1214287281036377, | |
| "learning_rate": 3.341359015516319e-05, | |
| "loss": 0.5872, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0112359550561798, | |
| "grad_norm": 1.6238343715667725, | |
| "learning_rate": 3.314606741573034e-05, | |
| "loss": 0.5715, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.0272873194221508, | |
| "grad_norm": 1.7471458911895752, | |
| "learning_rate": 3.2878544676297486e-05, | |
| "loss": 0.5377, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.043338683788122, | |
| "grad_norm": 2.1529836654663086, | |
| "learning_rate": 3.2611021936864634e-05, | |
| "loss": 0.5542, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.0593900481540932, | |
| "grad_norm": 2.278169870376587, | |
| "learning_rate": 3.234349919743178e-05, | |
| "loss": 0.5616, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.0754414125200642, | |
| "grad_norm": 2.8219568729400635, | |
| "learning_rate": 3.207597645799893e-05, | |
| "loss": 0.5711, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0914927768860354, | |
| "grad_norm": 1.5639195442199707, | |
| "learning_rate": 3.180845371856608e-05, | |
| "loss": 0.5439, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.1075441412520064, | |
| "grad_norm": 2.146303415298462, | |
| "learning_rate": 3.154093097913323e-05, | |
| "loss": 0.525, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.1235955056179776, | |
| "grad_norm": 1.7315692901611328, | |
| "learning_rate": 3.1273408239700376e-05, | |
| "loss": 0.5521, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.1396468699839486, | |
| "grad_norm": 2.457808494567871, | |
| "learning_rate": 3.1005885500267525e-05, | |
| "loss": 0.5209, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.1556982343499198, | |
| "grad_norm": 2.4021546840667725, | |
| "learning_rate": 3.073836276083467e-05, | |
| "loss": 0.5056, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.1717495987158908, | |
| "grad_norm": 3.025860548019409, | |
| "learning_rate": 3.047084002140182e-05, | |
| "loss": 0.4884, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.187800963081862, | |
| "grad_norm": 3.1893551349639893, | |
| "learning_rate": 3.0203317281968966e-05, | |
| "loss": 0.5203, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.203852327447833, | |
| "grad_norm": 4.527679443359375, | |
| "learning_rate": 2.9935794542536115e-05, | |
| "loss": 0.5114, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.2199036918138042, | |
| "grad_norm": 3.021358013153076, | |
| "learning_rate": 2.9668271803103263e-05, | |
| "loss": 0.5408, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.2359550561797752, | |
| "grad_norm": 3.0941548347473145, | |
| "learning_rate": 2.940074906367041e-05, | |
| "loss": 0.5122, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.2520064205457464, | |
| "grad_norm": 2.5667285919189453, | |
| "learning_rate": 2.913322632423756e-05, | |
| "loss": 0.5192, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.2680577849117176, | |
| "grad_norm": 3.7231733798980713, | |
| "learning_rate": 2.886570358480471e-05, | |
| "loss": 0.5609, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2841091492776886, | |
| "grad_norm": 1.9197113513946533, | |
| "learning_rate": 2.8598180845371857e-05, | |
| "loss": 0.4747, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.3001605136436596, | |
| "grad_norm": 2.612793207168579, | |
| "learning_rate": 2.8330658105939005e-05, | |
| "loss": 0.5286, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.3162118780096308, | |
| "grad_norm": 2.353598117828369, | |
| "learning_rate": 2.8063135366506153e-05, | |
| "loss": 0.535, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.332263242375602, | |
| "grad_norm": 2.9524009227752686, | |
| "learning_rate": 2.7795612627073302e-05, | |
| "loss": 0.5402, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.348314606741573, | |
| "grad_norm": 2.1400082111358643, | |
| "learning_rate": 2.752808988764045e-05, | |
| "loss": 0.5487, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3643659711075442, | |
| "grad_norm": 2.116978168487549, | |
| "learning_rate": 2.72605671482076e-05, | |
| "loss": 0.5586, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3804173354735152, | |
| "grad_norm": 2.3243775367736816, | |
| "learning_rate": 2.6993044408774747e-05, | |
| "loss": 0.5402, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.3964686998394864, | |
| "grad_norm": 2.2850890159606934, | |
| "learning_rate": 2.6725521669341895e-05, | |
| "loss": 0.5151, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.4125200642054574, | |
| "grad_norm": 2.305981397628784, | |
| "learning_rate": 2.6457998929909044e-05, | |
| "loss": 0.5494, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 1.8870021104812622, | |
| "learning_rate": 2.6190476190476192e-05, | |
| "loss": 0.5545, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.4446227929373996, | |
| "grad_norm": 2.1851470470428467, | |
| "learning_rate": 2.592295345104334e-05, | |
| "loss": 0.5205, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4606741573033708, | |
| "grad_norm": 2.7136600017547607, | |
| "learning_rate": 2.565543071161049e-05, | |
| "loss": 0.5427, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.476725521669342, | |
| "grad_norm": 3.9425771236419678, | |
| "learning_rate": 2.5387907972177637e-05, | |
| "loss": 0.5209, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.492776886035313, | |
| "grad_norm": 2.84690260887146, | |
| "learning_rate": 2.5120385232744786e-05, | |
| "loss": 0.5562, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.508828250401284, | |
| "grad_norm": 2.375824213027954, | |
| "learning_rate": 2.485286249331193e-05, | |
| "loss": 0.521, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.5248796147672552, | |
| "grad_norm": 2.241267681121826, | |
| "learning_rate": 2.458533975387908e-05, | |
| "loss": 0.5576, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.5409309791332264, | |
| "grad_norm": 2.209796190261841, | |
| "learning_rate": 2.4317817014446228e-05, | |
| "loss": 0.5034, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.5569823434991974, | |
| "grad_norm": 2.9751803874969482, | |
| "learning_rate": 2.4050294275013376e-05, | |
| "loss": 0.5222, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.5730337078651684, | |
| "grad_norm": 3.5506584644317627, | |
| "learning_rate": 2.3782771535580524e-05, | |
| "loss": 0.5067, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.5890850722311396, | |
| "grad_norm": 2.4530675411224365, | |
| "learning_rate": 2.3515248796147673e-05, | |
| "loss": 0.5262, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.6051364365971108, | |
| "grad_norm": 2.639045476913452, | |
| "learning_rate": 2.324772605671482e-05, | |
| "loss": 0.5396, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.621187800963082, | |
| "grad_norm": 3.676542043685913, | |
| "learning_rate": 2.298020331728197e-05, | |
| "loss": 0.5462, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.637239165329053, | |
| "grad_norm": 2.813171148300171, | |
| "learning_rate": 2.2712680577849118e-05, | |
| "loss": 0.5792, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.653290529695024, | |
| "grad_norm": 3.5937094688415527, | |
| "learning_rate": 2.2445157838416266e-05, | |
| "loss": 0.527, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.6693418940609952, | |
| "grad_norm": 2.3738603591918945, | |
| "learning_rate": 2.2177635098983415e-05, | |
| "loss": 0.5431, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6853932584269664, | |
| "grad_norm": 2.7167625427246094, | |
| "learning_rate": 2.1910112359550563e-05, | |
| "loss": 0.5187, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.7014446227929374, | |
| "grad_norm": 2.8609092235565186, | |
| "learning_rate": 2.164258962011771e-05, | |
| "loss": 0.5061, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.7174959871589084, | |
| "grad_norm": 3.4374756813049316, | |
| "learning_rate": 2.137506688068486e-05, | |
| "loss": 0.5301, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.7335473515248796, | |
| "grad_norm": 2.3880395889282227, | |
| "learning_rate": 2.1107544141252008e-05, | |
| "loss": 0.5572, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.7495987158908508, | |
| "grad_norm": 2.2175981998443604, | |
| "learning_rate": 2.0840021401819157e-05, | |
| "loss": 0.4862, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.7656500802568218, | |
| "grad_norm": 1.8051007986068726, | |
| "learning_rate": 2.05724986623863e-05, | |
| "loss": 0.5108, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.7817014446227928, | |
| "grad_norm": 3.5168681144714355, | |
| "learning_rate": 2.030497592295345e-05, | |
| "loss": 0.538, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.797752808988764, | |
| "grad_norm": 2.5539281368255615, | |
| "learning_rate": 2.00374531835206e-05, | |
| "loss": 0.5489, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.8138041733547352, | |
| "grad_norm": 1.7118955850601196, | |
| "learning_rate": 1.9769930444087747e-05, | |
| "loss": 0.5242, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.8298555377207064, | |
| "grad_norm": 2.2016992568969727, | |
| "learning_rate": 1.9502407704654895e-05, | |
| "loss": 0.5172, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.8459069020866774, | |
| "grad_norm": 2.072165012359619, | |
| "learning_rate": 1.9234884965222044e-05, | |
| "loss": 0.5216, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.8619582664526484, | |
| "grad_norm": 2.446287155151367, | |
| "learning_rate": 1.8967362225789192e-05, | |
| "loss": 0.5391, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8780096308186196, | |
| "grad_norm": 2.6297097206115723, | |
| "learning_rate": 1.869983948635634e-05, | |
| "loss": 0.5046, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.8940609951845908, | |
| "grad_norm": 2.429002285003662, | |
| "learning_rate": 1.843231674692349e-05, | |
| "loss": 0.5081, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.9101123595505618, | |
| "grad_norm": 2.9561805725097656, | |
| "learning_rate": 1.8164794007490637e-05, | |
| "loss": 0.5335, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.9261637239165328, | |
| "grad_norm": 2.9521896839141846, | |
| "learning_rate": 1.7897271268057786e-05, | |
| "loss": 0.5307, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.942215088282504, | |
| "grad_norm": 2.409397602081299, | |
| "learning_rate": 1.7629748528624934e-05, | |
| "loss": 0.5014, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.9582664526484752, | |
| "grad_norm": 2.046233892440796, | |
| "learning_rate": 1.7362225789192082e-05, | |
| "loss": 0.5637, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.9743178170144462, | |
| "grad_norm": 2.7884459495544434, | |
| "learning_rate": 1.709470304975923e-05, | |
| "loss": 0.5512, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.9903691813804172, | |
| "grad_norm": 2.0197086334228516, | |
| "learning_rate": 1.682718031032638e-05, | |
| "loss": 0.5112, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.0064205457463884, | |
| "grad_norm": 2.801968574523926, | |
| "learning_rate": 1.6559657570893527e-05, | |
| "loss": 0.5084, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.0224719101123596, | |
| "grad_norm": 2.099778652191162, | |
| "learning_rate": 1.6292134831460676e-05, | |
| "loss": 0.4558, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.038523274478331, | |
| "grad_norm": 3.728285789489746, | |
| "learning_rate": 1.6024612092027824e-05, | |
| "loss": 0.453, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.0545746388443016, | |
| "grad_norm": 3.892178773880005, | |
| "learning_rate": 1.5757089352594973e-05, | |
| "loss": 0.4347, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.070626003210273, | |
| "grad_norm": 3.3752293586730957, | |
| "learning_rate": 1.548956661316212e-05, | |
| "loss": 0.4382, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.086677367576244, | |
| "grad_norm": 4.784174919128418, | |
| "learning_rate": 1.5222043873729266e-05, | |
| "loss": 0.4416, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.102728731942215, | |
| "grad_norm": 2.877718448638916, | |
| "learning_rate": 1.4954521134296414e-05, | |
| "loss": 0.4402, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.1187800963081864, | |
| "grad_norm": 3.0553367137908936, | |
| "learning_rate": 1.4686998394863563e-05, | |
| "loss": 0.4508, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.134831460674157, | |
| "grad_norm": 3.8250510692596436, | |
| "learning_rate": 1.4419475655430711e-05, | |
| "loss": 0.4422, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.1508828250401284, | |
| "grad_norm": 3.143554210662842, | |
| "learning_rate": 1.415195291599786e-05, | |
| "loss": 0.4572, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.1669341894060996, | |
| "grad_norm": 1.8287140130996704, | |
| "learning_rate": 1.3884430176565008e-05, | |
| "loss": 0.4287, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.182985553772071, | |
| "grad_norm": 5.451256275177002, | |
| "learning_rate": 1.3616907437132156e-05, | |
| "loss": 0.4481, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.1990369181380416, | |
| "grad_norm": 4.044301986694336, | |
| "learning_rate": 1.3349384697699305e-05, | |
| "loss": 0.4448, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.215088282504013, | |
| "grad_norm": 4.877999782562256, | |
| "learning_rate": 1.3081861958266453e-05, | |
| "loss": 0.4457, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.231139646869984, | |
| "grad_norm": 4.929600238800049, | |
| "learning_rate": 1.2814339218833602e-05, | |
| "loss": 0.432, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.247191011235955, | |
| "grad_norm": 3.4969890117645264, | |
| "learning_rate": 1.254681647940075e-05, | |
| "loss": 0.4387, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.263242375601926, | |
| "grad_norm": 4.1156086921691895, | |
| "learning_rate": 1.2279293739967898e-05, | |
| "loss": 0.4406, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.279293739967897, | |
| "grad_norm": 4.0271148681640625, | |
| "learning_rate": 1.2011771000535047e-05, | |
| "loss": 0.4283, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.2953451043338684, | |
| "grad_norm": 3.6957242488861084, | |
| "learning_rate": 1.1744248261102195e-05, | |
| "loss": 0.4424, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.3113964686998396, | |
| "grad_norm": 3.1115617752075195, | |
| "learning_rate": 1.1476725521669344e-05, | |
| "loss": 0.4475, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.3274478330658104, | |
| "grad_norm": 4.844674587249756, | |
| "learning_rate": 1.120920278223649e-05, | |
| "loss": 0.4459, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.3434991974317816, | |
| "grad_norm": 2.712280750274658, | |
| "learning_rate": 1.0941680042803639e-05, | |
| "loss": 0.4219, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.359550561797753, | |
| "grad_norm": 2.9259140491485596, | |
| "learning_rate": 1.0674157303370787e-05, | |
| "loss": 0.4824, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.375601926163724, | |
| "grad_norm": 4.346639633178711, | |
| "learning_rate": 1.0406634563937935e-05, | |
| "loss": 0.4364, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.391653290529695, | |
| "grad_norm": 2.7212588787078857, | |
| "learning_rate": 1.0139111824505084e-05, | |
| "loss": 0.3956, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.407704654895666, | |
| "grad_norm": 4.124871253967285, | |
| "learning_rate": 9.871589085072232e-06, | |
| "loss": 0.483, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.423756019261637, | |
| "grad_norm": 4.835409641265869, | |
| "learning_rate": 9.60406634563938e-06, | |
| "loss": 0.4623, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.4398073836276084, | |
| "grad_norm": 2.9034523963928223, | |
| "learning_rate": 9.336543606206529e-06, | |
| "loss": 0.4208, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.4558587479935796, | |
| "grad_norm": 2.569786787033081, | |
| "learning_rate": 9.069020866773677e-06, | |
| "loss": 0.4141, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.4719101123595504, | |
| "grad_norm": 4.244718551635742, | |
| "learning_rate": 8.801498127340826e-06, | |
| "loss": 0.4583, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.4879614767255216, | |
| "grad_norm": 4.004569053649902, | |
| "learning_rate": 8.533975387907972e-06, | |
| "loss": 0.4266, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.504012841091493, | |
| "grad_norm": 4.286050796508789, | |
| "learning_rate": 8.26645264847512e-06, | |
| "loss": 0.4117, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.520064205457464, | |
| "grad_norm": 4.992115497589111, | |
| "learning_rate": 7.99892990904227e-06, | |
| "loss": 0.4103, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.5361155698234352, | |
| "grad_norm": 4.154786586761475, | |
| "learning_rate": 7.731407169609418e-06, | |
| "loss": 0.4566, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.552166934189406, | |
| "grad_norm": 4.701552391052246, | |
| "learning_rate": 7.463884430176565e-06, | |
| "loss": 0.4534, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.568218298555377, | |
| "grad_norm": 3.9032320976257324, | |
| "learning_rate": 7.1963616907437135e-06, | |
| "loss": 0.4586, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.5842696629213484, | |
| "grad_norm": 4.401456832885742, | |
| "learning_rate": 6.928838951310862e-06, | |
| "loss": 0.4219, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.600321027287319, | |
| "grad_norm": 3.317080497741699, | |
| "learning_rate": 6.66131621187801e-06, | |
| "loss": 0.4522, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.6163723916532904, | |
| "grad_norm": 3.893983840942383, | |
| "learning_rate": 6.393793472445159e-06, | |
| "loss": 0.4208, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.6324237560192616, | |
| "grad_norm": 2.585857391357422, | |
| "learning_rate": 6.126270733012306e-06, | |
| "loss": 0.4535, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.648475120385233, | |
| "grad_norm": 2.956127405166626, | |
| "learning_rate": 5.858747993579455e-06, | |
| "loss": 0.4587, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.664526484751204, | |
| "grad_norm": 4.7360992431640625, | |
| "learning_rate": 5.591225254146603e-06, | |
| "loss": 0.4317, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.6805778491171752, | |
| "grad_norm": 3.546750068664551, | |
| "learning_rate": 5.323702514713751e-06, | |
| "loss": 0.4707, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.696629213483146, | |
| "grad_norm": 3.985381841659546, | |
| "learning_rate": 5.056179775280899e-06, | |
| "loss": 0.448, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.712680577849117, | |
| "grad_norm": 3.1598966121673584, | |
| "learning_rate": 4.788657035848047e-06, | |
| "loss": 0.4656, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.7287319422150884, | |
| "grad_norm": 2.8233871459960938, | |
| "learning_rate": 4.521134296415196e-06, | |
| "loss": 0.4827, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.744783306581059, | |
| "grad_norm": 3.8089215755462646, | |
| "learning_rate": 4.253611556982344e-06, | |
| "loss": 0.4454, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.7608346709470304, | |
| "grad_norm": 3.793998956680298, | |
| "learning_rate": 3.986088817549492e-06, | |
| "loss": 0.4626, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.7768860353130016, | |
| "grad_norm": 4.2339768409729, | |
| "learning_rate": 3.71856607811664e-06, | |
| "loss": 0.4368, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.792937399678973, | |
| "grad_norm": 3.96157169342041, | |
| "learning_rate": 3.4510433386837885e-06, | |
| "loss": 0.4189, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.808988764044944, | |
| "grad_norm": 3.535388708114624, | |
| "learning_rate": 3.1835205992509364e-06, | |
| "loss": 0.4585, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.825040128410915, | |
| "grad_norm": 3.344831943511963, | |
| "learning_rate": 2.9159978598180844e-06, | |
| "loss": 0.4366, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.841091492776886, | |
| "grad_norm": 2.6424927711486816, | |
| "learning_rate": 2.648475120385233e-06, | |
| "loss": 0.4465, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 3.244215250015259, | |
| "learning_rate": 2.3809523809523808e-06, | |
| "loss": 0.4404, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.8731942215088284, | |
| "grad_norm": 3.6791014671325684, | |
| "learning_rate": 2.113429641519529e-06, | |
| "loss": 0.4533, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.889245585874799, | |
| "grad_norm": 6.856778144836426, | |
| "learning_rate": 1.8459069020866775e-06, | |
| "loss": 0.4305, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.9052969502407704, | |
| "grad_norm": 5.275630474090576, | |
| "learning_rate": 1.5783841626538255e-06, | |
| "loss": 0.5065, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.9213483146067416, | |
| "grad_norm": 4.3441619873046875, | |
| "learning_rate": 1.310861423220974e-06, | |
| "loss": 0.3924, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.937399678972713, | |
| "grad_norm": 4.840381145477295, | |
| "learning_rate": 1.043338683788122e-06, | |
| "loss": 0.455, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.953451043338684, | |
| "grad_norm": 4.567574501037598, | |
| "learning_rate": 7.758159443552703e-07, | |
| "loss": 0.4415, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.969502407704655, | |
| "grad_norm": 3.6461331844329834, | |
| "learning_rate": 5.082932049224184e-07, | |
| "loss": 0.4363, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.985553772070626, | |
| "grad_norm": 5.579707145690918, | |
| "learning_rate": 2.407704654895666e-07, | |
| "loss": 0.4153, | |
| "step": 1860 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1869, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4324368475705500.0, | |
| "train_batch_size": 50, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |