{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1869, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016051364365971106, "grad_norm": 0.5444720983505249, "learning_rate": 4.973247726056715e-05, "loss": 0.6974, "step": 10 }, { "epoch": 0.03210272873194221, "grad_norm": 0.46184447407722473, "learning_rate": 4.94649545211343e-05, "loss": 0.6938, "step": 20 }, { "epoch": 0.048154093097913325, "grad_norm": 0.6587526798248291, "learning_rate": 4.919743178170145e-05, "loss": 0.6926, "step": 30 }, { "epoch": 0.06420545746388442, "grad_norm": 2.1942267417907715, "learning_rate": 4.8929909042268596e-05, "loss": 0.6628, "step": 40 }, { "epoch": 0.08025682182985554, "grad_norm": 1.835483431816101, "learning_rate": 4.8662386302835744e-05, "loss": 0.6777, "step": 50 }, { "epoch": 0.09630818619582665, "grad_norm": 1.6423420906066895, "learning_rate": 4.839486356340289e-05, "loss": 0.6748, "step": 60 }, { "epoch": 0.11235955056179775, "grad_norm": 1.618177056312561, "learning_rate": 4.812734082397004e-05, "loss": 0.6585, "step": 70 }, { "epoch": 0.12841091492776885, "grad_norm": 2.7368061542510986, "learning_rate": 4.785981808453719e-05, "loss": 0.6208, "step": 80 }, { "epoch": 0.14446227929373998, "grad_norm": 2.0427486896514893, "learning_rate": 4.759229534510434e-05, "loss": 0.6447, "step": 90 }, { "epoch": 0.16051364365971107, "grad_norm": 2.044725179672241, "learning_rate": 4.7324772605671486e-05, "loss": 0.6543, "step": 100 }, { "epoch": 0.17656500802568217, "grad_norm": 1.4154126644134521, "learning_rate": 4.7057249866238635e-05, "loss": 0.6295, "step": 110 }, { "epoch": 0.1926163723916533, "grad_norm": 1.4071663618087769, "learning_rate": 4.678972712680578e-05, "loss": 0.6088, "step": 120 }, { "epoch": 0.2086677367576244, "grad_norm": 1.5164605379104614, "learning_rate": 4.652220438737293e-05, "loss": 0.6086, "step": 130 }, { "epoch": 0.2247191011235955, "grad_norm": 1.1694097518920898, "learning_rate": 4.625468164794008e-05, "loss": 0.6131, "step": 140 }, { "epoch": 0.24077046548956663, "grad_norm": 2.416905164718628, "learning_rate": 4.598715890850723e-05, "loss": 0.6392, "step": 150 }, { "epoch": 0.2568218298555377, "grad_norm": 1.8865312337875366, "learning_rate": 4.571963616907438e-05, "loss": 0.5955, "step": 160 }, { "epoch": 0.27287319422150885, "grad_norm": 1.4470267295837402, "learning_rate": 4.5452113429641525e-05, "loss": 0.6245, "step": 170 }, { "epoch": 0.28892455858747995, "grad_norm": 2.124974489212036, "learning_rate": 4.5184590690208673e-05, "loss": 0.6251, "step": 180 }, { "epoch": 0.30497592295345105, "grad_norm": 2.1896016597747803, "learning_rate": 4.491706795077582e-05, "loss": 0.631, "step": 190 }, { "epoch": 0.32102728731942215, "grad_norm": 1.449318766593933, "learning_rate": 4.4649545211342963e-05, "loss": 0.5939, "step": 200 }, { "epoch": 0.33707865168539325, "grad_norm": 2.2219743728637695, "learning_rate": 4.438202247191011e-05, "loss": 0.5746, "step": 210 }, { "epoch": 0.35313001605136435, "grad_norm": 2.2409071922302246, "learning_rate": 4.411449973247726e-05, "loss": 0.6065, "step": 220 }, { "epoch": 0.36918138041733545, "grad_norm": 1.1223793029785156, "learning_rate": 4.384697699304441e-05, "loss": 0.6123, "step": 230 }, { "epoch": 0.3852327447833066, "grad_norm": 1.2747622728347778, "learning_rate": 4.357945425361156e-05, "loss": 0.5952, "step": 240 }, { "epoch": 0.4012841091492777, "grad_norm": 1.8558810949325562, "learning_rate": 4.3311931514178705e-05, "loss": 0.6027, "step": 250 }, { "epoch": 0.4173354735152488, "grad_norm": 1.1629371643066406, "learning_rate": 4.3044408774745854e-05, "loss": 0.6228, "step": 260 }, { "epoch": 0.4333868378812199, "grad_norm": 1.4288586378097534, "learning_rate": 4.2776886035313e-05, "loss": 0.5734, "step": 270 }, { "epoch": 0.449438202247191, "grad_norm": 3.359997034072876, "learning_rate": 4.250936329588015e-05, "loss": 0.6018, "step": 280 }, { "epoch": 0.4654895666131621, "grad_norm": 1.8125280141830444, "learning_rate": 4.22418405564473e-05, "loss": 0.6288, "step": 290 }, { "epoch": 0.48154093097913325, "grad_norm": 1.3432456254959106, "learning_rate": 4.197431781701445e-05, "loss": 0.5756, "step": 300 }, { "epoch": 0.49759229534510435, "grad_norm": 3.372265100479126, "learning_rate": 4.1706795077581596e-05, "loss": 0.6197, "step": 310 }, { "epoch": 0.5136436597110754, "grad_norm": 1.7587655782699585, "learning_rate": 4.1439272338148744e-05, "loss": 0.5808, "step": 320 }, { "epoch": 0.5296950240770465, "grad_norm": 2.1404621601104736, "learning_rate": 4.117174959871589e-05, "loss": 0.6075, "step": 330 }, { "epoch": 0.5457463884430177, "grad_norm": 1.6256980895996094, "learning_rate": 4.090422685928304e-05, "loss": 0.5765, "step": 340 }, { "epoch": 0.5617977528089888, "grad_norm": 2.2659549713134766, "learning_rate": 4.063670411985019e-05, "loss": 0.6305, "step": 350 }, { "epoch": 0.5778491171749599, "grad_norm": 1.9907615184783936, "learning_rate": 4.036918138041734e-05, "loss": 0.5743, "step": 360 }, { "epoch": 0.593900481540931, "grad_norm": 2.26408314704895, "learning_rate": 4.0101658640984486e-05, "loss": 0.5842, "step": 370 }, { "epoch": 0.6099518459069021, "grad_norm": 1.9207652807235718, "learning_rate": 3.9834135901551634e-05, "loss": 0.5719, "step": 380 }, { "epoch": 0.6260032102728732, "grad_norm": 2.5978338718414307, "learning_rate": 3.956661316211878e-05, "loss": 0.5811, "step": 390 }, { "epoch": 0.6420545746388443, "grad_norm": 1.6617166996002197, "learning_rate": 3.929909042268593e-05, "loss": 0.5819, "step": 400 }, { "epoch": 0.6581059390048154, "grad_norm": 2.7522661685943604, "learning_rate": 3.903156768325308e-05, "loss": 0.605, "step": 410 }, { "epoch": 0.6741573033707865, "grad_norm": 1.6527293920516968, "learning_rate": 3.876404494382023e-05, "loss": 0.6036, "step": 420 }, { "epoch": 0.6902086677367576, "grad_norm": 1.6553492546081543, "learning_rate": 3.8496522204387376e-05, "loss": 0.5723, "step": 430 }, { "epoch": 0.7062600321027287, "grad_norm": 1.9113073348999023, "learning_rate": 3.8228999464954525e-05, "loss": 0.5869, "step": 440 }, { "epoch": 0.7223113964686998, "grad_norm": 1.493342399597168, "learning_rate": 3.796147672552167e-05, "loss": 0.5912, "step": 450 }, { "epoch": 0.7383627608346709, "grad_norm": 1.3368749618530273, "learning_rate": 3.769395398608882e-05, "loss": 0.613, "step": 460 }, { "epoch": 0.7544141252006421, "grad_norm": 2.699831008911133, "learning_rate": 3.742643124665597e-05, "loss": 0.5658, "step": 470 }, { "epoch": 0.7704654895666132, "grad_norm": 1.0824522972106934, "learning_rate": 3.715890850722312e-05, "loss": 0.6116, "step": 480 }, { "epoch": 0.7865168539325843, "grad_norm": 2.092763900756836, "learning_rate": 3.689138576779027e-05, "loss": 0.5711, "step": 490 }, { "epoch": 0.8025682182985554, "grad_norm": 2.2845699787139893, "learning_rate": 3.6623863028357415e-05, "loss": 0.5825, "step": 500 }, { "epoch": 0.8186195826645265, "grad_norm": 2.034006118774414, "learning_rate": 3.6356340288924564e-05, "loss": 0.5857, "step": 510 }, { "epoch": 0.8346709470304976, "grad_norm": 1.5809577703475952, "learning_rate": 3.608881754949171e-05, "loss": 0.5539, "step": 520 }, { "epoch": 0.8507223113964687, "grad_norm": 2.2914111614227295, "learning_rate": 3.582129481005886e-05, "loss": 0.5806, "step": 530 }, { "epoch": 0.8667736757624398, "grad_norm": 1.6067487001419067, "learning_rate": 3.555377207062601e-05, "loss": 0.5986, "step": 540 }, { "epoch": 0.8828250401284109, "grad_norm": 1.690928339958191, "learning_rate": 3.528624933119316e-05, "loss": 0.5713, "step": 550 }, { "epoch": 0.898876404494382, "grad_norm": 2.0696561336517334, "learning_rate": 3.5018726591760305e-05, "loss": 0.5601, "step": 560 }, { "epoch": 0.9149277688603531, "grad_norm": 1.693708896636963, "learning_rate": 3.4751203852327454e-05, "loss": 0.5812, "step": 570 }, { "epoch": 0.9309791332263242, "grad_norm": 1.5981098413467407, "learning_rate": 3.44836811128946e-05, "loss": 0.5699, "step": 580 }, { "epoch": 0.9470304975922953, "grad_norm": 1.4630780220031738, "learning_rate": 3.421615837346175e-05, "loss": 0.5917, "step": 590 }, { "epoch": 0.9630818619582665, "grad_norm": 1.8144547939300537, "learning_rate": 3.394863563402889e-05, "loss": 0.5969, "step": 600 }, { "epoch": 0.9791332263242376, "grad_norm": 2.242295265197754, "learning_rate": 3.368111289459604e-05, "loss": 0.5524, "step": 610 }, { "epoch": 0.9951845906902087, "grad_norm": 3.1214287281036377, "learning_rate": 3.341359015516319e-05, "loss": 0.5872, "step": 620 }, { "epoch": 1.0112359550561798, "grad_norm": 1.6238343715667725, "learning_rate": 3.314606741573034e-05, "loss": 0.5715, "step": 630 }, { "epoch": 1.0272873194221508, "grad_norm": 1.7471458911895752, "learning_rate": 3.2878544676297486e-05, "loss": 0.5377, "step": 640 }, { "epoch": 1.043338683788122, "grad_norm": 2.1529836654663086, "learning_rate": 3.2611021936864634e-05, "loss": 0.5542, "step": 650 }, { "epoch": 1.0593900481540932, "grad_norm": 2.278169870376587, "learning_rate": 3.234349919743178e-05, "loss": 0.5616, "step": 660 }, { "epoch": 1.0754414125200642, "grad_norm": 2.8219568729400635, "learning_rate": 3.207597645799893e-05, "loss": 0.5711, "step": 670 }, { "epoch": 1.0914927768860354, "grad_norm": 1.5639195442199707, "learning_rate": 3.180845371856608e-05, "loss": 0.5439, "step": 680 }, { "epoch": 1.1075441412520064, "grad_norm": 2.146303415298462, "learning_rate": 3.154093097913323e-05, "loss": 0.525, "step": 690 }, { "epoch": 1.1235955056179776, "grad_norm": 1.7315692901611328, "learning_rate": 3.1273408239700376e-05, "loss": 0.5521, "step": 700 }, { "epoch": 1.1396468699839486, "grad_norm": 2.457808494567871, "learning_rate": 3.1005885500267525e-05, "loss": 0.5209, "step": 710 }, { "epoch": 1.1556982343499198, "grad_norm": 2.4021546840667725, "learning_rate": 3.073836276083467e-05, "loss": 0.5056, "step": 720 }, { "epoch": 1.1717495987158908, "grad_norm": 3.025860548019409, "learning_rate": 3.047084002140182e-05, "loss": 0.4884, "step": 730 }, { "epoch": 1.187800963081862, "grad_norm": 3.1893551349639893, "learning_rate": 3.0203317281968966e-05, "loss": 0.5203, "step": 740 }, { "epoch": 1.203852327447833, "grad_norm": 4.527679443359375, "learning_rate": 2.9935794542536115e-05, "loss": 0.5114, "step": 750 }, { "epoch": 1.2199036918138042, "grad_norm": 3.021358013153076, "learning_rate": 2.9668271803103263e-05, "loss": 0.5408, "step": 760 }, { "epoch": 1.2359550561797752, "grad_norm": 3.0941548347473145, "learning_rate": 2.940074906367041e-05, "loss": 0.5122, "step": 770 }, { "epoch": 1.2520064205457464, "grad_norm": 2.5667285919189453, "learning_rate": 2.913322632423756e-05, "loss": 0.5192, "step": 780 }, { "epoch": 1.2680577849117176, "grad_norm": 3.7231733798980713, "learning_rate": 2.886570358480471e-05, "loss": 0.5609, "step": 790 }, { "epoch": 1.2841091492776886, "grad_norm": 1.9197113513946533, "learning_rate": 2.8598180845371857e-05, "loss": 0.4747, "step": 800 }, { "epoch": 1.3001605136436596, "grad_norm": 2.612793207168579, "learning_rate": 2.8330658105939005e-05, "loss": 0.5286, "step": 810 }, { "epoch": 1.3162118780096308, "grad_norm": 2.353598117828369, "learning_rate": 2.8063135366506153e-05, "loss": 0.535, "step": 820 }, { "epoch": 1.332263242375602, "grad_norm": 2.9524009227752686, "learning_rate": 2.7795612627073302e-05, "loss": 0.5402, "step": 830 }, { "epoch": 1.348314606741573, "grad_norm": 2.1400082111358643, "learning_rate": 2.752808988764045e-05, "loss": 0.5487, "step": 840 }, { "epoch": 1.3643659711075442, "grad_norm": 2.116978168487549, "learning_rate": 2.72605671482076e-05, "loss": 0.5586, "step": 850 }, { "epoch": 1.3804173354735152, "grad_norm": 2.3243775367736816, "learning_rate": 2.6993044408774747e-05, "loss": 0.5402, "step": 860 }, { "epoch": 1.3964686998394864, "grad_norm": 2.2850890159606934, "learning_rate": 2.6725521669341895e-05, "loss": 0.5151, "step": 870 }, { "epoch": 1.4125200642054574, "grad_norm": 2.305981397628784, "learning_rate": 2.6457998929909044e-05, "loss": 0.5494, "step": 880 }, { "epoch": 1.4285714285714286, "grad_norm": 1.8870021104812622, "learning_rate": 2.6190476190476192e-05, "loss": 0.5545, "step": 890 }, { "epoch": 1.4446227929373996, "grad_norm": 2.1851470470428467, "learning_rate": 2.592295345104334e-05, "loss": 0.5205, "step": 900 }, { "epoch": 1.4606741573033708, "grad_norm": 2.7136600017547607, "learning_rate": 2.565543071161049e-05, "loss": 0.5427, "step": 910 }, { "epoch": 1.476725521669342, "grad_norm": 3.9425771236419678, "learning_rate": 2.5387907972177637e-05, "loss": 0.5209, "step": 920 }, { "epoch": 1.492776886035313, "grad_norm": 2.84690260887146, "learning_rate": 2.5120385232744786e-05, "loss": 0.5562, "step": 930 }, { "epoch": 1.508828250401284, "grad_norm": 2.375824213027954, "learning_rate": 2.485286249331193e-05, "loss": 0.521, "step": 940 }, { "epoch": 1.5248796147672552, "grad_norm": 2.241267681121826, "learning_rate": 2.458533975387908e-05, "loss": 0.5576, "step": 950 }, { "epoch": 1.5409309791332264, "grad_norm": 2.209796190261841, "learning_rate": 2.4317817014446228e-05, "loss": 0.5034, "step": 960 }, { "epoch": 1.5569823434991974, "grad_norm": 2.9751803874969482, "learning_rate": 2.4050294275013376e-05, "loss": 0.5222, "step": 970 }, { "epoch": 1.5730337078651684, "grad_norm": 3.5506584644317627, "learning_rate": 2.3782771535580524e-05, "loss": 0.5067, "step": 980 }, { "epoch": 1.5890850722311396, "grad_norm": 2.4530675411224365, "learning_rate": 2.3515248796147673e-05, "loss": 0.5262, "step": 990 }, { "epoch": 1.6051364365971108, "grad_norm": 2.639045476913452, "learning_rate": 2.324772605671482e-05, "loss": 0.5396, "step": 1000 }, { "epoch": 1.621187800963082, "grad_norm": 3.676542043685913, "learning_rate": 2.298020331728197e-05, "loss": 0.5462, "step": 1010 }, { "epoch": 1.637239165329053, "grad_norm": 2.813171148300171, "learning_rate": 2.2712680577849118e-05, "loss": 0.5792, "step": 1020 }, { "epoch": 1.653290529695024, "grad_norm": 3.5937094688415527, "learning_rate": 2.2445157838416266e-05, "loss": 0.527, "step": 1030 }, { "epoch": 1.6693418940609952, "grad_norm": 2.3738603591918945, "learning_rate": 2.2177635098983415e-05, "loss": 0.5431, "step": 1040 }, { "epoch": 1.6853932584269664, "grad_norm": 2.7167625427246094, "learning_rate": 2.1910112359550563e-05, "loss": 0.5187, "step": 1050 }, { "epoch": 1.7014446227929374, "grad_norm": 2.8609092235565186, "learning_rate": 2.164258962011771e-05, "loss": 0.5061, "step": 1060 }, { "epoch": 1.7174959871589084, "grad_norm": 3.4374756813049316, "learning_rate": 2.137506688068486e-05, "loss": 0.5301, "step": 1070 }, { "epoch": 1.7335473515248796, "grad_norm": 2.3880395889282227, "learning_rate": 2.1107544141252008e-05, "loss": 0.5572, "step": 1080 }, { "epoch": 1.7495987158908508, "grad_norm": 2.2175981998443604, "learning_rate": 2.0840021401819157e-05, "loss": 0.4862, "step": 1090 }, { "epoch": 1.7656500802568218, "grad_norm": 1.8051007986068726, "learning_rate": 2.05724986623863e-05, "loss": 0.5108, "step": 1100 }, { "epoch": 1.7817014446227928, "grad_norm": 3.5168681144714355, "learning_rate": 2.030497592295345e-05, "loss": 0.538, "step": 1110 }, { "epoch": 1.797752808988764, "grad_norm": 2.5539281368255615, "learning_rate": 2.00374531835206e-05, "loss": 0.5489, "step": 1120 }, { "epoch": 1.8138041733547352, "grad_norm": 1.7118955850601196, "learning_rate": 1.9769930444087747e-05, "loss": 0.5242, "step": 1130 }, { "epoch": 1.8298555377207064, "grad_norm": 2.2016992568969727, "learning_rate": 1.9502407704654895e-05, "loss": 0.5172, "step": 1140 }, { "epoch": 1.8459069020866774, "grad_norm": 2.072165012359619, "learning_rate": 1.9234884965222044e-05, "loss": 0.5216, "step": 1150 }, { "epoch": 1.8619582664526484, "grad_norm": 2.446287155151367, "learning_rate": 1.8967362225789192e-05, "loss": 0.5391, "step": 1160 }, { "epoch": 1.8780096308186196, "grad_norm": 2.6297097206115723, "learning_rate": 1.869983948635634e-05, "loss": 0.5046, "step": 1170 }, { "epoch": 1.8940609951845908, "grad_norm": 2.429002285003662, "learning_rate": 1.843231674692349e-05, "loss": 0.5081, "step": 1180 }, { "epoch": 1.9101123595505618, "grad_norm": 2.9561805725097656, "learning_rate": 1.8164794007490637e-05, "loss": 0.5335, "step": 1190 }, { "epoch": 1.9261637239165328, "grad_norm": 2.9521896839141846, "learning_rate": 1.7897271268057786e-05, "loss": 0.5307, "step": 1200 }, { "epoch": 1.942215088282504, "grad_norm": 2.409397602081299, "learning_rate": 1.7629748528624934e-05, "loss": 0.5014, "step": 1210 }, { "epoch": 1.9582664526484752, "grad_norm": 2.046233892440796, "learning_rate": 1.7362225789192082e-05, "loss": 0.5637, "step": 1220 }, { "epoch": 1.9743178170144462, "grad_norm": 2.7884459495544434, "learning_rate": 1.709470304975923e-05, "loss": 0.5512, "step": 1230 }, { "epoch": 1.9903691813804172, "grad_norm": 2.0197086334228516, "learning_rate": 1.682718031032638e-05, "loss": 0.5112, "step": 1240 }, { "epoch": 2.0064205457463884, "grad_norm": 2.801968574523926, "learning_rate": 1.6559657570893527e-05, "loss": 0.5084, "step": 1250 }, { "epoch": 2.0224719101123596, "grad_norm": 2.099778652191162, "learning_rate": 1.6292134831460676e-05, "loss": 0.4558, "step": 1260 }, { "epoch": 2.038523274478331, "grad_norm": 3.728285789489746, "learning_rate": 1.6024612092027824e-05, "loss": 0.453, "step": 1270 }, { "epoch": 2.0545746388443016, "grad_norm": 3.892178773880005, "learning_rate": 1.5757089352594973e-05, "loss": 0.4347, "step": 1280 }, { "epoch": 2.070626003210273, "grad_norm": 3.3752293586730957, "learning_rate": 1.548956661316212e-05, "loss": 0.4382, "step": 1290 }, { "epoch": 2.086677367576244, "grad_norm": 4.784174919128418, "learning_rate": 1.5222043873729266e-05, "loss": 0.4416, "step": 1300 }, { "epoch": 2.102728731942215, "grad_norm": 2.877718448638916, "learning_rate": 1.4954521134296414e-05, "loss": 0.4402, "step": 1310 }, { "epoch": 2.1187800963081864, "grad_norm": 3.0553367137908936, "learning_rate": 1.4686998394863563e-05, "loss": 0.4508, "step": 1320 }, { "epoch": 2.134831460674157, "grad_norm": 3.8250510692596436, "learning_rate": 1.4419475655430711e-05, "loss": 0.4422, "step": 1330 }, { "epoch": 2.1508828250401284, "grad_norm": 3.143554210662842, "learning_rate": 1.415195291599786e-05, "loss": 0.4572, "step": 1340 }, { "epoch": 2.1669341894060996, "grad_norm": 1.8287140130996704, "learning_rate": 1.3884430176565008e-05, "loss": 0.4287, "step": 1350 }, { "epoch": 2.182985553772071, "grad_norm": 5.451256275177002, "learning_rate": 1.3616907437132156e-05, "loss": 0.4481, "step": 1360 }, { "epoch": 2.1990369181380416, "grad_norm": 4.044301986694336, "learning_rate": 1.3349384697699305e-05, "loss": 0.4448, "step": 1370 }, { "epoch": 2.215088282504013, "grad_norm": 4.877999782562256, "learning_rate": 1.3081861958266453e-05, "loss": 0.4457, "step": 1380 }, { "epoch": 2.231139646869984, "grad_norm": 4.929600238800049, "learning_rate": 1.2814339218833602e-05, "loss": 0.432, "step": 1390 }, { "epoch": 2.247191011235955, "grad_norm": 3.4969890117645264, "learning_rate": 1.254681647940075e-05, "loss": 0.4387, "step": 1400 }, { "epoch": 2.263242375601926, "grad_norm": 4.1156086921691895, "learning_rate": 1.2279293739967898e-05, "loss": 0.4406, "step": 1410 }, { "epoch": 2.279293739967897, "grad_norm": 4.0271148681640625, "learning_rate": 1.2011771000535047e-05, "loss": 0.4283, "step": 1420 }, { "epoch": 2.2953451043338684, "grad_norm": 3.6957242488861084, "learning_rate": 1.1744248261102195e-05, "loss": 0.4424, "step": 1430 }, { "epoch": 2.3113964686998396, "grad_norm": 3.1115617752075195, "learning_rate": 1.1476725521669344e-05, "loss": 0.4475, "step": 1440 }, { "epoch": 2.3274478330658104, "grad_norm": 4.844674587249756, "learning_rate": 1.120920278223649e-05, "loss": 0.4459, "step": 1450 }, { "epoch": 2.3434991974317816, "grad_norm": 2.712280750274658, "learning_rate": 1.0941680042803639e-05, "loss": 0.4219, "step": 1460 }, { "epoch": 2.359550561797753, "grad_norm": 2.9259140491485596, "learning_rate": 1.0674157303370787e-05, "loss": 0.4824, "step": 1470 }, { "epoch": 2.375601926163724, "grad_norm": 4.346639633178711, "learning_rate": 1.0406634563937935e-05, "loss": 0.4364, "step": 1480 }, { "epoch": 2.391653290529695, "grad_norm": 2.7212588787078857, "learning_rate": 1.0139111824505084e-05, "loss": 0.3956, "step": 1490 }, { "epoch": 2.407704654895666, "grad_norm": 4.124871253967285, "learning_rate": 9.871589085072232e-06, "loss": 0.483, "step": 1500 }, { "epoch": 2.423756019261637, "grad_norm": 4.835409641265869, "learning_rate": 9.60406634563938e-06, "loss": 0.4623, "step": 1510 }, { "epoch": 2.4398073836276084, "grad_norm": 2.9034523963928223, "learning_rate": 9.336543606206529e-06, "loss": 0.4208, "step": 1520 }, { "epoch": 2.4558587479935796, "grad_norm": 2.569786787033081, "learning_rate": 9.069020866773677e-06, "loss": 0.4141, "step": 1530 }, { "epoch": 2.4719101123595504, "grad_norm": 4.244718551635742, "learning_rate": 8.801498127340826e-06, "loss": 0.4583, "step": 1540 }, { "epoch": 2.4879614767255216, "grad_norm": 4.004569053649902, "learning_rate": 8.533975387907972e-06, "loss": 0.4266, "step": 1550 }, { "epoch": 2.504012841091493, "grad_norm": 4.286050796508789, "learning_rate": 8.26645264847512e-06, "loss": 0.4117, "step": 1560 }, { "epoch": 2.520064205457464, "grad_norm": 4.992115497589111, "learning_rate": 7.99892990904227e-06, "loss": 0.4103, "step": 1570 }, { "epoch": 2.5361155698234352, "grad_norm": 4.154786586761475, "learning_rate": 7.731407169609418e-06, "loss": 0.4566, "step": 1580 }, { "epoch": 2.552166934189406, "grad_norm": 4.701552391052246, "learning_rate": 7.463884430176565e-06, "loss": 0.4534, "step": 1590 }, { "epoch": 2.568218298555377, "grad_norm": 3.9032320976257324, "learning_rate": 7.1963616907437135e-06, "loss": 0.4586, "step": 1600 }, { "epoch": 2.5842696629213484, "grad_norm": 4.401456832885742, "learning_rate": 6.928838951310862e-06, "loss": 0.4219, "step": 1610 }, { "epoch": 2.600321027287319, "grad_norm": 3.317080497741699, "learning_rate": 6.66131621187801e-06, "loss": 0.4522, "step": 1620 }, { "epoch": 2.6163723916532904, "grad_norm": 3.893983840942383, "learning_rate": 6.393793472445159e-06, "loss": 0.4208, "step": 1630 }, { "epoch": 2.6324237560192616, "grad_norm": 2.585857391357422, "learning_rate": 6.126270733012306e-06, "loss": 0.4535, "step": 1640 }, { "epoch": 2.648475120385233, "grad_norm": 2.956127405166626, "learning_rate": 5.858747993579455e-06, "loss": 0.4587, "step": 1650 }, { "epoch": 2.664526484751204, "grad_norm": 4.7360992431640625, "learning_rate": 5.591225254146603e-06, "loss": 0.4317, "step": 1660 }, { "epoch": 2.6805778491171752, "grad_norm": 3.546750068664551, "learning_rate": 5.323702514713751e-06, "loss": 0.4707, "step": 1670 }, { "epoch": 2.696629213483146, "grad_norm": 3.985381841659546, "learning_rate": 5.056179775280899e-06, "loss": 0.448, "step": 1680 }, { "epoch": 2.712680577849117, "grad_norm": 3.1598966121673584, "learning_rate": 4.788657035848047e-06, "loss": 0.4656, "step": 1690 }, { "epoch": 2.7287319422150884, "grad_norm": 2.8233871459960938, "learning_rate": 4.521134296415196e-06, "loss": 0.4827, "step": 1700 }, { "epoch": 2.744783306581059, "grad_norm": 3.8089215755462646, "learning_rate": 4.253611556982344e-06, "loss": 0.4454, "step": 1710 }, { "epoch": 2.7608346709470304, "grad_norm": 3.793998956680298, "learning_rate": 3.986088817549492e-06, "loss": 0.4626, "step": 1720 }, { "epoch": 2.7768860353130016, "grad_norm": 4.2339768409729, "learning_rate": 3.71856607811664e-06, "loss": 0.4368, "step": 1730 }, { "epoch": 2.792937399678973, "grad_norm": 3.96157169342041, "learning_rate": 3.4510433386837885e-06, "loss": 0.4189, "step": 1740 }, { "epoch": 2.808988764044944, "grad_norm": 3.535388708114624, "learning_rate": 3.1835205992509364e-06, "loss": 0.4585, "step": 1750 }, { "epoch": 2.825040128410915, "grad_norm": 3.344831943511963, "learning_rate": 2.9159978598180844e-06, "loss": 0.4366, "step": 1760 }, { "epoch": 2.841091492776886, "grad_norm": 2.6424927711486816, "learning_rate": 2.648475120385233e-06, "loss": 0.4465, "step": 1770 }, { "epoch": 2.857142857142857, "grad_norm": 3.244215250015259, "learning_rate": 2.3809523809523808e-06, "loss": 0.4404, "step": 1780 }, { "epoch": 2.8731942215088284, "grad_norm": 3.6791014671325684, "learning_rate": 2.113429641519529e-06, "loss": 0.4533, "step": 1790 }, { "epoch": 2.889245585874799, "grad_norm": 6.856778144836426, "learning_rate": 1.8459069020866775e-06, "loss": 0.4305, "step": 1800 }, { "epoch": 2.9052969502407704, "grad_norm": 5.275630474090576, "learning_rate": 1.5783841626538255e-06, "loss": 0.5065, "step": 1810 }, { "epoch": 2.9213483146067416, "grad_norm": 4.3441619873046875, "learning_rate": 1.310861423220974e-06, "loss": 0.3924, "step": 1820 }, { "epoch": 2.937399678972713, "grad_norm": 4.840381145477295, "learning_rate": 1.043338683788122e-06, "loss": 0.455, "step": 1830 }, { "epoch": 2.953451043338684, "grad_norm": 4.567574501037598, "learning_rate": 7.758159443552703e-07, "loss": 0.4415, "step": 1840 }, { "epoch": 2.969502407704655, "grad_norm": 3.6461331844329834, "learning_rate": 5.082932049224184e-07, "loss": 0.4363, "step": 1850 }, { "epoch": 2.985553772070626, "grad_norm": 5.579707145690918, "learning_rate": 2.407704654895666e-07, "loss": 0.4153, "step": 1860 } ], "logging_steps": 10, "max_steps": 1869, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4324368475705500.0, "train_batch_size": 50, "trial_name": null, "trial_params": null }