diff --git "a/limo/full/checkpoint-1545/trainer_state.json" "b/limo/full/checkpoint-1545/trainer_state.json" new file mode 100644--- /dev/null +++ "b/limo/full/checkpoint-1545/trainer_state.json" @@ -0,0 +1,10848 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 500, + "global_step": 1545, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009708737864077669, + "grad_norm": 4.52614688873291, + "learning_rate": 4.999994831641374e-06, + "loss": 0.8521, + "step": 1 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 3.6663663387298584, + "learning_rate": 4.9999793265868636e-06, + "loss": 0.6915, + "step": 2 + }, + { + "epoch": 0.02912621359223301, + "grad_norm": 2.7436909675598145, + "learning_rate": 4.999953484900578e-06, + "loss": 0.8366, + "step": 3 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 2.088653326034546, + "learning_rate": 4.9999173066893655e-06, + "loss": 0.7118, + "step": 4 + }, + { + "epoch": 0.04854368932038835, + "grad_norm": 2.199723958969116, + "learning_rate": 4.9998707921028104e-06, + "loss": 0.7858, + "step": 5 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 2.0426740646362305, + "learning_rate": 4.999813941333237e-06, + "loss": 0.6171, + "step": 6 + }, + { + "epoch": 0.06796116504854369, + "grad_norm": 1.950857400894165, + "learning_rate": 4.999746754615704e-06, + "loss": 0.7283, + "step": 7 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 1.5054230690002441, + "learning_rate": 4.9996692322280085e-06, + "loss": 0.6389, + "step": 8 + }, + { + "epoch": 0.08737864077669903, + "grad_norm": 1.304260015487671, + "learning_rate": 4.999581374490681e-06, + "loss": 0.6234, + "step": 9 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 1.4494102001190186, + "learning_rate": 4.999483181766986e-06, + "loss": 0.8027, + "step": 10 + }, + { + "epoch": 0.10679611650485436, + "grad_norm": 1.3682693243026733, + "learning_rate": 4.999374654462919e-06, + "loss": 0.7446, + "step": 11 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 1.1007122993469238, + "learning_rate": 4.999255793027207e-06, + "loss": 0.7415, + "step": 12 + }, + { + "epoch": 0.1262135922330097, + "grad_norm": 1.0817819833755493, + "learning_rate": 4.999126597951305e-06, + "loss": 0.5366, + "step": 13 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 1.2209179401397705, + "learning_rate": 4.998987069769394e-06, + "loss": 0.5151, + "step": 14 + }, + { + "epoch": 0.14563106796116504, + "grad_norm": 1.3821749687194824, + "learning_rate": 4.998837209058379e-06, + "loss": 0.6989, + "step": 15 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 1.2181178331375122, + "learning_rate": 4.998677016437888e-06, + "loss": 0.5743, + "step": 16 + }, + { + "epoch": 0.1650485436893204, + "grad_norm": 1.045208215713501, + "learning_rate": 4.998506492570266e-06, + "loss": 0.4859, + "step": 17 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 1.3972703218460083, + "learning_rate": 4.998325638160576e-06, + "loss": 0.7708, + "step": 18 + }, + { + "epoch": 0.18446601941747573, + "grad_norm": 1.2529926300048828, + "learning_rate": 4.998134453956596e-06, + "loss": 0.5568, + "step": 19 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 1.3563785552978516, + "learning_rate": 4.997932940748811e-06, + "loss": 0.6435, + "step": 20 + }, + { + "epoch": 0.20388349514563106, + "grad_norm": 1.1624324321746826, + "learning_rate": 4.997721099370416e-06, + "loss": 0.6125, + "step": 21 + }, + { + "epoch": 0.21359223300970873, + "grad_norm": 1.2726877927780151, + "learning_rate": 4.997498930697308e-06, + "loss": 0.5921, + "step": 22 + }, + { + "epoch": 0.22330097087378642, + "grad_norm": 1.0864430665969849, + "learning_rate": 4.997266435648086e-06, + "loss": 0.5279, + "step": 23 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 1.2319843769073486, + "learning_rate": 4.997023615184044e-06, + "loss": 0.799, + "step": 24 + }, + { + "epoch": 0.24271844660194175, + "grad_norm": 1.12433922290802, + "learning_rate": 4.996770470309167e-06, + "loss": 0.7428, + "step": 25 + }, + { + "epoch": 0.2524271844660194, + "grad_norm": 1.1892931461334229, + "learning_rate": 4.996507002070131e-06, + "loss": 0.6064, + "step": 26 + }, + { + "epoch": 0.2621359223300971, + "grad_norm": 0.9294205904006958, + "learning_rate": 4.996233211556295e-06, + "loss": 0.6833, + "step": 27 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 1.1600544452667236, + "learning_rate": 4.9959490998996974e-06, + "loss": 0.4822, + "step": 28 + }, + { + "epoch": 0.2815533980582524, + "grad_norm": 0.8306522965431213, + "learning_rate": 4.995654668275049e-06, + "loss": 0.5528, + "step": 29 + }, + { + "epoch": 0.2912621359223301, + "grad_norm": 0.9367732405662537, + "learning_rate": 4.995349917899735e-06, + "loss": 0.4968, + "step": 30 + }, + { + "epoch": 0.30097087378640774, + "grad_norm": 0.9846139550209045, + "learning_rate": 4.9950348500338005e-06, + "loss": 0.4915, + "step": 31 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 1.137281894683838, + "learning_rate": 4.994709465979954e-06, + "loss": 0.6108, + "step": 32 + }, + { + "epoch": 0.32038834951456313, + "grad_norm": 0.9563445448875427, + "learning_rate": 4.994373767083556e-06, + "loss": 0.5569, + "step": 33 + }, + { + "epoch": 0.3300970873786408, + "grad_norm": 1.1258145570755005, + "learning_rate": 4.994027754732616e-06, + "loss": 0.6422, + "step": 34 + }, + { + "epoch": 0.33980582524271846, + "grad_norm": 1.071385145187378, + "learning_rate": 4.993671430357788e-06, + "loss": 0.5001, + "step": 35 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 1.0511752367019653, + "learning_rate": 4.99330479543236e-06, + "loss": 0.5523, + "step": 36 + }, + { + "epoch": 0.3592233009708738, + "grad_norm": 1.044432282447815, + "learning_rate": 4.992927851472254e-06, + "loss": 0.6826, + "step": 37 + }, + { + "epoch": 0.36893203883495146, + "grad_norm": 0.8882154226303101, + "learning_rate": 4.992540600036014e-06, + "loss": 0.7096, + "step": 38 + }, + { + "epoch": 0.3786407766990291, + "grad_norm": 1.0710127353668213, + "learning_rate": 4.992143042724805e-06, + "loss": 0.5487, + "step": 39 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 2.931645154953003, + "learning_rate": 4.991735181182401e-06, + "loss": 0.819, + "step": 40 + }, + { + "epoch": 0.39805825242718446, + "grad_norm": 1.0483241081237793, + "learning_rate": 4.991317017095182e-06, + "loss": 0.5693, + "step": 41 + }, + { + "epoch": 0.4077669902912621, + "grad_norm": 1.897265911102295, + "learning_rate": 4.990888552192126e-06, + "loss": 0.5371, + "step": 42 + }, + { + "epoch": 0.4174757281553398, + "grad_norm": 0.9810941815376282, + "learning_rate": 4.9904497882448004e-06, + "loss": 0.6299, + "step": 43 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 0.7764018774032593, + "learning_rate": 4.990000727067357e-06, + "loss": 0.5914, + "step": 44 + }, + { + "epoch": 0.4368932038834951, + "grad_norm": 0.7854965925216675, + "learning_rate": 4.989541370516523e-06, + "loss": 0.5044, + "step": 45 + }, + { + "epoch": 0.44660194174757284, + "grad_norm": 0.8319512009620667, + "learning_rate": 4.989071720491595e-06, + "loss": 0.4736, + "step": 46 + }, + { + "epoch": 0.4563106796116505, + "grad_norm": 0.9436545372009277, + "learning_rate": 4.988591778934428e-06, + "loss": 0.476, + "step": 47 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 1.3040553331375122, + "learning_rate": 4.9881015478294294e-06, + "loss": 0.5776, + "step": 48 + }, + { + "epoch": 0.47572815533980584, + "grad_norm": 0.933722198009491, + "learning_rate": 4.987601029203553e-06, + "loss": 0.4888, + "step": 49 + }, + { + "epoch": 0.4854368932038835, + "grad_norm": 0.858695387840271, + "learning_rate": 4.987090225126285e-06, + "loss": 0.599, + "step": 50 + }, + { + "epoch": 0.49514563106796117, + "grad_norm": 0.8849862217903137, + "learning_rate": 4.98656913770964e-06, + "loss": 0.5087, + "step": 51 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 0.8477684855461121, + "learning_rate": 4.986037769108154e-06, + "loss": 0.5507, + "step": 52 + }, + { + "epoch": 0.5145631067961165, + "grad_norm": 1.0618562698364258, + "learning_rate": 4.9854961215188676e-06, + "loss": 0.7117, + "step": 53 + }, + { + "epoch": 0.5242718446601942, + "grad_norm": 0.9216580390930176, + "learning_rate": 4.984944197181324e-06, + "loss": 0.4642, + "step": 54 + }, + { + "epoch": 0.5339805825242718, + "grad_norm": 0.9276666641235352, + "learning_rate": 4.9843819983775575e-06, + "loss": 0.4878, + "step": 55 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 0.9354626536369324, + "learning_rate": 4.983809527432086e-06, + "loss": 0.4856, + "step": 56 + }, + { + "epoch": 0.5533980582524272, + "grad_norm": 1.0111173391342163, + "learning_rate": 4.983226786711895e-06, + "loss": 0.5896, + "step": 57 + }, + { + "epoch": 0.5631067961165048, + "grad_norm": 0.8906674385070801, + "learning_rate": 4.982633778626437e-06, + "loss": 0.5374, + "step": 58 + }, + { + "epoch": 0.5728155339805825, + "grad_norm": 0.7241902947425842, + "learning_rate": 4.982030505627613e-06, + "loss": 0.6278, + "step": 59 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.9223883748054504, + "learning_rate": 4.98141697020977e-06, + "loss": 0.5967, + "step": 60 + }, + { + "epoch": 0.5922330097087378, + "grad_norm": 0.9214841723442078, + "learning_rate": 4.9807931749096836e-06, + "loss": 0.4895, + "step": 61 + }, + { + "epoch": 0.6019417475728155, + "grad_norm": 1.0902329683303833, + "learning_rate": 4.980159122306551e-06, + "loss": 0.6806, + "step": 62 + }, + { + "epoch": 0.6116504854368932, + "grad_norm": 0.9778655171394348, + "learning_rate": 4.979514815021984e-06, + "loss": 0.7513, + "step": 63 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 0.9169423580169678, + "learning_rate": 4.978860255719989e-06, + "loss": 0.5485, + "step": 64 + }, + { + "epoch": 0.6310679611650486, + "grad_norm": 0.8950898051261902, + "learning_rate": 4.978195447106965e-06, + "loss": 0.69, + "step": 65 + }, + { + "epoch": 0.6407766990291263, + "grad_norm": 0.9813127517700195, + "learning_rate": 4.9775203919316864e-06, + "loss": 0.5714, + "step": 66 + }, + { + "epoch": 0.6504854368932039, + "grad_norm": 1.1932649612426758, + "learning_rate": 4.976835092985297e-06, + "loss": 0.7081, + "step": 67 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.8608429431915283, + "learning_rate": 4.976139553101291e-06, + "loss": 0.6091, + "step": 68 + }, + { + "epoch": 0.6699029126213593, + "grad_norm": 0.9685009717941284, + "learning_rate": 4.975433775155509e-06, + "loss": 0.5879, + "step": 69 + }, + { + "epoch": 0.6796116504854369, + "grad_norm": 0.72447669506073, + "learning_rate": 4.974717762066123e-06, + "loss": 0.5094, + "step": 70 + }, + { + "epoch": 0.6893203883495146, + "grad_norm": 1.0234659910202026, + "learning_rate": 4.973991516793621e-06, + "loss": 0.7861, + "step": 71 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.8732417225837708, + "learning_rate": 4.973255042340801e-06, + "loss": 0.5652, + "step": 72 + }, + { + "epoch": 0.7087378640776699, + "grad_norm": 0.9726948142051697, + "learning_rate": 4.972508341752754e-06, + "loss": 0.7384, + "step": 73 + }, + { + "epoch": 0.7184466019417476, + "grad_norm": 0.9769085645675659, + "learning_rate": 4.9717514181168534e-06, + "loss": 0.5375, + "step": 74 + }, + { + "epoch": 0.7281553398058253, + "grad_norm": 0.8254098296165466, + "learning_rate": 4.970984274562741e-06, + "loss": 0.448, + "step": 75 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 1.040366530418396, + "learning_rate": 4.970206914262315e-06, + "loss": 0.6201, + "step": 76 + }, + { + "epoch": 0.7475728155339806, + "grad_norm": 0.928465723991394, + "learning_rate": 4.969419340429717e-06, + "loss": 0.5439, + "step": 77 + }, + { + "epoch": 0.7572815533980582, + "grad_norm": 0.8173317909240723, + "learning_rate": 4.968621556321319e-06, + "loss": 0.5596, + "step": 78 + }, + { + "epoch": 0.7669902912621359, + "grad_norm": 1.272718906402588, + "learning_rate": 4.967813565235708e-06, + "loss": 0.653, + "step": 79 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.8894286751747131, + "learning_rate": 4.966995370513675e-06, + "loss": 0.4878, + "step": 80 + }, + { + "epoch": 0.7864077669902912, + "grad_norm": 1.0618493556976318, + "learning_rate": 4.966166975538197e-06, + "loss": 0.7623, + "step": 81 + }, + { + "epoch": 0.7961165048543689, + "grad_norm": 0.9374388456344604, + "learning_rate": 4.965328383734429e-06, + "loss": 0.5505, + "step": 82 + }, + { + "epoch": 0.8058252427184466, + "grad_norm": 1.0057796239852905, + "learning_rate": 4.964479598569686e-06, + "loss": 0.7241, + "step": 83 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.9318037033081055, + "learning_rate": 4.963620623553428e-06, + "loss": 0.6787, + "step": 84 + }, + { + "epoch": 0.8252427184466019, + "grad_norm": 0.8819167017936707, + "learning_rate": 4.962751462237248e-06, + "loss": 0.5103, + "step": 85 + }, + { + "epoch": 0.8349514563106796, + "grad_norm": 1.0126394033432007, + "learning_rate": 4.9618721182148564e-06, + "loss": 0.4538, + "step": 86 + }, + { + "epoch": 0.8446601941747572, + "grad_norm": 0.9401935338973999, + "learning_rate": 4.960982595122064e-06, + "loss": 0.4892, + "step": 87 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 0.875956118106842, + "learning_rate": 4.960082896636773e-06, + "loss": 0.6734, + "step": 88 + }, + { + "epoch": 0.8640776699029126, + "grad_norm": 0.9892510175704956, + "learning_rate": 4.959173026478952e-06, + "loss": 0.5805, + "step": 89 + }, + { + "epoch": 0.8737864077669902, + "grad_norm": 0.8165844082832336, + "learning_rate": 4.958252988410631e-06, + "loss": 0.4292, + "step": 90 + }, + { + "epoch": 0.883495145631068, + "grad_norm": 1.2144172191619873, + "learning_rate": 4.9573227862358794e-06, + "loss": 0.5701, + "step": 91 + }, + { + "epoch": 0.8932038834951457, + "grad_norm": 0.8784387111663818, + "learning_rate": 4.956382423800791e-06, + "loss": 0.5766, + "step": 92 + }, + { + "epoch": 0.9029126213592233, + "grad_norm": 0.8603959679603577, + "learning_rate": 4.955431904993471e-06, + "loss": 0.6135, + "step": 93 + }, + { + "epoch": 0.912621359223301, + "grad_norm": 0.9591665863990784, + "learning_rate": 4.954471233744015e-06, + "loss": 0.4231, + "step": 94 + }, + { + "epoch": 0.9223300970873787, + "grad_norm": 0.9828278422355652, + "learning_rate": 4.9535004140245005e-06, + "loss": 0.6176, + "step": 95 + }, + { + "epoch": 0.9320388349514563, + "grad_norm": 0.9605256915092468, + "learning_rate": 4.952519449848962e-06, + "loss": 0.5469, + "step": 96 + }, + { + "epoch": 0.941747572815534, + "grad_norm": 1.0379314422607422, + "learning_rate": 4.951528345273379e-06, + "loss": 0.5562, + "step": 97 + }, + { + "epoch": 0.9514563106796117, + "grad_norm": 0.9734949469566345, + "learning_rate": 4.950527104395659e-06, + "loss": 0.6546, + "step": 98 + }, + { + "epoch": 0.9611650485436893, + "grad_norm": 1.0504772663116455, + "learning_rate": 4.9495157313556185e-06, + "loss": 0.4437, + "step": 99 + }, + { + "epoch": 0.970873786407767, + "grad_norm": 0.8973089456558228, + "learning_rate": 4.94849423033497e-06, + "loss": 0.3843, + "step": 100 + }, + { + "epoch": 0.9805825242718447, + "grad_norm": 0.9390032887458801, + "learning_rate": 4.9474626055573e-06, + "loss": 0.7418, + "step": 101 + }, + { + "epoch": 0.9902912621359223, + "grad_norm": 0.830512523651123, + "learning_rate": 4.946420861288051e-06, + "loss": 0.5576, + "step": 102 + }, + { + "epoch": 1.0, + "grad_norm": 0.9454949498176575, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.5272, + "step": 103 + }, + { + "epoch": 1.0097087378640777, + "grad_norm": 1.0955694913864136, + "learning_rate": 4.944307031545797e-06, + "loss": 0.4085, + "step": 104 + }, + { + "epoch": 1.0194174757281553, + "grad_norm": 1.0182733535766602, + "learning_rate": 4.943234954812812e-06, + "loss": 0.6185, + "step": 105 + }, + { + "epoch": 1.029126213592233, + "grad_norm": 1.0509144067764282, + "learning_rate": 4.942152776068264e-06, + "loss": 0.4346, + "step": 106 + }, + { + "epoch": 1.0388349514563107, + "grad_norm": 1.108575701713562, + "learning_rate": 4.941060499786622e-06, + "loss": 0.6482, + "step": 107 + }, + { + "epoch": 1.0485436893203883, + "grad_norm": 1.0695301294326782, + "learning_rate": 4.939958130484106e-06, + "loss": 0.4688, + "step": 108 + }, + { + "epoch": 1.058252427184466, + "grad_norm": 0.8110963106155396, + "learning_rate": 4.938845672718668e-06, + "loss": 0.4636, + "step": 109 + }, + { + "epoch": 1.0679611650485437, + "grad_norm": 1.0377169847488403, + "learning_rate": 4.937723131089974e-06, + "loss": 0.5277, + "step": 110 + }, + { + "epoch": 1.0776699029126213, + "grad_norm": 1.179370641708374, + "learning_rate": 4.93659051023938e-06, + "loss": 0.5084, + "step": 111 + }, + { + "epoch": 1.087378640776699, + "grad_norm": 0.7915495038032532, + "learning_rate": 4.93544781484992e-06, + "loss": 0.3111, + "step": 112 + }, + { + "epoch": 1.0970873786407767, + "grad_norm": 0.8695587515830994, + "learning_rate": 4.9342950496462815e-06, + "loss": 0.5161, + "step": 113 + }, + { + "epoch": 1.1067961165048543, + "grad_norm": 1.138805866241455, + "learning_rate": 4.933132219394786e-06, + "loss": 0.5075, + "step": 114 + }, + { + "epoch": 1.116504854368932, + "grad_norm": 0.8836705088615417, + "learning_rate": 4.931959328903376e-06, + "loss": 0.639, + "step": 115 + }, + { + "epoch": 1.1262135922330097, + "grad_norm": 0.8428300023078918, + "learning_rate": 4.930776383021584e-06, + "loss": 0.4566, + "step": 116 + }, + { + "epoch": 1.1359223300970873, + "grad_norm": 1.0511326789855957, + "learning_rate": 4.92958338664052e-06, + "loss": 0.3212, + "step": 117 + }, + { + "epoch": 1.145631067961165, + "grad_norm": 0.7950966954231262, + "learning_rate": 4.928380344692853e-06, + "loss": 0.3876, + "step": 118 + }, + { + "epoch": 1.1553398058252426, + "grad_norm": 0.8359726071357727, + "learning_rate": 4.927167262152784e-06, + "loss": 0.491, + "step": 119 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 0.8725942969322205, + "learning_rate": 4.925944144036027e-06, + "loss": 0.4993, + "step": 120 + }, + { + "epoch": 1.174757281553398, + "grad_norm": 0.8538187742233276, + "learning_rate": 4.924710995399796e-06, + "loss": 0.4621, + "step": 121 + }, + { + "epoch": 1.1844660194174756, + "grad_norm": 0.8445570468902588, + "learning_rate": 4.923467821342773e-06, + "loss": 0.4652, + "step": 122 + }, + { + "epoch": 1.1941747572815533, + "grad_norm": 0.8798047304153442, + "learning_rate": 4.922214627005092e-06, + "loss": 0.5119, + "step": 123 + }, + { + "epoch": 1.203883495145631, + "grad_norm": 0.9884085059165955, + "learning_rate": 4.920951417568323e-06, + "loss": 0.5963, + "step": 124 + }, + { + "epoch": 1.2135922330097086, + "grad_norm": 0.7976876497268677, + "learning_rate": 4.919678198255438e-06, + "loss": 0.409, + "step": 125 + }, + { + "epoch": 1.2233009708737863, + "grad_norm": 0.8497043251991272, + "learning_rate": 4.918394974330801e-06, + "loss": 0.5069, + "step": 126 + }, + { + "epoch": 1.233009708737864, + "grad_norm": 0.9168796539306641, + "learning_rate": 4.917101751100142e-06, + "loss": 0.4699, + "step": 127 + }, + { + "epoch": 1.2427184466019416, + "grad_norm": 0.9214611053466797, + "learning_rate": 4.915798533910534e-06, + "loss": 0.4605, + "step": 128 + }, + { + "epoch": 1.2524271844660193, + "grad_norm": 0.916888415813446, + "learning_rate": 4.9144853281503715e-06, + "loss": 0.3928, + "step": 129 + }, + { + "epoch": 1.262135922330097, + "grad_norm": 1.035357117652893, + "learning_rate": 4.91316213924935e-06, + "loss": 0.5672, + "step": 130 + }, + { + "epoch": 1.2718446601941746, + "grad_norm": 0.8245042562484741, + "learning_rate": 4.911828972678441e-06, + "loss": 0.3609, + "step": 131 + }, + { + "epoch": 1.2815533980582523, + "grad_norm": 0.9270608425140381, + "learning_rate": 4.91048583394987e-06, + "loss": 0.4302, + "step": 132 + }, + { + "epoch": 1.29126213592233, + "grad_norm": 0.8770717978477478, + "learning_rate": 4.909132728617095e-06, + "loss": 0.7062, + "step": 133 + }, + { + "epoch": 1.3009708737864076, + "grad_norm": 0.9357777833938599, + "learning_rate": 4.907769662274785e-06, + "loss": 0.4277, + "step": 134 + }, + { + "epoch": 1.3106796116504853, + "grad_norm": 0.8497308492660522, + "learning_rate": 4.90639664055879e-06, + "loss": 0.3775, + "step": 135 + }, + { + "epoch": 1.3203883495145632, + "grad_norm": 0.8059908151626587, + "learning_rate": 4.905013669146127e-06, + "loss": 0.4194, + "step": 136 + }, + { + "epoch": 1.3300970873786409, + "grad_norm": 0.9206295609474182, + "learning_rate": 4.903620753754949e-06, + "loss": 0.5536, + "step": 137 + }, + { + "epoch": 1.3398058252427185, + "grad_norm": 1.0086661577224731, + "learning_rate": 4.902217900144524e-06, + "loss": 0.7059, + "step": 138 + }, + { + "epoch": 1.3495145631067962, + "grad_norm": 0.829765260219574, + "learning_rate": 4.900805114115214e-06, + "loss": 0.6443, + "step": 139 + }, + { + "epoch": 1.3592233009708738, + "grad_norm": 1.0548204183578491, + "learning_rate": 4.899382401508446e-06, + "loss": 0.3785, + "step": 140 + }, + { + "epoch": 1.3689320388349515, + "grad_norm": 0.8123111128807068, + "learning_rate": 4.8979497682066916e-06, + "loss": 0.5454, + "step": 141 + }, + { + "epoch": 1.3786407766990292, + "grad_norm": 0.7998213171958923, + "learning_rate": 4.89650722013344e-06, + "loss": 0.3798, + "step": 142 + }, + { + "epoch": 1.3883495145631068, + "grad_norm": 0.7438819408416748, + "learning_rate": 4.895054763253177e-06, + "loss": 0.4741, + "step": 143 + }, + { + "epoch": 1.3980582524271845, + "grad_norm": 0.9747439622879028, + "learning_rate": 4.8935924035713564e-06, + "loss": 0.398, + "step": 144 + }, + { + "epoch": 1.4077669902912622, + "grad_norm": 0.8595287799835205, + "learning_rate": 4.892120147134378e-06, + "loss": 0.4111, + "step": 145 + }, + { + "epoch": 1.4174757281553398, + "grad_norm": 1.0879790782928467, + "learning_rate": 4.8906380000295615e-06, + "loss": 0.5177, + "step": 146 + }, + { + "epoch": 1.4271844660194175, + "grad_norm": 0.8619869947433472, + "learning_rate": 4.889145968385121e-06, + "loss": 0.4312, + "step": 147 + }, + { + "epoch": 1.4368932038834952, + "grad_norm": 0.8725226521492004, + "learning_rate": 4.887644058370139e-06, + "loss": 0.3597, + "step": 148 + }, + { + "epoch": 1.4466019417475728, + "grad_norm": 0.913958728313446, + "learning_rate": 4.886132276194544e-06, + "loss": 0.3836, + "step": 149 + }, + { + "epoch": 1.4563106796116505, + "grad_norm": 0.9644704461097717, + "learning_rate": 4.884610628109082e-06, + "loss": 0.3876, + "step": 150 + }, + { + "epoch": 1.4660194174757282, + "grad_norm": 0.7304561138153076, + "learning_rate": 4.883079120405292e-06, + "loss": 0.401, + "step": 151 + }, + { + "epoch": 1.4757281553398058, + "grad_norm": 0.9006849527359009, + "learning_rate": 4.881537759415478e-06, + "loss": 0.3514, + "step": 152 + }, + { + "epoch": 1.4854368932038835, + "grad_norm": 0.9095068573951721, + "learning_rate": 4.879986551512684e-06, + "loss": 0.5258, + "step": 153 + }, + { + "epoch": 1.4951456310679612, + "grad_norm": 0.8439248204231262, + "learning_rate": 4.878425503110672e-06, + "loss": 0.4044, + "step": 154 + }, + { + "epoch": 1.5048543689320388, + "grad_norm": 0.9092299938201904, + "learning_rate": 4.876854620663887e-06, + "loss": 0.4692, + "step": 155 + }, + { + "epoch": 1.5145631067961165, + "grad_norm": 1.0393599271774292, + "learning_rate": 4.875273910667434e-06, + "loss": 0.4612, + "step": 156 + }, + { + "epoch": 1.5242718446601942, + "grad_norm": 0.9718154072761536, + "learning_rate": 4.873683379657057e-06, + "loss": 0.3271, + "step": 157 + }, + { + "epoch": 1.5339805825242718, + "grad_norm": 0.8826969265937805, + "learning_rate": 4.8720830342091015e-06, + "loss": 0.719, + "step": 158 + }, + { + "epoch": 1.5436893203883495, + "grad_norm": 0.8973050713539124, + "learning_rate": 4.870472880940496e-06, + "loss": 0.5879, + "step": 159 + }, + { + "epoch": 1.5533980582524272, + "grad_norm": 1.1856048107147217, + "learning_rate": 4.868852926508721e-06, + "loss": 0.6377, + "step": 160 + }, + { + "epoch": 1.5631067961165048, + "grad_norm": 1.0391895771026611, + "learning_rate": 4.867223177611779e-06, + "loss": 0.5097, + "step": 161 + }, + { + "epoch": 1.5728155339805825, + "grad_norm": 1.1057053804397583, + "learning_rate": 4.865583640988173e-06, + "loss": 0.3714, + "step": 162 + }, + { + "epoch": 1.5825242718446602, + "grad_norm": 0.9357965588569641, + "learning_rate": 4.863934323416871e-06, + "loss": 0.4876, + "step": 163 + }, + { + "epoch": 1.5922330097087378, + "grad_norm": 0.8261438012123108, + "learning_rate": 4.862275231717288e-06, + "loss": 0.4145, + "step": 164 + }, + { + "epoch": 1.6019417475728155, + "grad_norm": 1.001351237297058, + "learning_rate": 4.860606372749247e-06, + "loss": 0.3615, + "step": 165 + }, + { + "epoch": 1.6116504854368932, + "grad_norm": 0.8855088353157043, + "learning_rate": 4.858927753412958e-06, + "loss": 0.4166, + "step": 166 + }, + { + "epoch": 1.6213592233009708, + "grad_norm": 0.9390084743499756, + "learning_rate": 4.857239380648985e-06, + "loss": 0.4242, + "step": 167 + }, + { + "epoch": 1.6310679611650487, + "grad_norm": 0.86589115858078, + "learning_rate": 4.855541261438223e-06, + "loss": 0.5007, + "step": 168 + }, + { + "epoch": 1.6407766990291264, + "grad_norm": 0.8739226460456848, + "learning_rate": 4.8538334028018605e-06, + "loss": 0.5952, + "step": 169 + }, + { + "epoch": 1.650485436893204, + "grad_norm": 0.759211540222168, + "learning_rate": 4.8521158118013605e-06, + "loss": 0.3322, + "step": 170 + }, + { + "epoch": 1.6601941747572817, + "grad_norm": 1.1307138204574585, + "learning_rate": 4.850388495538423e-06, + "loss": 0.4473, + "step": 171 + }, + { + "epoch": 1.6699029126213594, + "grad_norm": 1.050327181816101, + "learning_rate": 4.84865146115496e-06, + "loss": 0.4032, + "step": 172 + }, + { + "epoch": 1.679611650485437, + "grad_norm": 1.0714014768600464, + "learning_rate": 4.846904715833066e-06, + "loss": 0.3714, + "step": 173 + }, + { + "epoch": 1.6893203883495147, + "grad_norm": 0.7853191494941711, + "learning_rate": 4.8451482667949836e-06, + "loss": 0.5307, + "step": 174 + }, + { + "epoch": 1.6990291262135924, + "grad_norm": 0.9957652688026428, + "learning_rate": 4.843382121303082e-06, + "loss": 0.4776, + "step": 175 + }, + { + "epoch": 1.70873786407767, + "grad_norm": 0.888883650302887, + "learning_rate": 4.841606286659819e-06, + "loss": 0.3613, + "step": 176 + }, + { + "epoch": 1.7184466019417477, + "grad_norm": 0.9413757920265198, + "learning_rate": 4.839820770207714e-06, + "loss": 0.3656, + "step": 177 + }, + { + "epoch": 1.7281553398058254, + "grad_norm": 0.9671488404273987, + "learning_rate": 4.8380255793293195e-06, + "loss": 0.4002, + "step": 178 + }, + { + "epoch": 1.737864077669903, + "grad_norm": 1.2052301168441772, + "learning_rate": 4.8362207214471864e-06, + "loss": 0.3936, + "step": 179 + }, + { + "epoch": 1.7475728155339807, + "grad_norm": 0.9285777807235718, + "learning_rate": 4.83440620402384e-06, + "loss": 0.469, + "step": 180 + }, + { + "epoch": 1.7572815533980584, + "grad_norm": 0.8492085933685303, + "learning_rate": 4.832582034561738e-06, + "loss": 0.4749, + "step": 181 + }, + { + "epoch": 1.766990291262136, + "grad_norm": 0.8664201498031616, + "learning_rate": 4.830748220603251e-06, + "loss": 0.4987, + "step": 182 + }, + { + "epoch": 1.7766990291262137, + "grad_norm": 0.7961488366127014, + "learning_rate": 4.828904769730628e-06, + "loss": 0.5023, + "step": 183 + }, + { + "epoch": 1.7864077669902914, + "grad_norm": 0.9886943101882935, + "learning_rate": 4.827051689565958e-06, + "loss": 0.4413, + "step": 184 + }, + { + "epoch": 1.796116504854369, + "grad_norm": 0.8723036646842957, + "learning_rate": 4.825188987771149e-06, + "loss": 0.5691, + "step": 185 + }, + { + "epoch": 1.8058252427184467, + "grad_norm": 0.9434221982955933, + "learning_rate": 4.82331667204789e-06, + "loss": 0.3126, + "step": 186 + }, + { + "epoch": 1.8155339805825244, + "grad_norm": 0.9722500443458557, + "learning_rate": 4.821434750137619e-06, + "loss": 0.4777, + "step": 187 + }, + { + "epoch": 1.825242718446602, + "grad_norm": 0.9892464876174927, + "learning_rate": 4.819543229821494e-06, + "loss": 0.535, + "step": 188 + }, + { + "epoch": 1.8349514563106797, + "grad_norm": 0.8184608221054077, + "learning_rate": 4.8176421189203605e-06, + "loss": 0.3786, + "step": 189 + }, + { + "epoch": 1.8446601941747574, + "grad_norm": 0.9847081899642944, + "learning_rate": 4.815731425294716e-06, + "loss": 0.495, + "step": 190 + }, + { + "epoch": 1.854368932038835, + "grad_norm": 0.8743010759353638, + "learning_rate": 4.813811156844681e-06, + "loss": 0.5254, + "step": 191 + }, + { + "epoch": 1.8640776699029127, + "grad_norm": 1.206464409828186, + "learning_rate": 4.811881321509964e-06, + "loss": 0.521, + "step": 192 + }, + { + "epoch": 1.8737864077669903, + "grad_norm": 0.8556103110313416, + "learning_rate": 4.809941927269829e-06, + "loss": 0.6493, + "step": 193 + }, + { + "epoch": 1.883495145631068, + "grad_norm": 1.0219848155975342, + "learning_rate": 4.807992982143064e-06, + "loss": 0.468, + "step": 194 + }, + { + "epoch": 1.8932038834951457, + "grad_norm": 1.0638519525527954, + "learning_rate": 4.806034494187949e-06, + "loss": 0.6301, + "step": 195 + }, + { + "epoch": 1.9029126213592233, + "grad_norm": 0.9077790379524231, + "learning_rate": 4.804066471502216e-06, + "loss": 0.419, + "step": 196 + }, + { + "epoch": 1.912621359223301, + "grad_norm": 0.9752828478813171, + "learning_rate": 4.802088922223024e-06, + "loss": 0.3641, + "step": 197 + }, + { + "epoch": 1.9223300970873787, + "grad_norm": 0.8168025016784668, + "learning_rate": 4.80010185452692e-06, + "loss": 0.4678, + "step": 198 + }, + { + "epoch": 1.9320388349514563, + "grad_norm": 0.9176221489906311, + "learning_rate": 4.798105276629806e-06, + "loss": 0.3585, + "step": 199 + }, + { + "epoch": 1.941747572815534, + "grad_norm": 0.9507747888565063, + "learning_rate": 4.796099196786908e-06, + "loss": 0.3969, + "step": 200 + }, + { + "epoch": 1.9514563106796117, + "grad_norm": 0.7999218106269836, + "learning_rate": 4.794083623292737e-06, + "loss": 0.5478, + "step": 201 + }, + { + "epoch": 1.9611650485436893, + "grad_norm": 0.9281819462776184, + "learning_rate": 4.792058564481058e-06, + "loss": 0.3663, + "step": 202 + }, + { + "epoch": 1.970873786407767, + "grad_norm": 0.9587502479553223, + "learning_rate": 4.7900240287248554e-06, + "loss": 0.5232, + "step": 203 + }, + { + "epoch": 1.9805825242718447, + "grad_norm": 0.8954132795333862, + "learning_rate": 4.7879800244362975e-06, + "loss": 0.5567, + "step": 204 + }, + { + "epoch": 1.9902912621359223, + "grad_norm": 0.9513785243034363, + "learning_rate": 4.785926560066703e-06, + "loss": 0.555, + "step": 205 + }, + { + "epoch": 2.0, + "grad_norm": 0.8883495330810547, + "learning_rate": 4.783863644106502e-06, + "loss": 0.4269, + "step": 206 + }, + { + "epoch": 2.0097087378640777, + "grad_norm": 1.053060531616211, + "learning_rate": 4.781791285085209e-06, + "loss": 0.3394, + "step": 207 + }, + { + "epoch": 2.0194174757281553, + "grad_norm": 0.967287003993988, + "learning_rate": 4.779709491571378e-06, + "loss": 0.2951, + "step": 208 + }, + { + "epoch": 2.029126213592233, + "grad_norm": 0.8971940279006958, + "learning_rate": 4.777618272172573e-06, + "loss": 0.4388, + "step": 209 + }, + { + "epoch": 2.0388349514563107, + "grad_norm": 0.9578713178634644, + "learning_rate": 4.775517635535332e-06, + "loss": 0.4951, + "step": 210 + }, + { + "epoch": 2.0485436893203883, + "grad_norm": 1.1447572708129883, + "learning_rate": 4.77340759034513e-06, + "loss": 0.4791, + "step": 211 + }, + { + "epoch": 2.058252427184466, + "grad_norm": 1.332855224609375, + "learning_rate": 4.771288145326343e-06, + "loss": 0.4653, + "step": 212 + }, + { + "epoch": 2.0679611650485437, + "grad_norm": 1.055438756942749, + "learning_rate": 4.769159309242213e-06, + "loss": 0.2767, + "step": 213 + }, + { + "epoch": 2.0776699029126213, + "grad_norm": 1.3705360889434814, + "learning_rate": 4.767021090894809e-06, + "loss": 0.4402, + "step": 214 + }, + { + "epoch": 2.087378640776699, + "grad_norm": 1.3517563343048096, + "learning_rate": 4.764873499124997e-06, + "loss": 0.4773, + "step": 215 + }, + { + "epoch": 2.0970873786407767, + "grad_norm": 1.0658684968948364, + "learning_rate": 4.762716542812395e-06, + "loss": 0.3238, + "step": 216 + }, + { + "epoch": 2.1067961165048543, + "grad_norm": 0.9824750423431396, + "learning_rate": 4.7605502308753415e-06, + "loss": 0.3833, + "step": 217 + }, + { + "epoch": 2.116504854368932, + "grad_norm": 1.1418805122375488, + "learning_rate": 4.758374572270859e-06, + "loss": 0.4576, + "step": 218 + }, + { + "epoch": 2.1262135922330097, + "grad_norm": 0.9111520051956177, + "learning_rate": 4.756189575994614e-06, + "loss": 0.1972, + "step": 219 + }, + { + "epoch": 2.1359223300970873, + "grad_norm": 1.0712579488754272, + "learning_rate": 4.753995251080884e-06, + "loss": 0.3815, + "step": 220 + }, + { + "epoch": 2.145631067961165, + "grad_norm": 0.7820471525192261, + "learning_rate": 4.7517916066025126e-06, + "loss": 0.2703, + "step": 221 + }, + { + "epoch": 2.1553398058252426, + "grad_norm": 0.9843001961708069, + "learning_rate": 4.7495786516708806e-06, + "loss": 0.4997, + "step": 222 + }, + { + "epoch": 2.1650485436893203, + "grad_norm": 1.0501731634140015, + "learning_rate": 4.747356395435865e-06, + "loss": 0.2681, + "step": 223 + }, + { + "epoch": 2.174757281553398, + "grad_norm": 1.0242283344268799, + "learning_rate": 4.745124847085799e-06, + "loss": 0.3817, + "step": 224 + }, + { + "epoch": 2.1844660194174756, + "grad_norm": 1.1368554830551147, + "learning_rate": 4.742884015847436e-06, + "loss": 0.3516, + "step": 225 + }, + { + "epoch": 2.1941747572815533, + "grad_norm": 1.0896131992340088, + "learning_rate": 4.740633910985911e-06, + "loss": 0.3989, + "step": 226 + }, + { + "epoch": 2.203883495145631, + "grad_norm": 0.8043588399887085, + "learning_rate": 4.738374541804704e-06, + "loss": 0.2405, + "step": 227 + }, + { + "epoch": 2.2135922330097086, + "grad_norm": 0.9075763821601868, + "learning_rate": 4.7361059176456e-06, + "loss": 0.3208, + "step": 228 + }, + { + "epoch": 2.2233009708737863, + "grad_norm": 0.9826246500015259, + "learning_rate": 4.733828047888647e-06, + "loss": 0.4524, + "step": 229 + }, + { + "epoch": 2.233009708737864, + "grad_norm": 1.135237693786621, + "learning_rate": 4.731540941952126e-06, + "loss": 0.2596, + "step": 230 + }, + { + "epoch": 2.2427184466019416, + "grad_norm": 0.924315869808197, + "learning_rate": 4.7292446092925016e-06, + "loss": 0.3294, + "step": 231 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 0.9574539065361023, + "learning_rate": 4.726939059404392e-06, + "loss": 0.2563, + "step": 232 + }, + { + "epoch": 2.262135922330097, + "grad_norm": 0.8917362689971924, + "learning_rate": 4.724624301820524e-06, + "loss": 0.2961, + "step": 233 + }, + { + "epoch": 2.2718446601941746, + "grad_norm": 1.1570966243743896, + "learning_rate": 4.722300346111695e-06, + "loss": 0.4054, + "step": 234 + }, + { + "epoch": 2.2815533980582523, + "grad_norm": 1.1190950870513916, + "learning_rate": 4.719967201886734e-06, + "loss": 0.348, + "step": 235 + }, + { + "epoch": 2.29126213592233, + "grad_norm": 0.9346051216125488, + "learning_rate": 4.717624878792461e-06, + "loss": 0.516, + "step": 236 + }, + { + "epoch": 2.3009708737864076, + "grad_norm": 0.9378127455711365, + "learning_rate": 4.715273386513651e-06, + "loss": 0.4853, + "step": 237 + }, + { + "epoch": 2.3106796116504853, + "grad_norm": 1.0024279356002808, + "learning_rate": 4.712912734772988e-06, + "loss": 0.3455, + "step": 238 + }, + { + "epoch": 2.320388349514563, + "grad_norm": 0.7446038722991943, + "learning_rate": 4.710542933331025e-06, + "loss": 0.3764, + "step": 239 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 0.8997549414634705, + "learning_rate": 4.708163991986152e-06, + "loss": 0.4377, + "step": 240 + }, + { + "epoch": 2.3398058252427183, + "grad_norm": 0.877932071685791, + "learning_rate": 4.705775920574546e-06, + "loss": 0.4254, + "step": 241 + }, + { + "epoch": 2.349514563106796, + "grad_norm": 1.1586366891860962, + "learning_rate": 4.703378728970134e-06, + "loss": 0.2999, + "step": 242 + }, + { + "epoch": 2.3592233009708736, + "grad_norm": 0.8719814419746399, + "learning_rate": 4.700972427084551e-06, + "loss": 0.4169, + "step": 243 + }, + { + "epoch": 2.3689320388349513, + "grad_norm": 1.0403155088424683, + "learning_rate": 4.698557024867105e-06, + "loss": 0.2651, + "step": 244 + }, + { + "epoch": 2.378640776699029, + "grad_norm": 1.027769684791565, + "learning_rate": 4.696132532304727e-06, + "loss": 0.4728, + "step": 245 + }, + { + "epoch": 2.3883495145631066, + "grad_norm": 1.0018792152404785, + "learning_rate": 4.693698959421935e-06, + "loss": 0.349, + "step": 246 + }, + { + "epoch": 2.3980582524271843, + "grad_norm": 1.0755680799484253, + "learning_rate": 4.691256316280789e-06, + "loss": 0.5781, + "step": 247 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 0.8174302577972412, + "learning_rate": 4.688804612980855e-06, + "loss": 0.2627, + "step": 248 + }, + { + "epoch": 2.4174757281553396, + "grad_norm": 0.8010028600692749, + "learning_rate": 4.686343859659158e-06, + "loss": 0.4919, + "step": 249 + }, + { + "epoch": 2.4271844660194173, + "grad_norm": 0.9900104403495789, + "learning_rate": 4.683874066490143e-06, + "loss": 0.3611, + "step": 250 + }, + { + "epoch": 2.436893203883495, + "grad_norm": 1.2603962421417236, + "learning_rate": 4.681395243685631e-06, + "loss": 0.3744, + "step": 251 + }, + { + "epoch": 2.4466019417475726, + "grad_norm": 0.9518654346466064, + "learning_rate": 4.67890740149478e-06, + "loss": 0.345, + "step": 252 + }, + { + "epoch": 2.4563106796116507, + "grad_norm": 0.819825291633606, + "learning_rate": 4.676410550204036e-06, + "loss": 0.4169, + "step": 253 + }, + { + "epoch": 2.466019417475728, + "grad_norm": 0.8665376305580139, + "learning_rate": 4.673904700137098e-06, + "loss": 0.2292, + "step": 254 + }, + { + "epoch": 2.475728155339806, + "grad_norm": 0.8893418908119202, + "learning_rate": 4.671389861654873e-06, + "loss": 0.304, + "step": 255 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 0.915643572807312, + "learning_rate": 4.668866045155428e-06, + "loss": 0.3982, + "step": 256 + }, + { + "epoch": 2.4951456310679614, + "grad_norm": 1.0549052953720093, + "learning_rate": 4.666333261073956e-06, + "loss": 0.4209, + "step": 257 + }, + { + "epoch": 2.5048543689320386, + "grad_norm": 1.2616932392120361, + "learning_rate": 4.6637915198827265e-06, + "loss": 0.4782, + "step": 258 + }, + { + "epoch": 2.5145631067961167, + "grad_norm": 1.1108514070510864, + "learning_rate": 4.661240832091042e-06, + "loss": 0.3456, + "step": 259 + }, + { + "epoch": 2.524271844660194, + "grad_norm": 0.8557240962982178, + "learning_rate": 4.658681208245198e-06, + "loss": 0.386, + "step": 260 + }, + { + "epoch": 2.533980582524272, + "grad_norm": 0.8849290609359741, + "learning_rate": 4.65611265892844e-06, + "loss": 0.3983, + "step": 261 + }, + { + "epoch": 2.5436893203883493, + "grad_norm": 0.9907529950141907, + "learning_rate": 4.653535194760912e-06, + "loss": 0.366, + "step": 262 + }, + { + "epoch": 2.5533980582524274, + "grad_norm": 0.9792741537094116, + "learning_rate": 4.650948826399624e-06, + "loss": 0.2161, + "step": 263 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 0.9505898356437683, + "learning_rate": 4.648353564538397e-06, + "loss": 0.3018, + "step": 264 + }, + { + "epoch": 2.5728155339805827, + "grad_norm": 1.1814008951187134, + "learning_rate": 4.645749419907829e-06, + "loss": 0.3034, + "step": 265 + }, + { + "epoch": 2.58252427184466, + "grad_norm": 1.0968157052993774, + "learning_rate": 4.64313640327524e-06, + "loss": 0.2418, + "step": 266 + }, + { + "epoch": 2.592233009708738, + "grad_norm": 0.9502717852592468, + "learning_rate": 4.640514525444637e-06, + "loss": 0.4245, + "step": 267 + }, + { + "epoch": 2.6019417475728153, + "grad_norm": 1.1883552074432373, + "learning_rate": 4.637883797256663e-06, + "loss": 0.6322, + "step": 268 + }, + { + "epoch": 2.6116504854368934, + "grad_norm": 0.9291763305664062, + "learning_rate": 4.635244229588558e-06, + "loss": 0.3531, + "step": 269 + }, + { + "epoch": 2.6213592233009706, + "grad_norm": 0.68864905834198, + "learning_rate": 4.632595833354105e-06, + "loss": 0.3944, + "step": 270 + }, + { + "epoch": 2.6310679611650487, + "grad_norm": 1.281590223312378, + "learning_rate": 4.629938619503593e-06, + "loss": 0.3025, + "step": 271 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 1.1126227378845215, + "learning_rate": 4.627272599023772e-06, + "loss": 0.4944, + "step": 272 + }, + { + "epoch": 2.650485436893204, + "grad_norm": 0.91072678565979, + "learning_rate": 4.6245977829378e-06, + "loss": 0.5801, + "step": 273 + }, + { + "epoch": 2.6601941747572817, + "grad_norm": 0.9558573365211487, + "learning_rate": 4.6219141823052035e-06, + "loss": 0.2808, + "step": 274 + }, + { + "epoch": 2.6699029126213594, + "grad_norm": 0.9114917516708374, + "learning_rate": 4.619221808221833e-06, + "loss": 0.5259, + "step": 275 + }, + { + "epoch": 2.679611650485437, + "grad_norm": 0.9031375050544739, + "learning_rate": 4.616520671819812e-06, + "loss": 0.2877, + "step": 276 + }, + { + "epoch": 2.6893203883495147, + "grad_norm": 1.0576499700546265, + "learning_rate": 4.613810784267492e-06, + "loss": 0.42, + "step": 277 + }, + { + "epoch": 2.6990291262135924, + "grad_norm": 0.8092033267021179, + "learning_rate": 4.61109215676941e-06, + "loss": 0.2962, + "step": 278 + }, + { + "epoch": 2.70873786407767, + "grad_norm": 0.9271148443222046, + "learning_rate": 4.608364800566241e-06, + "loss": 0.4057, + "step": 279 + }, + { + "epoch": 2.7184466019417477, + "grad_norm": 0.7499092817306519, + "learning_rate": 4.605628726934747e-06, + "loss": 0.4567, + "step": 280 + }, + { + "epoch": 2.7281553398058254, + "grad_norm": 0.8146029710769653, + "learning_rate": 4.602883947187738e-06, + "loss": 0.4149, + "step": 281 + }, + { + "epoch": 2.737864077669903, + "grad_norm": 0.888672947883606, + "learning_rate": 4.600130472674017e-06, + "loss": 0.2856, + "step": 282 + }, + { + "epoch": 2.7475728155339807, + "grad_norm": 0.9286831021308899, + "learning_rate": 4.5973683147783405e-06, + "loss": 0.5477, + "step": 283 + }, + { + "epoch": 2.7572815533980584, + "grad_norm": 1.3980002403259277, + "learning_rate": 4.594597484921365e-06, + "loss": 0.3656, + "step": 284 + }, + { + "epoch": 2.766990291262136, + "grad_norm": 1.0174404382705688, + "learning_rate": 4.5918179945596055e-06, + "loss": 0.3542, + "step": 285 + }, + { + "epoch": 2.7766990291262137, + "grad_norm": 0.9232505559921265, + "learning_rate": 4.589029855185384e-06, + "loss": 0.3179, + "step": 286 + }, + { + "epoch": 2.7864077669902914, + "grad_norm": 1.0175881385803223, + "learning_rate": 4.586233078326785e-06, + "loss": 0.6136, + "step": 287 + }, + { + "epoch": 2.796116504854369, + "grad_norm": 0.9283950328826904, + "learning_rate": 4.583427675547602e-06, + "loss": 0.4247, + "step": 288 + }, + { + "epoch": 2.8058252427184467, + "grad_norm": 0.9678206443786621, + "learning_rate": 4.580613658447301e-06, + "loss": 0.502, + "step": 289 + }, + { + "epoch": 2.8155339805825244, + "grad_norm": 0.9418540000915527, + "learning_rate": 4.577791038660959e-06, + "loss": 0.3092, + "step": 290 + }, + { + "epoch": 2.825242718446602, + "grad_norm": 0.958787202835083, + "learning_rate": 4.574959827859226e-06, + "loss": 0.2833, + "step": 291 + }, + { + "epoch": 2.8349514563106797, + "grad_norm": 0.9094011783599854, + "learning_rate": 4.572120037748273e-06, + "loss": 0.3078, + "step": 292 + }, + { + "epoch": 2.8446601941747574, + "grad_norm": 0.8375478982925415, + "learning_rate": 4.5692716800697415e-06, + "loss": 0.4403, + "step": 293 + }, + { + "epoch": 2.854368932038835, + "grad_norm": 0.8053695559501648, + "learning_rate": 4.566414766600698e-06, + "loss": 0.3585, + "step": 294 + }, + { + "epoch": 2.8640776699029127, + "grad_norm": 0.871597945690155, + "learning_rate": 4.563549309153589e-06, + "loss": 0.238, + "step": 295 + }, + { + "epoch": 2.8737864077669903, + "grad_norm": 0.8860502243041992, + "learning_rate": 4.56067531957618e-06, + "loss": 0.2631, + "step": 296 + }, + { + "epoch": 2.883495145631068, + "grad_norm": 0.7494659423828125, + "learning_rate": 4.557792809751519e-06, + "loss": 0.3956, + "step": 297 + }, + { + "epoch": 2.8932038834951457, + "grad_norm": 0.8635668158531189, + "learning_rate": 4.554901791597883e-06, + "loss": 0.3569, + "step": 298 + }, + { + "epoch": 2.9029126213592233, + "grad_norm": 1.0071423053741455, + "learning_rate": 4.552002277068725e-06, + "loss": 0.3698, + "step": 299 + }, + { + "epoch": 2.912621359223301, + "grad_norm": 1.0618561506271362, + "learning_rate": 4.549094278152631e-06, + "loss": 0.4215, + "step": 300 + }, + { + "epoch": 2.9223300970873787, + "grad_norm": 1.3928142786026, + "learning_rate": 4.546177806873266e-06, + "loss": 0.3282, + "step": 301 + }, + { + "epoch": 2.9320388349514563, + "grad_norm": 1.0209264755249023, + "learning_rate": 4.543252875289326e-06, + "loss": 0.3716, + "step": 302 + }, + { + "epoch": 2.941747572815534, + "grad_norm": 0.9945548176765442, + "learning_rate": 4.540319495494486e-06, + "loss": 0.3973, + "step": 303 + }, + { + "epoch": 2.9514563106796117, + "grad_norm": 1.065015435218811, + "learning_rate": 4.537377679617353e-06, + "loss": 0.3996, + "step": 304 + }, + { + "epoch": 2.9611650485436893, + "grad_norm": 1.1069443225860596, + "learning_rate": 4.534427439821416e-06, + "loss": 0.2877, + "step": 305 + }, + { + "epoch": 2.970873786407767, + "grad_norm": 1.0089571475982666, + "learning_rate": 4.531468788304992e-06, + "loss": 0.2799, + "step": 306 + }, + { + "epoch": 2.9805825242718447, + "grad_norm": 1.0506935119628906, + "learning_rate": 4.5285017373011784e-06, + "loss": 0.3221, + "step": 307 + }, + { + "epoch": 2.9902912621359223, + "grad_norm": 0.7255403995513916, + "learning_rate": 4.5255262990778024e-06, + "loss": 0.2756, + "step": 308 + }, + { + "epoch": 3.0, + "grad_norm": 0.8445290923118591, + "learning_rate": 4.522542485937369e-06, + "loss": 0.2747, + "step": 309 + }, + { + "epoch": 3.0097087378640777, + "grad_norm": 0.9877444505691528, + "learning_rate": 4.519550310217013e-06, + "loss": 0.3152, + "step": 310 + }, + { + "epoch": 3.0194174757281553, + "grad_norm": 1.0058015584945679, + "learning_rate": 4.516549784288442e-06, + "loss": 0.2565, + "step": 311 + }, + { + "epoch": 3.029126213592233, + "grad_norm": 0.9974623322486877, + "learning_rate": 4.513540920557892e-06, + "loss": 0.2612, + "step": 312 + }, + { + "epoch": 3.0388349514563107, + "grad_norm": 0.9780676960945129, + "learning_rate": 4.510523731466072e-06, + "loss": 0.431, + "step": 313 + }, + { + "epoch": 3.0485436893203883, + "grad_norm": 1.2715204954147339, + "learning_rate": 4.507498229488116e-06, + "loss": 0.1685, + "step": 314 + }, + { + "epoch": 3.058252427184466, + "grad_norm": 2.212153673171997, + "learning_rate": 4.504464427133527e-06, + "loss": 0.2729, + "step": 315 + }, + { + "epoch": 3.0679611650485437, + "grad_norm": 1.3692691326141357, + "learning_rate": 4.501422336946126e-06, + "loss": 0.2697, + "step": 316 + }, + { + "epoch": 3.0776699029126213, + "grad_norm": 1.1158126592636108, + "learning_rate": 4.498371971504005e-06, + "loss": 0.2113, + "step": 317 + }, + { + "epoch": 3.087378640776699, + "grad_norm": 0.8955278396606445, + "learning_rate": 4.49531334341947e-06, + "loss": 0.2378, + "step": 318 + }, + { + "epoch": 3.0970873786407767, + "grad_norm": 0.9961569309234619, + "learning_rate": 4.49224646533899e-06, + "loss": 0.2678, + "step": 319 + }, + { + "epoch": 3.1067961165048543, + "grad_norm": 0.8884671926498413, + "learning_rate": 4.489171349943144e-06, + "loss": 0.2445, + "step": 320 + }, + { + "epoch": 3.116504854368932, + "grad_norm": 1.1398262977600098, + "learning_rate": 4.486088009946575e-06, + "loss": 0.1827, + "step": 321 + }, + { + "epoch": 3.1262135922330097, + "grad_norm": 1.2452175617218018, + "learning_rate": 4.482996458097926e-06, + "loss": 0.3053, + "step": 322 + }, + { + "epoch": 3.1359223300970873, + "grad_norm": 1.0263898372650146, + "learning_rate": 4.479896707179796e-06, + "loss": 0.462, + "step": 323 + }, + { + "epoch": 3.145631067961165, + "grad_norm": 1.0178437232971191, + "learning_rate": 4.476788770008685e-06, + "loss": 0.4596, + "step": 324 + }, + { + "epoch": 3.1553398058252426, + "grad_norm": 1.0502902269363403, + "learning_rate": 4.473672659434941e-06, + "loss": 0.3861, + "step": 325 + }, + { + "epoch": 3.1650485436893203, + "grad_norm": 1.2341052293777466, + "learning_rate": 4.470548388342704e-06, + "loss": 0.296, + "step": 326 + }, + { + "epoch": 3.174757281553398, + "grad_norm": 1.025456190109253, + "learning_rate": 4.467415969649858e-06, + "loss": 0.2583, + "step": 327 + }, + { + "epoch": 3.1844660194174756, + "grad_norm": 1.0375516414642334, + "learning_rate": 4.464275416307973e-06, + "loss": 0.3423, + "step": 328 + }, + { + "epoch": 3.1941747572815533, + "grad_norm": 1.2566360235214233, + "learning_rate": 4.461126741302253e-06, + "loss": 0.3176, + "step": 329 + }, + { + "epoch": 3.203883495145631, + "grad_norm": 1.177641749382019, + "learning_rate": 4.457969957651485e-06, + "loss": 0.2558, + "step": 330 + }, + { + "epoch": 3.2135922330097086, + "grad_norm": 1.0258479118347168, + "learning_rate": 4.454805078407979e-06, + "loss": 0.4843, + "step": 331 + }, + { + "epoch": 3.2233009708737863, + "grad_norm": 0.9031769037246704, + "learning_rate": 4.451632116657521e-06, + "loss": 0.2528, + "step": 332 + }, + { + "epoch": 3.233009708737864, + "grad_norm": 0.9155856370925903, + "learning_rate": 4.448451085519314e-06, + "loss": 0.2014, + "step": 333 + }, + { + "epoch": 3.2427184466019416, + "grad_norm": 1.1005407571792603, + "learning_rate": 4.445261998145927e-06, + "loss": 0.3033, + "step": 334 + }, + { + "epoch": 3.2524271844660193, + "grad_norm": 0.9126582741737366, + "learning_rate": 4.442064867723236e-06, + "loss": 0.2785, + "step": 335 + }, + { + "epoch": 3.262135922330097, + "grad_norm": 1.0484691858291626, + "learning_rate": 4.438859707470376e-06, + "loss": 0.293, + "step": 336 + }, + { + "epoch": 3.2718446601941746, + "grad_norm": 0.9439131021499634, + "learning_rate": 4.435646530639679e-06, + "loss": 0.1952, + "step": 337 + }, + { + "epoch": 3.2815533980582523, + "grad_norm": 1.2514874935150146, + "learning_rate": 4.432425350516627e-06, + "loss": 0.2916, + "step": 338 + }, + { + "epoch": 3.29126213592233, + "grad_norm": 1.256081223487854, + "learning_rate": 4.42919618041979e-06, + "loss": 0.1883, + "step": 339 + }, + { + "epoch": 3.3009708737864076, + "grad_norm": 0.9661153554916382, + "learning_rate": 4.425959033700776e-06, + "loss": 0.1276, + "step": 340 + }, + { + "epoch": 3.3106796116504853, + "grad_norm": 0.9738348126411438, + "learning_rate": 4.422713923744174e-06, + "loss": 0.348, + "step": 341 + }, + { + "epoch": 3.320388349514563, + "grad_norm": 1.2630469799041748, + "learning_rate": 4.419460863967496e-06, + "loss": 0.375, + "step": 342 + }, + { + "epoch": 3.3300970873786406, + "grad_norm": 0.866310179233551, + "learning_rate": 4.416199867821126e-06, + "loss": 0.3097, + "step": 343 + }, + { + "epoch": 3.3398058252427183, + "grad_norm": 1.4241950511932373, + "learning_rate": 4.412930948788263e-06, + "loss": 0.2579, + "step": 344 + }, + { + "epoch": 3.349514563106796, + "grad_norm": 0.9571874737739563, + "learning_rate": 4.409654120384863e-06, + "loss": 0.436, + "step": 345 + }, + { + "epoch": 3.3592233009708736, + "grad_norm": 1.0651613473892212, + "learning_rate": 4.406369396159585e-06, + "loss": 0.3728, + "step": 346 + }, + { + "epoch": 3.3689320388349513, + "grad_norm": 1.2672362327575684, + "learning_rate": 4.403076789693735e-06, + "loss": 0.448, + "step": 347 + }, + { + "epoch": 3.378640776699029, + "grad_norm": 1.0908823013305664, + "learning_rate": 4.399776314601212e-06, + "loss": 0.2399, + "step": 348 + }, + { + "epoch": 3.3883495145631066, + "grad_norm": 1.2599388360977173, + "learning_rate": 4.396467984528445e-06, + "loss": 0.3701, + "step": 349 + }, + { + "epoch": 3.3980582524271843, + "grad_norm": 1.1919026374816895, + "learning_rate": 4.393151813154345e-06, + "loss": 0.2457, + "step": 350 + }, + { + "epoch": 3.407766990291262, + "grad_norm": 1.035890817642212, + "learning_rate": 4.3898278141902396e-06, + "loss": 0.2621, + "step": 351 + }, + { + "epoch": 3.4174757281553396, + "grad_norm": 1.1830110549926758, + "learning_rate": 4.386496001379826e-06, + "loss": 0.3427, + "step": 352 + }, + { + "epoch": 3.4271844660194173, + "grad_norm": 0.8301703929901123, + "learning_rate": 4.383156388499106e-06, + "loss": 0.3473, + "step": 353 + }, + { + "epoch": 3.436893203883495, + "grad_norm": 0.8561685085296631, + "learning_rate": 4.3798089893563335e-06, + "loss": 0.2333, + "step": 354 + }, + { + "epoch": 3.4466019417475726, + "grad_norm": 1.0540318489074707, + "learning_rate": 4.3764538177919555e-06, + "loss": 0.1516, + "step": 355 + }, + { + "epoch": 3.4563106796116507, + "grad_norm": 1.0419858694076538, + "learning_rate": 4.3730908876785574e-06, + "loss": 0.2708, + "step": 356 + }, + { + "epoch": 3.466019417475728, + "grad_norm": 1.171985149383545, + "learning_rate": 4.3697202129208e-06, + "loss": 0.3133, + "step": 357 + }, + { + "epoch": 3.475728155339806, + "grad_norm": 1.0638673305511475, + "learning_rate": 4.36634180745537e-06, + "loss": 0.1745, + "step": 358 + }, + { + "epoch": 3.4854368932038833, + "grad_norm": 0.8694558143615723, + "learning_rate": 4.3629556852509145e-06, + "loss": 0.4621, + "step": 359 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 0.932741105556488, + "learning_rate": 4.35956186030799e-06, + "loss": 0.2332, + "step": 360 + }, + { + "epoch": 3.5048543689320386, + "grad_norm": 1.1963168382644653, + "learning_rate": 4.356160346659001e-06, + "loss": 0.3011, + "step": 361 + }, + { + "epoch": 3.5145631067961167, + "grad_norm": 1.1899361610412598, + "learning_rate": 4.3527511583681384e-06, + "loss": 0.6784, + "step": 362 + }, + { + "epoch": 3.524271844660194, + "grad_norm": 1.1308809518814087, + "learning_rate": 4.34933430953133e-06, + "loss": 0.3338, + "step": 363 + }, + { + "epoch": 3.533980582524272, + "grad_norm": 1.0777356624603271, + "learning_rate": 4.345909814276177e-06, + "loss": 0.362, + "step": 364 + }, + { + "epoch": 3.5436893203883493, + "grad_norm": 0.9632010459899902, + "learning_rate": 4.3424776867618935e-06, + "loss": 0.3633, + "step": 365 + }, + { + "epoch": 3.5533980582524274, + "grad_norm": 1.0557489395141602, + "learning_rate": 4.339037941179253e-06, + "loss": 0.3633, + "step": 366 + }, + { + "epoch": 3.5631067961165046, + "grad_norm": 0.9937204122543335, + "learning_rate": 4.335590591750526e-06, + "loss": 0.2539, + "step": 367 + }, + { + "epoch": 3.5728155339805827, + "grad_norm": 1.128485083580017, + "learning_rate": 4.332135652729423e-06, + "loss": 0.2688, + "step": 368 + }, + { + "epoch": 3.58252427184466, + "grad_norm": 0.9485536217689514, + "learning_rate": 4.328673138401036e-06, + "loss": 0.3675, + "step": 369 + }, + { + "epoch": 3.592233009708738, + "grad_norm": 1.0004498958587646, + "learning_rate": 4.325203063081776e-06, + "loss": 0.1809, + "step": 370 + }, + { + "epoch": 3.6019417475728153, + "grad_norm": 1.2935380935668945, + "learning_rate": 4.32172544111932e-06, + "loss": 0.2047, + "step": 371 + }, + { + "epoch": 3.6116504854368934, + "grad_norm": 0.8594838976860046, + "learning_rate": 4.318240286892544e-06, + "loss": 0.2426, + "step": 372 + }, + { + "epoch": 3.6213592233009706, + "grad_norm": 0.865341305732727, + "learning_rate": 4.314747614811471e-06, + "loss": 0.269, + "step": 373 + }, + { + "epoch": 3.6310679611650487, + "grad_norm": 0.975472629070282, + "learning_rate": 4.3112474393172055e-06, + "loss": 0.2057, + "step": 374 + }, + { + "epoch": 3.6407766990291264, + "grad_norm": 0.9958301782608032, + "learning_rate": 4.307739774881878e-06, + "loss": 0.2612, + "step": 375 + }, + { + "epoch": 3.650485436893204, + "grad_norm": 1.216472864151001, + "learning_rate": 4.304224636008582e-06, + "loss": 0.1837, + "step": 376 + }, + { + "epoch": 3.6601941747572817, + "grad_norm": 1.432723879814148, + "learning_rate": 4.300702037231318e-06, + "loss": 0.4011, + "step": 377 + }, + { + "epoch": 3.6699029126213594, + "grad_norm": 0.9345541596412659, + "learning_rate": 4.297171993114927e-06, + "loss": 0.1405, + "step": 378 + }, + { + "epoch": 3.679611650485437, + "grad_norm": 0.8612492084503174, + "learning_rate": 4.2936345182550365e-06, + "loss": 0.4025, + "step": 379 + }, + { + "epoch": 3.6893203883495147, + "grad_norm": 0.9304858446121216, + "learning_rate": 4.290089627277998e-06, + "loss": 0.2106, + "step": 380 + }, + { + "epoch": 3.6990291262135924, + "grad_norm": 0.8536747694015503, + "learning_rate": 4.286537334840825e-06, + "loss": 0.3325, + "step": 381 + }, + { + "epoch": 3.70873786407767, + "grad_norm": 1.4801360368728638, + "learning_rate": 4.2829776556311355e-06, + "loss": 0.2465, + "step": 382 + }, + { + "epoch": 3.7184466019417477, + "grad_norm": 1.1944482326507568, + "learning_rate": 4.279410604367088e-06, + "loss": 0.4483, + "step": 383 + }, + { + "epoch": 3.7281553398058254, + "grad_norm": 1.1957206726074219, + "learning_rate": 4.275836195797323e-06, + "loss": 0.3758, + "step": 384 + }, + { + "epoch": 3.737864077669903, + "grad_norm": 0.9933880567550659, + "learning_rate": 4.2722544447008995e-06, + "loss": 0.4136, + "step": 385 + }, + { + "epoch": 3.7475728155339807, + "grad_norm": 1.062076210975647, + "learning_rate": 4.268665365887238e-06, + "loss": 0.441, + "step": 386 + }, + { + "epoch": 3.7572815533980584, + "grad_norm": 0.8356176018714905, + "learning_rate": 4.265068974196056e-06, + "loss": 0.1671, + "step": 387 + }, + { + "epoch": 3.766990291262136, + "grad_norm": 1.0670630931854248, + "learning_rate": 4.261465284497307e-06, + "loss": 0.2107, + "step": 388 + }, + { + "epoch": 3.7766990291262137, + "grad_norm": 0.9141836762428284, + "learning_rate": 4.257854311691118e-06, + "loss": 0.3743, + "step": 389 + }, + { + "epoch": 3.7864077669902914, + "grad_norm": 1.2973644733428955, + "learning_rate": 4.254236070707734e-06, + "loss": 0.2203, + "step": 390 + }, + { + "epoch": 3.796116504854369, + "grad_norm": 0.9033822417259216, + "learning_rate": 4.250610576507445e-06, + "loss": 0.2088, + "step": 391 + }, + { + "epoch": 3.8058252427184467, + "grad_norm": 1.1606167554855347, + "learning_rate": 4.246977844080537e-06, + "loss": 0.3493, + "step": 392 + }, + { + "epoch": 3.8155339805825244, + "grad_norm": 1.1568549871444702, + "learning_rate": 4.24333788844722e-06, + "loss": 0.2675, + "step": 393 + }, + { + "epoch": 3.825242718446602, + "grad_norm": 1.2251572608947754, + "learning_rate": 4.239690724657571e-06, + "loss": 0.2949, + "step": 394 + }, + { + "epoch": 3.8349514563106797, + "grad_norm": 0.7754173278808594, + "learning_rate": 4.236036367791471e-06, + "loss": 0.3149, + "step": 395 + }, + { + "epoch": 3.8446601941747574, + "grad_norm": 1.0242606401443481, + "learning_rate": 4.23237483295854e-06, + "loss": 0.4014, + "step": 396 + }, + { + "epoch": 3.854368932038835, + "grad_norm": 1.0819196701049805, + "learning_rate": 4.228706135298081e-06, + "loss": 0.2485, + "step": 397 + }, + { + "epoch": 3.8640776699029127, + "grad_norm": 1.0384984016418457, + "learning_rate": 4.225030289979006e-06, + "loss": 0.2049, + "step": 398 + }, + { + "epoch": 3.8737864077669903, + "grad_norm": 0.9361140131950378, + "learning_rate": 4.221347312199788e-06, + "loss": 0.2095, + "step": 399 + }, + { + "epoch": 3.883495145631068, + "grad_norm": 0.7564887404441833, + "learning_rate": 4.2176572171883865e-06, + "loss": 0.3075, + "step": 400 + }, + { + "epoch": 3.8932038834951457, + "grad_norm": 1.040762186050415, + "learning_rate": 4.213960020202187e-06, + "loss": 0.3612, + "step": 401 + }, + { + "epoch": 3.9029126213592233, + "grad_norm": 1.23052179813385, + "learning_rate": 4.2102557365279435e-06, + "loss": 0.4987, + "step": 402 + }, + { + "epoch": 3.912621359223301, + "grad_norm": 1.1888084411621094, + "learning_rate": 4.206544381481708e-06, + "loss": 0.3761, + "step": 403 + }, + { + "epoch": 3.9223300970873787, + "grad_norm": 1.0521200895309448, + "learning_rate": 4.202825970408772e-06, + "loss": 0.1774, + "step": 404 + }, + { + "epoch": 3.9320388349514563, + "grad_norm": 1.4193365573883057, + "learning_rate": 4.199100518683601e-06, + "loss": 0.2208, + "step": 405 + }, + { + "epoch": 3.941747572815534, + "grad_norm": 0.8504016399383545, + "learning_rate": 4.195368041709772e-06, + "loss": 0.3786, + "step": 406 + }, + { + "epoch": 3.9514563106796117, + "grad_norm": 0.9185003042221069, + "learning_rate": 4.191628554919907e-06, + "loss": 0.2445, + "step": 407 + }, + { + "epoch": 3.9611650485436893, + "grad_norm": 1.0671206712722778, + "learning_rate": 4.187882073775615e-06, + "loss": 0.2674, + "step": 408 + }, + { + "epoch": 3.970873786407767, + "grad_norm": 0.9746595025062561, + "learning_rate": 4.184128613767422e-06, + "loss": 0.2695, + "step": 409 + }, + { + "epoch": 3.9805825242718447, + "grad_norm": 0.9572174549102783, + "learning_rate": 4.18036819041471e-06, + "loss": 0.3031, + "step": 410 + }, + { + "epoch": 3.9902912621359223, + "grad_norm": 1.1593903303146362, + "learning_rate": 4.17660081926565e-06, + "loss": 0.1894, + "step": 411 + }, + { + "epoch": 4.0, + "grad_norm": 1.0100287199020386, + "learning_rate": 4.172826515897146e-06, + "loss": 0.1792, + "step": 412 + }, + { + "epoch": 4.009708737864078, + "grad_norm": 0.8554271459579468, + "learning_rate": 4.169045295914757e-06, + "loss": 0.3282, + "step": 413 + }, + { + "epoch": 4.019417475728155, + "grad_norm": 1.0497305393218994, + "learning_rate": 4.165257174952647e-06, + "loss": 0.1456, + "step": 414 + }, + { + "epoch": 4.029126213592233, + "grad_norm": 0.9351025223731995, + "learning_rate": 4.161462168673508e-06, + "loss": 0.2288, + "step": 415 + }, + { + "epoch": 4.038834951456311, + "grad_norm": 1.175843596458435, + "learning_rate": 4.157660292768502e-06, + "loss": 0.3184, + "step": 416 + }, + { + "epoch": 4.048543689320389, + "grad_norm": 1.3104645013809204, + "learning_rate": 4.1538515629571985e-06, + "loss": 0.2433, + "step": 417 + }, + { + "epoch": 4.058252427184466, + "grad_norm": 1.4812334775924683, + "learning_rate": 4.1500359949875e-06, + "loss": 0.2837, + "step": 418 + }, + { + "epoch": 4.067961165048544, + "grad_norm": 1.3549306392669678, + "learning_rate": 4.1462136046355864e-06, + "loss": 0.2224, + "step": 419 + }, + { + "epoch": 4.077669902912621, + "grad_norm": 1.1158041954040527, + "learning_rate": 4.142384407705846e-06, + "loss": 0.4027, + "step": 420 + }, + { + "epoch": 4.087378640776699, + "grad_norm": 1.0280787944793701, + "learning_rate": 4.138548420030808e-06, + "loss": 0.1437, + "step": 421 + }, + { + "epoch": 4.097087378640777, + "grad_norm": 1.043931245803833, + "learning_rate": 4.13470565747108e-06, + "loss": 0.2594, + "step": 422 + }, + { + "epoch": 4.106796116504855, + "grad_norm": 0.8889212608337402, + "learning_rate": 4.130856135915282e-06, + "loss": 0.1997, + "step": 423 + }, + { + "epoch": 4.116504854368932, + "grad_norm": 1.3302550315856934, + "learning_rate": 4.126999871279982e-06, + "loss": 0.2012, + "step": 424 + }, + { + "epoch": 4.12621359223301, + "grad_norm": 1.0227890014648438, + "learning_rate": 4.123136879509626e-06, + "loss": 0.2964, + "step": 425 + }, + { + "epoch": 4.135922330097087, + "grad_norm": 1.2447762489318848, + "learning_rate": 4.119267176576475e-06, + "loss": 0.2678, + "step": 426 + }, + { + "epoch": 4.145631067961165, + "grad_norm": 0.8202763795852661, + "learning_rate": 4.11539077848054e-06, + "loss": 0.3348, + "step": 427 + }, + { + "epoch": 4.155339805825243, + "grad_norm": 1.2533812522888184, + "learning_rate": 4.111507701249513e-06, + "loss": 0.2985, + "step": 428 + }, + { + "epoch": 4.165048543689321, + "grad_norm": 1.6550928354263306, + "learning_rate": 4.107617960938702e-06, + "loss": 0.1144, + "step": 429 + }, + { + "epoch": 4.174757281553398, + "grad_norm": 1.470839023590088, + "learning_rate": 4.103721573630965e-06, + "loss": 0.2227, + "step": 430 + }, + { + "epoch": 4.184466019417476, + "grad_norm": 1.1045992374420166, + "learning_rate": 4.099818555436645e-06, + "loss": 0.2515, + "step": 431 + }, + { + "epoch": 4.194174757281553, + "grad_norm": 1.691807746887207, + "learning_rate": 4.095908922493499e-06, + "loss": 0.463, + "step": 432 + }, + { + "epoch": 4.203883495145631, + "grad_norm": 0.9316760897636414, + "learning_rate": 4.091992690966636e-06, + "loss": 0.0993, + "step": 433 + }, + { + "epoch": 4.213592233009709, + "grad_norm": 1.1786795854568481, + "learning_rate": 4.088069877048447e-06, + "loss": 0.1766, + "step": 434 + }, + { + "epoch": 4.223300970873787, + "grad_norm": 0.8992094397544861, + "learning_rate": 4.084140496958539e-06, + "loss": 0.1756, + "step": 435 + }, + { + "epoch": 4.233009708737864, + "grad_norm": 0.9905827045440674, + "learning_rate": 4.080204566943668e-06, + "loss": 0.1459, + "step": 436 + }, + { + "epoch": 4.242718446601942, + "grad_norm": 0.9543220400810242, + "learning_rate": 4.076262103277673e-06, + "loss": 0.1493, + "step": 437 + }, + { + "epoch": 4.252427184466019, + "grad_norm": 0.9142147302627563, + "learning_rate": 4.072313122261406e-06, + "loss": 0.4272, + "step": 438 + }, + { + "epoch": 4.262135922330097, + "grad_norm": 0.8952410817146301, + "learning_rate": 4.068357640222668e-06, + "loss": 0.121, + "step": 439 + }, + { + "epoch": 4.271844660194175, + "grad_norm": 0.8736719489097595, + "learning_rate": 4.06439567351614e-06, + "loss": 0.1525, + "step": 440 + }, + { + "epoch": 4.281553398058253, + "grad_norm": 0.9316339492797852, + "learning_rate": 4.0604272385233105e-06, + "loss": 0.2146, + "step": 441 + }, + { + "epoch": 4.29126213592233, + "grad_norm": 1.1137328147888184, + "learning_rate": 4.056452351652418e-06, + "loss": 0.1259, + "step": 442 + }, + { + "epoch": 4.300970873786408, + "grad_norm": 1.1264760494232178, + "learning_rate": 4.052471029338375e-06, + "loss": 0.2629, + "step": 443 + }, + { + "epoch": 4.310679611650485, + "grad_norm": 0.9680051803588867, + "learning_rate": 4.048483288042703e-06, + "loss": 0.1495, + "step": 444 + }, + { + "epoch": 4.320388349514563, + "grad_norm": 1.0331041812896729, + "learning_rate": 4.0444891442534615e-06, + "loss": 0.1509, + "step": 445 + }, + { + "epoch": 4.330097087378641, + "grad_norm": 0.9411431550979614, + "learning_rate": 4.040488614485187e-06, + "loss": 0.2739, + "step": 446 + }, + { + "epoch": 4.339805825242719, + "grad_norm": 1.2953507900238037, + "learning_rate": 4.036481715278818e-06, + "loss": 0.2616, + "step": 447 + }, + { + "epoch": 4.349514563106796, + "grad_norm": 0.8018221855163574, + "learning_rate": 4.032468463201626e-06, + "loss": 0.2696, + "step": 448 + }, + { + "epoch": 4.359223300970874, + "grad_norm": 0.9439960718154907, + "learning_rate": 4.028448874847152e-06, + "loss": 0.0845, + "step": 449 + }, + { + "epoch": 4.368932038834951, + "grad_norm": 1.1227456331253052, + "learning_rate": 4.024422966835137e-06, + "loss": 0.2061, + "step": 450 + }, + { + "epoch": 4.378640776699029, + "grad_norm": 1.0585674047470093, + "learning_rate": 4.0203907558114475e-06, + "loss": 0.1585, + "step": 451 + }, + { + "epoch": 4.388349514563107, + "grad_norm": 0.9007663130760193, + "learning_rate": 4.016352258448016e-06, + "loss": 0.1067, + "step": 452 + }, + { + "epoch": 4.398058252427185, + "grad_norm": 1.3700408935546875, + "learning_rate": 4.0123074914427635e-06, + "loss": 0.148, + "step": 453 + }, + { + "epoch": 4.407766990291262, + "grad_norm": 0.8373147249221802, + "learning_rate": 4.008256471519536e-06, + "loss": 0.1144, + "step": 454 + }, + { + "epoch": 4.41747572815534, + "grad_norm": 1.0520269870758057, + "learning_rate": 4.004199215428032e-06, + "loss": 0.2349, + "step": 455 + }, + { + "epoch": 4.427184466019417, + "grad_norm": 1.3709717988967896, + "learning_rate": 4.000135739943735e-06, + "loss": 0.3069, + "step": 456 + }, + { + "epoch": 4.436893203883495, + "grad_norm": 1.0186606645584106, + "learning_rate": 3.996066061867844e-06, + "loss": 0.2285, + "step": 457 + }, + { + "epoch": 4.446601941747573, + "grad_norm": 1.0011564493179321, + "learning_rate": 3.991990198027203e-06, + "loss": 0.1904, + "step": 458 + }, + { + "epoch": 4.456310679611651, + "grad_norm": 1.174538493156433, + "learning_rate": 3.987908165274233e-06, + "loss": 0.2158, + "step": 459 + }, + { + "epoch": 4.466019417475728, + "grad_norm": 0.9717496037483215, + "learning_rate": 3.9838199804868635e-06, + "loss": 0.3015, + "step": 460 + }, + { + "epoch": 4.475728155339806, + "grad_norm": 1.2071607112884521, + "learning_rate": 3.979725660568456e-06, + "loss": 0.2892, + "step": 461 + }, + { + "epoch": 4.485436893203883, + "grad_norm": 0.8957129120826721, + "learning_rate": 3.975625222447742e-06, + "loss": 0.1432, + "step": 462 + }, + { + "epoch": 4.495145631067961, + "grad_norm": 1.402980923652649, + "learning_rate": 3.97151868307875e-06, + "loss": 0.2913, + "step": 463 + }, + { + "epoch": 4.504854368932039, + "grad_norm": 1.0455483198165894, + "learning_rate": 3.9674060594407345e-06, + "loss": 0.1864, + "step": 464 + }, + { + "epoch": 4.514563106796117, + "grad_norm": 0.9044820666313171, + "learning_rate": 3.963287368538105e-06, + "loss": 0.0945, + "step": 465 + }, + { + "epoch": 4.524271844660194, + "grad_norm": 1.2196074724197388, + "learning_rate": 3.959162627400361e-06, + "loss": 0.3229, + "step": 466 + }, + { + "epoch": 4.533980582524272, + "grad_norm": 1.430355191230774, + "learning_rate": 3.9550318530820145e-06, + "loss": 0.2709, + "step": 467 + }, + { + "epoch": 4.543689320388349, + "grad_norm": 1.0211271047592163, + "learning_rate": 3.9508950626625244e-06, + "loss": 0.1399, + "step": 468 + }, + { + "epoch": 4.553398058252427, + "grad_norm": 1.0876238346099854, + "learning_rate": 3.946752273246224e-06, + "loss": 0.2654, + "step": 469 + }, + { + "epoch": 4.563106796116505, + "grad_norm": 0.975965678691864, + "learning_rate": 3.942603501962249e-06, + "loss": 0.1921, + "step": 470 + }, + { + "epoch": 4.572815533980583, + "grad_norm": 1.0028088092803955, + "learning_rate": 3.9384487659644716e-06, + "loss": 0.1228, + "step": 471 + }, + { + "epoch": 4.58252427184466, + "grad_norm": 1.1247432231903076, + "learning_rate": 3.934288082431423e-06, + "loss": 0.2308, + "step": 472 + }, + { + "epoch": 4.592233009708738, + "grad_norm": 0.99698805809021, + "learning_rate": 3.930121468566227e-06, + "loss": 0.3171, + "step": 473 + }, + { + "epoch": 4.601941747572815, + "grad_norm": 1.0016257762908936, + "learning_rate": 3.925948941596528e-06, + "loss": 0.1961, + "step": 474 + }, + { + "epoch": 4.611650485436893, + "grad_norm": 0.7484853863716125, + "learning_rate": 3.92177051877442e-06, + "loss": 0.1746, + "step": 475 + }, + { + "epoch": 4.621359223300971, + "grad_norm": 0.9711058139801025, + "learning_rate": 3.917586217376369e-06, + "loss": 0.2399, + "step": 476 + }, + { + "epoch": 4.631067961165049, + "grad_norm": 0.9540738463401794, + "learning_rate": 3.913396054703155e-06, + "loss": 0.2647, + "step": 477 + }, + { + "epoch": 4.640776699029126, + "grad_norm": 0.9963274002075195, + "learning_rate": 3.909200048079786e-06, + "loss": 0.2405, + "step": 478 + }, + { + "epoch": 4.650485436893204, + "grad_norm": 1.0638726949691772, + "learning_rate": 3.9049982148554384e-06, + "loss": 0.3632, + "step": 479 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 1.0812081098556519, + "learning_rate": 3.900790572403376e-06, + "loss": 0.2, + "step": 480 + }, + { + "epoch": 4.669902912621359, + "grad_norm": 0.8857947587966919, + "learning_rate": 3.896577138120881e-06, + "loss": 0.2758, + "step": 481 + }, + { + "epoch": 4.679611650485437, + "grad_norm": 1.2224204540252686, + "learning_rate": 3.892357929429187e-06, + "loss": 0.1827, + "step": 482 + }, + { + "epoch": 4.689320388349515, + "grad_norm": 1.1760519742965698, + "learning_rate": 3.8881329637734e-06, + "loss": 0.1317, + "step": 483 + }, + { + "epoch": 4.699029126213592, + "grad_norm": 0.9335840344429016, + "learning_rate": 3.883902258622431e-06, + "loss": 0.2218, + "step": 484 + }, + { + "epoch": 4.70873786407767, + "grad_norm": 1.1263084411621094, + "learning_rate": 3.8796658314689205e-06, + "loss": 0.2378, + "step": 485 + }, + { + "epoch": 4.718446601941747, + "grad_norm": 1.0587648153305054, + "learning_rate": 3.875423699829168e-06, + "loss": 0.1328, + "step": 486 + }, + { + "epoch": 4.728155339805825, + "grad_norm": 0.9542792439460754, + "learning_rate": 3.871175881243061e-06, + "loss": 0.1703, + "step": 487 + }, + { + "epoch": 4.737864077669903, + "grad_norm": 0.9771331548690796, + "learning_rate": 3.866922393273999e-06, + "loss": 0.3161, + "step": 488 + }, + { + "epoch": 4.747572815533981, + "grad_norm": 1.167136788368225, + "learning_rate": 3.862663253508822e-06, + "loss": 0.1772, + "step": 489 + }, + { + "epoch": 4.757281553398058, + "grad_norm": 0.9686987400054932, + "learning_rate": 3.858398479557739e-06, + "loss": 0.1755, + "step": 490 + }, + { + "epoch": 4.766990291262136, + "grad_norm": 1.1323494911193848, + "learning_rate": 3.8541280890542565e-06, + "loss": 0.2034, + "step": 491 + }, + { + "epoch": 4.776699029126213, + "grad_norm": 1.456445574760437, + "learning_rate": 3.849852099655102e-06, + "loss": 0.3151, + "step": 492 + }, + { + "epoch": 4.786407766990291, + "grad_norm": 0.9582480788230896, + "learning_rate": 3.845570529040151e-06, + "loss": 0.1992, + "step": 493 + }, + { + "epoch": 4.796116504854369, + "grad_norm": 0.9113503098487854, + "learning_rate": 3.841283394912361e-06, + "loss": 0.1958, + "step": 494 + }, + { + "epoch": 4.805825242718447, + "grad_norm": 1.0523179769515991, + "learning_rate": 3.836990714997686e-06, + "loss": 0.2865, + "step": 495 + }, + { + "epoch": 4.815533980582524, + "grad_norm": 1.2636815309524536, + "learning_rate": 3.832692507045015e-06, + "loss": 0.2601, + "step": 496 + }, + { + "epoch": 4.825242718446602, + "grad_norm": 0.8661506175994873, + "learning_rate": 3.828388788826091e-06, + "loss": 0.312, + "step": 497 + }, + { + "epoch": 4.834951456310679, + "grad_norm": 1.0353248119354248, + "learning_rate": 3.824079578135442e-06, + "loss": 0.2472, + "step": 498 + }, + { + "epoch": 4.844660194174757, + "grad_norm": 1.523149847984314, + "learning_rate": 3.819764892790307e-06, + "loss": 0.2525, + "step": 499 + }, + { + "epoch": 4.854368932038835, + "grad_norm": 1.2362757921218872, + "learning_rate": 3.815444750630555e-06, + "loss": 0.2148, + "step": 500 + }, + { + "epoch": 4.864077669902913, + "grad_norm": 1.7284575700759888, + "learning_rate": 3.811119169518624e-06, + "loss": 0.3737, + "step": 501 + }, + { + "epoch": 4.87378640776699, + "grad_norm": 1.220207929611206, + "learning_rate": 3.8067881673394363e-06, + "loss": 0.4219, + "step": 502 + }, + { + "epoch": 4.883495145631068, + "grad_norm": 1.0464245080947876, + "learning_rate": 3.802451762000331e-06, + "loss": 0.1622, + "step": 503 + }, + { + "epoch": 4.893203883495145, + "grad_norm": 1.5916672945022583, + "learning_rate": 3.7981099714309856e-06, + "loss": 0.149, + "step": 504 + }, + { + "epoch": 4.902912621359223, + "grad_norm": 1.0251778364181519, + "learning_rate": 3.7937628135833453e-06, + "loss": 0.2037, + "step": 505 + }, + { + "epoch": 4.9126213592233015, + "grad_norm": 1.2007070779800415, + "learning_rate": 3.7894103064315463e-06, + "loss": 0.197, + "step": 506 + }, + { + "epoch": 4.922330097087379, + "grad_norm": 1.1666109561920166, + "learning_rate": 3.7850524679718424e-06, + "loss": 0.155, + "step": 507 + }, + { + "epoch": 4.932038834951456, + "grad_norm": 0.9731155633926392, + "learning_rate": 3.7806893162225328e-06, + "loss": 0.1791, + "step": 508 + }, + { + "epoch": 4.941747572815534, + "grad_norm": 1.1099647283554077, + "learning_rate": 3.7763208692238818e-06, + "loss": 0.2402, + "step": 509 + }, + { + "epoch": 4.951456310679612, + "grad_norm": 1.1254994869232178, + "learning_rate": 3.7719471450380518e-06, + "loss": 0.1703, + "step": 510 + }, + { + "epoch": 4.961165048543689, + "grad_norm": 1.1321799755096436, + "learning_rate": 3.7675681617490212e-06, + "loss": 0.3277, + "step": 511 + }, + { + "epoch": 4.970873786407767, + "grad_norm": 0.9587062001228333, + "learning_rate": 3.7631839374625167e-06, + "loss": 0.3316, + "step": 512 + }, + { + "epoch": 4.980582524271845, + "grad_norm": 0.9437461495399475, + "learning_rate": 3.758794490305932e-06, + "loss": 0.166, + "step": 513 + }, + { + "epoch": 4.990291262135923, + "grad_norm": 1.0000038146972656, + "learning_rate": 3.7543998384282565e-06, + "loss": 0.2137, + "step": 514 + }, + { + "epoch": 5.0, + "grad_norm": 0.8058333396911621, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.2744, + "step": 515 + }, + { + "epoch": 5.009708737864078, + "grad_norm": 1.1363271474838257, + "learning_rate": 3.745594993213118e-06, + "loss": 0.2244, + "step": 516 + }, + { + "epoch": 5.019417475728155, + "grad_norm": 1.0915714502334595, + "learning_rate": 3.7411848362809324e-06, + "loss": 0.3713, + "step": 517 + }, + { + "epoch": 5.029126213592233, + "grad_norm": 1.0293701887130737, + "learning_rate": 3.7367695474380623e-06, + "loss": 0.2063, + "step": 518 + }, + { + "epoch": 5.038834951456311, + "grad_norm": 1.7663441896438599, + "learning_rate": 3.7323491449403444e-06, + "loss": 0.1835, + "step": 519 + }, + { + "epoch": 5.048543689320389, + "grad_norm": 1.2588750123977661, + "learning_rate": 3.7279236470647593e-06, + "loss": 0.0947, + "step": 520 + }, + { + "epoch": 5.058252427184466, + "grad_norm": 1.9125629663467407, + "learning_rate": 3.723493072109355e-06, + "loss": 0.3472, + "step": 521 + }, + { + "epoch": 5.067961165048544, + "grad_norm": 1.6187901496887207, + "learning_rate": 3.719057438393172e-06, + "loss": 0.1696, + "step": 522 + }, + { + "epoch": 5.077669902912621, + "grad_norm": 1.0210967063903809, + "learning_rate": 3.714616764256166e-06, + "loss": 0.2502, + "step": 523 + }, + { + "epoch": 5.087378640776699, + "grad_norm": 1.0009822845458984, + "learning_rate": 3.7101710680591353e-06, + "loss": 0.1797, + "step": 524 + }, + { + "epoch": 5.097087378640777, + "grad_norm": 1.2591980695724487, + "learning_rate": 3.7057203681836407e-06, + "loss": 0.2277, + "step": 525 + }, + { + "epoch": 5.106796116504855, + "grad_norm": 1.310861349105835, + "learning_rate": 3.701264683031934e-06, + "loss": 0.2296, + "step": 526 + }, + { + "epoch": 5.116504854368932, + "grad_norm": 1.2342560291290283, + "learning_rate": 3.6968040310268766e-06, + "loss": 0.1746, + "step": 527 + }, + { + "epoch": 5.12621359223301, + "grad_norm": 1.3253822326660156, + "learning_rate": 3.692338430611869e-06, + "loss": 0.2296, + "step": 528 + }, + { + "epoch": 5.135922330097087, + "grad_norm": 1.0516308546066284, + "learning_rate": 3.687867900250771e-06, + "loss": 0.1245, + "step": 529 + }, + { + "epoch": 5.145631067961165, + "grad_norm": 1.3177831172943115, + "learning_rate": 3.683392458427825e-06, + "loss": 0.1418, + "step": 530 + }, + { + "epoch": 5.155339805825243, + "grad_norm": 1.109287142753601, + "learning_rate": 3.6789121236475818e-06, + "loss": 0.2233, + "step": 531 + }, + { + "epoch": 5.165048543689321, + "grad_norm": 1.1260926723480225, + "learning_rate": 3.674426914434824e-06, + "loss": 0.2019, + "step": 532 + }, + { + "epoch": 5.174757281553398, + "grad_norm": 1.3299827575683594, + "learning_rate": 3.6699368493344856e-06, + "loss": 0.2099, + "step": 533 + }, + { + "epoch": 5.184466019417476, + "grad_norm": 0.9847913980484009, + "learning_rate": 3.665441946911582e-06, + "loss": 0.0921, + "step": 534 + }, + { + "epoch": 5.194174757281553, + "grad_norm": 1.4914610385894775, + "learning_rate": 3.660942225751126e-06, + "loss": 0.1286, + "step": 535 + }, + { + "epoch": 5.203883495145631, + "grad_norm": 1.0563063621520996, + "learning_rate": 3.6564377044580558e-06, + "loss": 0.1488, + "step": 536 + }, + { + "epoch": 5.213592233009709, + "grad_norm": 1.1781753301620483, + "learning_rate": 3.6519284016571567e-06, + "loss": 0.1075, + "step": 537 + }, + { + "epoch": 5.223300970873787, + "grad_norm": 1.2290960550308228, + "learning_rate": 3.647414335992985e-06, + "loss": 0.2092, + "step": 538 + }, + { + "epoch": 5.233009708737864, + "grad_norm": 1.2922903299331665, + "learning_rate": 3.642895526129787e-06, + "loss": 0.216, + "step": 539 + }, + { + "epoch": 5.242718446601942, + "grad_norm": 1.2318174839019775, + "learning_rate": 3.638371990751428e-06, + "loss": 0.1835, + "step": 540 + }, + { + "epoch": 5.252427184466019, + "grad_norm": 1.2748051881790161, + "learning_rate": 3.63384374856131e-06, + "loss": 0.1683, + "step": 541 + }, + { + "epoch": 5.262135922330097, + "grad_norm": 1.7720893621444702, + "learning_rate": 3.629310818282297e-06, + "loss": 0.3087, + "step": 542 + }, + { + "epoch": 5.271844660194175, + "grad_norm": 0.8520978093147278, + "learning_rate": 3.6247732186566365e-06, + "loss": 0.1854, + "step": 543 + }, + { + "epoch": 5.281553398058253, + "grad_norm": 0.9303215742111206, + "learning_rate": 3.6202309684458813e-06, + "loss": 0.117, + "step": 544 + }, + { + "epoch": 5.29126213592233, + "grad_norm": 1.3285412788391113, + "learning_rate": 3.615684086430815e-06, + "loss": 0.1257, + "step": 545 + }, + { + "epoch": 5.300970873786408, + "grad_norm": 1.0170793533325195, + "learning_rate": 3.61113259141137e-06, + "loss": 0.1576, + "step": 546 + }, + { + "epoch": 5.310679611650485, + "grad_norm": 0.8529905080795288, + "learning_rate": 3.606576502206554e-06, + "loss": 0.0613, + "step": 547 + }, + { + "epoch": 5.320388349514563, + "grad_norm": 0.7129078507423401, + "learning_rate": 3.602015837654369e-06, + "loss": 0.0713, + "step": 548 + }, + { + "epoch": 5.330097087378641, + "grad_norm": 0.9281150102615356, + "learning_rate": 3.5974506166117355e-06, + "loss": 0.0702, + "step": 549 + }, + { + "epoch": 5.339805825242719, + "grad_norm": 1.1607550382614136, + "learning_rate": 3.592880857954413e-06, + "loss": 0.2438, + "step": 550 + }, + { + "epoch": 5.349514563106796, + "grad_norm": 2.1823811531066895, + "learning_rate": 3.588306580576922e-06, + "loss": 0.0838, + "step": 551 + }, + { + "epoch": 5.359223300970874, + "grad_norm": 0.9428465366363525, + "learning_rate": 3.583727803392468e-06, + "loss": 0.0444, + "step": 552 + }, + { + "epoch": 5.368932038834951, + "grad_norm": 0.8492387533187866, + "learning_rate": 3.57914454533286e-06, + "loss": 0.1829, + "step": 553 + }, + { + "epoch": 5.378640776699029, + "grad_norm": 1.1316030025482178, + "learning_rate": 3.5745568253484363e-06, + "loss": 0.1218, + "step": 554 + }, + { + "epoch": 5.388349514563107, + "grad_norm": 0.8767411708831787, + "learning_rate": 3.5699646624079824e-06, + "loss": 0.1396, + "step": 555 + }, + { + "epoch": 5.398058252427185, + "grad_norm": 1.06037437915802, + "learning_rate": 3.5653680754986543e-06, + "loss": 0.105, + "step": 556 + }, + { + "epoch": 5.407766990291262, + "grad_norm": 1.111646056175232, + "learning_rate": 3.560767083625899e-06, + "loss": 0.3099, + "step": 557 + }, + { + "epoch": 5.41747572815534, + "grad_norm": 1.120338797569275, + "learning_rate": 3.556161705813378e-06, + "loss": 0.1221, + "step": 558 + }, + { + "epoch": 5.427184466019417, + "grad_norm": 1.3789657354354858, + "learning_rate": 3.5515519611028863e-06, + "loss": 0.2341, + "step": 559 + }, + { + "epoch": 5.436893203883495, + "grad_norm": 0.9921037554740906, + "learning_rate": 3.5469378685542742e-06, + "loss": 0.2036, + "step": 560 + }, + { + "epoch": 5.446601941747573, + "grad_norm": 1.2706873416900635, + "learning_rate": 3.542319447245372e-06, + "loss": 0.3167, + "step": 561 + }, + { + "epoch": 5.456310679611651, + "grad_norm": 1.579001784324646, + "learning_rate": 3.537696716271904e-06, + "loss": 0.2752, + "step": 562 + }, + { + "epoch": 5.466019417475728, + "grad_norm": 1.1241706609725952, + "learning_rate": 3.533069694747415e-06, + "loss": 0.1022, + "step": 563 + }, + { + "epoch": 5.475728155339806, + "grad_norm": 0.8345101475715637, + "learning_rate": 3.528438401803192e-06, + "loss": 0.1361, + "step": 564 + }, + { + "epoch": 5.485436893203883, + "grad_norm": 0.9768244028091431, + "learning_rate": 3.52380285658818e-06, + "loss": 0.1032, + "step": 565 + }, + { + "epoch": 5.495145631067961, + "grad_norm": 1.464938998222351, + "learning_rate": 3.5191630782689074e-06, + "loss": 0.1876, + "step": 566 + }, + { + "epoch": 5.504854368932039, + "grad_norm": 0.9765818119049072, + "learning_rate": 3.5145190860294043e-06, + "loss": 0.1057, + "step": 567 + }, + { + "epoch": 5.514563106796117, + "grad_norm": 0.9651658535003662, + "learning_rate": 3.5098708990711254e-06, + "loss": 0.0653, + "step": 568 + }, + { + "epoch": 5.524271844660194, + "grad_norm": 1.2055721282958984, + "learning_rate": 3.505218536612869e-06, + "loss": 0.1895, + "step": 569 + }, + { + "epoch": 5.533980582524272, + "grad_norm": 1.0697849988937378, + "learning_rate": 3.500562017890695e-06, + "loss": 0.0845, + "step": 570 + }, + { + "epoch": 5.543689320388349, + "grad_norm": 0.6971541047096252, + "learning_rate": 3.495901362157853e-06, + "loss": 0.0609, + "step": 571 + }, + { + "epoch": 5.553398058252427, + "grad_norm": 1.18365478515625, + "learning_rate": 3.4912365886846934e-06, + "loss": 0.3142, + "step": 572 + }, + { + "epoch": 5.563106796116505, + "grad_norm": 1.1030703783035278, + "learning_rate": 3.4865677167585942e-06, + "loss": 0.1735, + "step": 573 + }, + { + "epoch": 5.572815533980583, + "grad_norm": 1.0149526596069336, + "learning_rate": 3.4818947656838796e-06, + "loss": 0.1038, + "step": 574 + }, + { + "epoch": 5.58252427184466, + "grad_norm": 0.9888204336166382, + "learning_rate": 3.4772177547817387e-06, + "loss": 0.0588, + "step": 575 + }, + { + "epoch": 5.592233009708738, + "grad_norm": 1.2715967893600464, + "learning_rate": 3.472536703390148e-06, + "loss": 0.108, + "step": 576 + }, + { + "epoch": 5.601941747572815, + "grad_norm": 0.8117223381996155, + "learning_rate": 3.467851630863789e-06, + "loss": 0.0809, + "step": 577 + }, + { + "epoch": 5.611650485436893, + "grad_norm": 0.9409249424934387, + "learning_rate": 3.463162556573969e-06, + "loss": 0.2129, + "step": 578 + }, + { + "epoch": 5.621359223300971, + "grad_norm": 1.2371786832809448, + "learning_rate": 3.4584694999085424e-06, + "loss": 0.2876, + "step": 579 + }, + { + "epoch": 5.631067961165049, + "grad_norm": 0.9954182505607605, + "learning_rate": 3.4537724802718294e-06, + "loss": 0.1089, + "step": 580 + }, + { + "epoch": 5.640776699029126, + "grad_norm": 0.8118783831596375, + "learning_rate": 3.4490715170845356e-06, + "loss": 0.1262, + "step": 581 + }, + { + "epoch": 5.650485436893204, + "grad_norm": 0.7283351421356201, + "learning_rate": 3.4443666297836715e-06, + "loss": 0.1363, + "step": 582 + }, + { + "epoch": 5.660194174757281, + "grad_norm": 1.0266672372817993, + "learning_rate": 3.4396578378224734e-06, + "loss": 0.1708, + "step": 583 + }, + { + "epoch": 5.669902912621359, + "grad_norm": 0.8701723217964172, + "learning_rate": 3.4349451606703214e-06, + "loss": 0.1945, + "step": 584 + }, + { + "epoch": 5.679611650485437, + "grad_norm": 1.1535297632217407, + "learning_rate": 3.430228617812661e-06, + "loss": 0.1021, + "step": 585 + }, + { + "epoch": 5.689320388349515, + "grad_norm": 0.7845752239227295, + "learning_rate": 3.4255082287509183e-06, + "loss": 0.0567, + "step": 586 + }, + { + "epoch": 5.699029126213592, + "grad_norm": 1.0981193780899048, + "learning_rate": 3.420784013002426e-06, + "loss": 0.1126, + "step": 587 + }, + { + "epoch": 5.70873786407767, + "grad_norm": 1.113115906715393, + "learning_rate": 3.416055990100336e-06, + "loss": 0.0997, + "step": 588 + }, + { + "epoch": 5.718446601941747, + "grad_norm": 1.045980453491211, + "learning_rate": 3.4113241795935427e-06, + "loss": 0.1004, + "step": 589 + }, + { + "epoch": 5.728155339805825, + "grad_norm": 0.9982129335403442, + "learning_rate": 3.4065886010466014e-06, + "loss": 0.1795, + "step": 590 + }, + { + "epoch": 5.737864077669903, + "grad_norm": 1.106252670288086, + "learning_rate": 3.401849274039647e-06, + "loss": 0.1923, + "step": 591 + }, + { + "epoch": 5.747572815533981, + "grad_norm": 0.9183070063591003, + "learning_rate": 3.3971062181683117e-06, + "loss": 0.1105, + "step": 592 + }, + { + "epoch": 5.757281553398058, + "grad_norm": 1.006274938583374, + "learning_rate": 3.3923594530436477e-06, + "loss": 0.198, + "step": 593 + }, + { + "epoch": 5.766990291262136, + "grad_norm": 1.1224919557571411, + "learning_rate": 3.387608998292041e-06, + "loss": 0.1351, + "step": 594 + }, + { + "epoch": 5.776699029126213, + "grad_norm": 0.8881707191467285, + "learning_rate": 3.382854873555137e-06, + "loss": 0.0772, + "step": 595 + }, + { + "epoch": 5.786407766990291, + "grad_norm": 1.3855921030044556, + "learning_rate": 3.3780970984897504e-06, + "loss": 0.0927, + "step": 596 + }, + { + "epoch": 5.796116504854369, + "grad_norm": 1.5147404670715332, + "learning_rate": 3.373335692767793e-06, + "loss": 0.2161, + "step": 597 + }, + { + "epoch": 5.805825242718447, + "grad_norm": 1.1337159872055054, + "learning_rate": 3.3685706760761865e-06, + "loss": 0.1655, + "step": 598 + }, + { + "epoch": 5.815533980582524, + "grad_norm": 1.1046392917633057, + "learning_rate": 3.3638020681167827e-06, + "loss": 0.1096, + "step": 599 + }, + { + "epoch": 5.825242718446602, + "grad_norm": 0.7068084478378296, + "learning_rate": 3.3590298886062833e-06, + "loss": 0.1372, + "step": 600 + }, + { + "epoch": 5.834951456310679, + "grad_norm": 1.148061990737915, + "learning_rate": 3.354254157276155e-06, + "loss": 0.2396, + "step": 601 + }, + { + "epoch": 5.844660194174757, + "grad_norm": 0.9790175557136536, + "learning_rate": 3.3494748938725525e-06, + "loss": 0.1436, + "step": 602 + }, + { + "epoch": 5.854368932038835, + "grad_norm": 1.1377002000808716, + "learning_rate": 3.3446921181562326e-06, + "loss": 0.1633, + "step": 603 + }, + { + "epoch": 5.864077669902913, + "grad_norm": 0.8733611702919006, + "learning_rate": 3.3399058499024767e-06, + "loss": 0.0667, + "step": 604 + }, + { + "epoch": 5.87378640776699, + "grad_norm": 1.4806839227676392, + "learning_rate": 3.3351161089010055e-06, + "loss": 0.186, + "step": 605 + }, + { + "epoch": 5.883495145631068, + "grad_norm": 1.0615733861923218, + "learning_rate": 3.330322914955897e-06, + "loss": 0.1802, + "step": 606 + }, + { + "epoch": 5.893203883495145, + "grad_norm": 1.1709858179092407, + "learning_rate": 3.325526287885509e-06, + "loss": 0.1916, + "step": 607 + }, + { + "epoch": 5.902912621359223, + "grad_norm": 1.0924482345581055, + "learning_rate": 3.3207262475223913e-06, + "loss": 0.1862, + "step": 608 + }, + { + "epoch": 5.9126213592233015, + "grad_norm": 1.733832836151123, + "learning_rate": 3.315922813713209e-06, + "loss": 0.3237, + "step": 609 + }, + { + "epoch": 5.922330097087379, + "grad_norm": 0.9331390857696533, + "learning_rate": 3.3111160063186553e-06, + "loss": 0.1006, + "step": 610 + }, + { + "epoch": 5.932038834951456, + "grad_norm": 1.0888906717300415, + "learning_rate": 3.3063058452133756e-06, + "loss": 0.2272, + "step": 611 + }, + { + "epoch": 5.941747572815534, + "grad_norm": 0.9720425605773926, + "learning_rate": 3.301492350285879e-06, + "loss": 0.1531, + "step": 612 + }, + { + "epoch": 5.951456310679612, + "grad_norm": 1.0783334970474243, + "learning_rate": 3.296675541438461e-06, + "loss": 0.0999, + "step": 613 + }, + { + "epoch": 5.961165048543689, + "grad_norm": 1.107725739479065, + "learning_rate": 3.2918554385871163e-06, + "loss": 0.2651, + "step": 614 + }, + { + "epoch": 5.970873786407767, + "grad_norm": 0.7212538719177246, + "learning_rate": 3.2870320616614626e-06, + "loss": 0.0738, + "step": 615 + }, + { + "epoch": 5.980582524271845, + "grad_norm": 0.9390976428985596, + "learning_rate": 3.282205430604653e-06, + "loss": 0.1179, + "step": 616 + }, + { + "epoch": 5.990291262135923, + "grad_norm": 1.0487676858901978, + "learning_rate": 3.2773755653732954e-06, + "loss": 0.1506, + "step": 617 + }, + { + "epoch": 6.0, + "grad_norm": 0.952416718006134, + "learning_rate": 3.272542485937369e-06, + "loss": 0.1571, + "step": 618 + }, + { + "epoch": 6.009708737864078, + "grad_norm": 1.1496444940567017, + "learning_rate": 3.267706212280146e-06, + "loss": 0.0913, + "step": 619 + }, + { + "epoch": 6.019417475728155, + "grad_norm": 1.0084459781646729, + "learning_rate": 3.2628667643981036e-06, + "loss": 0.0861, + "step": 620 + }, + { + "epoch": 6.029126213592233, + "grad_norm": 1.3622573614120483, + "learning_rate": 3.2580241623008426e-06, + "loss": 0.1241, + "step": 621 + }, + { + "epoch": 6.038834951456311, + "grad_norm": 1.3345288038253784, + "learning_rate": 3.2531784260110067e-06, + "loss": 0.173, + "step": 622 + }, + { + "epoch": 6.048543689320389, + "grad_norm": 1.3314571380615234, + "learning_rate": 3.2483295755641986e-06, + "loss": 0.1099, + "step": 623 + }, + { + "epoch": 6.058252427184466, + "grad_norm": 1.4012583494186401, + "learning_rate": 3.243477631008897e-06, + "loss": 0.1808, + "step": 624 + }, + { + "epoch": 6.067961165048544, + "grad_norm": 1.1909350156784058, + "learning_rate": 3.238622612406373e-06, + "loss": 0.0395, + "step": 625 + }, + { + "epoch": 6.077669902912621, + "grad_norm": 1.5543804168701172, + "learning_rate": 3.233764539830608e-06, + "loss": 0.1112, + "step": 626 + }, + { + "epoch": 6.087378640776699, + "grad_norm": 0.7781279683113098, + "learning_rate": 3.228903433368212e-06, + "loss": 0.0314, + "step": 627 + }, + { + "epoch": 6.097087378640777, + "grad_norm": 0.5457009077072144, + "learning_rate": 3.224039313118338e-06, + "loss": 0.0346, + "step": 628 + }, + { + "epoch": 6.106796116504855, + "grad_norm": 0.9449105262756348, + "learning_rate": 3.2191721991925993e-06, + "loss": 0.0701, + "step": 629 + }, + { + "epoch": 6.116504854368932, + "grad_norm": 1.0498558282852173, + "learning_rate": 3.21430211171499e-06, + "loss": 0.1421, + "step": 630 + }, + { + "epoch": 6.12621359223301, + "grad_norm": 1.1856399774551392, + "learning_rate": 3.209429070821795e-06, + "loss": 0.2414, + "step": 631 + }, + { + "epoch": 6.135922330097087, + "grad_norm": 1.2752889394760132, + "learning_rate": 3.2045530966615136e-06, + "loss": 0.1537, + "step": 632 + }, + { + "epoch": 6.145631067961165, + "grad_norm": 0.9089634418487549, + "learning_rate": 3.1996742093947724e-06, + "loss": 0.1174, + "step": 633 + }, + { + "epoch": 6.155339805825243, + "grad_norm": 48.404693603515625, + "learning_rate": 3.1947924291942423e-06, + "loss": 0.2074, + "step": 634 + }, + { + "epoch": 6.165048543689321, + "grad_norm": 1.4127225875854492, + "learning_rate": 3.189907776244556e-06, + "loss": 0.0639, + "step": 635 + }, + { + "epoch": 6.174757281553398, + "grad_norm": 1.9454190731048584, + "learning_rate": 3.185020270742225e-06, + "loss": 0.145, + "step": 636 + }, + { + "epoch": 6.184466019417476, + "grad_norm": 0.8144808411598206, + "learning_rate": 3.180129932895553e-06, + "loss": 0.0605, + "step": 637 + }, + { + "epoch": 6.194174757281553, + "grad_norm": 1.178910732269287, + "learning_rate": 3.1752367829245563e-06, + "loss": 0.2024, + "step": 638 + }, + { + "epoch": 6.203883495145631, + "grad_norm": 1.5138559341430664, + "learning_rate": 3.1703408410608777e-06, + "loss": 0.2836, + "step": 639 + }, + { + "epoch": 6.213592233009709, + "grad_norm": 1.081778883934021, + "learning_rate": 3.1654421275477045e-06, + "loss": 0.0874, + "step": 640 + }, + { + "epoch": 6.223300970873787, + "grad_norm": 1.0488624572753906, + "learning_rate": 3.1605406626396826e-06, + "loss": 0.081, + "step": 641 + }, + { + "epoch": 6.233009708737864, + "grad_norm": 1.043975591659546, + "learning_rate": 3.155636466602836e-06, + "loss": 0.1343, + "step": 642 + }, + { + "epoch": 6.242718446601942, + "grad_norm": 1.2351950407028198, + "learning_rate": 3.150729559714478e-06, + "loss": 0.1853, + "step": 643 + }, + { + "epoch": 6.252427184466019, + "grad_norm": 1.028315544128418, + "learning_rate": 3.145819962263134e-06, + "loss": 0.1328, + "step": 644 + }, + { + "epoch": 6.262135922330097, + "grad_norm": 0.966011643409729, + "learning_rate": 3.1409076945484513e-06, + "loss": 0.0671, + "step": 645 + }, + { + "epoch": 6.271844660194175, + "grad_norm": 1.0541956424713135, + "learning_rate": 3.135992776881119e-06, + "loss": 0.0451, + "step": 646 + }, + { + "epoch": 6.281553398058253, + "grad_norm": 1.1905815601348877, + "learning_rate": 3.1310752295827818e-06, + "loss": 0.0375, + "step": 647 + }, + { + "epoch": 6.29126213592233, + "grad_norm": 1.232605218887329, + "learning_rate": 3.1261550729859602e-06, + "loss": 0.1158, + "step": 648 + }, + { + "epoch": 6.300970873786408, + "grad_norm": 1.013516902923584, + "learning_rate": 3.12123232743396e-06, + "loss": 0.0596, + "step": 649 + }, + { + "epoch": 6.310679611650485, + "grad_norm": 1.258267879486084, + "learning_rate": 3.116307013280793e-06, + "loss": 0.2079, + "step": 650 + }, + { + "epoch": 6.320388349514563, + "grad_norm": 0.8476516008377075, + "learning_rate": 3.1113791508910913e-06, + "loss": 0.1106, + "step": 651 + }, + { + "epoch": 6.330097087378641, + "grad_norm": 1.2680771350860596, + "learning_rate": 3.106448760640022e-06, + "loss": 0.1778, + "step": 652 + }, + { + "epoch": 6.339805825242719, + "grad_norm": 0.8958115577697754, + "learning_rate": 3.1015158629132066e-06, + "loss": 0.0379, + "step": 653 + }, + { + "epoch": 6.349514563106796, + "grad_norm": 1.1448755264282227, + "learning_rate": 3.096580478106631e-06, + "loss": 0.1273, + "step": 654 + }, + { + "epoch": 6.359223300970874, + "grad_norm": 0.9504626393318176, + "learning_rate": 3.0916426266265676e-06, + "loss": 0.077, + "step": 655 + }, + { + "epoch": 6.368932038834951, + "grad_norm": 1.1011178493499756, + "learning_rate": 3.086702328889486e-06, + "loss": 0.0519, + "step": 656 + }, + { + "epoch": 6.378640776699029, + "grad_norm": 0.9548404216766357, + "learning_rate": 3.0817596053219697e-06, + "loss": 0.0942, + "step": 657 + }, + { + "epoch": 6.388349514563107, + "grad_norm": 0.8062845468521118, + "learning_rate": 3.076814476360634e-06, + "loss": 0.0985, + "step": 658 + }, + { + "epoch": 6.398058252427185, + "grad_norm": 0.608669102191925, + "learning_rate": 3.071866962452038e-06, + "loss": 0.0501, + "step": 659 + }, + { + "epoch": 6.407766990291262, + "grad_norm": 1.308223009109497, + "learning_rate": 3.066917084052603e-06, + "loss": 0.1855, + "step": 660 + }, + { + "epoch": 6.41747572815534, + "grad_norm": 1.656266450881958, + "learning_rate": 3.061964861628527e-06, + "loss": 0.2386, + "step": 661 + }, + { + "epoch": 6.427184466019417, + "grad_norm": 0.7621467709541321, + "learning_rate": 3.057010315655698e-06, + "loss": 0.0692, + "step": 662 + }, + { + "epoch": 6.436893203883495, + "grad_norm": 1.2241672277450562, + "learning_rate": 3.0520534666196134e-06, + "loss": 0.1118, + "step": 663 + }, + { + "epoch": 6.446601941747573, + "grad_norm": 0.9063080549240112, + "learning_rate": 3.0470943350152914e-06, + "loss": 0.1127, + "step": 664 + }, + { + "epoch": 6.456310679611651, + "grad_norm": 0.880898654460907, + "learning_rate": 3.042132941347189e-06, + "loss": 0.189, + "step": 665 + }, + { + "epoch": 6.466019417475728, + "grad_norm": 0.8484269380569458, + "learning_rate": 3.037169306129115e-06, + "loss": 0.0735, + "step": 666 + }, + { + "epoch": 6.475728155339806, + "grad_norm": 0.9555432200431824, + "learning_rate": 3.0322034498841475e-06, + "loss": 0.122, + "step": 667 + }, + { + "epoch": 6.485436893203883, + "grad_norm": 0.981014609336853, + "learning_rate": 3.027235393144547e-06, + "loss": 0.125, + "step": 668 + }, + { + "epoch": 6.495145631067961, + "grad_norm": 1.0492215156555176, + "learning_rate": 3.0222651564516715e-06, + "loss": 0.1992, + "step": 669 + }, + { + "epoch": 6.504854368932039, + "grad_norm": 1.2064902782440186, + "learning_rate": 3.017292760355896e-06, + "loss": 0.0663, + "step": 670 + }, + { + "epoch": 6.514563106796117, + "grad_norm": 1.1765069961547852, + "learning_rate": 3.0123182254165194e-06, + "loss": 0.0401, + "step": 671 + }, + { + "epoch": 6.524271844660194, + "grad_norm": 1.5717885494232178, + "learning_rate": 3.0073415722016875e-06, + "loss": 0.2454, + "step": 672 + }, + { + "epoch": 6.533980582524272, + "grad_norm": 1.1897467374801636, + "learning_rate": 3.002362821288302e-06, + "loss": 0.061, + "step": 673 + }, + { + "epoch": 6.543689320388349, + "grad_norm": 1.0062282085418701, + "learning_rate": 2.9973819932619404e-06, + "loss": 0.1018, + "step": 674 + }, + { + "epoch": 6.553398058252427, + "grad_norm": 1.0144904851913452, + "learning_rate": 2.9923991087167657e-06, + "loss": 0.1027, + "step": 675 + }, + { + "epoch": 6.563106796116505, + "grad_norm": 0.9750237464904785, + "learning_rate": 2.987414188255446e-06, + "loss": 0.1482, + "step": 676 + }, + { + "epoch": 6.572815533980583, + "grad_norm": 1.158136010169983, + "learning_rate": 2.9824272524890664e-06, + "loss": 0.1345, + "step": 677 + }, + { + "epoch": 6.58252427184466, + "grad_norm": 1.1052918434143066, + "learning_rate": 2.977438322037046e-06, + "loss": 0.0791, + "step": 678 + }, + { + "epoch": 6.592233009708738, + "grad_norm": 0.8905234932899475, + "learning_rate": 2.9724474175270485e-06, + "loss": 0.0684, + "step": 679 + }, + { + "epoch": 6.601941747572815, + "grad_norm": 1.0031498670578003, + "learning_rate": 2.967454559594903e-06, + "loss": 0.0628, + "step": 680 + }, + { + "epoch": 6.611650485436893, + "grad_norm": 2.124293327331543, + "learning_rate": 2.9624597688845126e-06, + "loss": 0.1071, + "step": 681 + }, + { + "epoch": 6.621359223300971, + "grad_norm": 1.101279854774475, + "learning_rate": 2.957463066047773e-06, + "loss": 0.0892, + "step": 682 + }, + { + "epoch": 6.631067961165049, + "grad_norm": 0.6379441618919373, + "learning_rate": 2.9524644717444866e-06, + "loss": 0.0469, + "step": 683 + }, + { + "epoch": 6.640776699029126, + "grad_norm": 1.4411451816558838, + "learning_rate": 2.9474640066422757e-06, + "loss": 0.2329, + "step": 684 + }, + { + "epoch": 6.650485436893204, + "grad_norm": 1.02129065990448, + "learning_rate": 2.9424616914164982e-06, + "loss": 0.0619, + "step": 685 + }, + { + "epoch": 6.660194174757281, + "grad_norm": 0.7449476718902588, + "learning_rate": 2.9374575467501605e-06, + "loss": 0.063, + "step": 686 + }, + { + "epoch": 6.669902912621359, + "grad_norm": 1.1670392751693726, + "learning_rate": 2.9324515933338343e-06, + "loss": 0.0926, + "step": 687 + }, + { + "epoch": 6.679611650485437, + "grad_norm": 1.2292739152908325, + "learning_rate": 2.9274438518655703e-06, + "loss": 0.1585, + "step": 688 + }, + { + "epoch": 6.689320388349515, + "grad_norm": 0.9029463529586792, + "learning_rate": 2.9224343430508105e-06, + "loss": 0.0726, + "step": 689 + }, + { + "epoch": 6.699029126213592, + "grad_norm": 0.9147337079048157, + "learning_rate": 2.917423087602306e-06, + "loss": 0.1295, + "step": 690 + }, + { + "epoch": 6.70873786407767, + "grad_norm": 0.9737975597381592, + "learning_rate": 2.9124101062400283e-06, + "loss": 0.153, + "step": 691 + }, + { + "epoch": 6.718446601941747, + "grad_norm": 1.2561050653457642, + "learning_rate": 2.907395419691087e-06, + "loss": 0.178, + "step": 692 + }, + { + "epoch": 6.728155339805825, + "grad_norm": 1.113938808441162, + "learning_rate": 2.9023790486896404e-06, + "loss": 0.081, + "step": 693 + }, + { + "epoch": 6.737864077669903, + "grad_norm": 0.82021164894104, + "learning_rate": 2.8973610139768114e-06, + "loss": 0.0724, + "step": 694 + }, + { + "epoch": 6.747572815533981, + "grad_norm": 9.422320365905762, + "learning_rate": 2.8923413363006038e-06, + "loss": 0.1952, + "step": 695 + }, + { + "epoch": 6.757281553398058, + "grad_norm": 0.9243801236152649, + "learning_rate": 2.887320036415811e-06, + "loss": 0.0708, + "step": 696 + }, + { + "epoch": 6.766990291262136, + "grad_norm": 1.160498857498169, + "learning_rate": 2.882297135083937e-06, + "loss": 0.0429, + "step": 697 + }, + { + "epoch": 6.776699029126213, + "grad_norm": 0.9941173791885376, + "learning_rate": 2.877272653073107e-06, + "loss": 0.0608, + "step": 698 + }, + { + "epoch": 6.786407766990291, + "grad_norm": 1.100027084350586, + "learning_rate": 2.87224661115798e-06, + "loss": 0.096, + "step": 699 + }, + { + "epoch": 6.796116504854369, + "grad_norm": 0.8699636459350586, + "learning_rate": 2.8672190301196655e-06, + "loss": 0.1668, + "step": 700 + }, + { + "epoch": 6.805825242718447, + "grad_norm": 1.1691793203353882, + "learning_rate": 2.8621899307456376e-06, + "loss": 0.1234, + "step": 701 + }, + { + "epoch": 6.815533980582524, + "grad_norm": 1.030217170715332, + "learning_rate": 2.8571593338296473e-06, + "loss": 0.09, + "step": 702 + }, + { + "epoch": 6.825242718446602, + "grad_norm": 0.5228599905967712, + "learning_rate": 2.8521272601716376e-06, + "loss": 0.0317, + "step": 703 + }, + { + "epoch": 6.834951456310679, + "grad_norm": 1.4643433094024658, + "learning_rate": 2.8470937305776567e-06, + "loss": 0.2609, + "step": 704 + }, + { + "epoch": 6.844660194174757, + "grad_norm": 0.9849478006362915, + "learning_rate": 2.842058765859776e-06, + "loss": 0.0848, + "step": 705 + }, + { + "epoch": 6.854368932038835, + "grad_norm": 0.9210445284843445, + "learning_rate": 2.837022386835996e-06, + "loss": 0.0809, + "step": 706 + }, + { + "epoch": 6.864077669902913, + "grad_norm": 0.8688995242118835, + "learning_rate": 2.8319846143301676e-06, + "loss": 0.064, + "step": 707 + }, + { + "epoch": 6.87378640776699, + "grad_norm": 0.8066009879112244, + "learning_rate": 2.826945469171903e-06, + "loss": 0.0467, + "step": 708 + }, + { + "epoch": 6.883495145631068, + "grad_norm": 0.6212665438652039, + "learning_rate": 2.82190497219649e-06, + "loss": 0.0793, + "step": 709 + }, + { + "epoch": 6.893203883495145, + "grad_norm": 1.0815949440002441, + "learning_rate": 2.8168631442448046e-06, + "loss": 0.1624, + "step": 710 + }, + { + "epoch": 6.902912621359223, + "grad_norm": 0.7521616816520691, + "learning_rate": 2.8118200061632273e-06, + "loss": 0.0374, + "step": 711 + }, + { + "epoch": 6.9126213592233015, + "grad_norm": 1.0690211057662964, + "learning_rate": 2.8067755788035544e-06, + "loss": 0.1545, + "step": 712 + }, + { + "epoch": 6.922330097087379, + "grad_norm": 1.1906397342681885, + "learning_rate": 2.801729883022915e-06, + "loss": 0.0859, + "step": 713 + }, + { + "epoch": 6.932038834951456, + "grad_norm": 0.871376633644104, + "learning_rate": 2.7966829396836804e-06, + "loss": 0.1442, + "step": 714 + }, + { + "epoch": 6.941747572815534, + "grad_norm": 1.062356948852539, + "learning_rate": 2.791634769653381e-06, + "loss": 0.0734, + "step": 715 + }, + { + "epoch": 6.951456310679612, + "grad_norm": 0.9127151966094971, + "learning_rate": 2.78658539380462e-06, + "loss": 0.2102, + "step": 716 + }, + { + "epoch": 6.961165048543689, + "grad_norm": 0.8193743228912354, + "learning_rate": 2.781534833014985e-06, + "loss": 0.1019, + "step": 717 + }, + { + "epoch": 6.970873786407767, + "grad_norm": 0.989223301410675, + "learning_rate": 2.7764831081669635e-06, + "loss": 0.1714, + "step": 718 + }, + { + "epoch": 6.980582524271845, + "grad_norm": 0.6328715085983276, + "learning_rate": 2.771430240147856e-06, + "loss": 0.0497, + "step": 719 + }, + { + "epoch": 6.990291262135923, + "grad_norm": 1.1424680948257446, + "learning_rate": 2.7663762498496905e-06, + "loss": 0.1421, + "step": 720 + }, + { + "epoch": 7.0, + "grad_norm": 1.1053463220596313, + "learning_rate": 2.761321158169134e-06, + "loss": 0.0579, + "step": 721 + }, + { + "epoch": 7.009708737864078, + "grad_norm": 0.916090726852417, + "learning_rate": 2.7562649860074077e-06, + "loss": 0.051, + "step": 722 + }, + { + "epoch": 7.019417475728155, + "grad_norm": 1.599184513092041, + "learning_rate": 2.7512077542702005e-06, + "loss": 0.2136, + "step": 723 + }, + { + "epoch": 7.029126213592233, + "grad_norm": 0.8245609998703003, + "learning_rate": 2.746149483867582e-06, + "loss": 0.0343, + "step": 724 + }, + { + "epoch": 7.038834951456311, + "grad_norm": 0.9597436785697937, + "learning_rate": 2.741090195713917e-06, + "loss": 0.0577, + "step": 725 + }, + { + "epoch": 7.048543689320389, + "grad_norm": 1.0133090019226074, + "learning_rate": 2.736029910727777e-06, + "loss": 0.0816, + "step": 726 + }, + { + "epoch": 7.058252427184466, + "grad_norm": 1.2042464017868042, + "learning_rate": 2.730968649831858e-06, + "loss": 0.064, + "step": 727 + }, + { + "epoch": 7.067961165048544, + "grad_norm": 1.2887951135635376, + "learning_rate": 2.7259064339528875e-06, + "loss": 0.1278, + "step": 728 + }, + { + "epoch": 7.077669902912621, + "grad_norm": 1.1621922254562378, + "learning_rate": 2.720843284021543e-06, + "loss": 0.107, + "step": 729 + }, + { + "epoch": 7.087378640776699, + "grad_norm": 0.6943169832229614, + "learning_rate": 2.7157792209723654e-06, + "loss": 0.0554, + "step": 730 + }, + { + "epoch": 7.097087378640777, + "grad_norm": 0.9973244667053223, + "learning_rate": 2.7107142657436696e-06, + "loss": 0.0963, + "step": 731 + }, + { + "epoch": 7.106796116504855, + "grad_norm": 0.9302243590354919, + "learning_rate": 2.705648439277459e-06, + "loss": 0.043, + "step": 732 + }, + { + "epoch": 7.116504854368932, + "grad_norm": 0.6210493445396423, + "learning_rate": 2.7005817625193398e-06, + "loss": 0.0524, + "step": 733 + }, + { + "epoch": 7.12621359223301, + "grad_norm": 0.792597770690918, + "learning_rate": 2.695514256418435e-06, + "loss": 0.0359, + "step": 734 + }, + { + "epoch": 7.135922330097087, + "grad_norm": 1.1926745176315308, + "learning_rate": 2.6904459419272955e-06, + "loss": 0.0771, + "step": 735 + }, + { + "epoch": 7.145631067961165, + "grad_norm": 1.0876519680023193, + "learning_rate": 2.685376840001814e-06, + "loss": 0.0669, + "step": 736 + }, + { + "epoch": 7.155339805825243, + "grad_norm": 1.0764493942260742, + "learning_rate": 2.6803069716011405e-06, + "loss": 0.0242, + "step": 737 + }, + { + "epoch": 7.165048543689321, + "grad_norm": 1.308888554573059, + "learning_rate": 2.6752363576875933e-06, + "loss": 0.0749, + "step": 738 + }, + { + "epoch": 7.174757281553398, + "grad_norm": 1.2870591878890991, + "learning_rate": 2.6701650192265734e-06, + "loss": 0.0776, + "step": 739 + }, + { + "epoch": 7.184466019417476, + "grad_norm": 1.0227237939834595, + "learning_rate": 2.6650929771864776e-06, + "loss": 0.0491, + "step": 740 + }, + { + "epoch": 7.194174757281553, + "grad_norm": 1.2830699682235718, + "learning_rate": 2.660020252538611e-06, + "loss": 0.121, + "step": 741 + }, + { + "epoch": 7.203883495145631, + "grad_norm": 0.7571777701377869, + "learning_rate": 2.6549468662571026e-06, + "loss": 0.0598, + "step": 742 + }, + { + "epoch": 7.213592233009709, + "grad_norm": 1.2842772006988525, + "learning_rate": 2.6498728393188157e-06, + "loss": 0.1016, + "step": 743 + }, + { + "epoch": 7.223300970873787, + "grad_norm": 0.9173499345779419, + "learning_rate": 2.6447981927032634e-06, + "loss": 0.0308, + "step": 744 + }, + { + "epoch": 7.233009708737864, + "grad_norm": 1.0887157917022705, + "learning_rate": 2.639722947392521e-06, + "loss": 0.1625, + "step": 745 + }, + { + "epoch": 7.242718446601942, + "grad_norm": 0.8496439456939697, + "learning_rate": 2.6346471243711376e-06, + "loss": 0.061, + "step": 746 + }, + { + "epoch": 7.252427184466019, + "grad_norm": 0.7860584855079651, + "learning_rate": 2.629570744626052e-06, + "loss": 0.1061, + "step": 747 + }, + { + "epoch": 7.262135922330097, + "grad_norm": 0.8548651933670044, + "learning_rate": 2.624493829146507e-06, + "loss": 0.0929, + "step": 748 + }, + { + "epoch": 7.271844660194175, + "grad_norm": 1.2701689004898071, + "learning_rate": 2.619416398923957e-06, + "loss": 0.1547, + "step": 749 + }, + { + "epoch": 7.281553398058253, + "grad_norm": 0.8744873404502869, + "learning_rate": 2.614338474951987e-06, + "loss": 0.022, + "step": 750 + }, + { + "epoch": 7.29126213592233, + "grad_norm": 0.7233145236968994, + "learning_rate": 2.6092600782262213e-06, + "loss": 0.0523, + "step": 751 + }, + { + "epoch": 7.300970873786408, + "grad_norm": 0.7879371643066406, + "learning_rate": 2.6041812297442417e-06, + "loss": 0.044, + "step": 752 + }, + { + "epoch": 7.310679611650485, + "grad_norm": 1.221299409866333, + "learning_rate": 2.5991019505054965e-06, + "loss": 0.0541, + "step": 753 + }, + { + "epoch": 7.320388349514563, + "grad_norm": 0.7391517758369446, + "learning_rate": 2.5940222615112143e-06, + "loss": 0.0332, + "step": 754 + }, + { + "epoch": 7.330097087378641, + "grad_norm": 0.7787492871284485, + "learning_rate": 2.5889421837643186e-06, + "loss": 0.0446, + "step": 755 + }, + { + "epoch": 7.339805825242719, + "grad_norm": 6.488763332366943, + "learning_rate": 2.5838617382693415e-06, + "loss": 0.1187, + "step": 756 + }, + { + "epoch": 7.349514563106796, + "grad_norm": 1.3808194398880005, + "learning_rate": 2.5787809460323337e-06, + "loss": 0.0555, + "step": 757 + }, + { + "epoch": 7.359223300970874, + "grad_norm": 1.1066080331802368, + "learning_rate": 2.57369982806078e-06, + "loss": 0.105, + "step": 758 + }, + { + "epoch": 7.368932038834951, + "grad_norm": 0.9817208647727966, + "learning_rate": 2.5686184053635127e-06, + "loss": 0.0499, + "step": 759 + }, + { + "epoch": 7.378640776699029, + "grad_norm": 0.6625164747238159, + "learning_rate": 2.563536698950624e-06, + "loss": 0.0348, + "step": 760 + }, + { + "epoch": 7.388349514563107, + "grad_norm": 0.5409085750579834, + "learning_rate": 2.5584547298333772e-06, + "loss": 0.0123, + "step": 761 + }, + { + "epoch": 7.398058252427185, + "grad_norm": 0.6516930460929871, + "learning_rate": 2.5533725190241255e-06, + "loss": 0.0101, + "step": 762 + }, + { + "epoch": 7.407766990291262, + "grad_norm": 0.7117874026298523, + "learning_rate": 2.5482900875362184e-06, + "loss": 0.0352, + "step": 763 + }, + { + "epoch": 7.41747572815534, + "grad_norm": 1.0977354049682617, + "learning_rate": 2.543207456383919e-06, + "loss": 0.0468, + "step": 764 + }, + { + "epoch": 7.427184466019417, + "grad_norm": 1.0205830335617065, + "learning_rate": 2.538124646582315e-06, + "loss": 0.1731, + "step": 765 + }, + { + "epoch": 7.436893203883495, + "grad_norm": 0.7433698177337646, + "learning_rate": 2.533041679147235e-06, + "loss": 0.0431, + "step": 766 + }, + { + "epoch": 7.446601941747573, + "grad_norm": 0.7881643176078796, + "learning_rate": 2.527958575095157e-06, + "loss": 0.0361, + "step": 767 + }, + { + "epoch": 7.456310679611651, + "grad_norm": 1.1216914653778076, + "learning_rate": 2.522875355443124e-06, + "loss": 0.1596, + "step": 768 + }, + { + "epoch": 7.466019417475728, + "grad_norm": 1.3237402439117432, + "learning_rate": 2.5177920412086586e-06, + "loss": 0.1079, + "step": 769 + }, + { + "epoch": 7.475728155339806, + "grad_norm": 1.0378979444503784, + "learning_rate": 2.512708653409674e-06, + "loss": 0.0652, + "step": 770 + }, + { + "epoch": 7.485436893203883, + "grad_norm": 1.159886121749878, + "learning_rate": 2.507625213064386e-06, + "loss": 0.0465, + "step": 771 + }, + { + "epoch": 7.495145631067961, + "grad_norm": 0.673026978969574, + "learning_rate": 2.5025417411912307e-06, + "loss": 0.034, + "step": 772 + }, + { + "epoch": 7.504854368932039, + "grad_norm": 1.011098861694336, + "learning_rate": 2.4974582588087697e-06, + "loss": 0.0951, + "step": 773 + }, + { + "epoch": 7.514563106796117, + "grad_norm": 1.035995602607727, + "learning_rate": 2.492374786935614e-06, + "loss": 0.0827, + "step": 774 + }, + { + "epoch": 7.524271844660194, + "grad_norm": 1.1238621473312378, + "learning_rate": 2.487291346590326e-06, + "loss": 0.1215, + "step": 775 + }, + { + "epoch": 7.533980582524272, + "grad_norm": 0.7399255037307739, + "learning_rate": 2.4822079587913414e-06, + "loss": 0.0792, + "step": 776 + }, + { + "epoch": 7.543689320388349, + "grad_norm": 0.7715346217155457, + "learning_rate": 2.4771246445568763e-06, + "loss": 0.0311, + "step": 777 + }, + { + "epoch": 7.553398058252427, + "grad_norm": 0.9662187099456787, + "learning_rate": 2.472041424904844e-06, + "loss": 0.0511, + "step": 778 + }, + { + "epoch": 7.563106796116505, + "grad_norm": 1.0473203659057617, + "learning_rate": 2.466958320852766e-06, + "loss": 0.0905, + "step": 779 + }, + { + "epoch": 7.572815533980583, + "grad_norm": 1.532143473625183, + "learning_rate": 2.4618753534176854e-06, + "loss": 0.0664, + "step": 780 + }, + { + "epoch": 7.58252427184466, + "grad_norm": 1.3228340148925781, + "learning_rate": 2.4567925436160823e-06, + "loss": 0.0745, + "step": 781 + }, + { + "epoch": 7.592233009708738, + "grad_norm": 1.191823124885559, + "learning_rate": 2.4517099124637824e-06, + "loss": 0.1207, + "step": 782 + }, + { + "epoch": 7.601941747572815, + "grad_norm": 1.2542591094970703, + "learning_rate": 2.4466274809758757e-06, + "loss": 0.0542, + "step": 783 + }, + { + "epoch": 7.611650485436893, + "grad_norm": 0.9405433535575867, + "learning_rate": 2.4415452701666236e-06, + "loss": 0.0919, + "step": 784 + }, + { + "epoch": 7.621359223300971, + "grad_norm": 0.9204425811767578, + "learning_rate": 2.436463301049378e-06, + "loss": 0.0929, + "step": 785 + }, + { + "epoch": 7.631067961165049, + "grad_norm": 1.5501075983047485, + "learning_rate": 2.431381594636488e-06, + "loss": 0.1076, + "step": 786 + }, + { + "epoch": 7.640776699029126, + "grad_norm": 1.5708661079406738, + "learning_rate": 2.42630017193922e-06, + "loss": 0.2274, + "step": 787 + }, + { + "epoch": 7.650485436893204, + "grad_norm": 0.5069414377212524, + "learning_rate": 2.4212190539676667e-06, + "loss": 0.0152, + "step": 788 + }, + { + "epoch": 7.660194174757281, + "grad_norm": 1.0907994508743286, + "learning_rate": 2.4161382617306585e-06, + "loss": 0.0799, + "step": 789 + }, + { + "epoch": 7.669902912621359, + "grad_norm": 1.0139005184173584, + "learning_rate": 2.4110578162356814e-06, + "loss": 0.0385, + "step": 790 + }, + { + "epoch": 7.679611650485437, + "grad_norm": 0.8794927597045898, + "learning_rate": 2.405977738488786e-06, + "loss": 0.0673, + "step": 791 + }, + { + "epoch": 7.689320388349515, + "grad_norm": 0.8962538242340088, + "learning_rate": 2.4008980494945044e-06, + "loss": 0.0576, + "step": 792 + }, + { + "epoch": 7.699029126213592, + "grad_norm": 0.9313005805015564, + "learning_rate": 2.3958187702557587e-06, + "loss": 0.0263, + "step": 793 + }, + { + "epoch": 7.70873786407767, + "grad_norm": 0.6752762198448181, + "learning_rate": 2.39073992177378e-06, + "loss": 0.0334, + "step": 794 + }, + { + "epoch": 7.718446601941747, + "grad_norm": 1.2299002408981323, + "learning_rate": 2.385661525048014e-06, + "loss": 0.0686, + "step": 795 + }, + { + "epoch": 7.728155339805825, + "grad_norm": 1.4804145097732544, + "learning_rate": 2.3805836010760435e-06, + "loss": 0.1083, + "step": 796 + }, + { + "epoch": 7.737864077669903, + "grad_norm": 0.7202876210212708, + "learning_rate": 2.375506170853494e-06, + "loss": 0.0239, + "step": 797 + }, + { + "epoch": 7.747572815533981, + "grad_norm": 0.6486926674842834, + "learning_rate": 2.3704292553739487e-06, + "loss": 0.0279, + "step": 798 + }, + { + "epoch": 7.757281553398058, + "grad_norm": 0.7923670411109924, + "learning_rate": 2.3653528756288636e-06, + "loss": 0.075, + "step": 799 + }, + { + "epoch": 7.766990291262136, + "grad_norm": 1.2188405990600586, + "learning_rate": 2.3602770526074804e-06, + "loss": 0.1341, + "step": 800 + }, + { + "epoch": 7.776699029126213, + "grad_norm": 1.213319182395935, + "learning_rate": 2.3552018072967375e-06, + "loss": 0.0744, + "step": 801 + }, + { + "epoch": 7.786407766990291, + "grad_norm": 0.9601984620094299, + "learning_rate": 2.3501271606811848e-06, + "loss": 0.0378, + "step": 802 + }, + { + "epoch": 7.796116504854369, + "grad_norm": 1.3879088163375854, + "learning_rate": 2.345053133742898e-06, + "loss": 0.1409, + "step": 803 + }, + { + "epoch": 7.805825242718447, + "grad_norm": 0.746889054775238, + "learning_rate": 2.3399797474613894e-06, + "loss": 0.0254, + "step": 804 + }, + { + "epoch": 7.815533980582524, + "grad_norm": 1.1101627349853516, + "learning_rate": 2.334907022813523e-06, + "loss": 0.0435, + "step": 805 + }, + { + "epoch": 7.825242718446602, + "grad_norm": 0.9770099520683289, + "learning_rate": 2.329834980773427e-06, + "loss": 0.1422, + "step": 806 + }, + { + "epoch": 7.834951456310679, + "grad_norm": 1.0562567710876465, + "learning_rate": 2.324763642312407e-06, + "loss": 0.065, + "step": 807 + }, + { + "epoch": 7.844660194174757, + "grad_norm": 1.1696139574050903, + "learning_rate": 2.3196930283988603e-06, + "loss": 0.0962, + "step": 808 + }, + { + "epoch": 7.854368932038835, + "grad_norm": 0.7921373248100281, + "learning_rate": 2.3146231599981865e-06, + "loss": 0.1024, + "step": 809 + }, + { + "epoch": 7.864077669902913, + "grad_norm": 1.2344034910202026, + "learning_rate": 2.3095540580727054e-06, + "loss": 0.1732, + "step": 810 + }, + { + "epoch": 7.87378640776699, + "grad_norm": 1.0766966342926025, + "learning_rate": 2.304485743581566e-06, + "loss": 0.1065, + "step": 811 + }, + { + "epoch": 7.883495145631068, + "grad_norm": 1.139400839805603, + "learning_rate": 2.299418237480661e-06, + "loss": 0.0175, + "step": 812 + }, + { + "epoch": 7.893203883495145, + "grad_norm": 0.9371456503868103, + "learning_rate": 2.294351560722542e-06, + "loss": 0.0914, + "step": 813 + }, + { + "epoch": 7.902912621359223, + "grad_norm": 0.5872753262519836, + "learning_rate": 2.2892857342563316e-06, + "loss": 0.0351, + "step": 814 + }, + { + "epoch": 7.9126213592233015, + "grad_norm": 1.1175590753555298, + "learning_rate": 2.2842207790276355e-06, + "loss": 0.0764, + "step": 815 + }, + { + "epoch": 7.922330097087379, + "grad_norm": 1.2251633405685425, + "learning_rate": 2.279156715978457e-06, + "loss": 0.1079, + "step": 816 + }, + { + "epoch": 7.932038834951456, + "grad_norm": 1.0800000429153442, + "learning_rate": 2.274093566047113e-06, + "loss": 0.0533, + "step": 817 + }, + { + "epoch": 7.941747572815534, + "grad_norm": 0.7505971789360046, + "learning_rate": 2.2690313501681426e-06, + "loss": 0.0411, + "step": 818 + }, + { + "epoch": 7.951456310679612, + "grad_norm": 0.9410497546195984, + "learning_rate": 2.263970089272223e-06, + "loss": 0.0541, + "step": 819 + }, + { + "epoch": 7.961165048543689, + "grad_norm": 0.8549265265464783, + "learning_rate": 2.2589098042860838e-06, + "loss": 0.1001, + "step": 820 + }, + { + "epoch": 7.970873786407767, + "grad_norm": 0.736650824546814, + "learning_rate": 2.2538505161324186e-06, + "loss": 0.0271, + "step": 821 + }, + { + "epoch": 7.980582524271845, + "grad_norm": 0.8051927089691162, + "learning_rate": 2.2487922457298007e-06, + "loss": 0.0299, + "step": 822 + }, + { + "epoch": 7.990291262135923, + "grad_norm": 1.251429557800293, + "learning_rate": 2.243735013992593e-06, + "loss": 0.1019, + "step": 823 + }, + { + "epoch": 8.0, + "grad_norm": 0.8267395496368408, + "learning_rate": 2.238678841830867e-06, + "loss": 0.0444, + "step": 824 + }, + { + "epoch": 8.009708737864077, + "grad_norm": 1.0311859846115112, + "learning_rate": 2.2336237501503103e-06, + "loss": 0.0393, + "step": 825 + }, + { + "epoch": 8.019417475728156, + "grad_norm": 0.6791521906852722, + "learning_rate": 2.2285697598521446e-06, + "loss": 0.0173, + "step": 826 + }, + { + "epoch": 8.029126213592233, + "grad_norm": 0.6706322431564331, + "learning_rate": 2.2235168918330374e-06, + "loss": 0.036, + "step": 827 + }, + { + "epoch": 8.03883495145631, + "grad_norm": 0.9080128073692322, + "learning_rate": 2.2184651669850164e-06, + "loss": 0.0482, + "step": 828 + }, + { + "epoch": 8.048543689320388, + "grad_norm": 0.49124234914779663, + "learning_rate": 2.2134146061953814e-06, + "loss": 0.0033, + "step": 829 + }, + { + "epoch": 8.058252427184467, + "grad_norm": 1.4356873035430908, + "learning_rate": 2.2083652303466196e-06, + "loss": 0.1166, + "step": 830 + }, + { + "epoch": 8.067961165048544, + "grad_norm": 1.9703857898712158, + "learning_rate": 2.20331706031632e-06, + "loss": 0.1, + "step": 831 + }, + { + "epoch": 8.077669902912621, + "grad_norm": 0.6531624794006348, + "learning_rate": 2.1982701169770853e-06, + "loss": 0.0243, + "step": 832 + }, + { + "epoch": 8.087378640776699, + "grad_norm": 0.9504621624946594, + "learning_rate": 2.1932244211964456e-06, + "loss": 0.0462, + "step": 833 + }, + { + "epoch": 8.097087378640778, + "grad_norm": 0.9275246858596802, + "learning_rate": 2.1881799938367735e-06, + "loss": 0.0403, + "step": 834 + }, + { + "epoch": 8.106796116504855, + "grad_norm": 0.7024449706077576, + "learning_rate": 2.1831368557551962e-06, + "loss": 0.0266, + "step": 835 + }, + { + "epoch": 8.116504854368932, + "grad_norm": 1.1542211771011353, + "learning_rate": 2.1780950278035114e-06, + "loss": 0.062, + "step": 836 + }, + { + "epoch": 8.12621359223301, + "grad_norm": 0.7213531136512756, + "learning_rate": 2.173054530828098e-06, + "loss": 0.0134, + "step": 837 + }, + { + "epoch": 8.135922330097088, + "grad_norm": 0.6973257660865784, + "learning_rate": 2.168015385669833e-06, + "loss": 0.0167, + "step": 838 + }, + { + "epoch": 8.145631067961165, + "grad_norm": 1.4313327074050903, + "learning_rate": 2.162977613164005e-06, + "loss": 0.1217, + "step": 839 + }, + { + "epoch": 8.155339805825243, + "grad_norm": 1.4857988357543945, + "learning_rate": 2.157941234140225e-06, + "loss": 0.1104, + "step": 840 + }, + { + "epoch": 8.16504854368932, + "grad_norm": 1.080141544342041, + "learning_rate": 2.1529062694223437e-06, + "loss": 0.0566, + "step": 841 + }, + { + "epoch": 8.174757281553399, + "grad_norm": 0.6205718517303467, + "learning_rate": 2.147872739828364e-06, + "loss": 0.012, + "step": 842 + }, + { + "epoch": 8.184466019417476, + "grad_norm": 0.6710229516029358, + "learning_rate": 2.142840666170354e-06, + "loss": 0.0163, + "step": 843 + }, + { + "epoch": 8.194174757281553, + "grad_norm": 0.6269445419311523, + "learning_rate": 2.1378100692543637e-06, + "loss": 0.017, + "step": 844 + }, + { + "epoch": 8.20388349514563, + "grad_norm": 0.7391650676727295, + "learning_rate": 2.1327809698803354e-06, + "loss": 0.0296, + "step": 845 + }, + { + "epoch": 8.21359223300971, + "grad_norm": 0.861961305141449, + "learning_rate": 2.1277533888420203e-06, + "loss": 0.0403, + "step": 846 + }, + { + "epoch": 8.223300970873787, + "grad_norm": 0.6323930025100708, + "learning_rate": 2.1227273469268932e-06, + "loss": 0.0148, + "step": 847 + }, + { + "epoch": 8.233009708737864, + "grad_norm": 0.9467630386352539, + "learning_rate": 2.117702864916063e-06, + "loss": 0.0272, + "step": 848 + }, + { + "epoch": 8.242718446601941, + "grad_norm": 1.1341761350631714, + "learning_rate": 2.1126799635841897e-06, + "loss": 0.0373, + "step": 849 + }, + { + "epoch": 8.25242718446602, + "grad_norm": 0.8145177960395813, + "learning_rate": 2.1076586636993975e-06, + "loss": 0.039, + "step": 850 + }, + { + "epoch": 8.262135922330097, + "grad_norm": 0.7213461995124817, + "learning_rate": 2.102638986023189e-06, + "loss": 0.023, + "step": 851 + }, + { + "epoch": 8.271844660194175, + "grad_norm": 1.1217619180679321, + "learning_rate": 2.0976209513103604e-06, + "loss": 0.067, + "step": 852 + }, + { + "epoch": 8.281553398058252, + "grad_norm": 0.49746865034103394, + "learning_rate": 2.0926045803089135e-06, + "loss": 0.0148, + "step": 853 + }, + { + "epoch": 8.29126213592233, + "grad_norm": 0.917957067489624, + "learning_rate": 2.087589893759972e-06, + "loss": 0.0287, + "step": 854 + }, + { + "epoch": 8.300970873786408, + "grad_norm": 0.7017801403999329, + "learning_rate": 2.0825769123976954e-06, + "loss": 0.0574, + "step": 855 + }, + { + "epoch": 8.310679611650485, + "grad_norm": 0.9924259781837463, + "learning_rate": 2.077565656949191e-06, + "loss": 0.0996, + "step": 856 + }, + { + "epoch": 8.320388349514563, + "grad_norm": 0.8891076445579529, + "learning_rate": 2.072556148134431e-06, + "loss": 0.0578, + "step": 857 + }, + { + "epoch": 8.330097087378642, + "grad_norm": 0.8946473598480225, + "learning_rate": 2.0675484066661666e-06, + "loss": 0.0427, + "step": 858 + }, + { + "epoch": 8.339805825242719, + "grad_norm": 0.6834955215454102, + "learning_rate": 2.0625424532498407e-06, + "loss": 0.0176, + "step": 859 + }, + { + "epoch": 8.349514563106796, + "grad_norm": 0.9527811408042908, + "learning_rate": 2.057538308583502e-06, + "loss": 0.0226, + "step": 860 + }, + { + "epoch": 8.359223300970873, + "grad_norm": 1.418888807296753, + "learning_rate": 2.0525359933577243e-06, + "loss": 0.0945, + "step": 861 + }, + { + "epoch": 8.368932038834952, + "grad_norm": 0.8171473741531372, + "learning_rate": 2.047535528255514e-06, + "loss": 0.0348, + "step": 862 + }, + { + "epoch": 8.37864077669903, + "grad_norm": 0.38823753595352173, + "learning_rate": 2.0425369339522276e-06, + "loss": 0.0034, + "step": 863 + }, + { + "epoch": 8.388349514563107, + "grad_norm": 0.68483966588974, + "learning_rate": 2.0375402311154886e-06, + "loss": 0.0425, + "step": 864 + }, + { + "epoch": 8.398058252427184, + "grad_norm": 0.6909242272377014, + "learning_rate": 2.0325454404050983e-06, + "loss": 0.0043, + "step": 865 + }, + { + "epoch": 8.407766990291263, + "grad_norm": 0.9698901772499084, + "learning_rate": 2.0275525824729523e-06, + "loss": 0.0846, + "step": 866 + }, + { + "epoch": 8.41747572815534, + "grad_norm": 0.7008579969406128, + "learning_rate": 2.022561677962955e-06, + "loss": 0.0503, + "step": 867 + }, + { + "epoch": 8.427184466019417, + "grad_norm": 0.823596179485321, + "learning_rate": 2.017572747510934e-06, + "loss": 0.0309, + "step": 868 + }, + { + "epoch": 8.436893203883495, + "grad_norm": 1.0459940433502197, + "learning_rate": 2.012585811744555e-06, + "loss": 0.0496, + "step": 869 + }, + { + "epoch": 8.446601941747574, + "grad_norm": 0.9439486861228943, + "learning_rate": 2.0076008912832355e-06, + "loss": 0.0703, + "step": 870 + }, + { + "epoch": 8.45631067961165, + "grad_norm": 1.2154759168624878, + "learning_rate": 2.002618006738061e-06, + "loss": 0.0716, + "step": 871 + }, + { + "epoch": 8.466019417475728, + "grad_norm": 0.9697406888008118, + "learning_rate": 1.9976371787116992e-06, + "loss": 0.037, + "step": 872 + }, + { + "epoch": 8.475728155339805, + "grad_norm": 0.6543058753013611, + "learning_rate": 1.9926584277983134e-06, + "loss": 0.0158, + "step": 873 + }, + { + "epoch": 8.485436893203884, + "grad_norm": 1.0881898403167725, + "learning_rate": 1.9876817745834805e-06, + "loss": 0.0691, + "step": 874 + }, + { + "epoch": 8.495145631067961, + "grad_norm": 1.221283197402954, + "learning_rate": 1.9827072396441044e-06, + "loss": 0.1064, + "step": 875 + }, + { + "epoch": 8.504854368932039, + "grad_norm": 1.0721032619476318, + "learning_rate": 1.9777348435483285e-06, + "loss": 0.0776, + "step": 876 + }, + { + "epoch": 8.514563106796116, + "grad_norm": 1.2138664722442627, + "learning_rate": 1.972764606855454e-06, + "loss": 0.0738, + "step": 877 + }, + { + "epoch": 8.524271844660195, + "grad_norm": 0.8295838236808777, + "learning_rate": 1.9677965501158534e-06, + "loss": 0.0355, + "step": 878 + }, + { + "epoch": 8.533980582524272, + "grad_norm": 0.9322443008422852, + "learning_rate": 1.9628306938708857e-06, + "loss": 0.0639, + "step": 879 + }, + { + "epoch": 8.54368932038835, + "grad_norm": 0.7268901467323303, + "learning_rate": 1.957867058652812e-06, + "loss": 0.0355, + "step": 880 + }, + { + "epoch": 8.553398058252426, + "grad_norm": 0.9112371802330017, + "learning_rate": 1.952905664984709e-06, + "loss": 0.1148, + "step": 881 + }, + { + "epoch": 8.563106796116505, + "grad_norm": 0.8290453553199768, + "learning_rate": 1.947946533380387e-06, + "loss": 0.0247, + "step": 882 + }, + { + "epoch": 8.572815533980583, + "grad_norm": 1.105477213859558, + "learning_rate": 1.9429896843443025e-06, + "loss": 0.0115, + "step": 883 + }, + { + "epoch": 8.58252427184466, + "grad_norm": 0.9778973460197449, + "learning_rate": 1.938035138371474e-06, + "loss": 0.1297, + "step": 884 + }, + { + "epoch": 8.592233009708737, + "grad_norm": 0.5014190673828125, + "learning_rate": 1.933082915947398e-06, + "loss": 0.0712, + "step": 885 + }, + { + "epoch": 8.601941747572816, + "grad_norm": 0.7944493293762207, + "learning_rate": 1.928133037547963e-06, + "loss": 0.0222, + "step": 886 + }, + { + "epoch": 8.611650485436893, + "grad_norm": 0.9558936953544617, + "learning_rate": 1.9231855236393677e-06, + "loss": 0.0311, + "step": 887 + }, + { + "epoch": 8.62135922330097, + "grad_norm": 0.9489585757255554, + "learning_rate": 1.9182403946780316e-06, + "loss": 0.0544, + "step": 888 + }, + { + "epoch": 8.631067961165048, + "grad_norm": 1.0101840496063232, + "learning_rate": 1.9132976711105146e-06, + "loss": 0.0858, + "step": 889 + }, + { + "epoch": 8.640776699029127, + "grad_norm": 0.6758891940116882, + "learning_rate": 1.9083573733734328e-06, + "loss": 0.0452, + "step": 890 + }, + { + "epoch": 8.650485436893204, + "grad_norm": 0.9936819076538086, + "learning_rate": 1.903419521893369e-06, + "loss": 0.0409, + "step": 891 + }, + { + "epoch": 8.660194174757281, + "grad_norm": 0.8402771949768066, + "learning_rate": 1.898484137086794e-06, + "loss": 0.0413, + "step": 892 + }, + { + "epoch": 8.669902912621358, + "grad_norm": 0.6089380979537964, + "learning_rate": 1.8935512393599784e-06, + "loss": 0.0082, + "step": 893 + }, + { + "epoch": 8.679611650485437, + "grad_norm": 0.9442456960678101, + "learning_rate": 1.8886208491089095e-06, + "loss": 0.0168, + "step": 894 + }, + { + "epoch": 8.689320388349515, + "grad_norm": 0.9080673456192017, + "learning_rate": 1.8836929867192077e-06, + "loss": 0.0168, + "step": 895 + }, + { + "epoch": 8.699029126213592, + "grad_norm": 0.7783305048942566, + "learning_rate": 1.8787676725660405e-06, + "loss": 0.0228, + "step": 896 + }, + { + "epoch": 8.70873786407767, + "grad_norm": 1.021910548210144, + "learning_rate": 1.8738449270140404e-06, + "loss": 0.1082, + "step": 897 + }, + { + "epoch": 8.718446601941748, + "grad_norm": 0.9895579218864441, + "learning_rate": 1.8689247704172187e-06, + "loss": 0.0542, + "step": 898 + }, + { + "epoch": 8.728155339805825, + "grad_norm": 0.8246378898620605, + "learning_rate": 1.8640072231188825e-06, + "loss": 0.0667, + "step": 899 + }, + { + "epoch": 8.737864077669903, + "grad_norm": 0.9075983762741089, + "learning_rate": 1.8590923054515504e-06, + "loss": 0.0458, + "step": 900 + }, + { + "epoch": 8.74757281553398, + "grad_norm": 0.8279749751091003, + "learning_rate": 1.8541800377368673e-06, + "loss": 0.052, + "step": 901 + }, + { + "epoch": 8.757281553398059, + "grad_norm": 0.9606509804725647, + "learning_rate": 1.8492704402855229e-06, + "loss": 0.0584, + "step": 902 + }, + { + "epoch": 8.766990291262136, + "grad_norm": 0.42115524411201477, + "learning_rate": 1.8443635333971643e-06, + "loss": 0.0156, + "step": 903 + }, + { + "epoch": 8.776699029126213, + "grad_norm": 1.1302638053894043, + "learning_rate": 1.8394593373603173e-06, + "loss": 0.0397, + "step": 904 + }, + { + "epoch": 8.78640776699029, + "grad_norm": 1.0244662761688232, + "learning_rate": 1.8345578724522957e-06, + "loss": 0.0385, + "step": 905 + }, + { + "epoch": 8.79611650485437, + "grad_norm": 1.0088191032409668, + "learning_rate": 1.8296591589391227e-06, + "loss": 0.0297, + "step": 906 + }, + { + "epoch": 8.805825242718447, + "grad_norm": 0.9084392189979553, + "learning_rate": 1.8247632170754443e-06, + "loss": 0.0368, + "step": 907 + }, + { + "epoch": 8.815533980582524, + "grad_norm": 0.488815575838089, + "learning_rate": 1.8198700671044477e-06, + "loss": 0.0094, + "step": 908 + }, + { + "epoch": 8.825242718446601, + "grad_norm": 1.1691783666610718, + "learning_rate": 1.8149797292577757e-06, + "loss": 0.0653, + "step": 909 + }, + { + "epoch": 8.83495145631068, + "grad_norm": 1.1218113899230957, + "learning_rate": 1.8100922237554442e-06, + "loss": 0.087, + "step": 910 + }, + { + "epoch": 8.844660194174757, + "grad_norm": 0.9231531023979187, + "learning_rate": 1.8052075708057581e-06, + "loss": 0.0097, + "step": 911 + }, + { + "epoch": 8.854368932038835, + "grad_norm": 0.40473511815071106, + "learning_rate": 1.8003257906052284e-06, + "loss": 0.0056, + "step": 912 + }, + { + "epoch": 8.864077669902912, + "grad_norm": 1.0644712448120117, + "learning_rate": 1.7954469033384868e-06, + "loss": 0.057, + "step": 913 + }, + { + "epoch": 8.87378640776699, + "grad_norm": 0.9894399642944336, + "learning_rate": 1.790570929178206e-06, + "loss": 0.0567, + "step": 914 + }, + { + "epoch": 8.883495145631068, + "grad_norm": 0.7185762524604797, + "learning_rate": 1.7856978882850112e-06, + "loss": 0.0388, + "step": 915 + }, + { + "epoch": 8.893203883495145, + "grad_norm": 0.9258529543876648, + "learning_rate": 1.780827800807401e-06, + "loss": 0.0512, + "step": 916 + }, + { + "epoch": 8.902912621359224, + "grad_norm": 1.1820943355560303, + "learning_rate": 1.7759606868816623e-06, + "loss": 0.0576, + "step": 917 + }, + { + "epoch": 8.912621359223301, + "grad_norm": 0.7165997624397278, + "learning_rate": 1.771096566631788e-06, + "loss": 0.0326, + "step": 918 + }, + { + "epoch": 8.922330097087379, + "grad_norm": 1.1988775730133057, + "learning_rate": 1.766235460169392e-06, + "loss": 0.0602, + "step": 919 + }, + { + "epoch": 8.932038834951456, + "grad_norm": 0.915666937828064, + "learning_rate": 1.7613773875936274e-06, + "loss": 0.0139, + "step": 920 + }, + { + "epoch": 8.941747572815533, + "grad_norm": 0.922701895236969, + "learning_rate": 1.7565223689911038e-06, + "loss": 0.0131, + "step": 921 + }, + { + "epoch": 8.951456310679612, + "grad_norm": 0.5915096402168274, + "learning_rate": 1.7516704244358018e-06, + "loss": 0.0354, + "step": 922 + }, + { + "epoch": 8.96116504854369, + "grad_norm": 0.5728499293327332, + "learning_rate": 1.7468215739889941e-06, + "loss": 0.0277, + "step": 923 + }, + { + "epoch": 8.970873786407767, + "grad_norm": 1.0376454591751099, + "learning_rate": 1.741975837699158e-06, + "loss": 0.0891, + "step": 924 + }, + { + "epoch": 8.980582524271846, + "grad_norm": 0.5719737410545349, + "learning_rate": 1.7371332356018972e-06, + "loss": 0.0702, + "step": 925 + }, + { + "epoch": 8.990291262135923, + "grad_norm": 0.9259313344955444, + "learning_rate": 1.7322937877198545e-06, + "loss": 0.0859, + "step": 926 + }, + { + "epoch": 9.0, + "grad_norm": 0.8504927754402161, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.0235, + "step": 927 + }, + { + "epoch": 9.009708737864077, + "grad_norm": 0.8123329281806946, + "learning_rate": 1.7226244346267063e-06, + "loss": 0.0469, + "step": 928 + }, + { + "epoch": 9.019417475728156, + "grad_norm": 0.7285428047180176, + "learning_rate": 1.7177945693953486e-06, + "loss": 0.0227, + "step": 929 + }, + { + "epoch": 9.029126213592233, + "grad_norm": 0.857140064239502, + "learning_rate": 1.7129679383385384e-06, + "loss": 0.0319, + "step": 930 + }, + { + "epoch": 9.03883495145631, + "grad_norm": 0.5479164123535156, + "learning_rate": 1.7081445614128845e-06, + "loss": 0.0118, + "step": 931 + }, + { + "epoch": 9.048543689320388, + "grad_norm": 0.8059089183807373, + "learning_rate": 1.7033244585615393e-06, + "loss": 0.0663, + "step": 932 + }, + { + "epoch": 9.058252427184467, + "grad_norm": 1.0029854774475098, + "learning_rate": 1.698507649714121e-06, + "loss": 0.0417, + "step": 933 + }, + { + "epoch": 9.067961165048544, + "grad_norm": 0.69701087474823, + "learning_rate": 1.6936941547866248e-06, + "loss": 0.0103, + "step": 934 + }, + { + "epoch": 9.077669902912621, + "grad_norm": 0.5342279076576233, + "learning_rate": 1.688883993681345e-06, + "loss": 0.0278, + "step": 935 + }, + { + "epoch": 9.087378640776699, + "grad_norm": 0.7861754298210144, + "learning_rate": 1.6840771862867922e-06, + "loss": 0.0186, + "step": 936 + }, + { + "epoch": 9.097087378640778, + "grad_norm": 1.3431063890457153, + "learning_rate": 1.6792737524776093e-06, + "loss": 0.0341, + "step": 937 + }, + { + "epoch": 9.106796116504855, + "grad_norm": 0.44472604990005493, + "learning_rate": 1.674473712114492e-06, + "loss": 0.0082, + "step": 938 + }, + { + "epoch": 9.116504854368932, + "grad_norm": 0.5097460746765137, + "learning_rate": 1.6696770850441036e-06, + "loss": 0.0043, + "step": 939 + }, + { + "epoch": 9.12621359223301, + "grad_norm": 1.295851707458496, + "learning_rate": 1.6648838910989955e-06, + "loss": 0.0904, + "step": 940 + }, + { + "epoch": 9.135922330097088, + "grad_norm": 1.0630472898483276, + "learning_rate": 1.6600941500975237e-06, + "loss": 0.0222, + "step": 941 + }, + { + "epoch": 9.145631067961165, + "grad_norm": 1.1637173891067505, + "learning_rate": 1.6553078818437678e-06, + "loss": 0.0509, + "step": 942 + }, + { + "epoch": 9.155339805825243, + "grad_norm": 0.9822437763214111, + "learning_rate": 1.6505251061274492e-06, + "loss": 0.0379, + "step": 943 + }, + { + "epoch": 9.16504854368932, + "grad_norm": 0.8363843560218811, + "learning_rate": 1.6457458427238464e-06, + "loss": 0.107, + "step": 944 + }, + { + "epoch": 9.174757281553399, + "grad_norm": 1.046858549118042, + "learning_rate": 1.6409701113937182e-06, + "loss": 0.0228, + "step": 945 + }, + { + "epoch": 9.184466019417476, + "grad_norm": 0.7185032367706299, + "learning_rate": 1.6361979318832173e-06, + "loss": 0.0168, + "step": 946 + }, + { + "epoch": 9.194174757281553, + "grad_norm": 0.6683576703071594, + "learning_rate": 1.6314293239238134e-06, + "loss": 0.0401, + "step": 947 + }, + { + "epoch": 9.20388349514563, + "grad_norm": 0.619415283203125, + "learning_rate": 1.626664307232207e-06, + "loss": 0.0252, + "step": 948 + }, + { + "epoch": 9.21359223300971, + "grad_norm": 0.7388278841972351, + "learning_rate": 1.62190290151025e-06, + "loss": 0.0444, + "step": 949 + }, + { + "epoch": 9.223300970873787, + "grad_norm": 0.7596797943115234, + "learning_rate": 1.617145126444864e-06, + "loss": 0.0315, + "step": 950 + }, + { + "epoch": 9.233009708737864, + "grad_norm": 0.7415765523910522, + "learning_rate": 1.6123910017079591e-06, + "loss": 0.0363, + "step": 951 + }, + { + "epoch": 9.242718446601941, + "grad_norm": 0.38191917538642883, + "learning_rate": 1.6076405469563533e-06, + "loss": 0.0189, + "step": 952 + }, + { + "epoch": 9.25242718446602, + "grad_norm": 0.7858003973960876, + "learning_rate": 1.6028937818316889e-06, + "loss": 0.0297, + "step": 953 + }, + { + "epoch": 9.262135922330097, + "grad_norm": 0.45921990275382996, + "learning_rate": 1.598150725960354e-06, + "loss": 0.0047, + "step": 954 + }, + { + "epoch": 9.271844660194175, + "grad_norm": 0.7559367418289185, + "learning_rate": 1.5934113989533992e-06, + "loss": 0.0199, + "step": 955 + }, + { + "epoch": 9.281553398058252, + "grad_norm": 0.7790589928627014, + "learning_rate": 1.5886758204064582e-06, + "loss": 0.0174, + "step": 956 + }, + { + "epoch": 9.29126213592233, + "grad_norm": 0.3947029411792755, + "learning_rate": 1.583944009899665e-06, + "loss": 0.0047, + "step": 957 + }, + { + "epoch": 9.300970873786408, + "grad_norm": 0.7276475429534912, + "learning_rate": 1.579215986997575e-06, + "loss": 0.0155, + "step": 958 + }, + { + "epoch": 9.310679611650485, + "grad_norm": 1.0900495052337646, + "learning_rate": 1.5744917712490821e-06, + "loss": 0.0772, + "step": 959 + }, + { + "epoch": 9.320388349514563, + "grad_norm": 0.44895508885383606, + "learning_rate": 1.5697713821873401e-06, + "loss": 0.0038, + "step": 960 + }, + { + "epoch": 9.330097087378642, + "grad_norm": 0.6020745635032654, + "learning_rate": 1.5650548393296788e-06, + "loss": 0.0069, + "step": 961 + }, + { + "epoch": 9.339805825242719, + "grad_norm": 1.2439451217651367, + "learning_rate": 1.5603421621775273e-06, + "loss": 0.0321, + "step": 962 + }, + { + "epoch": 9.349514563106796, + "grad_norm": 0.9518343806266785, + "learning_rate": 1.555633370216329e-06, + "loss": 0.0215, + "step": 963 + }, + { + "epoch": 9.359223300970873, + "grad_norm": 0.6604970693588257, + "learning_rate": 1.5509284829154652e-06, + "loss": 0.0341, + "step": 964 + }, + { + "epoch": 9.368932038834952, + "grad_norm": 0.6232668161392212, + "learning_rate": 1.5462275197281717e-06, + "loss": 0.0087, + "step": 965 + }, + { + "epoch": 9.37864077669903, + "grad_norm": 1.5144613981246948, + "learning_rate": 1.5415305000914587e-06, + "loss": 0.063, + "step": 966 + }, + { + "epoch": 9.388349514563107, + "grad_norm": 0.38772061467170715, + "learning_rate": 1.536837443426032e-06, + "loss": 0.0041, + "step": 967 + }, + { + "epoch": 9.398058252427184, + "grad_norm": 0.34827372431755066, + "learning_rate": 1.5321483691362121e-06, + "loss": 0.0166, + "step": 968 + }, + { + "epoch": 9.407766990291263, + "grad_norm": 0.5997104644775391, + "learning_rate": 1.5274632966098527e-06, + "loss": 0.0051, + "step": 969 + }, + { + "epoch": 9.41747572815534, + "grad_norm": 0.39720046520233154, + "learning_rate": 1.5227822452182617e-06, + "loss": 0.0072, + "step": 970 + }, + { + "epoch": 9.427184466019417, + "grad_norm": 0.4819827079772949, + "learning_rate": 1.5181052343161212e-06, + "loss": 0.0221, + "step": 971 + }, + { + "epoch": 9.436893203883495, + "grad_norm": 0.3216429650783539, + "learning_rate": 1.5134322832414066e-06, + "loss": 0.0051, + "step": 972 + }, + { + "epoch": 9.446601941747574, + "grad_norm": 0.6433607339859009, + "learning_rate": 1.508763411315308e-06, + "loss": 0.0217, + "step": 973 + }, + { + "epoch": 9.45631067961165, + "grad_norm": 0.44555774331092834, + "learning_rate": 1.5040986378421485e-06, + "loss": 0.0143, + "step": 974 + }, + { + "epoch": 9.466019417475728, + "grad_norm": 0.37716561555862427, + "learning_rate": 1.499437982109305e-06, + "loss": 0.0132, + "step": 975 + }, + { + "epoch": 9.475728155339805, + "grad_norm": 0.930448591709137, + "learning_rate": 1.4947814633871316e-06, + "loss": 0.0136, + "step": 976 + }, + { + "epoch": 9.485436893203884, + "grad_norm": 0.3260258436203003, + "learning_rate": 1.4901291009288748e-06, + "loss": 0.0093, + "step": 977 + }, + { + "epoch": 9.495145631067961, + "grad_norm": 0.9126110672950745, + "learning_rate": 1.4854809139705961e-06, + "loss": 0.0547, + "step": 978 + }, + { + "epoch": 9.504854368932039, + "grad_norm": 0.5943132638931274, + "learning_rate": 1.4808369217310937e-06, + "loss": 0.0202, + "step": 979 + }, + { + "epoch": 9.514563106796116, + "grad_norm": 1.0365543365478516, + "learning_rate": 1.4761971434118207e-06, + "loss": 0.045, + "step": 980 + }, + { + "epoch": 9.524271844660195, + "grad_norm": 0.8611214756965637, + "learning_rate": 1.4715615981968088e-06, + "loss": 0.0206, + "step": 981 + }, + { + "epoch": 9.533980582524272, + "grad_norm": 0.49241188168525696, + "learning_rate": 1.4669303052525852e-06, + "loss": 0.0222, + "step": 982 + }, + { + "epoch": 9.54368932038835, + "grad_norm": 0.34316757321357727, + "learning_rate": 1.4623032837280971e-06, + "loss": 0.0018, + "step": 983 + }, + { + "epoch": 9.553398058252426, + "grad_norm": 1.0957579612731934, + "learning_rate": 1.4576805527546293e-06, + "loss": 0.0436, + "step": 984 + }, + { + "epoch": 9.563106796116505, + "grad_norm": 1.0365829467773438, + "learning_rate": 1.4530621314457255e-06, + "loss": 0.0305, + "step": 985 + }, + { + "epoch": 9.572815533980583, + "grad_norm": 1.2111074924468994, + "learning_rate": 1.4484480388971141e-06, + "loss": 0.0607, + "step": 986 + }, + { + "epoch": 9.58252427184466, + "grad_norm": 1.0916733741760254, + "learning_rate": 1.4438382941866224e-06, + "loss": 0.0365, + "step": 987 + }, + { + "epoch": 9.592233009708737, + "grad_norm": 0.630837082862854, + "learning_rate": 1.4392329163741015e-06, + "loss": 0.0192, + "step": 988 + }, + { + "epoch": 9.601941747572816, + "grad_norm": 0.5758682489395142, + "learning_rate": 1.4346319245013463e-06, + "loss": 0.0894, + "step": 989 + }, + { + "epoch": 9.611650485436893, + "grad_norm": 0.761637806892395, + "learning_rate": 1.430035337592018e-06, + "loss": 0.0084, + "step": 990 + }, + { + "epoch": 9.62135922330097, + "grad_norm": 0.431892067193985, + "learning_rate": 1.425443174651564e-06, + "loss": 0.0118, + "step": 991 + }, + { + "epoch": 9.631067961165048, + "grad_norm": 0.5555681586265564, + "learning_rate": 1.4208554546671407e-06, + "loss": 0.0249, + "step": 992 + }, + { + "epoch": 9.640776699029127, + "grad_norm": 0.8118094205856323, + "learning_rate": 1.4162721966075323e-06, + "loss": 0.0362, + "step": 993 + }, + { + "epoch": 9.650485436893204, + "grad_norm": 0.3614587187767029, + "learning_rate": 1.411693419423078e-06, + "loss": 0.0181, + "step": 994 + }, + { + "epoch": 9.660194174757281, + "grad_norm": 0.6472867131233215, + "learning_rate": 1.4071191420455873e-06, + "loss": 0.0171, + "step": 995 + }, + { + "epoch": 9.669902912621358, + "grad_norm": 0.46341660618782043, + "learning_rate": 1.4025493833882645e-06, + "loss": 0.0078, + "step": 996 + }, + { + "epoch": 9.679611650485437, + "grad_norm": 0.46548253297805786, + "learning_rate": 1.3979841623456309e-06, + "loss": 0.0041, + "step": 997 + }, + { + "epoch": 9.689320388349515, + "grad_norm": 1.1735632419586182, + "learning_rate": 1.3934234977934463e-06, + "loss": 0.0297, + "step": 998 + }, + { + "epoch": 9.699029126213592, + "grad_norm": 0.49554625153541565, + "learning_rate": 1.3888674085886302e-06, + "loss": 0.0159, + "step": 999 + }, + { + "epoch": 9.70873786407767, + "grad_norm": 0.47762471437454224, + "learning_rate": 1.3843159135691859e-06, + "loss": 0.0232, + "step": 1000 + }, + { + "epoch": 9.718446601941748, + "grad_norm": 0.4060843586921692, + "learning_rate": 1.3797690315541193e-06, + "loss": 0.0168, + "step": 1001 + }, + { + "epoch": 9.728155339805825, + "grad_norm": 0.7846500873565674, + "learning_rate": 1.3752267813433645e-06, + "loss": 0.0289, + "step": 1002 + }, + { + "epoch": 9.737864077669903, + "grad_norm": 0.39978328347206116, + "learning_rate": 1.3706891817177036e-06, + "loss": 0.0081, + "step": 1003 + }, + { + "epoch": 9.74757281553398, + "grad_norm": 0.6862725019454956, + "learning_rate": 1.3661562514386895e-06, + "loss": 0.061, + "step": 1004 + }, + { + "epoch": 9.757281553398059, + "grad_norm": 0.9297090768814087, + "learning_rate": 1.3616280092485719e-06, + "loss": 0.0315, + "step": 1005 + }, + { + "epoch": 9.766990291262136, + "grad_norm": 1.188409686088562, + "learning_rate": 1.357104473870213e-06, + "loss": 0.0807, + "step": 1006 + }, + { + "epoch": 9.776699029126213, + "grad_norm": 0.6750743985176086, + "learning_rate": 1.3525856640070156e-06, + "loss": 0.0165, + "step": 1007 + }, + { + "epoch": 9.78640776699029, + "grad_norm": 0.6655329465866089, + "learning_rate": 1.3480715983428433e-06, + "loss": 0.0164, + "step": 1008 + }, + { + "epoch": 9.79611650485437, + "grad_norm": 0.8620071411132812, + "learning_rate": 1.3435622955419447e-06, + "loss": 0.0428, + "step": 1009 + }, + { + "epoch": 9.805825242718447, + "grad_norm": 0.4294649064540863, + "learning_rate": 1.3390577742488747e-06, + "loss": 0.0133, + "step": 1010 + }, + { + "epoch": 9.815533980582524, + "grad_norm": 1.072709321975708, + "learning_rate": 1.334558053088419e-06, + "loss": 0.0211, + "step": 1011 + }, + { + "epoch": 9.825242718446601, + "grad_norm": 0.9287241101264954, + "learning_rate": 1.3300631506655148e-06, + "loss": 0.0255, + "step": 1012 + }, + { + "epoch": 9.83495145631068, + "grad_norm": 1.010530948638916, + "learning_rate": 1.3255730855651772e-06, + "loss": 0.0495, + "step": 1013 + }, + { + "epoch": 9.844660194174757, + "grad_norm": 0.4488737881183624, + "learning_rate": 1.3210878763524186e-06, + "loss": 0.0232, + "step": 1014 + }, + { + "epoch": 9.854368932038835, + "grad_norm": 1.137284755706787, + "learning_rate": 1.3166075415721762e-06, + "loss": 0.0491, + "step": 1015 + }, + { + "epoch": 9.864077669902912, + "grad_norm": 0.5317333936691284, + "learning_rate": 1.3121320997492305e-06, + "loss": 0.014, + "step": 1016 + }, + { + "epoch": 9.87378640776699, + "grad_norm": 0.8078583478927612, + "learning_rate": 1.307661569388132e-06, + "loss": 0.0242, + "step": 1017 + }, + { + "epoch": 9.883495145631068, + "grad_norm": 0.6492327451705933, + "learning_rate": 1.3031959689731236e-06, + "loss": 0.0162, + "step": 1018 + }, + { + "epoch": 9.893203883495145, + "grad_norm": 0.6473954916000366, + "learning_rate": 1.2987353169680667e-06, + "loss": 0.0526, + "step": 1019 + }, + { + "epoch": 9.902912621359224, + "grad_norm": 0.710628092288971, + "learning_rate": 1.2942796318163595e-06, + "loss": 0.015, + "step": 1020 + }, + { + "epoch": 9.912621359223301, + "grad_norm": 0.4777820408344269, + "learning_rate": 1.2898289319408653e-06, + "loss": 0.0178, + "step": 1021 + }, + { + "epoch": 9.922330097087379, + "grad_norm": 0.9042239189147949, + "learning_rate": 1.2853832357438346e-06, + "loss": 0.0242, + "step": 1022 + }, + { + "epoch": 9.932038834951456, + "grad_norm": 0.5531700849533081, + "learning_rate": 1.2809425616068288e-06, + "loss": 0.0339, + "step": 1023 + }, + { + "epoch": 9.941747572815533, + "grad_norm": 1.1424311399459839, + "learning_rate": 1.2765069278906456e-06, + "loss": 0.0472, + "step": 1024 + }, + { + "epoch": 9.951456310679612, + "grad_norm": 0.8695238828659058, + "learning_rate": 1.2720763529352415e-06, + "loss": 0.0071, + "step": 1025 + }, + { + "epoch": 9.96116504854369, + "grad_norm": 0.4688761234283447, + "learning_rate": 1.2676508550596562e-06, + "loss": 0.0061, + "step": 1026 + }, + { + "epoch": 9.970873786407767, + "grad_norm": 0.5627487301826477, + "learning_rate": 1.2632304525619388e-06, + "loss": 0.011, + "step": 1027 + }, + { + "epoch": 9.980582524271846, + "grad_norm": 1.3065028190612793, + "learning_rate": 1.2588151637190687e-06, + "loss": 0.0695, + "step": 1028 + }, + { + "epoch": 9.990291262135923, + "grad_norm": 0.5717766284942627, + "learning_rate": 1.2544050067868834e-06, + "loss": 0.01, + "step": 1029 + }, + { + "epoch": 10.0, + "grad_norm": 0.7852147817611694, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.0306, + "step": 1030 + }, + { + "epoch": 10.009708737864077, + "grad_norm": 0.40971171855926514, + "learning_rate": 1.2456001615717445e-06, + "loss": 0.0079, + "step": 1031 + }, + { + "epoch": 10.019417475728156, + "grad_norm": 0.39056044816970825, + "learning_rate": 1.2412055096940692e-06, + "loss": 0.0129, + "step": 1032 + }, + { + "epoch": 10.029126213592233, + "grad_norm": 0.9858739376068115, + "learning_rate": 1.2368160625374835e-06, + "loss": 0.0261, + "step": 1033 + }, + { + "epoch": 10.03883495145631, + "grad_norm": 0.2877596616744995, + "learning_rate": 1.2324318382509787e-06, + "loss": 0.0103, + "step": 1034 + }, + { + "epoch": 10.048543689320388, + "grad_norm": 0.29269400238990784, + "learning_rate": 1.2280528549619487e-06, + "loss": 0.0041, + "step": 1035 + }, + { + "epoch": 10.058252427184467, + "grad_norm": 0.5697181820869446, + "learning_rate": 1.2236791307761184e-06, + "loss": 0.0243, + "step": 1036 + }, + { + "epoch": 10.067961165048544, + "grad_norm": 0.6563681364059448, + "learning_rate": 1.2193106837774678e-06, + "loss": 0.0055, + "step": 1037 + }, + { + "epoch": 10.077669902912621, + "grad_norm": 0.33351874351501465, + "learning_rate": 1.2149475320281578e-06, + "loss": 0.0033, + "step": 1038 + }, + { + "epoch": 10.087378640776699, + "grad_norm": 0.906010091304779, + "learning_rate": 1.2105896935684545e-06, + "loss": 0.016, + "step": 1039 + }, + { + "epoch": 10.097087378640778, + "grad_norm": 0.5395269393920898, + "learning_rate": 1.2062371864166553e-06, + "loss": 0.0265, + "step": 1040 + }, + { + "epoch": 10.106796116504855, + "grad_norm": 0.5314574837684631, + "learning_rate": 1.2018900285690148e-06, + "loss": 0.0065, + "step": 1041 + }, + { + "epoch": 10.116504854368932, + "grad_norm": 0.41846874356269836, + "learning_rate": 1.1975482379996697e-06, + "loss": 0.0175, + "step": 1042 + }, + { + "epoch": 10.12621359223301, + "grad_norm": 0.6140009760856628, + "learning_rate": 1.1932118326605644e-06, + "loss": 0.0279, + "step": 1043 + }, + { + "epoch": 10.135922330097088, + "grad_norm": 0.260271281003952, + "learning_rate": 1.188880830481377e-06, + "loss": 0.0017, + "step": 1044 + }, + { + "epoch": 10.145631067961165, + "grad_norm": 0.30545926094055176, + "learning_rate": 1.1845552493694462e-06, + "loss": 0.0069, + "step": 1045 + }, + { + "epoch": 10.155339805825243, + "grad_norm": 0.42265552282333374, + "learning_rate": 1.1802351072096948e-06, + "loss": 0.0043, + "step": 1046 + }, + { + "epoch": 10.16504854368932, + "grad_norm": 0.24939106404781342, + "learning_rate": 1.1759204218645577e-06, + "loss": 0.0037, + "step": 1047 + }, + { + "epoch": 10.174757281553399, + "grad_norm": 0.4412282705307007, + "learning_rate": 1.1716112111739095e-06, + "loss": 0.0057, + "step": 1048 + }, + { + "epoch": 10.184466019417476, + "grad_norm": 1.1326662302017212, + "learning_rate": 1.167307492954986e-06, + "loss": 0.036, + "step": 1049 + }, + { + "epoch": 10.194174757281553, + "grad_norm": 0.5459797382354736, + "learning_rate": 1.1630092850023148e-06, + "loss": 0.0164, + "step": 1050 + }, + { + "epoch": 10.20388349514563, + "grad_norm": 1.0390825271606445, + "learning_rate": 1.15871660508764e-06, + "loss": 0.0239, + "step": 1051 + }, + { + "epoch": 10.21359223300971, + "grad_norm": 0.6237937211990356, + "learning_rate": 1.1544294709598491e-06, + "loss": 0.0182, + "step": 1052 + }, + { + "epoch": 10.223300970873787, + "grad_norm": 0.4918019473552704, + "learning_rate": 1.1501479003448992e-06, + "loss": 0.0145, + "step": 1053 + }, + { + "epoch": 10.233009708737864, + "grad_norm": 0.3002013564109802, + "learning_rate": 1.1458719109457445e-06, + "loss": 0.0049, + "step": 1054 + }, + { + "epoch": 10.242718446601941, + "grad_norm": 0.6791167259216309, + "learning_rate": 1.141601520442262e-06, + "loss": 0.0175, + "step": 1055 + }, + { + "epoch": 10.25242718446602, + "grad_norm": 0.25156375765800476, + "learning_rate": 1.1373367464911798e-06, + "loss": 0.0045, + "step": 1056 + }, + { + "epoch": 10.262135922330097, + "grad_norm": 0.5771428346633911, + "learning_rate": 1.1330776067260026e-06, + "loss": 0.0454, + "step": 1057 + }, + { + "epoch": 10.271844660194175, + "grad_norm": 0.40346211194992065, + "learning_rate": 1.12882411875694e-06, + "loss": 0.0025, + "step": 1058 + }, + { + "epoch": 10.281553398058252, + "grad_norm": 0.2417290210723877, + "learning_rate": 1.1245763001708326e-06, + "loss": 0.0032, + "step": 1059 + }, + { + "epoch": 10.29126213592233, + "grad_norm": 0.37248948216438293, + "learning_rate": 1.120334168531081e-06, + "loss": 0.0069, + "step": 1060 + }, + { + "epoch": 10.300970873786408, + "grad_norm": 0.6066130995750427, + "learning_rate": 1.1160977413775704e-06, + "loss": 0.0322, + "step": 1061 + }, + { + "epoch": 10.310679611650485, + "grad_norm": 0.22785338759422302, + "learning_rate": 1.1118670362266003e-06, + "loss": 0.0068, + "step": 1062 + }, + { + "epoch": 10.320388349514563, + "grad_norm": 0.4489402174949646, + "learning_rate": 1.1076420705708137e-06, + "loss": 0.0047, + "step": 1063 + }, + { + "epoch": 10.330097087378642, + "grad_norm": 0.37608322501182556, + "learning_rate": 1.1034228618791197e-06, + "loss": 0.0007, + "step": 1064 + }, + { + "epoch": 10.339805825242719, + "grad_norm": 0.34204986691474915, + "learning_rate": 1.0992094275966256e-06, + "loss": 0.0058, + "step": 1065 + }, + { + "epoch": 10.349514563106796, + "grad_norm": 0.7375351190567017, + "learning_rate": 1.0950017851445624e-06, + "loss": 0.0098, + "step": 1066 + }, + { + "epoch": 10.359223300970873, + "grad_norm": 0.7323979735374451, + "learning_rate": 1.0907999519202142e-06, + "loss": 0.0252, + "step": 1067 + }, + { + "epoch": 10.368932038834952, + "grad_norm": 0.549560546875, + "learning_rate": 1.0866039452968464e-06, + "loss": 0.0105, + "step": 1068 + }, + { + "epoch": 10.37864077669903, + "grad_norm": 0.565077006816864, + "learning_rate": 1.0824137826236318e-06, + "loss": 0.0008, + "step": 1069 + }, + { + "epoch": 10.388349514563107, + "grad_norm": 0.17279475927352905, + "learning_rate": 1.078229481225582e-06, + "loss": 0.0021, + "step": 1070 + }, + { + "epoch": 10.398058252427184, + "grad_norm": 0.5564970374107361, + "learning_rate": 1.074051058403472e-06, + "loss": 0.0166, + "step": 1071 + }, + { + "epoch": 10.407766990291263, + "grad_norm": 0.49482810497283936, + "learning_rate": 1.069878531433773e-06, + "loss": 0.0057, + "step": 1072 + }, + { + "epoch": 10.41747572815534, + "grad_norm": 0.3255504369735718, + "learning_rate": 1.0657119175685776e-06, + "loss": 0.0053, + "step": 1073 + }, + { + "epoch": 10.427184466019417, + "grad_norm": 0.18100124597549438, + "learning_rate": 1.061551234035529e-06, + "loss": 0.0015, + "step": 1074 + }, + { + "epoch": 10.436893203883495, + "grad_norm": 0.6678428649902344, + "learning_rate": 1.0573964980377517e-06, + "loss": 0.0254, + "step": 1075 + }, + { + "epoch": 10.446601941747574, + "grad_norm": 0.5514142513275146, + "learning_rate": 1.0532477267537772e-06, + "loss": 0.0571, + "step": 1076 + }, + { + "epoch": 10.45631067961165, + "grad_norm": 0.6988746523857117, + "learning_rate": 1.0491049373374762e-06, + "loss": 0.0575, + "step": 1077 + }, + { + "epoch": 10.466019417475728, + "grad_norm": 0.6181220412254333, + "learning_rate": 1.044968146917986e-06, + "loss": 0.0136, + "step": 1078 + }, + { + "epoch": 10.475728155339805, + "grad_norm": 0.2651762068271637, + "learning_rate": 1.0408373725996386e-06, + "loss": 0.0024, + "step": 1079 + }, + { + "epoch": 10.485436893203884, + "grad_norm": 0.5839412212371826, + "learning_rate": 1.0367126314618946e-06, + "loss": 0.0071, + "step": 1080 + }, + { + "epoch": 10.495145631067961, + "grad_norm": 0.629555881023407, + "learning_rate": 1.0325939405592661e-06, + "loss": 0.023, + "step": 1081 + }, + { + "epoch": 10.504854368932039, + "grad_norm": 0.2746008038520813, + "learning_rate": 1.0284813169212502e-06, + "loss": 0.0038, + "step": 1082 + }, + { + "epoch": 10.514563106796116, + "grad_norm": 0.5150997042655945, + "learning_rate": 1.024374777552258e-06, + "loss": 0.0062, + "step": 1083 + }, + { + "epoch": 10.524271844660195, + "grad_norm": 0.5501676201820374, + "learning_rate": 1.0202743394315444e-06, + "loss": 0.0069, + "step": 1084 + }, + { + "epoch": 10.533980582524272, + "grad_norm": 0.5312268733978271, + "learning_rate": 1.0161800195131372e-06, + "loss": 0.0074, + "step": 1085 + }, + { + "epoch": 10.54368932038835, + "grad_norm": 0.4340658187866211, + "learning_rate": 1.0120918347257669e-06, + "loss": 0.0101, + "step": 1086 + }, + { + "epoch": 10.553398058252426, + "grad_norm": 0.27946993708610535, + "learning_rate": 1.0080098019727979e-06, + "loss": 0.0071, + "step": 1087 + }, + { + "epoch": 10.563106796116505, + "grad_norm": 1.1418503522872925, + "learning_rate": 1.0039339381321572e-06, + "loss": 0.0417, + "step": 1088 + }, + { + "epoch": 10.572815533980583, + "grad_norm": 0.98741614818573, + "learning_rate": 9.998642600562664e-07, + "loss": 0.0409, + "step": 1089 + }, + { + "epoch": 10.58252427184466, + "grad_norm": 0.7151482105255127, + "learning_rate": 9.95800784571969e-07, + "loss": 0.0205, + "step": 1090 + }, + { + "epoch": 10.592233009708737, + "grad_norm": 0.4826299250125885, + "learning_rate": 9.91743528480464e-07, + "loss": 0.0229, + "step": 1091 + }, + { + "epoch": 10.601941747572816, + "grad_norm": 0.7173331379890442, + "learning_rate": 9.876925085572365e-07, + "loss": 0.0098, + "step": 1092 + }, + { + "epoch": 10.611650485436893, + "grad_norm": 0.4329473376274109, + "learning_rate": 9.836477415519843e-07, + "loss": 0.0136, + "step": 1093 + }, + { + "epoch": 10.62135922330097, + "grad_norm": 0.4952574074268341, + "learning_rate": 9.79609244188553e-07, + "loss": 0.0101, + "step": 1094 + }, + { + "epoch": 10.631067961165048, + "grad_norm": 0.3168066442012787, + "learning_rate": 9.755770331648642e-07, + "loss": 0.0132, + "step": 1095 + }, + { + "epoch": 10.640776699029127, + "grad_norm": 0.4660727381706238, + "learning_rate": 9.715511251528486e-07, + "loss": 0.0275, + "step": 1096 + }, + { + "epoch": 10.650485436893204, + "grad_norm": 0.6679536700248718, + "learning_rate": 9.67531536798375e-07, + "loss": 0.0236, + "step": 1097 + }, + { + "epoch": 10.660194174757281, + "grad_norm": 0.6162199378013611, + "learning_rate": 9.635182847211827e-07, + "loss": 0.0219, + "step": 1098 + }, + { + "epoch": 10.669902912621358, + "grad_norm": 0.9909278154373169, + "learning_rate": 9.595113855148128e-07, + "loss": 0.0132, + "step": 1099 + }, + { + "epoch": 10.679611650485437, + "grad_norm": 1.3139898777008057, + "learning_rate": 9.555108557465383e-07, + "loss": 0.0774, + "step": 1100 + }, + { + "epoch": 10.689320388349515, + "grad_norm": 0.6796632409095764, + "learning_rate": 9.51516711957298e-07, + "loss": 0.0432, + "step": 1101 + }, + { + "epoch": 10.699029126213592, + "grad_norm": 0.4316011965274811, + "learning_rate": 9.475289706616256e-07, + "loss": 0.0149, + "step": 1102 + }, + { + "epoch": 10.70873786407767, + "grad_norm": 0.5164605975151062, + "learning_rate": 9.435476483475825e-07, + "loss": 0.0199, + "step": 1103 + }, + { + "epoch": 10.718446601941748, + "grad_norm": 0.45947888493537903, + "learning_rate": 9.395727614766903e-07, + "loss": 0.0142, + "step": 1104 + }, + { + "epoch": 10.728155339805825, + "grad_norm": 0.651980459690094, + "learning_rate": 9.356043264838607e-07, + "loss": 0.0112, + "step": 1105 + }, + { + "epoch": 10.737864077669903, + "grad_norm": 0.3955351710319519, + "learning_rate": 9.316423597773316e-07, + "loss": 0.004, + "step": 1106 + }, + { + "epoch": 10.74757281553398, + "grad_norm": 0.6277558207511902, + "learning_rate": 9.276868777385942e-07, + "loss": 0.0408, + "step": 1107 + }, + { + "epoch": 10.757281553398059, + "grad_norm": 0.4933006465435028, + "learning_rate": 9.237378967223279e-07, + "loss": 0.0151, + "step": 1108 + }, + { + "epoch": 10.766990291262136, + "grad_norm": 0.590498149394989, + "learning_rate": 9.197954330563327e-07, + "loss": 0.0178, + "step": 1109 + }, + { + "epoch": 10.776699029126213, + "grad_norm": 0.2939163148403168, + "learning_rate": 9.158595030414621e-07, + "loss": 0.0114, + "step": 1110 + }, + { + "epoch": 10.78640776699029, + "grad_norm": 0.6207811832427979, + "learning_rate": 9.11930122951554e-07, + "loss": 0.0099, + "step": 1111 + }, + { + "epoch": 10.79611650485437, + "grad_norm": 0.6594740152359009, + "learning_rate": 9.080073090333646e-07, + "loss": 0.0104, + "step": 1112 + }, + { + "epoch": 10.805825242718447, + "grad_norm": 0.7800750732421875, + "learning_rate": 9.040910775065015e-07, + "loss": 0.0211, + "step": 1113 + }, + { + "epoch": 10.815533980582524, + "grad_norm": 0.402415007352829, + "learning_rate": 9.001814445633558e-07, + "loss": 0.004, + "step": 1114 + }, + { + "epoch": 10.825242718446601, + "grad_norm": 0.7148779630661011, + "learning_rate": 8.962784263690358e-07, + "loss": 0.0198, + "step": 1115 + }, + { + "epoch": 10.83495145631068, + "grad_norm": 0.647210419178009, + "learning_rate": 8.923820390612991e-07, + "loss": 0.0121, + "step": 1116 + }, + { + "epoch": 10.844660194174757, + "grad_norm": 0.512809157371521, + "learning_rate": 8.884922987504882e-07, + "loss": 0.0097, + "step": 1117 + }, + { + "epoch": 10.854368932038835, + "grad_norm": 0.6944689750671387, + "learning_rate": 8.846092215194607e-07, + "loss": 0.0308, + "step": 1118 + }, + { + "epoch": 10.864077669902912, + "grad_norm": 0.5818324685096741, + "learning_rate": 8.807328234235254e-07, + "loss": 0.0058, + "step": 1119 + }, + { + "epoch": 10.87378640776699, + "grad_norm": 0.8093370795249939, + "learning_rate": 8.768631204903738e-07, + "loss": 0.024, + "step": 1120 + }, + { + "epoch": 10.883495145631068, + "grad_norm": 0.8604028820991516, + "learning_rate": 8.730001287200177e-07, + "loss": 0.0256, + "step": 1121 + }, + { + "epoch": 10.893203883495145, + "grad_norm": 0.33591437339782715, + "learning_rate": 8.691438640847177e-07, + "loss": 0.005, + "step": 1122 + }, + { + "epoch": 10.902912621359224, + "grad_norm": 1.100368618965149, + "learning_rate": 8.652943425289206e-07, + "loss": 0.026, + "step": 1123 + }, + { + "epoch": 10.912621359223301, + "grad_norm": 0.29850178956985474, + "learning_rate": 8.61451579969193e-07, + "loss": 0.0122, + "step": 1124 + }, + { + "epoch": 10.922330097087379, + "grad_norm": 0.5824030041694641, + "learning_rate": 8.576155922941548e-07, + "loss": 0.0179, + "step": 1125 + }, + { + "epoch": 10.932038834951456, + "grad_norm": 0.3939349949359894, + "learning_rate": 8.537863953644138e-07, + "loss": 0.0134, + "step": 1126 + }, + { + "epoch": 10.941747572815533, + "grad_norm": 0.5187537670135498, + "learning_rate": 8.499640050125007e-07, + "loss": 0.0127, + "step": 1127 + }, + { + "epoch": 10.951456310679612, + "grad_norm": 0.5530067086219788, + "learning_rate": 8.461484370428025e-07, + "loss": 0.0273, + "step": 1128 + }, + { + "epoch": 10.96116504854369, + "grad_norm": 0.8861692547798157, + "learning_rate": 8.423397072314985e-07, + "loss": 0.0151, + "step": 1129 + }, + { + "epoch": 10.970873786407767, + "grad_norm": 0.39885714650154114, + "learning_rate": 8.385378313264933e-07, + "loss": 0.0088, + "step": 1130 + }, + { + "epoch": 10.980582524271846, + "grad_norm": 0.6531423330307007, + "learning_rate": 8.347428250473541e-07, + "loss": 0.007, + "step": 1131 + }, + { + "epoch": 10.990291262135923, + "grad_norm": 0.36868107318878174, + "learning_rate": 8.309547040852434e-07, + "loss": 0.0031, + "step": 1132 + }, + { + "epoch": 11.0, + "grad_norm": 0.3329010307788849, + "learning_rate": 8.271734841028553e-07, + "loss": 0.0075, + "step": 1133 + }, + { + "epoch": 11.009708737864077, + "grad_norm": 0.30606314539909363, + "learning_rate": 8.233991807343497e-07, + "loss": 0.0086, + "step": 1134 + }, + { + "epoch": 11.019417475728156, + "grad_norm": 0.2135513722896576, + "learning_rate": 8.196318095852909e-07, + "loss": 0.0026, + "step": 1135 + }, + { + "epoch": 11.029126213592233, + "grad_norm": 0.2945385277271271, + "learning_rate": 8.158713862325782e-07, + "loss": 0.0131, + "step": 1136 + }, + { + "epoch": 11.03883495145631, + "grad_norm": 0.2576274275779724, + "learning_rate": 8.12117926224385e-07, + "loss": 0.0038, + "step": 1137 + }, + { + "epoch": 11.048543689320388, + "grad_norm": 0.1427377313375473, + "learning_rate": 8.08371445080093e-07, + "loss": 0.0027, + "step": 1138 + }, + { + "epoch": 11.058252427184467, + "grad_norm": 0.20165517926216125, + "learning_rate": 8.04631958290229e-07, + "loss": 0.0017, + "step": 1139 + }, + { + "epoch": 11.067961165048544, + "grad_norm": 0.5164278149604797, + "learning_rate": 8.008994813163995e-07, + "loss": 0.0131, + "step": 1140 + }, + { + "epoch": 11.077669902912621, + "grad_norm": 0.18127082288265228, + "learning_rate": 7.971740295912289e-07, + "loss": 0.0053, + "step": 1141 + }, + { + "epoch": 11.087378640776699, + "grad_norm": 0.21812501549720764, + "learning_rate": 7.934556185182928e-07, + "loss": 0.0052, + "step": 1142 + }, + { + "epoch": 11.097087378640778, + "grad_norm": 0.2931056320667267, + "learning_rate": 7.897442634720576e-07, + "loss": 0.0077, + "step": 1143 + }, + { + "epoch": 11.106796116504855, + "grad_norm": 0.7248398065567017, + "learning_rate": 7.860399797978138e-07, + "loss": 0.0627, + "step": 1144 + }, + { + "epoch": 11.116504854368932, + "grad_norm": 0.37893545627593994, + "learning_rate": 7.823427828116148e-07, + "loss": 0.0108, + "step": 1145 + }, + { + "epoch": 11.12621359223301, + "grad_norm": 0.2130487859249115, + "learning_rate": 7.786526878002126e-07, + "loss": 0.0019, + "step": 1146 + }, + { + "epoch": 11.135922330097088, + "grad_norm": 0.3752915859222412, + "learning_rate": 7.749697100209947e-07, + "loss": 0.0082, + "step": 1147 + }, + { + "epoch": 11.145631067961165, + "grad_norm": 0.27648448944091797, + "learning_rate": 7.7129386470192e-07, + "loss": 0.0031, + "step": 1148 + }, + { + "epoch": 11.155339805825243, + "grad_norm": 0.3961423933506012, + "learning_rate": 7.6762516704146e-07, + "loss": 0.0107, + "step": 1149 + }, + { + "epoch": 11.16504854368932, + "grad_norm": 0.5809851288795471, + "learning_rate": 7.6396363220853e-07, + "loss": 0.0158, + "step": 1150 + }, + { + "epoch": 11.174757281553399, + "grad_norm": 0.3201679587364197, + "learning_rate": 7.603092753424298e-07, + "loss": 0.0087, + "step": 1151 + }, + { + "epoch": 11.184466019417476, + "grad_norm": 0.388967365026474, + "learning_rate": 7.566621115527811e-07, + "loss": 0.0027, + "step": 1152 + }, + { + "epoch": 11.194174757281553, + "grad_norm": 0.22992132604122162, + "learning_rate": 7.530221559194643e-07, + "loss": 0.0016, + "step": 1153 + }, + { + "epoch": 11.20388349514563, + "grad_norm": 0.5471656322479248, + "learning_rate": 7.493894234925558e-07, + "loss": 0.0028, + "step": 1154 + }, + { + "epoch": 11.21359223300971, + "grad_norm": 0.40595000982284546, + "learning_rate": 7.457639292922675e-07, + "loss": 0.0142, + "step": 1155 + }, + { + "epoch": 11.223300970873787, + "grad_norm": 0.19051553308963776, + "learning_rate": 7.421456883088826e-07, + "loss": 0.0008, + "step": 1156 + }, + { + "epoch": 11.233009708737864, + "grad_norm": 0.3705807030200958, + "learning_rate": 7.385347155026934e-07, + "loss": 0.0096, + "step": 1157 + }, + { + "epoch": 11.242718446601941, + "grad_norm": 0.5367113947868347, + "learning_rate": 7.349310258039441e-07, + "loss": 0.0106, + "step": 1158 + }, + { + "epoch": 11.25242718446602, + "grad_norm": 0.7418363094329834, + "learning_rate": 7.31334634112762e-07, + "loss": 0.0365, + "step": 1159 + }, + { + "epoch": 11.262135922330097, + "grad_norm": 0.6008816957473755, + "learning_rate": 7.277455552991011e-07, + "loss": 0.0241, + "step": 1160 + }, + { + "epoch": 11.271844660194175, + "grad_norm": 0.24267537891864777, + "learning_rate": 7.241638042026783e-07, + "loss": 0.0045, + "step": 1161 + }, + { + "epoch": 11.281553398058252, + "grad_norm": 0.17620030045509338, + "learning_rate": 7.20589395632913e-07, + "loss": 0.0041, + "step": 1162 + }, + { + "epoch": 11.29126213592233, + "grad_norm": 0.7443981170654297, + "learning_rate": 7.170223443688654e-07, + "loss": 0.0167, + "step": 1163 + }, + { + "epoch": 11.300970873786408, + "grad_norm": 0.4085586369037628, + "learning_rate": 7.134626651591758e-07, + "loss": 0.0103, + "step": 1164 + }, + { + "epoch": 11.310679611650485, + "grad_norm": 0.10864268243312836, + "learning_rate": 7.099103727220024e-07, + "loss": 0.0017, + "step": 1165 + }, + { + "epoch": 11.320388349514563, + "grad_norm": 0.9929929375648499, + "learning_rate": 7.063654817449638e-07, + "loss": 0.0271, + "step": 1166 + }, + { + "epoch": 11.330097087378642, + "grad_norm": 0.14670035243034363, + "learning_rate": 7.028280068850734e-07, + "loss": 0.0018, + "step": 1167 + }, + { + "epoch": 11.339805825242719, + "grad_norm": 0.8203486204147339, + "learning_rate": 6.992979627686821e-07, + "loss": 0.0103, + "step": 1168 + }, + { + "epoch": 11.349514563106796, + "grad_norm": 0.5324729681015015, + "learning_rate": 6.957753639914175e-07, + "loss": 0.0137, + "step": 1169 + }, + { + "epoch": 11.359223300970873, + "grad_norm": 0.31972795724868774, + "learning_rate": 6.922602251181221e-07, + "loss": 0.0054, + "step": 1170 + }, + { + "epoch": 11.368932038834952, + "grad_norm": 0.8650928139686584, + "learning_rate": 6.887525606827947e-07, + "loss": 0.0189, + "step": 1171 + }, + { + "epoch": 11.37864077669903, + "grad_norm": 0.09976081550121307, + "learning_rate": 6.852523851885295e-07, + "loss": 0.0013, + "step": 1172 + }, + { + "epoch": 11.388349514563107, + "grad_norm": 0.3876700699329376, + "learning_rate": 6.817597131074566e-07, + "loss": 0.0032, + "step": 1173 + }, + { + "epoch": 11.398058252427184, + "grad_norm": 0.2802983820438385, + "learning_rate": 6.782745588806811e-07, + "loss": 0.0039, + "step": 1174 + }, + { + "epoch": 11.407766990291263, + "grad_norm": 0.2554693818092346, + "learning_rate": 6.747969369182248e-07, + "loss": 0.0071, + "step": 1175 + }, + { + "epoch": 11.41747572815534, + "grad_norm": 0.6343085169792175, + "learning_rate": 6.713268615989654e-07, + "loss": 0.0075, + "step": 1176 + }, + { + "epoch": 11.427184466019417, + "grad_norm": 0.46486198902130127, + "learning_rate": 6.678643472705773e-07, + "loss": 0.0209, + "step": 1177 + }, + { + "epoch": 11.436893203883495, + "grad_norm": 0.12777316570281982, + "learning_rate": 6.644094082494746e-07, + "loss": 0.0014, + "step": 1178 + }, + { + "epoch": 11.446601941747574, + "grad_norm": 0.43883708119392395, + "learning_rate": 6.609620588207474e-07, + "loss": 0.0191, + "step": 1179 + }, + { + "epoch": 11.45631067961165, + "grad_norm": 0.40817007422447205, + "learning_rate": 6.575223132381067e-07, + "loss": 0.0101, + "step": 1180 + }, + { + "epoch": 11.466019417475728, + "grad_norm": 0.35821834206581116, + "learning_rate": 6.540901857238233e-07, + "loss": 0.0087, + "step": 1181 + }, + { + "epoch": 11.475728155339805, + "grad_norm": 0.4091426432132721, + "learning_rate": 6.506656904686698e-07, + "loss": 0.0107, + "step": 1182 + }, + { + "epoch": 11.485436893203884, + "grad_norm": 0.40708640217781067, + "learning_rate": 6.472488416318621e-07, + "loss": 0.0104, + "step": 1183 + }, + { + "epoch": 11.495145631067961, + "grad_norm": 0.9649088978767395, + "learning_rate": 6.438396533410002e-07, + "loss": 0.0066, + "step": 1184 + }, + { + "epoch": 11.504854368932039, + "grad_norm": 0.22220736742019653, + "learning_rate": 6.4043813969201e-07, + "loss": 0.0018, + "step": 1185 + }, + { + "epoch": 11.514563106796116, + "grad_norm": 0.9987980723381042, + "learning_rate": 6.370443147490857e-07, + "loss": 0.0144, + "step": 1186 + }, + { + "epoch": 11.524271844660195, + "grad_norm": 0.24521130323410034, + "learning_rate": 6.336581925446309e-07, + "loss": 0.0067, + "step": 1187 + }, + { + "epoch": 11.533980582524272, + "grad_norm": 0.4199708104133606, + "learning_rate": 6.302797870792007e-07, + "loss": 0.0094, + "step": 1188 + }, + { + "epoch": 11.54368932038835, + "grad_norm": 0.2658780813217163, + "learning_rate": 6.269091123214438e-07, + "loss": 0.0038, + "step": 1189 + }, + { + "epoch": 11.553398058252426, + "grad_norm": 0.5286270380020142, + "learning_rate": 6.235461822080449e-07, + "loss": 0.0121, + "step": 1190 + }, + { + "epoch": 11.563106796116505, + "grad_norm": 0.3407003581523895, + "learning_rate": 6.201910106436673e-07, + "loss": 0.0075, + "step": 1191 + }, + { + "epoch": 11.572815533980583, + "grad_norm": 0.36705833673477173, + "learning_rate": 6.168436115008941e-07, + "loss": 0.0117, + "step": 1192 + }, + { + "epoch": 11.58252427184466, + "grad_norm": 0.31512248516082764, + "learning_rate": 6.135039986201744e-07, + "loss": 0.0025, + "step": 1193 + }, + { + "epoch": 11.592233009708737, + "grad_norm": 0.6591824889183044, + "learning_rate": 6.101721858097606e-07, + "loss": 0.0171, + "step": 1194 + }, + { + "epoch": 11.601941747572816, + "grad_norm": 0.27943211793899536, + "learning_rate": 6.068481868456558e-07, + "loss": 0.0107, + "step": 1195 + }, + { + "epoch": 11.611650485436893, + "grad_norm": 0.17899493873119354, + "learning_rate": 6.035320154715549e-07, + "loss": 0.0048, + "step": 1196 + }, + { + "epoch": 11.62135922330097, + "grad_norm": 0.17442212998867035, + "learning_rate": 6.00223685398788e-07, + "loss": 0.0036, + "step": 1197 + }, + { + "epoch": 11.631067961165048, + "grad_norm": 0.47512590885162354, + "learning_rate": 5.969232103062647e-07, + "loss": 0.0191, + "step": 1198 + }, + { + "epoch": 11.640776699029127, + "grad_norm": 0.12684416770935059, + "learning_rate": 5.936306038404158e-07, + "loss": 0.0005, + "step": 1199 + }, + { + "epoch": 11.650485436893204, + "grad_norm": 0.7837037444114685, + "learning_rate": 5.903458796151382e-07, + "loss": 0.046, + "step": 1200 + }, + { + "epoch": 11.660194174757281, + "grad_norm": 0.18797621130943298, + "learning_rate": 5.870690512117377e-07, + "loss": 0.004, + "step": 1201 + }, + { + "epoch": 11.669902912621358, + "grad_norm": 0.367638498544693, + "learning_rate": 5.838001321788744e-07, + "loss": 0.0032, + "step": 1202 + }, + { + "epoch": 11.679611650485437, + "grad_norm": 0.3879675269126892, + "learning_rate": 5.80539136032505e-07, + "loss": 0.0078, + "step": 1203 + }, + { + "epoch": 11.689320388349515, + "grad_norm": 0.648831307888031, + "learning_rate": 5.772860762558269e-07, + "loss": 0.0085, + "step": 1204 + }, + { + "epoch": 11.699029126213592, + "grad_norm": 0.20488865673542023, + "learning_rate": 5.740409662992244e-07, + "loss": 0.0043, + "step": 1205 + }, + { + "epoch": 11.70873786407767, + "grad_norm": 0.39734843373298645, + "learning_rate": 5.708038195802098e-07, + "loss": 0.0017, + "step": 1206 + }, + { + "epoch": 11.718446601941748, + "grad_norm": 0.42358076572418213, + "learning_rate": 5.675746494833733e-07, + "loss": 0.0043, + "step": 1207 + }, + { + "epoch": 11.728155339805825, + "grad_norm": 0.15531939268112183, + "learning_rate": 5.643534693603214e-07, + "loss": 0.0019, + "step": 1208 + }, + { + "epoch": 11.737864077669903, + "grad_norm": 0.4669681489467621, + "learning_rate": 5.61140292529625e-07, + "loss": 0.0082, + "step": 1209 + }, + { + "epoch": 11.74757281553398, + "grad_norm": 0.3049868047237396, + "learning_rate": 5.579351322767643e-07, + "loss": 0.0045, + "step": 1210 + }, + { + "epoch": 11.757281553398059, + "grad_norm": 0.4822055399417877, + "learning_rate": 5.547380018540735e-07, + "loss": 0.0268, + "step": 1211 + }, + { + "epoch": 11.766990291262136, + "grad_norm": 0.1961973011493683, + "learning_rate": 5.515489144806862e-07, + "loss": 0.0029, + "step": 1212 + }, + { + "epoch": 11.776699029126213, + "grad_norm": 0.39638909697532654, + "learning_rate": 5.483678833424796e-07, + "loss": 0.0104, + "step": 1213 + }, + { + "epoch": 11.78640776699029, + "grad_norm": 0.2661190330982208, + "learning_rate": 5.451949215920221e-07, + "loss": 0.0062, + "step": 1214 + }, + { + "epoch": 11.79611650485437, + "grad_norm": 0.3237748444080353, + "learning_rate": 5.420300423485167e-07, + "loss": 0.0022, + "step": 1215 + }, + { + "epoch": 11.805825242718447, + "grad_norm": 0.37730419635772705, + "learning_rate": 5.38873258697748e-07, + "loss": 0.003, + "step": 1216 + }, + { + "epoch": 11.815533980582524, + "grad_norm": 0.4794592559337616, + "learning_rate": 5.357245836920286e-07, + "loss": 0.0081, + "step": 1217 + }, + { + "epoch": 11.825242718446601, + "grad_norm": 0.3878485858440399, + "learning_rate": 5.325840303501431e-07, + "loss": 0.0062, + "step": 1218 + }, + { + "epoch": 11.83495145631068, + "grad_norm": 0.3394526243209839, + "learning_rate": 5.29451611657297e-07, + "loss": 0.0075, + "step": 1219 + }, + { + "epoch": 11.844660194174757, + "grad_norm": 0.46053069829940796, + "learning_rate": 5.263273405650601e-07, + "loss": 0.0339, + "step": 1220 + }, + { + "epoch": 11.854368932038835, + "grad_norm": 0.3216699957847595, + "learning_rate": 5.232112299913151e-07, + "loss": 0.0056, + "step": 1221 + }, + { + "epoch": 11.864077669902912, + "grad_norm": 0.24092155694961548, + "learning_rate": 5.201032928202043e-07, + "loss": 0.0095, + "step": 1222 + }, + { + "epoch": 11.87378640776699, + "grad_norm": 0.9538231492042542, + "learning_rate": 5.17003541902075e-07, + "loss": 0.0429, + "step": 1223 + }, + { + "epoch": 11.883495145631068, + "grad_norm": 0.4639842212200165, + "learning_rate": 5.139119900534259e-07, + "loss": 0.021, + "step": 1224 + }, + { + "epoch": 11.893203883495145, + "grad_norm": 0.3017670810222626, + "learning_rate": 5.108286500568562e-07, + "loss": 0.0019, + "step": 1225 + }, + { + "epoch": 11.902912621359224, + "grad_norm": 0.4440242350101471, + "learning_rate": 5.077535346610115e-07, + "loss": 0.0061, + "step": 1226 + }, + { + "epoch": 11.912621359223301, + "grad_norm": 0.16742651164531708, + "learning_rate": 5.046866565805311e-07, + "loss": 0.0014, + "step": 1227 + }, + { + "epoch": 11.922330097087379, + "grad_norm": 0.22309471666812897, + "learning_rate": 5.016280284959957e-07, + "loss": 0.0027, + "step": 1228 + }, + { + "epoch": 11.932038834951456, + "grad_norm": 0.39898568391799927, + "learning_rate": 4.985776630538746e-07, + "loss": 0.008, + "step": 1229 + }, + { + "epoch": 11.941747572815533, + "grad_norm": 0.2526729702949524, + "learning_rate": 4.95535572866474e-07, + "loss": 0.0013, + "step": 1230 + }, + { + "epoch": 11.951456310679612, + "grad_norm": 0.1417032927274704, + "learning_rate": 4.925017705118843e-07, + "loss": 0.0014, + "step": 1231 + }, + { + "epoch": 11.96116504854369, + "grad_norm": 0.39468225836753845, + "learning_rate": 4.89476268533928e-07, + "loss": 0.0049, + "step": 1232 + }, + { + "epoch": 11.970873786407767, + "grad_norm": 0.33007320761680603, + "learning_rate": 4.864590794421092e-07, + "loss": 0.0044, + "step": 1233 + }, + { + "epoch": 11.980582524271846, + "grad_norm": 0.4248305857181549, + "learning_rate": 4.834502157115597e-07, + "loss": 0.0226, + "step": 1234 + }, + { + "epoch": 11.990291262135923, + "grad_norm": 0.09050054848194122, + "learning_rate": 4.804496897829883e-07, + "loss": 0.0009, + "step": 1235 + }, + { + "epoch": 12.0, + "grad_norm": 0.3121471107006073, + "learning_rate": 4.774575140626317e-07, + "loss": 0.0067, + "step": 1236 + }, + { + "epoch": 12.009708737864077, + "grad_norm": 0.20965978503227234, + "learning_rate": 4.744737009221986e-07, + "loss": 0.0045, + "step": 1237 + }, + { + "epoch": 12.019417475728156, + "grad_norm": 0.2554788887500763, + "learning_rate": 4.7149826269882294e-07, + "loss": 0.0044, + "step": 1238 + }, + { + "epoch": 12.029126213592233, + "grad_norm": 0.22696569561958313, + "learning_rate": 4.6853121169500914e-07, + "loss": 0.0076, + "step": 1239 + }, + { + "epoch": 12.03883495145631, + "grad_norm": 0.2505325973033905, + "learning_rate": 4.6557256017858485e-07, + "loss": 0.0019, + "step": 1240 + }, + { + "epoch": 12.048543689320388, + "grad_norm": 0.13231754302978516, + "learning_rate": 4.626223203826477e-07, + "loss": 0.0019, + "step": 1241 + }, + { + "epoch": 12.058252427184467, + "grad_norm": 0.15132735669612885, + "learning_rate": 4.5968050450551527e-07, + "loss": 0.0017, + "step": 1242 + }, + { + "epoch": 12.067961165048544, + "grad_norm": 0.16848385334014893, + "learning_rate": 4.56747124710675e-07, + "loss": 0.0042, + "step": 1243 + }, + { + "epoch": 12.077669902912621, + "grad_norm": 0.2351788580417633, + "learning_rate": 4.5382219312673364e-07, + "loss": 0.0087, + "step": 1244 + }, + { + "epoch": 12.087378640776699, + "grad_norm": 0.08185896277427673, + "learning_rate": 4.5090572184736863e-07, + "loss": 0.0013, + "step": 1245 + }, + { + "epoch": 12.097087378640778, + "grad_norm": 0.33229878544807434, + "learning_rate": 4.4799772293127486e-07, + "loss": 0.0087, + "step": 1246 + }, + { + "epoch": 12.106796116504855, + "grad_norm": 0.5459170937538147, + "learning_rate": 4.4509820840211745e-07, + "loss": 0.0075, + "step": 1247 + }, + { + "epoch": 12.116504854368932, + "grad_norm": 0.22844095528125763, + "learning_rate": 4.422071902484812e-07, + "loss": 0.0059, + "step": 1248 + }, + { + "epoch": 12.12621359223301, + "grad_norm": 0.1771807223558426, + "learning_rate": 4.3932468042382075e-07, + "loss": 0.0051, + "step": 1249 + }, + { + "epoch": 12.135922330097088, + "grad_norm": 0.3462653160095215, + "learning_rate": 4.3645069084641195e-07, + "loss": 0.0053, + "step": 1250 + }, + { + "epoch": 12.145631067961165, + "grad_norm": 0.23488548398017883, + "learning_rate": 4.335852333993018e-07, + "loss": 0.0107, + "step": 1251 + }, + { + "epoch": 12.155339805825243, + "grad_norm": 0.20178791880607605, + "learning_rate": 4.3072831993025895e-07, + "loss": 0.0024, + "step": 1252 + }, + { + "epoch": 12.16504854368932, + "grad_norm": 0.28149035573005676, + "learning_rate": 4.278799622517274e-07, + "loss": 0.0042, + "step": 1253 + }, + { + "epoch": 12.174757281553399, + "grad_norm": 0.22097015380859375, + "learning_rate": 4.2504017214077374e-07, + "loss": 0.0053, + "step": 1254 + }, + { + "epoch": 12.184466019417476, + "grad_norm": 0.3525868356227875, + "learning_rate": 4.222089613390412e-07, + "loss": 0.0009, + "step": 1255 + }, + { + "epoch": 12.194174757281553, + "grad_norm": 0.2622714936733246, + "learning_rate": 4.1938634155269944e-07, + "loss": 0.0049, + "step": 1256 + }, + { + "epoch": 12.20388349514563, + "grad_norm": 0.19608217477798462, + "learning_rate": 4.165723244523978e-07, + "loss": 0.0071, + "step": 1257 + }, + { + "epoch": 12.21359223300971, + "grad_norm": 0.22802308201789856, + "learning_rate": 4.1376692167321626e-07, + "loss": 0.0059, + "step": 1258 + }, + { + "epoch": 12.223300970873787, + "grad_norm": 0.20340679585933685, + "learning_rate": 4.109701448146164e-07, + "loss": 0.0042, + "step": 1259 + }, + { + "epoch": 12.233009708737864, + "grad_norm": 0.22457940876483917, + "learning_rate": 4.0818200544039484e-07, + "loss": 0.0045, + "step": 1260 + }, + { + "epoch": 12.242718446601941, + "grad_norm": 0.09476789087057114, + "learning_rate": 4.054025150786356e-07, + "loss": 0.0009, + "step": 1261 + }, + { + "epoch": 12.25242718446602, + "grad_norm": 0.16748803853988647, + "learning_rate": 4.026316852216605e-07, + "loss": 0.0043, + "step": 1262 + }, + { + "epoch": 12.262135922330097, + "grad_norm": 0.1957675963640213, + "learning_rate": 3.998695273259834e-07, + "loss": 0.004, + "step": 1263 + }, + { + "epoch": 12.271844660194175, + "grad_norm": 0.35359010100364685, + "learning_rate": 3.971160528122622e-07, + "loss": 0.0053, + "step": 1264 + }, + { + "epoch": 12.281553398058252, + "grad_norm": 0.36833491921424866, + "learning_rate": 3.9437127306525295e-07, + "loss": 0.0243, + "step": 1265 + }, + { + "epoch": 12.29126213592233, + "grad_norm": 0.13272319734096527, + "learning_rate": 3.9163519943375973e-07, + "loss": 0.003, + "step": 1266 + }, + { + "epoch": 12.300970873786408, + "grad_norm": 0.4329506456851959, + "learning_rate": 3.889078432305904e-07, + "loss": 0.0333, + "step": 1267 + }, + { + "epoch": 12.310679611650485, + "grad_norm": 0.2429269254207611, + "learning_rate": 3.8618921573250896e-07, + "loss": 0.0083, + "step": 1268 + }, + { + "epoch": 12.320388349514563, + "grad_norm": 0.11397936940193176, + "learning_rate": 3.834793281801891e-07, + "loss": 0.0017, + "step": 1269 + }, + { + "epoch": 12.330097087378642, + "grad_norm": 0.6022169589996338, + "learning_rate": 3.8077819177816695e-07, + "loss": 0.0253, + "step": 1270 + }, + { + "epoch": 12.339805825242719, + "grad_norm": 0.1681753695011139, + "learning_rate": 3.780858176947963e-07, + "loss": 0.0033, + "step": 1271 + }, + { + "epoch": 12.349514563106796, + "grad_norm": 0.1316511482000351, + "learning_rate": 3.754022170622007e-07, + "loss": 0.0007, + "step": 1272 + }, + { + "epoch": 12.359223300970873, + "grad_norm": 0.33542078733444214, + "learning_rate": 3.7272740097622884e-07, + "loss": 0.0119, + "step": 1273 + }, + { + "epoch": 12.368932038834952, + "grad_norm": 0.38612493872642517, + "learning_rate": 3.700613804964073e-07, + "loss": 0.0106, + "step": 1274 + }, + { + "epoch": 12.37864077669903, + "grad_norm": 0.5517392754554749, + "learning_rate": 3.6740416664589634e-07, + "loss": 0.0111, + "step": 1275 + }, + { + "epoch": 12.388349514563107, + "grad_norm": 0.3845731317996979, + "learning_rate": 3.6475577041144324e-07, + "loss": 0.0117, + "step": 1276 + }, + { + "epoch": 12.398058252427184, + "grad_norm": 0.04387803003191948, + "learning_rate": 3.6211620274333727e-07, + "loss": 0.0004, + "step": 1277 + }, + { + "epoch": 12.407766990291263, + "grad_norm": 0.5418988466262817, + "learning_rate": 3.594854745553636e-07, + "loss": 0.0065, + "step": 1278 + }, + { + "epoch": 12.41747572815534, + "grad_norm": 0.5132270455360413, + "learning_rate": 3.568635967247605e-07, + "loss": 0.0273, + "step": 1279 + }, + { + "epoch": 12.427184466019417, + "grad_norm": 0.11886625736951828, + "learning_rate": 3.5425058009217193e-07, + "loss": 0.0019, + "step": 1280 + }, + { + "epoch": 12.436893203883495, + "grad_norm": 0.19105517864227295, + "learning_rate": 3.516464354616031e-07, + "loss": 0.0058, + "step": 1281 + }, + { + "epoch": 12.446601941747574, + "grad_norm": 0.457914263010025, + "learning_rate": 3.4905117360037683e-07, + "loss": 0.0115, + "step": 1282 + }, + { + "epoch": 12.45631067961165, + "grad_norm": 0.20160461962223053, + "learning_rate": 3.4646480523908813e-07, + "loss": 0.002, + "step": 1283 + }, + { + "epoch": 12.466019417475728, + "grad_norm": 0.4353931248188019, + "learning_rate": 3.43887341071561e-07, + "loss": 0.039, + "step": 1284 + }, + { + "epoch": 12.475728155339805, + "grad_norm": 0.07777180522680283, + "learning_rate": 3.413187917548019e-07, + "loss": 0.0008, + "step": 1285 + }, + { + "epoch": 12.485436893203884, + "grad_norm": 0.7818756699562073, + "learning_rate": 3.3875916790895883e-07, + "loss": 0.0242, + "step": 1286 + }, + { + "epoch": 12.495145631067961, + "grad_norm": 0.10199937224388123, + "learning_rate": 3.3620848011727437e-07, + "loss": 0.0009, + "step": 1287 + }, + { + "epoch": 12.504854368932039, + "grad_norm": 0.06977365165948868, + "learning_rate": 3.336667389260445e-07, + "loss": 0.0007, + "step": 1288 + }, + { + "epoch": 12.514563106796116, + "grad_norm": 0.11607404798269272, + "learning_rate": 3.311339548445727e-07, + "loss": 0.001, + "step": 1289 + }, + { + "epoch": 12.524271844660195, + "grad_norm": 0.18200449645519257, + "learning_rate": 3.2861013834512844e-07, + "loss": 0.0034, + "step": 1290 + }, + { + "epoch": 12.533980582524272, + "grad_norm": 0.34803321957588196, + "learning_rate": 3.2609529986290246e-07, + "loss": 0.0148, + "step": 1291 + }, + { + "epoch": 12.54368932038835, + "grad_norm": 0.20186111330986023, + "learning_rate": 3.235894497959649e-07, + "loss": 0.002, + "step": 1292 + }, + { + "epoch": 12.553398058252426, + "grad_norm": 0.3056010603904724, + "learning_rate": 3.2109259850522045e-07, + "loss": 0.0032, + "step": 1293 + }, + { + "epoch": 12.563106796116505, + "grad_norm": 0.35836365818977356, + "learning_rate": 3.186047563143685e-07, + "loss": 0.0064, + "step": 1294 + }, + { + "epoch": 12.572815533980583, + "grad_norm": 0.17417111992835999, + "learning_rate": 3.161259335098571e-07, + "loss": 0.0023, + "step": 1295 + }, + { + "epoch": 12.58252427184466, + "grad_norm": 0.06176523491740227, + "learning_rate": 3.1365614034084224e-07, + "loss": 0.0007, + "step": 1296 + }, + { + "epoch": 12.592233009708737, + "grad_norm": 0.1570029854774475, + "learning_rate": 3.111953870191459e-07, + "loss": 0.0025, + "step": 1297 + }, + { + "epoch": 12.601941747572816, + "grad_norm": 0.40762364864349365, + "learning_rate": 3.087436837192118e-07, + "loss": 0.0094, + "step": 1298 + }, + { + "epoch": 12.611650485436893, + "grad_norm": 0.6000169515609741, + "learning_rate": 3.0630104057806616e-07, + "loss": 0.0307, + "step": 1299 + }, + { + "epoch": 12.62135922330097, + "grad_norm": 0.22923287749290466, + "learning_rate": 3.0386746769527323e-07, + "loss": 0.0072, + "step": 1300 + }, + { + "epoch": 12.631067961165048, + "grad_norm": 0.09936048835515976, + "learning_rate": 3.0144297513289483e-07, + "loss": 0.0013, + "step": 1301 + }, + { + "epoch": 12.640776699029127, + "grad_norm": 0.17220628261566162, + "learning_rate": 2.9902757291544905e-07, + "loss": 0.0019, + "step": 1302 + }, + { + "epoch": 12.650485436893204, + "grad_norm": 0.0635104775428772, + "learning_rate": 2.966212710298674e-07, + "loss": 0.0006, + "step": 1303 + }, + { + "epoch": 12.660194174757281, + "grad_norm": 0.25115472078323364, + "learning_rate": 2.94224079425455e-07, + "loss": 0.0026, + "step": 1304 + }, + { + "epoch": 12.669902912621358, + "grad_norm": 0.4713001847267151, + "learning_rate": 2.9183600801384853e-07, + "loss": 0.0014, + "step": 1305 + }, + { + "epoch": 12.679611650485437, + "grad_norm": 0.0625031441450119, + "learning_rate": 2.8945706666897555e-07, + "loss": 0.0005, + "step": 1306 + }, + { + "epoch": 12.689320388349515, + "grad_norm": 0.16515658795833588, + "learning_rate": 2.870872652270129e-07, + "loss": 0.0047, + "step": 1307 + }, + { + "epoch": 12.699029126213592, + "grad_norm": 0.08110034465789795, + "learning_rate": 2.8472661348634883e-07, + "loss": 0.0011, + "step": 1308 + }, + { + "epoch": 12.70873786407767, + "grad_norm": 0.17738069593906403, + "learning_rate": 2.82375121207539e-07, + "loss": 0.0053, + "step": 1309 + }, + { + "epoch": 12.718446601941748, + "grad_norm": 0.18028077483177185, + "learning_rate": 2.8003279811326724e-07, + "loss": 0.0046, + "step": 1310 + }, + { + "epoch": 12.728155339805825, + "grad_norm": 0.07242338359355927, + "learning_rate": 2.776996538883062e-07, + "loss": 0.0005, + "step": 1311 + }, + { + "epoch": 12.737864077669903, + "grad_norm": 0.2609570026397705, + "learning_rate": 2.7537569817947694e-07, + "loss": 0.006, + "step": 1312 + }, + { + "epoch": 12.74757281553398, + "grad_norm": 0.19686202704906464, + "learning_rate": 2.730609405956083e-07, + "loss": 0.0068, + "step": 1313 + }, + { + "epoch": 12.757281553398059, + "grad_norm": 0.20468905568122864, + "learning_rate": 2.707553907074989e-07, + "loss": 0.0063, + "step": 1314 + }, + { + "epoch": 12.766990291262136, + "grad_norm": 0.3390023708343506, + "learning_rate": 2.684590580478749e-07, + "loss": 0.0037, + "step": 1315 + }, + { + "epoch": 12.776699029126213, + "grad_norm": 0.5263711214065552, + "learning_rate": 2.6617195211135343e-07, + "loss": 0.0063, + "step": 1316 + }, + { + "epoch": 12.78640776699029, + "grad_norm": 0.23772577941417694, + "learning_rate": 2.638940823544012e-07, + "loss": 0.0053, + "step": 1317 + }, + { + "epoch": 12.79611650485437, + "grad_norm": 0.12934847176074982, + "learning_rate": 2.6162545819529624e-07, + "loss": 0.0027, + "step": 1318 + }, + { + "epoch": 12.805825242718447, + "grad_norm": 0.19521230459213257, + "learning_rate": 2.593660890140895e-07, + "loss": 0.0051, + "step": 1319 + }, + { + "epoch": 12.815533980582524, + "grad_norm": 0.2085026204586029, + "learning_rate": 2.57115984152565e-07, + "loss": 0.0051, + "step": 1320 + }, + { + "epoch": 12.825242718446601, + "grad_norm": 0.4785703122615814, + "learning_rate": 2.548751529142018e-07, + "loss": 0.0142, + "step": 1321 + }, + { + "epoch": 12.83495145631068, + "grad_norm": 0.47134339809417725, + "learning_rate": 2.526436045641351e-07, + "loss": 0.0128, + "step": 1322 + }, + { + "epoch": 12.844660194174757, + "grad_norm": 0.15493355691432953, + "learning_rate": 2.504213483291193e-07, + "loss": 0.0024, + "step": 1323 + }, + { + "epoch": 12.854368932038835, + "grad_norm": 0.06153065711259842, + "learning_rate": 2.482083933974883e-07, + "loss": 0.0007, + "step": 1324 + }, + { + "epoch": 12.864077669902912, + "grad_norm": 0.2635805606842041, + "learning_rate": 2.4600474891911696e-07, + "loss": 0.0055, + "step": 1325 + }, + { + "epoch": 12.87378640776699, + "grad_norm": 0.16463325917720795, + "learning_rate": 2.43810424005386e-07, + "loss": 0.0043, + "step": 1326 + }, + { + "epoch": 12.883495145631068, + "grad_norm": 0.2017255276441574, + "learning_rate": 2.416254277291416e-07, + "loss": 0.0071, + "step": 1327 + }, + { + "epoch": 12.893203883495145, + "grad_norm": 0.039516631513834, + "learning_rate": 2.3944976912465916e-07, + "loss": 0.0004, + "step": 1328 + }, + { + "epoch": 12.902912621359224, + "grad_norm": 0.19528639316558838, + "learning_rate": 2.3728345718760622e-07, + "loss": 0.0057, + "step": 1329 + }, + { + "epoch": 12.912621359223301, + "grad_norm": 0.38023102283477783, + "learning_rate": 2.3512650087500338e-07, + "loss": 0.018, + "step": 1330 + }, + { + "epoch": 12.922330097087379, + "grad_norm": 0.13987840712070465, + "learning_rate": 2.3297890910519093e-07, + "loss": 0.0019, + "step": 1331 + }, + { + "epoch": 12.932038834951456, + "grad_norm": 0.05007342994213104, + "learning_rate": 2.3084069075778758e-07, + "loss": 0.0004, + "step": 1332 + }, + { + "epoch": 12.941747572815533, + "grad_norm": 0.19607791304588318, + "learning_rate": 2.287118546736572e-07, + "loss": 0.0036, + "step": 1333 + }, + { + "epoch": 12.951456310679612, + "grad_norm": 0.4152860641479492, + "learning_rate": 2.2659240965487023e-07, + "loss": 0.005, + "step": 1334 + }, + { + "epoch": 12.96116504854369, + "grad_norm": 0.31173020601272583, + "learning_rate": 2.2448236446466847e-07, + "loss": 0.0028, + "step": 1335 + }, + { + "epoch": 12.970873786407767, + "grad_norm": 0.31397274136543274, + "learning_rate": 2.2238172782742763e-07, + "loss": 0.014, + "step": 1336 + }, + { + "epoch": 12.980582524271846, + "grad_norm": 0.06522321701049805, + "learning_rate": 2.2029050842862277e-07, + "loss": 0.0008, + "step": 1337 + }, + { + "epoch": 12.990291262135923, + "grad_norm": 0.15653946995735168, + "learning_rate": 2.1820871491479102e-07, + "loss": 0.0027, + "step": 1338 + }, + { + "epoch": 13.0, + "grad_norm": 0.2695692479610443, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.0042, + "step": 1339 + }, + { + "epoch": 13.009708737864077, + "grad_norm": 0.3138914108276367, + "learning_rate": 2.140734399332975e-07, + "loss": 0.0115, + "step": 1340 + }, + { + "epoch": 13.019417475728156, + "grad_norm": 0.2193198800086975, + "learning_rate": 2.1201997556370284e-07, + "loss": 0.0048, + "step": 1341 + }, + { + "epoch": 13.029126213592233, + "grad_norm": 0.08152951300144196, + "learning_rate": 2.0997597127514507e-07, + "loss": 0.0012, + "step": 1342 + }, + { + "epoch": 13.03883495145631, + "grad_norm": 0.37986671924591064, + "learning_rate": 2.079414355189427e-07, + "loss": 0.007, + "step": 1343 + }, + { + "epoch": 13.048543689320388, + "grad_norm": 0.15214113891124725, + "learning_rate": 2.059163767072639e-07, + "loss": 0.0032, + "step": 1344 + }, + { + "epoch": 13.058252427184467, + "grad_norm": 0.09272062033414841, + "learning_rate": 2.0390080321309236e-07, + "loss": 0.0013, + "step": 1345 + }, + { + "epoch": 13.067961165048544, + "grad_norm": 0.02976040169596672, + "learning_rate": 2.01894723370194e-07, + "loss": 0.0003, + "step": 1346 + }, + { + "epoch": 13.077669902912621, + "grad_norm": 0.12307222932577133, + "learning_rate": 1.9989814547308056e-07, + "loss": 0.0024, + "step": 1347 + }, + { + "epoch": 13.087378640776699, + "grad_norm": 0.3986649513244629, + "learning_rate": 1.9791107777697633e-07, + "loss": 0.0171, + "step": 1348 + }, + { + "epoch": 13.097087378640778, + "grad_norm": 0.11371457576751709, + "learning_rate": 1.9593352849778453e-07, + "loss": 0.0014, + "step": 1349 + }, + { + "epoch": 13.106796116504855, + "grad_norm": 0.08083812892436981, + "learning_rate": 1.9396550581205208e-07, + "loss": 0.0011, + "step": 1350 + }, + { + "epoch": 13.116504854368932, + "grad_norm": 0.19208478927612305, + "learning_rate": 1.920070178569361e-07, + "loss": 0.0028, + "step": 1351 + }, + { + "epoch": 13.12621359223301, + "grad_norm": 0.23043976724147797, + "learning_rate": 1.900580727301718e-07, + "loss": 0.0024, + "step": 1352 + }, + { + "epoch": 13.135922330097088, + "grad_norm": 0.1643299162387848, + "learning_rate": 1.8811867849003684e-07, + "loss": 0.0046, + "step": 1353 + }, + { + "epoch": 13.145631067961165, + "grad_norm": 0.18977408111095428, + "learning_rate": 1.8618884315531939e-07, + "loss": 0.0044, + "step": 1354 + }, + { + "epoch": 13.155339805825243, + "grad_norm": 0.39201587438583374, + "learning_rate": 1.8426857470528414e-07, + "loss": 0.025, + "step": 1355 + }, + { + "epoch": 13.16504854368932, + "grad_norm": 0.41091713309288025, + "learning_rate": 1.8235788107963948e-07, + "loss": 0.0184, + "step": 1356 + }, + { + "epoch": 13.174757281553399, + "grad_norm": 0.04530488699674606, + "learning_rate": 1.8045677017850595e-07, + "loss": 0.0004, + "step": 1357 + }, + { + "epoch": 13.184466019417476, + "grad_norm": 0.1892017126083374, + "learning_rate": 1.785652498623816e-07, + "loss": 0.006, + "step": 1358 + }, + { + "epoch": 13.194174757281553, + "grad_norm": 0.26920121908187866, + "learning_rate": 1.7668332795211074e-07, + "loss": 0.0055, + "step": 1359 + }, + { + "epoch": 13.20388349514563, + "grad_norm": 0.08278663456439972, + "learning_rate": 1.7481101222885126e-07, + "loss": 0.0012, + "step": 1360 + }, + { + "epoch": 13.21359223300971, + "grad_norm": 0.17943482100963593, + "learning_rate": 1.7294831043404264e-07, + "loss": 0.0046, + "step": 1361 + }, + { + "epoch": 13.223300970873787, + "grad_norm": 0.28155428171157837, + "learning_rate": 1.7109523026937302e-07, + "loss": 0.0108, + "step": 1362 + }, + { + "epoch": 13.233009708737864, + "grad_norm": 0.14628250896930695, + "learning_rate": 1.6925177939674936e-07, + "loss": 0.0042, + "step": 1363 + }, + { + "epoch": 13.242718446601941, + "grad_norm": 0.21149447560310364, + "learning_rate": 1.6741796543826321e-07, + "loss": 0.0083, + "step": 1364 + }, + { + "epoch": 13.25242718446602, + "grad_norm": 0.17508278787136078, + "learning_rate": 1.6559379597616136e-07, + "loss": 0.0041, + "step": 1365 + }, + { + "epoch": 13.262135922330097, + "grad_norm": 0.2160256952047348, + "learning_rate": 1.6377927855281362e-07, + "loss": 0.0049, + "step": 1366 + }, + { + "epoch": 13.271844660194175, + "grad_norm": 0.7788318991661072, + "learning_rate": 1.6197442067068136e-07, + "loss": 0.0106, + "step": 1367 + }, + { + "epoch": 13.281553398058252, + "grad_norm": 0.1593695729970932, + "learning_rate": 1.6017922979228662e-07, + "loss": 0.0042, + "step": 1368 + }, + { + "epoch": 13.29126213592233, + "grad_norm": 0.13514795899391174, + "learning_rate": 1.5839371334018193e-07, + "loss": 0.0026, + "step": 1369 + }, + { + "epoch": 13.300970873786408, + "grad_norm": 0.19250337779521942, + "learning_rate": 1.5661787869691858e-07, + "loss": 0.0043, + "step": 1370 + }, + { + "epoch": 13.310679611650485, + "grad_norm": 0.18198005855083466, + "learning_rate": 1.5485173320501673e-07, + "loss": 0.0037, + "step": 1371 + }, + { + "epoch": 13.320388349514563, + "grad_norm": 0.18506045639514923, + "learning_rate": 1.5309528416693503e-07, + "loss": 0.0039, + "step": 1372 + }, + { + "epoch": 13.330097087378642, + "grad_norm": 0.09922155737876892, + "learning_rate": 1.513485388450403e-07, + "loss": 0.0013, + "step": 1373 + }, + { + "epoch": 13.339805825242719, + "grad_norm": 0.4548637270927429, + "learning_rate": 1.4961150446157759e-07, + "loss": 0.0123, + "step": 1374 + }, + { + "epoch": 13.349514563106796, + "grad_norm": 0.1478271484375, + "learning_rate": 1.4788418819864037e-07, + "loss": 0.0025, + "step": 1375 + }, + { + "epoch": 13.359223300970873, + "grad_norm": 0.2919103801250458, + "learning_rate": 1.461665971981402e-07, + "loss": 0.0069, + "step": 1376 + }, + { + "epoch": 13.368932038834952, + "grad_norm": 0.0701352208852768, + "learning_rate": 1.444587385617785e-07, + "loss": 0.0008, + "step": 1377 + }, + { + "epoch": 13.37864077669903, + "grad_norm": 0.15089794993400574, + "learning_rate": 1.4276061935101586e-07, + "loss": 0.0031, + "step": 1378 + }, + { + "epoch": 13.388349514563107, + "grad_norm": 0.08464773744344711, + "learning_rate": 1.4107224658704288e-07, + "loss": 0.0009, + "step": 1379 + }, + { + "epoch": 13.398058252427184, + "grad_norm": 0.0651252344250679, + "learning_rate": 1.3939362725075344e-07, + "loss": 0.0006, + "step": 1380 + }, + { + "epoch": 13.407766990291263, + "grad_norm": 0.43149158358573914, + "learning_rate": 1.3772476828271236e-07, + "loss": 0.038, + "step": 1381 + }, + { + "epoch": 13.41747572815534, + "grad_norm": 0.2249184548854828, + "learning_rate": 1.360656765831289e-07, + "loss": 0.0051, + "step": 1382 + }, + { + "epoch": 13.427184466019417, + "grad_norm": 0.13268357515335083, + "learning_rate": 1.3441635901182803e-07, + "loss": 0.0016, + "step": 1383 + }, + { + "epoch": 13.436893203883495, + "grad_norm": 0.10658839344978333, + "learning_rate": 1.3277682238822142e-07, + "loss": 0.0015, + "step": 1384 + }, + { + "epoch": 13.446601941747574, + "grad_norm": 0.1615607887506485, + "learning_rate": 1.3114707349127954e-07, + "loss": 0.0026, + "step": 1385 + }, + { + "epoch": 13.45631067961165, + "grad_norm": 0.1557530164718628, + "learning_rate": 1.2952711905950377e-07, + "loss": 0.003, + "step": 1386 + }, + { + "epoch": 13.466019417475728, + "grad_norm": 0.286628782749176, + "learning_rate": 1.279169657908988e-07, + "loss": 0.0054, + "step": 1387 + }, + { + "epoch": 13.475728155339805, + "grad_norm": 0.12180517613887787, + "learning_rate": 1.263166203429439e-07, + "loss": 0.0018, + "step": 1388 + }, + { + "epoch": 13.485436893203884, + "grad_norm": 0.08528687059879303, + "learning_rate": 1.2472608933256637e-07, + "loss": 0.001, + "step": 1389 + }, + { + "epoch": 13.495145631067961, + "grad_norm": 0.12740404903888702, + "learning_rate": 1.2314537933611425e-07, + "loss": 0.0028, + "step": 1390 + }, + { + "epoch": 13.504854368932039, + "grad_norm": 0.14788556098937988, + "learning_rate": 1.2157449688932872e-07, + "loss": 0.0038, + "step": 1391 + }, + { + "epoch": 13.514563106796116, + "grad_norm": 0.15698152780532837, + "learning_rate": 1.2001344848731612e-07, + "loss": 0.0018, + "step": 1392 + }, + { + "epoch": 13.524271844660195, + "grad_norm": 0.13510096073150635, + "learning_rate": 1.1846224058452316e-07, + "loss": 0.0026, + "step": 1393 + }, + { + "epoch": 13.533980582524272, + "grad_norm": 0.13202345371246338, + "learning_rate": 1.1692087959470882e-07, + "loss": 0.0019, + "step": 1394 + }, + { + "epoch": 13.54368932038835, + "grad_norm": 0.27190133929252625, + "learning_rate": 1.1538937189091825e-07, + "loss": 0.007, + "step": 1395 + }, + { + "epoch": 13.553398058252426, + "grad_norm": 0.1908790022134781, + "learning_rate": 1.1386772380545669e-07, + "loss": 0.0046, + "step": 1396 + }, + { + "epoch": 13.563106796116505, + "grad_norm": 0.17838077247142792, + "learning_rate": 1.1235594162986168e-07, + "loss": 0.0045, + "step": 1397 + }, + { + "epoch": 13.572815533980583, + "grad_norm": 0.23380354046821594, + "learning_rate": 1.1085403161488012e-07, + "loss": 0.0043, + "step": 1398 + }, + { + "epoch": 13.58252427184466, + "grad_norm": 0.04911648854613304, + "learning_rate": 1.09361999970439e-07, + "loss": 0.0005, + "step": 1399 + }, + { + "epoch": 13.592233009708737, + "grad_norm": 0.12357534468173981, + "learning_rate": 1.0787985286562219e-07, + "loss": 0.0023, + "step": 1400 + }, + { + "epoch": 13.601941747572816, + "grad_norm": 0.11849277466535568, + "learning_rate": 1.0640759642864401e-07, + "loss": 0.0015, + "step": 1401 + }, + { + "epoch": 13.611650485436893, + "grad_norm": 0.20324498414993286, + "learning_rate": 1.0494523674682372e-07, + "loss": 0.0062, + "step": 1402 + }, + { + "epoch": 13.62135922330097, + "grad_norm": 0.16543932259082794, + "learning_rate": 1.0349277986656081e-07, + "loss": 0.0041, + "step": 1403 + }, + { + "epoch": 13.631067961165048, + "grad_norm": 0.14420188963413239, + "learning_rate": 1.0205023179330975e-07, + "loss": 0.0028, + "step": 1404 + }, + { + "epoch": 13.640776699029127, + "grad_norm": 0.47798407077789307, + "learning_rate": 1.00617598491555e-07, + "loss": 0.0376, + "step": 1405 + }, + { + "epoch": 13.650485436893204, + "grad_norm": 0.09702909737825394, + "learning_rate": 9.919488588478715e-08, + "loss": 0.0011, + "step": 1406 + }, + { + "epoch": 13.660194174757281, + "grad_norm": 0.24692678451538086, + "learning_rate": 9.778209985547682e-08, + "loss": 0.0054, + "step": 1407 + }, + { + "epoch": 13.669902912621358, + "grad_norm": 0.17881527543067932, + "learning_rate": 9.637924624505191e-08, + "loss": 0.0035, + "step": 1408 + }, + { + "epoch": 13.679611650485437, + "grad_norm": 0.1385125368833542, + "learning_rate": 9.498633085387343e-08, + "loss": 0.0031, + "step": 1409 + }, + { + "epoch": 13.689320388349515, + "grad_norm": 0.15435045957565308, + "learning_rate": 9.360335944121029e-08, + "loss": 0.0035, + "step": 1410 + }, + { + "epoch": 13.699029126213592, + "grad_norm": 0.09265675395727158, + "learning_rate": 9.223033772521594e-08, + "loss": 0.0014, + "step": 1411 + }, + { + "epoch": 13.70873786407767, + "grad_norm": 0.19740736484527588, + "learning_rate": 9.086727138290535e-08, + "loss": 0.0074, + "step": 1412 + }, + { + "epoch": 13.718446601941748, + "grad_norm": 0.14353950321674347, + "learning_rate": 8.951416605013114e-08, + "loss": 0.0033, + "step": 1413 + }, + { + "epoch": 13.728155339805825, + "grad_norm": 0.11986155807971954, + "learning_rate": 8.817102732155996e-08, + "loss": 0.0017, + "step": 1414 + }, + { + "epoch": 13.737864077669903, + "grad_norm": 0.16766305267810822, + "learning_rate": 8.683786075065065e-08, + "loss": 0.0031, + "step": 1415 + }, + { + "epoch": 13.74757281553398, + "grad_norm": 0.27344322204589844, + "learning_rate": 8.55146718496283e-08, + "loss": 0.0132, + "step": 1416 + }, + { + "epoch": 13.757281553398059, + "grad_norm": 0.21480204164981842, + "learning_rate": 8.420146608946605e-08, + "loss": 0.0055, + "step": 1417 + }, + { + "epoch": 13.766990291262136, + "grad_norm": 0.09167057275772095, + "learning_rate": 8.28982488998581e-08, + "loss": 0.0015, + "step": 1418 + }, + { + "epoch": 13.776699029126213, + "grad_norm": 0.25587520003318787, + "learning_rate": 8.160502566919942e-08, + "loss": 0.0044, + "step": 1419 + }, + { + "epoch": 13.78640776699029, + "grad_norm": 0.10709410905838013, + "learning_rate": 8.032180174456283e-08, + "loss": 0.0014, + "step": 1420 + }, + { + "epoch": 13.79611650485437, + "grad_norm": 0.15303583443164825, + "learning_rate": 7.904858243167806e-08, + "loss": 0.0043, + "step": 1421 + }, + { + "epoch": 13.805825242718447, + "grad_norm": 0.43564650416374207, + "learning_rate": 7.778537299490796e-08, + "loss": 0.0134, + "step": 1422 + }, + { + "epoch": 13.815533980582524, + "grad_norm": 0.16968989372253418, + "learning_rate": 7.653217865722817e-08, + "loss": 0.0037, + "step": 1423 + }, + { + "epoch": 13.825242718446601, + "grad_norm": 0.15621624886989594, + "learning_rate": 7.528900460020444e-08, + "loss": 0.0029, + "step": 1424 + }, + { + "epoch": 13.83495145631068, + "grad_norm": 0.25654706358909607, + "learning_rate": 7.405585596397314e-08, + "loss": 0.0023, + "step": 1425 + }, + { + "epoch": 13.844660194174757, + "grad_norm": 0.6071222424507141, + "learning_rate": 7.283273784721739e-08, + "loss": 0.0169, + "step": 1426 + }, + { + "epoch": 13.854368932038835, + "grad_norm": 0.09629572927951813, + "learning_rate": 7.161965530714743e-08, + "loss": 0.0012, + "step": 1427 + }, + { + "epoch": 13.864077669902912, + "grad_norm": 0.13124921917915344, + "learning_rate": 7.041661335948024e-08, + "loss": 0.002, + "step": 1428 + }, + { + "epoch": 13.87378640776699, + "grad_norm": 0.14983834326267242, + "learning_rate": 6.92236169784169e-08, + "loss": 0.0027, + "step": 1429 + }, + { + "epoch": 13.883495145631068, + "grad_norm": 0.16207829117774963, + "learning_rate": 6.804067109662443e-08, + "loss": 0.0036, + "step": 1430 + }, + { + "epoch": 13.893203883495145, + "grad_norm": 0.23562447726726532, + "learning_rate": 6.68677806052137e-08, + "loss": 0.0079, + "step": 1431 + }, + { + "epoch": 13.902912621359224, + "grad_norm": 0.2869737148284912, + "learning_rate": 6.57049503537191e-08, + "loss": 0.0069, + "step": 1432 + }, + { + "epoch": 13.912621359223301, + "grad_norm": 0.13367785513401031, + "learning_rate": 6.455218515008049e-08, + "loss": 0.0022, + "step": 1433 + }, + { + "epoch": 13.922330097087379, + "grad_norm": 0.21501076221466064, + "learning_rate": 6.340948976062023e-08, + "loss": 0.0055, + "step": 1434 + }, + { + "epoch": 13.932038834951456, + "grad_norm": 0.2994862198829651, + "learning_rate": 6.227686891002671e-08, + "loss": 0.0078, + "step": 1435 + }, + { + "epoch": 13.941747572815533, + "grad_norm": 0.1983334869146347, + "learning_rate": 6.115432728133198e-08, + "loss": 0.004, + "step": 1436 + }, + { + "epoch": 13.951456310679612, + "grad_norm": 0.2330927550792694, + "learning_rate": 6.004186951589414e-08, + "loss": 0.0093, + "step": 1437 + }, + { + "epoch": 13.96116504854369, + "grad_norm": 0.04621034115552902, + "learning_rate": 5.8939500213378296e-08, + "loss": 0.0004, + "step": 1438 + }, + { + "epoch": 13.970873786407767, + "grad_norm": 0.15329445898532867, + "learning_rate": 5.7847223931735974e-08, + "loss": 0.0033, + "step": 1439 + }, + { + "epoch": 13.980582524271846, + "grad_norm": 0.18739864230155945, + "learning_rate": 5.6765045187187614e-08, + "loss": 0.0033, + "step": 1440 + }, + { + "epoch": 13.990291262135923, + "grad_norm": 0.2754325270652771, + "learning_rate": 5.569296845420375e-08, + "loss": 0.0069, + "step": 1441 + }, + { + "epoch": 14.0, + "grad_norm": 0.28285136818885803, + "learning_rate": 5.463099816548578e-08, + "loss": 0.0098, + "step": 1442 + }, + { + "epoch": 14.009708737864077, + "grad_norm": 0.0704316720366478, + "learning_rate": 5.3579138711948587e-08, + "loss": 0.0007, + "step": 1443 + }, + { + "epoch": 14.019417475728156, + "grad_norm": 0.22070623934268951, + "learning_rate": 5.253739444270128e-08, + "loss": 0.0049, + "step": 1444 + }, + { + "epoch": 14.029126213592233, + "grad_norm": 0.25098851323127747, + "learning_rate": 5.150576966503063e-08, + "loss": 0.005, + "step": 1445 + }, + { + "epoch": 14.03883495145631, + "grad_norm": 0.21593202650547028, + "learning_rate": 5.048426864438183e-08, + "loss": 0.0048, + "step": 1446 + }, + { + "epoch": 14.048543689320388, + "grad_norm": 0.07581067085266113, + "learning_rate": 4.9472895604341655e-08, + "loss": 0.001, + "step": 1447 + }, + { + "epoch": 14.058252427184467, + "grad_norm": 0.3260473310947418, + "learning_rate": 4.8471654726621464e-08, + "loss": 0.0166, + "step": 1448 + }, + { + "epoch": 14.067961165048544, + "grad_norm": 0.12206926196813583, + "learning_rate": 4.7480550151038365e-08, + "loss": 0.0029, + "step": 1449 + }, + { + "epoch": 14.077669902912621, + "grad_norm": 0.08410502225160599, + "learning_rate": 4.649958597549964e-08, + "loss": 0.0012, + "step": 1450 + }, + { + "epoch": 14.087378640776699, + "grad_norm": 0.16693179309368134, + "learning_rate": 4.552876625598501e-08, + "loss": 0.0047, + "step": 1451 + }, + { + "epoch": 14.097087378640778, + "grad_norm": 0.1822391152381897, + "learning_rate": 4.4568095006529975e-08, + "loss": 0.0054, + "step": 1452 + }, + { + "epoch": 14.106796116504855, + "grad_norm": 0.05579308047890663, + "learning_rate": 4.361757619920942e-08, + "loss": 0.0006, + "step": 1453 + }, + { + "epoch": 14.116504854368932, + "grad_norm": 0.07022775709629059, + "learning_rate": 4.2677213764120986e-08, + "loss": 0.0007, + "step": 1454 + }, + { + "epoch": 14.12621359223301, + "grad_norm": 0.08001091331243515, + "learning_rate": 4.174701158936895e-08, + "loss": 0.0011, + "step": 1455 + }, + { + "epoch": 14.135922330097088, + "grad_norm": 0.20097486674785614, + "learning_rate": 4.082697352104814e-08, + "loss": 0.007, + "step": 1456 + }, + { + "epoch": 14.145631067961165, + "grad_norm": 0.1303526610136032, + "learning_rate": 3.991710336322757e-08, + "loss": 0.0029, + "step": 1457 + }, + { + "epoch": 14.155339805825243, + "grad_norm": 0.10144368559122086, + "learning_rate": 3.9017404877935986e-08, + "loss": 0.0014, + "step": 1458 + }, + { + "epoch": 14.16504854368932, + "grad_norm": 0.29990750551223755, + "learning_rate": 3.812788178514437e-08, + "loss": 0.0089, + "step": 1459 + }, + { + "epoch": 14.174757281553399, + "grad_norm": 0.08991169184446335, + "learning_rate": 3.7248537762752666e-08, + "loss": 0.001, + "step": 1460 + }, + { + "epoch": 14.184466019417476, + "grad_norm": 0.12679709494113922, + "learning_rate": 3.637937644657308e-08, + "loss": 0.0026, + "step": 1461 + }, + { + "epoch": 14.194174757281553, + "grad_norm": 0.21683473885059357, + "learning_rate": 3.55204014303151e-08, + "loss": 0.006, + "step": 1462 + }, + { + "epoch": 14.20388349514563, + "grad_norm": 0.5879408121109009, + "learning_rate": 3.467161626557164e-08, + "loss": 0.0465, + "step": 1463 + }, + { + "epoch": 14.21359223300971, + "grad_norm": 0.12856099009513855, + "learning_rate": 3.3833024461803756e-08, + "loss": 0.0026, + "step": 1464 + }, + { + "epoch": 14.223300970873787, + "grad_norm": 0.22118137776851654, + "learning_rate": 3.300462948632593e-08, + "loss": 0.0089, + "step": 1465 + }, + { + "epoch": 14.233009708737864, + "grad_norm": 0.3074754774570465, + "learning_rate": 3.218643476429167e-08, + "loss": 0.0165, + "step": 1466 + }, + { + "epoch": 14.242718446601941, + "grad_norm": 0.12004687637090683, + "learning_rate": 3.1378443678680706e-08, + "loss": 0.0026, + "step": 1467 + }, + { + "epoch": 14.25242718446602, + "grad_norm": 0.14797070622444153, + "learning_rate": 3.0580659570282886e-08, + "loss": 0.0034, + "step": 1468 + }, + { + "epoch": 14.262135922330097, + "grad_norm": 0.2764511704444885, + "learning_rate": 2.979308573768547e-08, + "loss": 0.0077, + "step": 1469 + }, + { + "epoch": 14.271844660194175, + "grad_norm": 0.15794546902179718, + "learning_rate": 2.9015725437259724e-08, + "loss": 0.0036, + "step": 1470 + }, + { + "epoch": 14.281553398058252, + "grad_norm": 0.056184958666563034, + "learning_rate": 2.8248581883147387e-08, + "loss": 0.0005, + "step": 1471 + }, + { + "epoch": 14.29126213592233, + "grad_norm": 0.20542171597480774, + "learning_rate": 2.7491658247246478e-08, + "loss": 0.0058, + "step": 1472 + }, + { + "epoch": 14.300970873786408, + "grad_norm": 0.1507173627614975, + "learning_rate": 2.6744957659199376e-08, + "loss": 0.003, + "step": 1473 + }, + { + "epoch": 14.310679611650485, + "grad_norm": 0.1602557897567749, + "learning_rate": 2.6008483206379497e-08, + "loss": 0.0041, + "step": 1474 + }, + { + "epoch": 14.320388349514563, + "grad_norm": 0.1496906727552414, + "learning_rate": 2.5282237933877962e-08, + "loss": 0.0022, + "step": 1475 + }, + { + "epoch": 14.330097087378642, + "grad_norm": 0.1473391056060791, + "learning_rate": 2.4566224844491393e-08, + "loss": 0.003, + "step": 1476 + }, + { + "epoch": 14.339805825242719, + "grad_norm": 0.16431382298469543, + "learning_rate": 2.38604468987097e-08, + "loss": 0.0042, + "step": 1477 + }, + { + "epoch": 14.349514563106796, + "grad_norm": 0.06767686456441879, + "learning_rate": 2.316490701470414e-08, + "loss": 0.0009, + "step": 1478 + }, + { + "epoch": 14.359223300970873, + "grad_norm": 0.27724364399909973, + "learning_rate": 2.247960806831373e-08, + "loss": 0.0035, + "step": 1479 + }, + { + "epoch": 14.368932038834952, + "grad_norm": 0.30783212184906006, + "learning_rate": 2.180455289303579e-08, + "loss": 0.0127, + "step": 1480 + }, + { + "epoch": 14.37864077669903, + "grad_norm": 0.15318472683429718, + "learning_rate": 2.113974428001153e-08, + "loss": 0.004, + "step": 1481 + }, + { + "epoch": 14.388349514563107, + "grad_norm": 0.06371254473924637, + "learning_rate": 2.0485184978016604e-08, + "loss": 0.0008, + "step": 1482 + }, + { + "epoch": 14.398058252427184, + "grad_norm": 0.14508719742298126, + "learning_rate": 1.984087769344889e-08, + "loss": 0.0029, + "step": 1483 + }, + { + "epoch": 14.407766990291263, + "grad_norm": 0.2267545610666275, + "learning_rate": 1.9206825090317126e-08, + "loss": 0.0051, + "step": 1484 + }, + { + "epoch": 14.41747572815534, + "grad_norm": 0.27744215726852417, + "learning_rate": 1.8583029790230356e-08, + "loss": 0.0085, + "step": 1485 + }, + { + "epoch": 14.427184466019417, + "grad_norm": 0.24904626607894897, + "learning_rate": 1.796949437238682e-08, + "loss": 0.0063, + "step": 1486 + }, + { + "epoch": 14.436893203883495, + "grad_norm": 0.19574686884880066, + "learning_rate": 1.736622137356342e-08, + "loss": 0.0062, + "step": 1487 + }, + { + "epoch": 14.446601941747574, + "grad_norm": 0.06552054733037949, + "learning_rate": 1.677321328810516e-08, + "loss": 0.0007, + "step": 1488 + }, + { + "epoch": 14.45631067961165, + "grad_norm": 0.23957927525043488, + "learning_rate": 1.6190472567914617e-08, + "loss": 0.0097, + "step": 1489 + }, + { + "epoch": 14.466019417475728, + "grad_norm": 0.10903111845254898, + "learning_rate": 1.561800162244248e-08, + "loss": 0.0018, + "step": 1490 + }, + { + "epoch": 14.475728155339805, + "grad_norm": 0.24251382052898407, + "learning_rate": 1.5055802818676745e-08, + "loss": 0.0086, + "step": 1491 + }, + { + "epoch": 14.485436893203884, + "grad_norm": 0.15319575369358063, + "learning_rate": 1.450387848113327e-08, + "loss": 0.0045, + "step": 1492 + }, + { + "epoch": 14.495145631067961, + "grad_norm": 0.10059983283281326, + "learning_rate": 1.3962230891846618e-08, + "loss": 0.0017, + "step": 1493 + }, + { + "epoch": 14.504854368932039, + "grad_norm": 0.1764296442270279, + "learning_rate": 1.3430862290359781e-08, + "loss": 0.0033, + "step": 1494 + }, + { + "epoch": 14.514563106796116, + "grad_norm": 0.4749487042427063, + "learning_rate": 1.2909774873715585e-08, + "loss": 0.0158, + "step": 1495 + }, + { + "epoch": 14.524271844660195, + "grad_norm": 0.18315227329730988, + "learning_rate": 1.2398970796447807e-08, + "loss": 0.004, + "step": 1496 + }, + { + "epoch": 14.533980582524272, + "grad_norm": 0.36333203315734863, + "learning_rate": 1.1898452170570618e-08, + "loss": 0.0238, + "step": 1497 + }, + { + "epoch": 14.54368932038835, + "grad_norm": 0.06321419030427933, + "learning_rate": 1.140822106557249e-08, + "loss": 0.0005, + "step": 1498 + }, + { + "epoch": 14.553398058252426, + "grad_norm": 0.0857374295592308, + "learning_rate": 1.0928279508405082e-08, + "loss": 0.0006, + "step": 1499 + }, + { + "epoch": 14.563106796116505, + "grad_norm": 0.2555052638053894, + "learning_rate": 1.0458629483476868e-08, + "loss": 0.0063, + "step": 1500 + }, + { + "epoch": 14.572815533980583, + "grad_norm": 0.10225080698728561, + "learning_rate": 9.999272932643134e-09, + "loss": 0.0013, + "step": 1501 + }, + { + "epoch": 14.58252427184466, + "grad_norm": 0.19144372642040253, + "learning_rate": 9.550211755199879e-09, + "loss": 0.0038, + "step": 1502 + }, + { + "epoch": 14.592233009708737, + "grad_norm": 0.08165448158979416, + "learning_rate": 9.111447807874374e-09, + "loss": 0.0009, + "step": 1503 + }, + { + "epoch": 14.601941747572816, + "grad_norm": 0.04350195825099945, + "learning_rate": 8.682982904817948e-09, + "loss": 0.0003, + "step": 1504 + }, + { + "epoch": 14.611650485436893, + "grad_norm": 0.3616931736469269, + "learning_rate": 8.264818817599052e-09, + "loss": 0.0147, + "step": 1505 + }, + { + "epoch": 14.62135922330097, + "grad_norm": 0.09430540353059769, + "learning_rate": 7.856957275194921e-09, + "loss": 0.0015, + "step": 1506 + }, + { + "epoch": 14.631067961165048, + "grad_norm": 0.09992361813783646, + "learning_rate": 7.459399963985758e-09, + "loss": 0.0014, + "step": 1507 + }, + { + "epoch": 14.640776699029127, + "grad_norm": 0.21097907423973083, + "learning_rate": 7.072148527746403e-09, + "loss": 0.0067, + "step": 1508 + }, + { + "epoch": 14.650485436893204, + "grad_norm": 0.19425953924655914, + "learning_rate": 6.6952045676405005e-09, + "loss": 0.0035, + "step": 1509 + }, + { + "epoch": 14.660194174757281, + "grad_norm": 0.12785759568214417, + "learning_rate": 6.328569642212734e-09, + "loss": 0.0021, + "step": 1510 + }, + { + "epoch": 14.669902912621358, + "grad_norm": 0.2563770115375519, + "learning_rate": 5.972245267384102e-09, + "loss": 0.0085, + "step": 1511 + }, + { + "epoch": 14.679611650485437, + "grad_norm": 0.12719255685806274, + "learning_rate": 5.62623291644443e-09, + "loss": 0.002, + "step": 1512 + }, + { + "epoch": 14.689320388349515, + "grad_norm": 0.05718519538640976, + "learning_rate": 5.290534020046256e-09, + "loss": 0.0008, + "step": 1513 + }, + { + "epoch": 14.699029126213592, + "grad_norm": 0.11268595606088638, + "learning_rate": 4.965149966199567e-09, + "loss": 0.0018, + "step": 1514 + }, + { + "epoch": 14.70873786407767, + "grad_norm": 0.1993200033903122, + "learning_rate": 4.6500821002654075e-09, + "loss": 0.0077, + "step": 1515 + }, + { + "epoch": 14.718446601941748, + "grad_norm": 0.1494385302066803, + "learning_rate": 4.345331724950885e-09, + "loss": 0.0035, + "step": 1516 + }, + { + "epoch": 14.728155339805825, + "grad_norm": 0.13596144318580627, + "learning_rate": 4.050900100303068e-09, + "loss": 0.003, + "step": 1517 + }, + { + "epoch": 14.737864077669903, + "grad_norm": 0.09926627576351166, + "learning_rate": 3.766788443705094e-09, + "loss": 0.0017, + "step": 1518 + }, + { + "epoch": 14.74757281553398, + "grad_norm": 0.10796511173248291, + "learning_rate": 3.492997929869235e-09, + "loss": 0.0021, + "step": 1519 + }, + { + "epoch": 14.757281553398059, + "grad_norm": 0.12152813374996185, + "learning_rate": 3.2295296908338437e-09, + "loss": 0.0014, + "step": 1520 + }, + { + "epoch": 14.766990291262136, + "grad_norm": 0.12858039140701294, + "learning_rate": 2.976384815957245e-09, + "loss": 0.0015, + "step": 1521 + }, + { + "epoch": 14.776699029126213, + "grad_norm": 0.12139680236577988, + "learning_rate": 2.7335643519144086e-09, + "loss": 0.0015, + "step": 1522 + }, + { + "epoch": 14.78640776699029, + "grad_norm": 0.22163856029510498, + "learning_rate": 2.5010693026922273e-09, + "loss": 0.0066, + "step": 1523 + }, + { + "epoch": 14.79611650485437, + "grad_norm": 0.04654043912887573, + "learning_rate": 2.278900629584524e-09, + "loss": 0.0005, + "step": 1524 + }, + { + "epoch": 14.805825242718447, + "grad_norm": 0.33974435925483704, + "learning_rate": 2.067059251189274e-09, + "loss": 0.0097, + "step": 1525 + }, + { + "epoch": 14.815533980582524, + "grad_norm": 0.04179006814956665, + "learning_rate": 1.8655460434044427e-09, + "loss": 0.0004, + "step": 1526 + }, + { + "epoch": 14.825242718446601, + "grad_norm": 0.22010868787765503, + "learning_rate": 1.6743618394238215e-09, + "loss": 0.0069, + "step": 1527 + }, + { + "epoch": 14.83495145631068, + "grad_norm": 1.0385078191757202, + "learning_rate": 1.493507429734531e-09, + "loss": 0.0043, + "step": 1528 + }, + { + "epoch": 14.844660194174757, + "grad_norm": 0.4172058403491974, + "learning_rate": 1.3229835621125786e-09, + "loss": 0.0129, + "step": 1529 + }, + { + "epoch": 14.854368932038835, + "grad_norm": 0.17302429676055908, + "learning_rate": 1.1627909416211947e-09, + "loss": 0.0037, + "step": 1530 + }, + { + "epoch": 14.864077669902912, + "grad_norm": 0.1551245003938675, + "learning_rate": 1.0129302306061128e-09, + "loss": 0.004, + "step": 1531 + }, + { + "epoch": 14.87378640776699, + "grad_norm": 0.12023022770881653, + "learning_rate": 8.734020486950157e-10, + "loss": 0.0019, + "step": 1532 + }, + { + "epoch": 14.883495145631068, + "grad_norm": 0.04651810601353645, + "learning_rate": 7.442069727930934e-10, + "loss": 0.0004, + "step": 1533 + }, + { + "epoch": 14.893203883495145, + "grad_norm": 0.09399501979351044, + "learning_rate": 6.253455370811012e-10, + "loss": 0.0013, + "step": 1534 + }, + { + "epoch": 14.902912621359224, + "grad_norm": 0.12654393911361694, + "learning_rate": 5.168182330145266e-10, + "loss": 0.0024, + "step": 1535 + }, + { + "epoch": 14.912621359223301, + "grad_norm": 0.21424488723278046, + "learning_rate": 4.186255093194258e-10, + "loss": 0.0069, + "step": 1536 + }, + { + "epoch": 14.922330097087379, + "grad_norm": 0.20085856318473816, + "learning_rate": 3.3076777199186894e-10, + "loss": 0.0023, + "step": 1537 + }, + { + "epoch": 14.932038834951456, + "grad_norm": 0.14510715007781982, + "learning_rate": 2.532453842965521e-10, + "loss": 0.0045, + "step": 1538 + }, + { + "epoch": 14.941747572815533, + "grad_norm": 0.10943800210952759, + "learning_rate": 1.8605866676374428e-10, + "loss": 0.0011, + "step": 1539 + }, + { + "epoch": 14.951456310679612, + "grad_norm": 0.3463171422481537, + "learning_rate": 1.292078971898425e-10, + "loss": 0.0215, + "step": 1540 + }, + { + "epoch": 14.96116504854369, + "grad_norm": 0.033019255846738815, + "learning_rate": 8.269331063459618e-11, + "loss": 0.0002, + "step": 1541 + }, + { + "epoch": 14.970873786407767, + "grad_norm": 0.12394634634256363, + "learning_rate": 4.651509942193988e-11, + "loss": 0.0021, + "step": 1542 + }, + { + "epoch": 14.980582524271846, + "grad_norm": 0.13520504534244537, + "learning_rate": 2.06734131366626e-11, + "loss": 0.0027, + "step": 1543 + }, + { + "epoch": 14.990291262135923, + "grad_norm": 0.11344866454601288, + "learning_rate": 5.168358626628234e-12, + "loss": 0.002, + "step": 1544 + }, + { + "epoch": 15.0, + "grad_norm": 0.062394145876169205, + "learning_rate": 0.0, + "loss": 0.0007, + "step": 1545 + } + ], + "logging_steps": 1, + "max_steps": 1545, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 515, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 432414568808448.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}