{ "best_global_step": 512, "best_metric": 0.22544851899147034, "best_model_checkpoint": "DQwen3-1.7B-uncensored/checkpoint-512", "epoch": 0.07895142636854278, "eval_steps": 128, "global_step": 512, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.9247070550918579, "epoch": 0.00015420200462606013, "grad_norm": 31.125, "learning_rate": 0.0, "loss": 2.181769847869873, "mean_token_accuracy": 0.6662116050720215, "num_tokens": 1473.0, "step": 1 }, { "entropy": 1.0056357383728027, "epoch": 0.00030840400925212025, "grad_norm": 43.0, "learning_rate": 1.6666666666666667e-06, "loss": 2.696769952774048, "mean_token_accuracy": 0.617977499961853, "num_tokens": 2460.0, "step": 2 }, { "entropy": 1.0802278518676758, "epoch": 0.0004626060138781804, "grad_norm": 41.5, "learning_rate": 3.3333333333333333e-06, "loss": 2.6830270290374756, "mean_token_accuracy": 0.6033275127410889, "num_tokens": 3610.0, "step": 3 }, { "entropy": 0.9580708742141724, "epoch": 0.0006168080185042405, "grad_norm": 33.75, "learning_rate": 5e-06, "loss": 2.234797716140747, "mean_token_accuracy": 0.6668869853019714, "num_tokens": 5131.0, "step": 4 }, { "entropy": 1.0076204538345337, "epoch": 0.0007710100231303007, "grad_norm": 34.75, "learning_rate": 6.666666666666667e-06, "loss": 2.374027729034424, "mean_token_accuracy": 0.6405493021011353, "num_tokens": 6377.0, "step": 5 }, { "entropy": 1.0595803260803223, "epoch": 0.0009252120277563608, "grad_norm": 34.25, "learning_rate": 8.333333333333334e-06, "loss": 2.216482400894165, "mean_token_accuracy": 0.6401006579399109, "num_tokens": 7577.0, "step": 6 }, { "entropy": 1.1813561916351318, "epoch": 0.001079414032382421, "grad_norm": 25.375, "learning_rate": 1e-05, "loss": 2.0130465030670166, "mean_token_accuracy": 0.6524437665939331, "num_tokens": 8874.0, "step": 7 }, { "entropy": 1.4208881855010986, "epoch": 0.001233616037008481, "grad_norm": 24.125, "learning_rate": 1.1666666666666668e-05, "loss": 2.3102152347564697, "mean_token_accuracy": 0.5884244441986084, "num_tokens": 9815.0, "step": 8 }, { "entropy": 0.8640963435173035, "epoch": 0.0013878180416345412, "grad_norm": 12.3125, "learning_rate": 1.3333333333333333e-05, "loss": 1.3446450233459473, "mean_token_accuracy": 0.7377659678459167, "num_tokens": 11703.0, "step": 9 }, { "entropy": 1.0817725658416748, "epoch": 0.0015420200462606013, "grad_norm": 14.125, "learning_rate": 1.5e-05, "loss": 1.6320915222167969, "mean_token_accuracy": 0.6808972358703613, "num_tokens": 13093.0, "step": 10 }, { "entropy": 0.9536824226379395, "epoch": 0.0016962220508866615, "grad_norm": 10.5, "learning_rate": 1.6666666666666667e-05, "loss": 1.2360026836395264, "mean_token_accuracy": 0.741475522518158, "num_tokens": 14714.0, "step": 11 }, { "entropy": 1.049913763999939, "epoch": 0.0018504240555127216, "grad_norm": 10.875, "learning_rate": 1.8333333333333333e-05, "loss": 1.2552467584609985, "mean_token_accuracy": 0.7327285408973694, "num_tokens": 16155.0, "step": 12 }, { "entropy": 0.7939231395721436, "epoch": 0.0020046260601387818, "grad_norm": 7.28125, "learning_rate": 2e-05, "loss": 0.8604422211647034, "mean_token_accuracy": 0.7944584488868713, "num_tokens": 18148.0, "step": 13 }, { "entropy": 0.9421704411506653, "epoch": 0.002158828064764842, "grad_norm": 9.375, "learning_rate": 2.1666666666666667e-05, "loss": 0.9789397716522217, "mean_token_accuracy": 0.7728531956672668, "num_tokens": 19600.0, "step": 14 }, { "entropy": 1.101209044456482, "epoch": 0.002313030069390902, "grad_norm": 9.5, "learning_rate": 2.3333333333333336e-05, "loss": 1.1167230606079102, "mean_token_accuracy": 0.759087085723877, "num_tokens": 20791.0, "step": 15 }, { "entropy": 0.8545694351196289, "epoch": 0.002467232074016962, "grad_norm": 8.875, "learning_rate": 2.5e-05, "loss": 0.8782606720924377, "mean_token_accuracy": 0.8138889074325562, "num_tokens": 22239.0, "step": 16 }, { "entropy": 0.5610961318016052, "epoch": 0.0026214340786430224, "grad_norm": 5.5, "learning_rate": 2.6666666666666667e-05, "loss": 0.5257444381713867, "mean_token_accuracy": 0.8684759736061096, "num_tokens": 24642.0, "step": 17 }, { "entropy": 0.9791864156723022, "epoch": 0.0027756360832690823, "grad_norm": 12.0, "learning_rate": 2.8333333333333335e-05, "loss": 1.0223743915557861, "mean_token_accuracy": 0.780053436756134, "num_tokens": 25773.0, "step": 18 }, { "entropy": 0.6505466103553772, "epoch": 0.0029298380878951427, "grad_norm": 7.34375, "learning_rate": 3e-05, "loss": 0.5793496370315552, "mean_token_accuracy": 0.8571428656578064, "num_tokens": 27258.0, "step": 19 }, { "entropy": 0.8408939242362976, "epoch": 0.0030840400925212026, "grad_norm": 8.1875, "learning_rate": 3.1666666666666666e-05, "loss": 0.7403033375740051, "mean_token_accuracy": 0.8192341923713684, "num_tokens": 28389.0, "step": 20 }, { "entropy": 0.6034911274909973, "epoch": 0.003238242097147263, "grad_norm": 9.125, "learning_rate": 3.3333333333333335e-05, "loss": 0.5590517520904541, "mean_token_accuracy": 0.8626410365104675, "num_tokens": 29904.0, "step": 21 }, { "entropy": 0.9795640110969543, "epoch": 0.003392444101773323, "grad_norm": 13.25, "learning_rate": 3.5e-05, "loss": 0.9148629307746887, "mean_token_accuracy": 0.7654135227203369, "num_tokens": 30577.0, "step": 22 }, { "entropy": 0.7439587712287903, "epoch": 0.0035466461063993833, "grad_norm": 7.25, "learning_rate": 3.6666666666666666e-05, "loss": 0.5953554511070251, "mean_token_accuracy": 0.8356807231903076, "num_tokens": 31863.0, "step": 23 }, { "entropy": 0.7064525485038757, "epoch": 0.0037008481110254433, "grad_norm": 12.3125, "learning_rate": 3.8333333333333334e-05, "loss": 0.6644932627677917, "mean_token_accuracy": 0.8366890549659729, "num_tokens": 32765.0, "step": 24 }, { "entropy": 0.671248733997345, "epoch": 0.0038550501156515036, "grad_norm": 10.6875, "learning_rate": 4e-05, "loss": 0.6164469718933105, "mean_token_accuracy": 0.826787531375885, "num_tokens": 33766.0, "step": 25 }, { "entropy": 0.5062084794044495, "epoch": 0.0040092521202775636, "grad_norm": 5.4375, "learning_rate": 4.166666666666667e-05, "loss": 0.40353134274482727, "mean_token_accuracy": 0.8768283128738403, "num_tokens": 35073.0, "step": 26 }, { "entropy": 0.5384562015533447, "epoch": 0.0041634541249036235, "grad_norm": 5.78125, "learning_rate": 4.3333333333333334e-05, "loss": 0.5431831479072571, "mean_token_accuracy": 0.8347339034080505, "num_tokens": 36152.0, "step": 27 }, { "entropy": 0.4401922821998596, "epoch": 0.004317656129529684, "grad_norm": 4.0, "learning_rate": 4.5e-05, "loss": 0.412110298871994, "mean_token_accuracy": 0.8691341876983643, "num_tokens": 37673.0, "step": 28 }, { "entropy": 0.3748137652873993, "epoch": 0.004471858134155744, "grad_norm": 3.484375, "learning_rate": 4.666666666666667e-05, "loss": 0.360552042722702, "mean_token_accuracy": 0.8896728754043579, "num_tokens": 39240.0, "step": 29 }, { "entropy": 0.45826223492622375, "epoch": 0.004626060138781804, "grad_norm": 4.53125, "learning_rate": 4.8333333333333334e-05, "loss": 0.455470472574234, "mean_token_accuracy": 0.8571428656578064, "num_tokens": 40256.0, "step": 30 }, { "entropy": 0.33488166332244873, "epoch": 0.004780262143407864, "grad_norm": 3.9375, "learning_rate": 5e-05, "loss": 0.3165741562843323, "mean_token_accuracy": 0.8937432765960693, "num_tokens": 42118.0, "step": 31 }, { "entropy": 0.3528728783130646, "epoch": 0.004934464148033924, "grad_norm": 7.6875, "learning_rate": 4.9999468976006514e-05, "loss": 0.38620343804359436, "mean_token_accuracy": 0.8836023807525635, "num_tokens": 43303.0, "step": 32 }, { "entropy": 0.36625728011131287, "epoch": 0.005088666152659985, "grad_norm": 4.84375, "learning_rate": 4.999787592658497e-05, "loss": 0.446200966835022, "mean_token_accuracy": 0.8777328133583069, "num_tokens": 44546.0, "step": 33 }, { "entropy": 0.26643213629722595, "epoch": 0.005242868157286045, "grad_norm": 4.40625, "learning_rate": 4.999522091941117e-05, "loss": 0.2737399637699127, "mean_token_accuracy": 0.8979820609092712, "num_tokens": 46338.0, "step": 34 }, { "entropy": 0.26414045691490173, "epoch": 0.005397070161912105, "grad_norm": 3.296875, "learning_rate": 4.999150406727491e-05, "loss": 0.2983474135398865, "mean_token_accuracy": 0.9013499617576599, "num_tokens": 48272.0, "step": 35 }, { "entropy": 0.370844304561615, "epoch": 0.005551272166538165, "grad_norm": 3.546875, "learning_rate": 4.9986725528075205e-05, "loss": 0.3454509377479553, "mean_token_accuracy": 0.8820555806159973, "num_tokens": 49467.0, "step": 36 }, { "entropy": 0.2591751515865326, "epoch": 0.0057054741711642255, "grad_norm": 2.8125, "learning_rate": 4.998088550481357e-05, "loss": 0.2637964189052582, "mean_token_accuracy": 0.9097625613212585, "num_tokens": 51370.0, "step": 37 }, { "entropy": 0.30300796031951904, "epoch": 0.005859676175790285, "grad_norm": 2.71875, "learning_rate": 4.997398424558541e-05, "loss": 0.30886220932006836, "mean_token_accuracy": 0.901874303817749, "num_tokens": 53192.0, "step": 38 }, { "entropy": 0.27792277932167053, "epoch": 0.006013878180416345, "grad_norm": 2.6875, "learning_rate": 4.996602204356945e-05, "loss": 0.2732873558998108, "mean_token_accuracy": 0.9054564833641052, "num_tokens": 55051.0, "step": 39 }, { "entropy": 0.2960520088672638, "epoch": 0.006168080185042405, "grad_norm": 2.390625, "learning_rate": 4.9956999237015336e-05, "loss": 0.28190645575523376, "mean_token_accuracy": 0.8957963585853577, "num_tokens": 56748.0, "step": 40 }, { "entropy": 0.27777737379074097, "epoch": 0.006322282189668466, "grad_norm": 2.390625, "learning_rate": 4.994691620922919e-05, "loss": 0.2448980063199997, "mean_token_accuracy": 0.918410062789917, "num_tokens": 58668.0, "step": 41 }, { "entropy": 0.3428345024585724, "epoch": 0.006476484194294526, "grad_norm": 5.15625, "learning_rate": 4.993577338855741e-05, "loss": 0.354027658700943, "mean_token_accuracy": 0.8893527984619141, "num_tokens": 60113.0, "step": 42 }, { "entropy": 0.30580252408981323, "epoch": 0.006630686198920586, "grad_norm": 2.75, "learning_rate": 4.9923571248368375e-05, "loss": 0.28440362215042114, "mean_token_accuracy": 0.9064558744430542, "num_tokens": 61639.0, "step": 43 }, { "entropy": 0.43973830342292786, "epoch": 0.006784888203546646, "grad_norm": 4.5625, "learning_rate": 4.991031030703244e-05, "loss": 0.4269028902053833, "mean_token_accuracy": 0.859668493270874, "num_tokens": 62552.0, "step": 44 }, { "entropy": 0.33187365531921387, "epoch": 0.006939090208172706, "grad_norm": 3.546875, "learning_rate": 4.989599112789984e-05, "loss": 0.34638962149620056, "mean_token_accuracy": 0.8819671869277954, "num_tokens": 63780.0, "step": 45 }, { "entropy": 0.31787073612213135, "epoch": 0.007093292212798767, "grad_norm": 3.703125, "learning_rate": 4.988061431927681e-05, "loss": 0.3301301598548889, "mean_token_accuracy": 0.8874788284301758, "num_tokens": 64970.0, "step": 46 }, { "entropy": 0.2509302496910095, "epoch": 0.0072474942174248266, "grad_norm": 3.375, "learning_rate": 4.9864180534399674e-05, "loss": 0.2752370238304138, "mean_token_accuracy": 0.9063180685043335, "num_tokens": 66814.0, "step": 47 }, { "entropy": 0.28000396490097046, "epoch": 0.0074016962220508865, "grad_norm": 3.46875, "learning_rate": 4.984669047140716e-05, "loss": 0.3101637363433838, "mean_token_accuracy": 0.8998599648475647, "num_tokens": 68250.0, "step": 48 }, { "entropy": 0.263468474149704, "epoch": 0.0075558982266769464, "grad_norm": 3.046875, "learning_rate": 4.982814487331072e-05, "loss": 0.29188624024391174, "mean_token_accuracy": 0.9051411747932434, "num_tokens": 69639.0, "step": 49 }, { "entropy": 0.30314338207244873, "epoch": 0.007710100231303007, "grad_norm": 3.296875, "learning_rate": 4.9808544527962964e-05, "loss": 0.3129803538322449, "mean_token_accuracy": 0.8961228728294373, "num_tokens": 71014.0, "step": 50 }, { "entropy": 0.299335241317749, "epoch": 0.007864302235929066, "grad_norm": 2.9375, "learning_rate": 4.978789026802419e-05, "loss": 0.3139159679412842, "mean_token_accuracy": 0.8981788158416748, "num_tokens": 72230.0, "step": 51 }, { "entropy": 0.33503401279449463, "epoch": 0.008018504240555127, "grad_norm": 3.265625, "learning_rate": 4.9766182970926975e-05, "loss": 0.3325449526309967, "mean_token_accuracy": 0.8837863206863403, "num_tokens": 73305.0, "step": 52 }, { "entropy": 0.3575332760810852, "epoch": 0.008172706245181188, "grad_norm": 3.21875, "learning_rate": 4.9743423558839e-05, "loss": 0.3531642258167267, "mean_token_accuracy": 0.8720735907554626, "num_tokens": 74509.0, "step": 53 }, { "entropy": 0.3179467022418976, "epoch": 0.008326908249807247, "grad_norm": 2.90625, "learning_rate": 4.971961299862376e-05, "loss": 0.32219475507736206, "mean_token_accuracy": 0.898162305355072, "num_tokens": 75823.0, "step": 54 }, { "entropy": 0.27880457043647766, "epoch": 0.008481110254433308, "grad_norm": 5.5625, "learning_rate": 4.9694752301799566e-05, "loss": 0.2741078734397888, "mean_token_accuracy": 0.9016189575195312, "num_tokens": 77437.0, "step": 55 }, { "entropy": 0.34488171339035034, "epoch": 0.008635312259059369, "grad_norm": 3.90625, "learning_rate": 4.9668842524496526e-05, "loss": 0.366953045129776, "mean_token_accuracy": 0.8817635178565979, "num_tokens": 78942.0, "step": 56 }, { "entropy": 0.2759552299976349, "epoch": 0.008789514263685428, "grad_norm": 2.390625, "learning_rate": 4.9641884767411714e-05, "loss": 0.23074716329574585, "mean_token_accuracy": 0.9223560690879822, "num_tokens": 80444.0, "step": 57 }, { "entropy": 0.2680215835571289, "epoch": 0.008943716268311488, "grad_norm": 2.375, "learning_rate": 4.9613880175762414e-05, "loss": 0.2504393458366394, "mean_token_accuracy": 0.9081172347068787, "num_tokens": 82226.0, "step": 58 }, { "entropy": 0.3482436537742615, "epoch": 0.009097918272937548, "grad_norm": 2.859375, "learning_rate": 4.958482993923742e-05, "loss": 0.3350726068019867, "mean_token_accuracy": 0.8843283653259277, "num_tokens": 83306.0, "step": 59 }, { "entropy": 0.3756292164325714, "epoch": 0.009252120277563608, "grad_norm": 2.75, "learning_rate": 4.955473529194654e-05, "loss": 0.3661136329174042, "mean_token_accuracy": 0.8703535795211792, "num_tokens": 84417.0, "step": 60 }, { "entropy": 0.30815669894218445, "epoch": 0.00940632228218967, "grad_norm": 2.546875, "learning_rate": 4.952359751236817e-05, "loss": 0.2678232789039612, "mean_token_accuracy": 0.9128242135047913, "num_tokens": 85813.0, "step": 61 }, { "entropy": 0.2885396480560303, "epoch": 0.009560524286815728, "grad_norm": 2.28125, "learning_rate": 4.9491417923294934e-05, "loss": 0.2961139976978302, "mean_token_accuracy": 0.897884726524353, "num_tokens": 87192.0, "step": 62 }, { "entropy": 0.3488570749759674, "epoch": 0.009714726291441789, "grad_norm": 4.09375, "learning_rate": 4.9458197891777556e-05, "loss": 0.41754454374313354, "mean_token_accuracy": 0.8623949289321899, "num_tokens": 88152.0, "step": 63 }, { "entropy": 0.30880776047706604, "epoch": 0.009868928296067848, "grad_norm": 3.59375, "learning_rate": 4.942393882906674e-05, "loss": 0.3697586953639984, "mean_token_accuracy": 0.8826446533203125, "num_tokens": 89370.0, "step": 64 }, { "entropy": 0.33239686489105225, "epoch": 0.010023130300693909, "grad_norm": 3.078125, "learning_rate": 4.9388642190553226e-05, "loss": 0.3398675322532654, "mean_token_accuracy": 0.8863636255264282, "num_tokens": 90478.0, "step": 65 }, { "entropy": 0.2123962938785553, "epoch": 0.01017733230531997, "grad_norm": 2.65625, "learning_rate": 4.935230947570597e-05, "loss": 0.24962244927883148, "mean_token_accuracy": 0.9207471013069153, "num_tokens": 92467.0, "step": 66 }, { "entropy": 0.2466113418340683, "epoch": 0.010331534309946029, "grad_norm": 2.984375, "learning_rate": 4.931494222800844e-05, "loss": 0.2969174385070801, "mean_token_accuracy": 0.8992460370063782, "num_tokens": 93934.0, "step": 67 }, { "entropy": 0.2843821942806244, "epoch": 0.01048573631457209, "grad_norm": 2.765625, "learning_rate": 4.9276542034893044e-05, "loss": 0.32256507873535156, "mean_token_accuracy": 0.8970125913619995, "num_tokens": 95214.0, "step": 68 }, { "entropy": 0.2835680842399597, "epoch": 0.01063993831919815, "grad_norm": 2.359375, "learning_rate": 4.923711052767369e-05, "loss": 0.25358864665031433, "mean_token_accuracy": 0.9075269103050232, "num_tokens": 96617.0, "step": 69 }, { "entropy": 0.24416741728782654, "epoch": 0.01079414032382421, "grad_norm": 2.453125, "learning_rate": 4.9196649381476504e-05, "loss": 0.2399137169122696, "mean_token_accuracy": 0.9191176295280457, "num_tokens": 98257.0, "step": 70 }, { "entropy": 0.29682713747024536, "epoch": 0.01094834232845027, "grad_norm": 3.109375, "learning_rate": 4.915516031516863e-05, "loss": 0.3011798858642578, "mean_token_accuracy": 0.9019886255264282, "num_tokens": 99673.0, "step": 71 }, { "entropy": 0.31920552253723145, "epoch": 0.01110254433307633, "grad_norm": 2.84375, "learning_rate": 4.911264509128524e-05, "loss": 0.3012612462043762, "mean_token_accuracy": 0.8963922262191772, "num_tokens": 100762.0, "step": 72 }, { "entropy": 0.30763909220695496, "epoch": 0.01125674633770239, "grad_norm": 2.5, "learning_rate": 4.906910551595466e-05, "loss": 0.2967263460159302, "mean_token_accuracy": 0.9000924825668335, "num_tokens": 101851.0, "step": 73 }, { "entropy": 0.3040732145309448, "epoch": 0.011410948342328451, "grad_norm": 3.0625, "learning_rate": 4.902454343882162e-05, "loss": 0.3297285735607147, "mean_token_accuracy": 0.8881889581680298, "num_tokens": 103129.0, "step": 74 }, { "entropy": 0.2837064564228058, "epoch": 0.01156515034695451, "grad_norm": 2.546875, "learning_rate": 4.8978960752968695e-05, "loss": 0.28480246663093567, "mean_token_accuracy": 0.8954154849052429, "num_tokens": 104533.0, "step": 75 }, { "entropy": 0.3194725811481476, "epoch": 0.01171935235158057, "grad_norm": 3.171875, "learning_rate": 4.893235939483587e-05, "loss": 0.3251062333583832, "mean_token_accuracy": 0.8847517967224121, "num_tokens": 105669.0, "step": 76 }, { "entropy": 0.2741483747959137, "epoch": 0.01187355435620663, "grad_norm": 2.34375, "learning_rate": 4.8884741344138294e-05, "loss": 0.2672386169433594, "mean_token_accuracy": 0.9074759483337402, "num_tokens": 107028.0, "step": 77 }, { "entropy": 0.2283252328634262, "epoch": 0.01202775636083269, "grad_norm": 2.265625, "learning_rate": 4.8836108623782154e-05, "loss": 0.23968154191970825, "mean_token_accuracy": 0.9172775149345398, "num_tokens": 108946.0, "step": 78 }, { "entropy": 0.3795450031757355, "epoch": 0.012181958365458751, "grad_norm": 2.921875, "learning_rate": 4.8786463299778773e-05, "loss": 0.4119304120540619, "mean_token_accuracy": 0.8639523386955261, "num_tokens": 109961.0, "step": 79 }, { "entropy": 0.3368295431137085, "epoch": 0.01233616037008481, "grad_norm": 2.984375, "learning_rate": 4.873580748115679e-05, "loss": 0.3614250719547272, "mean_token_accuracy": 0.8688679337501526, "num_tokens": 111029.0, "step": 80 }, { "entropy": 0.2857000231742859, "epoch": 0.012490362374710871, "grad_norm": 2.890625, "learning_rate": 4.8684143319872636e-05, "loss": 0.2805139422416687, "mean_token_accuracy": 0.8976377844810486, "num_tokens": 112307.0, "step": 81 }, { "entropy": 0.3508206605911255, "epoch": 0.012644564379336932, "grad_norm": 2.6875, "learning_rate": 4.863147301071903e-05, "loss": 0.3427751362323761, "mean_token_accuracy": 0.8861867785453796, "num_tokens": 113343.0, "step": 82 }, { "entropy": 0.33843186497688293, "epoch": 0.012798766383962991, "grad_norm": 2.796875, "learning_rate": 4.8577798791231815e-05, "loss": 0.32030197978019714, "mean_token_accuracy": 0.8884462118148804, "num_tokens": 114606.0, "step": 83 }, { "entropy": 0.2596885859966278, "epoch": 0.012952968388589052, "grad_norm": 2.15625, "learning_rate": 4.852312294159486e-05, "loss": 0.2479410469532013, "mean_token_accuracy": 0.916926920413971, "num_tokens": 116215.0, "step": 84 }, { "entropy": 0.3047274947166443, "epoch": 0.013107170393215111, "grad_norm": 3.03125, "learning_rate": 4.8467447784543205e-05, "loss": 0.30305323004722595, "mean_token_accuracy": 0.8943606019020081, "num_tokens": 117482.0, "step": 85 }, { "entropy": 0.18909737467765808, "epoch": 0.013261372397841172, "grad_norm": 1.984375, "learning_rate": 4.841077568526439e-05, "loss": 0.2026541382074356, "mean_token_accuracy": 0.9290540814399719, "num_tokens": 119858.0, "step": 86 }, { "entropy": 0.2918696403503418, "epoch": 0.013415574402467233, "grad_norm": 2.953125, "learning_rate": 4.8353109051297976e-05, "loss": 0.3184109330177307, "mean_token_accuracy": 0.8954508900642395, "num_tokens": 121119.0, "step": 87 }, { "entropy": 0.3284008800983429, "epoch": 0.013569776407093292, "grad_norm": 2.953125, "learning_rate": 4.829445033243326e-05, "loss": 0.3101221024990082, "mean_token_accuracy": 0.8810949325561523, "num_tokens": 122296.0, "step": 88 }, { "entropy": 0.2881852090358734, "epoch": 0.013723978411719353, "grad_norm": 2.375, "learning_rate": 4.823480202060521e-05, "loss": 0.28734254837036133, "mean_token_accuracy": 0.897292971611023, "num_tokens": 123560.0, "step": 89 }, { "entropy": 0.3923459053039551, "epoch": 0.013878180416345412, "grad_norm": 3.40625, "learning_rate": 4.817416664978861e-05, "loss": 0.4181320071220398, "mean_token_accuracy": 0.857782781124115, "num_tokens": 124461.0, "step": 90 }, { "entropy": 0.38047465682029724, "epoch": 0.014032382420971472, "grad_norm": 3.296875, "learning_rate": 4.81125467958904e-05, "loss": 0.4139612317085266, "mean_token_accuracy": 0.8700189590454102, "num_tokens": 125523.0, "step": 91 }, { "entropy": 0.32315686345100403, "epoch": 0.014186584425597533, "grad_norm": 3.125, "learning_rate": 4.804994507664026e-05, "loss": 0.29804831743240356, "mean_token_accuracy": 0.9056603908538818, "num_tokens": 126962.0, "step": 92 }, { "entropy": 0.3598167598247528, "epoch": 0.014340786430223592, "grad_norm": 4.375, "learning_rate": 4.798636415147938e-05, "loss": 0.33338406682014465, "mean_token_accuracy": 0.876142144203186, "num_tokens": 127955.0, "step": 93 }, { "entropy": 0.2664312422275543, "epoch": 0.014494988434849653, "grad_norm": 2.46875, "learning_rate": 4.7921806721447494e-05, "loss": 0.24038437008857727, "mean_token_accuracy": 0.9096692204475403, "num_tokens": 129535.0, "step": 94 }, { "entropy": 0.40390363335609436, "epoch": 0.014649190439475714, "grad_norm": 5.46875, "learning_rate": 4.785627552906816e-05, "loss": 0.39077234268188477, "mean_token_accuracy": 0.8630303144454956, "num_tokens": 130368.0, "step": 95 }, { "entropy": 0.25566768646240234, "epoch": 0.014803392444101773, "grad_norm": 2.359375, "learning_rate": 4.77897733582322e-05, "loss": 0.24936963617801666, "mean_token_accuracy": 0.9094488024711609, "num_tokens": 132154.0, "step": 96 }, { "entropy": 0.27786779403686523, "epoch": 0.014957594448727834, "grad_norm": 2.65625, "learning_rate": 4.77223030340795e-05, "loss": 0.26183679699897766, "mean_token_accuracy": 0.9076694250106812, "num_tokens": 133505.0, "step": 97 }, { "entropy": 0.2191360741853714, "epoch": 0.015111796453353893, "grad_norm": 1.7109375, "learning_rate": 4.7653867422878926e-05, "loss": 0.20657718181610107, "mean_token_accuracy": 0.9271235466003418, "num_tokens": 135585.0, "step": 98 }, { "entropy": 0.24974940717220306, "epoch": 0.015265998457979954, "grad_norm": 2.6875, "learning_rate": 4.758446943190661e-05, "loss": 0.2656131982803345, "mean_token_accuracy": 0.9067688584327698, "num_tokens": 137159.0, "step": 99 }, { "entropy": 0.2313736230134964, "epoch": 0.015420200462606014, "grad_norm": 2.046875, "learning_rate": 4.751411200932242e-05, "loss": 0.23317928612232208, "mean_token_accuracy": 0.9191856980323792, "num_tokens": 138788.0, "step": 100 }, { "entropy": 0.2997652590274811, "epoch": 0.015574402467232074, "grad_norm": 2.4375, "learning_rate": 4.7442798144044695e-05, "loss": 0.3120857775211334, "mean_token_accuracy": 0.8966366052627563, "num_tokens": 140015.0, "step": 101 }, { "entropy": 0.3081951141357422, "epoch": 0.015728604471858133, "grad_norm": 2.6875, "learning_rate": 4.7370530865623334e-05, "loss": 0.34071362018585205, "mean_token_accuracy": 0.8860557675361633, "num_tokens": 141278.0, "step": 102 }, { "entropy": 0.2699045240879059, "epoch": 0.015882806476484195, "grad_norm": 2.234375, "learning_rate": 4.729731324411104e-05, "loss": 0.27989721298217773, "mean_token_accuracy": 0.90031898021698, "num_tokens": 142540.0, "step": 103 }, { "entropy": 0.2760254144668579, "epoch": 0.016037008481110254, "grad_norm": 2.234375, "learning_rate": 4.722314838993291e-05, "loss": 0.3070385158061981, "mean_token_accuracy": 0.9090268015861511, "num_tokens": 143966.0, "step": 104 }, { "entropy": 0.29715025424957275, "epoch": 0.016191210485736313, "grad_norm": 2.703125, "learning_rate": 4.714803945375431e-05, "loss": 0.3124261796474457, "mean_token_accuracy": 0.8967213034629822, "num_tokens": 145194.0, "step": 105 }, { "entropy": 0.3180467486381531, "epoch": 0.016345412490362376, "grad_norm": 2.8125, "learning_rate": 4.707198962634701e-05, "loss": 0.3431381285190582, "mean_token_accuracy": 0.8840726017951965, "num_tokens": 146194.0, "step": 106 }, { "entropy": 0.25070613622665405, "epoch": 0.016499614494988435, "grad_norm": 2.3125, "learning_rate": 4.699500213845367e-05, "loss": 0.290202796459198, "mean_token_accuracy": 0.9046997427940369, "num_tokens": 147734.0, "step": 107 }, { "entropy": 0.22292165458202362, "epoch": 0.016653816499614494, "grad_norm": 1.7109375, "learning_rate": 4.691708026065055e-05, "loss": 0.2274986356496811, "mean_token_accuracy": 0.9138554334640503, "num_tokens": 149402.0, "step": 108 }, { "entropy": 0.3579561412334442, "epoch": 0.016808018504240557, "grad_norm": 2.78125, "learning_rate": 4.683822730320858e-05, "loss": 0.31315499544143677, "mean_token_accuracy": 0.889497697353363, "num_tokens": 150505.0, "step": 109 }, { "entropy": 0.23602542281150818, "epoch": 0.016962220508866616, "grad_norm": 1.609375, "learning_rate": 4.6758446615952746e-05, "loss": 0.20407229661941528, "mean_token_accuracy": 0.9303831458091736, "num_tokens": 152366.0, "step": 110 }, { "entropy": 0.3046983480453491, "epoch": 0.017116422513492675, "grad_norm": 3.25, "learning_rate": 4.6677741588119784e-05, "loss": 0.3156391382217407, "mean_token_accuracy": 0.8897196054458618, "num_tokens": 153444.0, "step": 111 }, { "entropy": 0.25662004947662354, "epoch": 0.017270624518118737, "grad_norm": 2.515625, "learning_rate": 4.6596115648214196e-05, "loss": 0.2515248656272888, "mean_token_accuracy": 0.907616376876831, "num_tokens": 154870.0, "step": 112 }, { "entropy": 0.28677132725715637, "epoch": 0.017424826522744796, "grad_norm": 3.265625, "learning_rate": 4.651357226386258e-05, "loss": 0.2942817211151123, "mean_token_accuracy": 0.8936970829963684, "num_tokens": 155941.0, "step": 113 }, { "entropy": 0.21182145178318024, "epoch": 0.017579028527370855, "grad_norm": 2.3125, "learning_rate": 4.6430114941666334e-05, "loss": 0.23567034304141998, "mean_token_accuracy": 0.9196969866752625, "num_tokens": 157269.0, "step": 114 }, { "entropy": 0.18196314573287964, "epoch": 0.017733230531996914, "grad_norm": 1.71875, "learning_rate": 4.6345747227052726e-05, "loss": 0.18516698479652405, "mean_token_accuracy": 0.9305768013000488, "num_tokens": 159236.0, "step": 115 }, { "entropy": 0.23556780815124512, "epoch": 0.017887432536622977, "grad_norm": 2.40625, "learning_rate": 4.626047270412419e-05, "loss": 0.22876134514808655, "mean_token_accuracy": 0.9182389974594116, "num_tokens": 160516.0, "step": 116 }, { "entropy": 0.24857133626937866, "epoch": 0.018041634541249036, "grad_norm": 3.0625, "learning_rate": 4.6174294995506154e-05, "loss": 0.2965892255306244, "mean_token_accuracy": 0.90025794506073, "num_tokens": 161687.0, "step": 117 }, { "entropy": 0.21330931782722473, "epoch": 0.018195836545875095, "grad_norm": 2.421875, "learning_rate": 4.6087217762193105e-05, "loss": 0.23048508167266846, "mean_token_accuracy": 0.9241044521331787, "num_tokens": 163342.0, "step": 118 }, { "entropy": 0.25938084721565247, "epoch": 0.018350038550501158, "grad_norm": 2.734375, "learning_rate": 4.599924470339303e-05, "loss": 0.27338430285453796, "mean_token_accuracy": 0.9029850959777832, "num_tokens": 164690.0, "step": 119 }, { "entropy": 0.3166216015815735, "epoch": 0.018504240555127217, "grad_norm": 3.609375, "learning_rate": 4.5910379556370355e-05, "loss": 0.3654600977897644, "mean_token_accuracy": 0.871026337146759, "num_tokens": 165799.0, "step": 120 }, { "entropy": 0.21709276735782623, "epoch": 0.018658442559753276, "grad_norm": 1.8359375, "learning_rate": 4.582062609628709e-05, "loss": 0.214874729514122, "mean_token_accuracy": 0.9245843291282654, "num_tokens": 167491.0, "step": 121 }, { "entropy": 0.24251380562782288, "epoch": 0.01881264456437934, "grad_norm": 1.9921875, "learning_rate": 4.57299881360425e-05, "loss": 0.26085519790649414, "mean_token_accuracy": 0.9065860509872437, "num_tokens": 168987.0, "step": 122 }, { "entropy": 0.2558088004589081, "epoch": 0.018966846569005397, "grad_norm": 2.359375, "learning_rate": 4.563846952611112e-05, "loss": 0.2583191692829132, "mean_token_accuracy": 0.9092382788658142, "num_tokens": 170229.0, "step": 123 }, { "entropy": 0.28851792216300964, "epoch": 0.019121048573631456, "grad_norm": 2.25, "learning_rate": 4.554607415437915e-05, "loss": 0.28650322556495667, "mean_token_accuracy": 0.8939759135246277, "num_tokens": 171482.0, "step": 124 }, { "entropy": 0.3131585419178009, "epoch": 0.01927525057825752, "grad_norm": 2.578125, "learning_rate": 4.545280594597935e-05, "loss": 0.2936202585697174, "mean_token_accuracy": 0.8922480344772339, "num_tokens": 172780.0, "step": 125 }, { "entropy": 0.24182380735874176, "epoch": 0.019429452582883578, "grad_norm": 2.234375, "learning_rate": 4.535866886312423e-05, "loss": 0.2440458983182907, "mean_token_accuracy": 0.9163833856582642, "num_tokens": 174259.0, "step": 126 }, { "entropy": 0.2646311819553375, "epoch": 0.019583654587509637, "grad_norm": 2.109375, "learning_rate": 4.526366690493777e-05, "loss": 0.2328074872493744, "mean_token_accuracy": 0.9140625, "num_tokens": 175675.0, "step": 127 }, { "entropy": 0.2266581654548645, "epoch": 0.019737856592135696, "grad_norm": 1.671875, "learning_rate": 4.5167804107285514e-05, "loss": 0.21153169870376587, "mean_token_accuracy": 0.922784149646759, "num_tokens": 177522.0, "step": 128 }, { "epoch": 0.019737856592135696, "eval_entropy": 0.27021819719097073, "eval_loss": 0.26394832134246826, "eval_mean_token_accuracy": 0.9077995745410696, "eval_num_tokens": 177522.0, "eval_runtime": 35.0787, "eval_samples_per_second": 77.854, "eval_steps_per_second": 9.75, "step": 128 }, { "entropy": 0.3175150752067566, "epoch": 0.01989205859676176, "grad_norm": 2.5, "learning_rate": 4.507108454260309e-05, "loss": 0.32345065474510193, "mean_token_accuracy": 0.895765483379364, "num_tokens": 178758.0, "step": 129 }, { "entropy": 0.26202577352523804, "epoch": 0.020046260601387818, "grad_norm": 2.59375, "learning_rate": 4.497351231972329e-05, "loss": 0.247625470161438, "mean_token_accuracy": 0.915336549282074, "num_tokens": 180207.0, "step": 130 }, { "entropy": 0.23124445974826813, "epoch": 0.020200462606013877, "grad_norm": 2.265625, "learning_rate": 4.487509158370139e-05, "loss": 0.221195787191391, "mean_token_accuracy": 0.9168797731399536, "num_tokens": 181779.0, "step": 131 }, { "entropy": 0.3099311590194702, "epoch": 0.02035466461063994, "grad_norm": 3.03125, "learning_rate": 4.4775826515639205e-05, "loss": 0.3427657186985016, "mean_token_accuracy": 0.8853210806846619, "num_tokens": 182877.0, "step": 132 }, { "entropy": 0.19146594405174255, "epoch": 0.020508866615266, "grad_norm": 1.9296875, "learning_rate": 4.4675721332507345e-05, "loss": 0.18723616003990173, "mean_token_accuracy": 0.9326805472373962, "num_tokens": 184519.0, "step": 133 }, { "entropy": 0.29960504174232483, "epoch": 0.020663068619892058, "grad_norm": 2.796875, "learning_rate": 4.4574780286966154e-05, "loss": 0.31267160177230835, "mean_token_accuracy": 0.890625, "num_tokens": 185423.0, "step": 134 }, { "entropy": 0.26278653740882874, "epoch": 0.02081727062451812, "grad_norm": 2.28125, "learning_rate": 4.4473007667184995e-05, "loss": 0.27267012000083923, "mean_token_accuracy": 0.9038869142532349, "num_tokens": 186846.0, "step": 135 }, { "entropy": 0.18965409696102142, "epoch": 0.02097147262914418, "grad_norm": 2.390625, "learning_rate": 4.43704077966601e-05, "loss": 0.21876873075962067, "mean_token_accuracy": 0.9245041608810425, "num_tokens": 188417.0, "step": 136 }, { "entropy": 0.20953340828418732, "epoch": 0.021125674633770238, "grad_norm": 2.109375, "learning_rate": 4.426698503403091e-05, "loss": 0.205082505941391, "mean_token_accuracy": 0.926571249961853, "num_tokens": 190032.0, "step": 137 }, { "entropy": 0.2490757405757904, "epoch": 0.0212798766383963, "grad_norm": 2.0, "learning_rate": 4.4162743772894905e-05, "loss": 0.23051951825618744, "mean_token_accuracy": 0.9111841917037964, "num_tokens": 191256.0, "step": 138 }, { "entropy": 0.3277740180492401, "epoch": 0.02143407864302236, "grad_norm": 5.03125, "learning_rate": 4.405768844162094e-05, "loss": 0.37247925996780396, "mean_token_accuracy": 0.8656716346740723, "num_tokens": 192202.0, "step": 139 }, { "entropy": 0.20335228741168976, "epoch": 0.02158828064764842, "grad_norm": 2.0625, "learning_rate": 4.395182350316115e-05, "loss": 0.20390284061431885, "mean_token_accuracy": 0.9318037033081055, "num_tokens": 193779.0, "step": 140 }, { "entropy": 0.222616046667099, "epoch": 0.021742482652274478, "grad_norm": 2.15625, "learning_rate": 4.384515345486131e-05, "loss": 0.22837010025978088, "mean_token_accuracy": 0.9107261896133423, "num_tokens": 195288.0, "step": 141 }, { "entropy": 0.2554439902305603, "epoch": 0.02189668465690054, "grad_norm": 2.65625, "learning_rate": 4.373768282826983e-05, "loss": 0.28548112511634827, "mean_token_accuracy": 0.905958354473114, "num_tokens": 196689.0, "step": 142 }, { "entropy": 0.23849214613437653, "epoch": 0.0220508866615266, "grad_norm": 2.21875, "learning_rate": 4.3629416188945224e-05, "loss": 0.25381097197532654, "mean_token_accuracy": 0.9149101972579956, "num_tokens": 197978.0, "step": 143 }, { "entropy": 0.26421603560447693, "epoch": 0.02220508866615266, "grad_norm": 3.5625, "learning_rate": 4.352035813626214e-05, "loss": 0.27579382061958313, "mean_token_accuracy": 0.8979591727256775, "num_tokens": 199260.0, "step": 144 }, { "entropy": 0.20953713357448578, "epoch": 0.02235929067077872, "grad_norm": 2.328125, "learning_rate": 4.3410513303215985e-05, "loss": 0.1990606188774109, "mean_token_accuracy": 0.9306029677391052, "num_tokens": 201026.0, "step": 145 }, { "entropy": 0.32288917899131775, "epoch": 0.02251349267540478, "grad_norm": 2.984375, "learning_rate": 4.329988635622611e-05, "loss": 0.3260837197303772, "mean_token_accuracy": 0.893796980381012, "num_tokens": 202098.0, "step": 146 }, { "entropy": 0.21132293343544006, "epoch": 0.02266769468003084, "grad_norm": 1.84375, "learning_rate": 4.318848199493758e-05, "loss": 0.19785253703594208, "mean_token_accuracy": 0.9298823475837708, "num_tokens": 204231.0, "step": 147 }, { "entropy": 0.3431147038936615, "epoch": 0.022821896684656902, "grad_norm": 2.84375, "learning_rate": 4.30763049520215e-05, "loss": 0.3377273380756378, "mean_token_accuracy": 0.8919667601585388, "num_tokens": 205322.0, "step": 148 }, { "entropy": 0.24553008377552032, "epoch": 0.02297609868928296, "grad_norm": 2.546875, "learning_rate": 4.296335999297397e-05, "loss": 0.23867689073085785, "mean_token_accuracy": 0.9165446758270264, "num_tokens": 206696.0, "step": 149 }, { "entropy": 0.27541691064834595, "epoch": 0.02313030069390902, "grad_norm": 2.03125, "learning_rate": 4.284965191591364e-05, "loss": 0.25213125348091125, "mean_token_accuracy": 0.914050817489624, "num_tokens": 208042.0, "step": 150 }, { "entropy": 0.23892685770988464, "epoch": 0.023284502698535083, "grad_norm": 2.03125, "learning_rate": 4.2735185551377895e-05, "loss": 0.20277726650238037, "mean_token_accuracy": 0.9304635524749756, "num_tokens": 209560.0, "step": 151 }, { "entropy": 0.2151283323764801, "epoch": 0.02343870470316114, "grad_norm": 2.015625, "learning_rate": 4.261996576211761e-05, "loss": 0.2226867973804474, "mean_token_accuracy": 0.9178715944290161, "num_tokens": 211297.0, "step": 152 }, { "entropy": 0.2410528063774109, "epoch": 0.0235929067077872, "grad_norm": 2.015625, "learning_rate": 4.25039974428906e-05, "loss": 0.22763265669345856, "mean_token_accuracy": 0.9149277806282043, "num_tokens": 212551.0, "step": 153 }, { "entropy": 0.2535974383354187, "epoch": 0.02374710871241326, "grad_norm": 2.328125, "learning_rate": 4.238728552025365e-05, "loss": 0.2421426922082901, "mean_token_accuracy": 0.9143372178077698, "num_tokens": 213668.0, "step": 154 }, { "entropy": 0.2121782749891281, "epoch": 0.023901310717039322, "grad_norm": 1.5, "learning_rate": 4.226983495235328e-05, "loss": 0.20025445520877838, "mean_token_accuracy": 0.9322981238365173, "num_tokens": 215286.0, "step": 155 }, { "entropy": 0.14580558240413666, "epoch": 0.02405551272166538, "grad_norm": 1.625, "learning_rate": 4.215165072871505e-05, "loss": 0.14826127886772156, "mean_token_accuracy": 0.9467787146568298, "num_tokens": 217436.0, "step": 156 }, { "entropy": 0.2315557599067688, "epoch": 0.02420971472629144, "grad_norm": 2.078125, "learning_rate": 4.203273787003162e-05, "loss": 0.2486051321029663, "mean_token_accuracy": 0.9164133667945862, "num_tokens": 218760.0, "step": 157 }, { "entropy": 0.25005754828453064, "epoch": 0.024363916730917503, "grad_norm": 2.390625, "learning_rate": 4.1913101427949505e-05, "loss": 0.2627011835575104, "mean_token_accuracy": 0.9080632925033569, "num_tokens": 220095.0, "step": 158 }, { "entropy": 0.2149634212255478, "epoch": 0.024518118735543562, "grad_norm": 2.28125, "learning_rate": 4.179274648485438e-05, "loss": 0.21630343794822693, "mean_token_accuracy": 0.9172714352607727, "num_tokens": 221481.0, "step": 159 }, { "entropy": 0.2316989302635193, "epoch": 0.02467232074016962, "grad_norm": 2.3125, "learning_rate": 4.1671678153655256e-05, "loss": 0.240981787443161, "mean_token_accuracy": 0.9135708808898926, "num_tokens": 222808.0, "step": 160 }, { "entropy": 0.29497963190078735, "epoch": 0.024826522744795684, "grad_norm": 2.40625, "learning_rate": 4.154990157756722e-05, "loss": 0.2961036264896393, "mean_token_accuracy": 0.9030969142913818, "num_tokens": 223817.0, "step": 161 }, { "entropy": 0.22725972533226013, "epoch": 0.024980724749421743, "grad_norm": 2.546875, "learning_rate": 4.142742192989299e-05, "loss": 0.22807390987873077, "mean_token_accuracy": 0.9114027619361877, "num_tokens": 225044.0, "step": 162 }, { "entropy": 0.2280416190624237, "epoch": 0.025134926754047802, "grad_norm": 2.421875, "learning_rate": 4.1304244413803076e-05, "loss": 0.24813513457775116, "mean_token_accuracy": 0.9090909361839294, "num_tokens": 226339.0, "step": 163 }, { "entropy": 0.20092645287513733, "epoch": 0.025289128758673864, "grad_norm": 2.015625, "learning_rate": 4.118037426211482e-05, "loss": 0.22428975999355316, "mean_token_accuracy": 0.9173313975334167, "num_tokens": 227726.0, "step": 164 }, { "entropy": 0.20079851150512695, "epoch": 0.025443330763299923, "grad_norm": 9.5625, "learning_rate": 4.105581673707002e-05, "loss": 0.21033848822116852, "mean_token_accuracy": 0.9232493042945862, "num_tokens": 229519.0, "step": 165 }, { "entropy": 0.25729137659072876, "epoch": 0.025597532767925982, "grad_norm": 2.3125, "learning_rate": 4.0930577130111424e-05, "loss": 0.2733251452445984, "mean_token_accuracy": 0.9045871496200562, "num_tokens": 230617.0, "step": 166 }, { "entropy": 0.20442764461040497, "epoch": 0.02575173477255204, "grad_norm": 1.890625, "learning_rate": 4.080466076165793e-05, "loss": 0.20845486223697662, "mean_token_accuracy": 0.9209572076797485, "num_tokens": 232004.0, "step": 167 }, { "entropy": 0.20175087451934814, "epoch": 0.025905936777178104, "grad_norm": 2.453125, "learning_rate": 4.067807298087857e-05, "loss": 0.21334150433540344, "mean_token_accuracy": 0.9243085980415344, "num_tokens": 233386.0, "step": 168 }, { "entropy": 0.26961395144462585, "epoch": 0.026060138781804163, "grad_norm": 2.125, "learning_rate": 4.055081916546525e-05, "loss": 0.24742326140403748, "mean_token_accuracy": 0.9157986044883728, "num_tokens": 234546.0, "step": 169 }, { "entropy": 0.20450648665428162, "epoch": 0.026214340786430222, "grad_norm": 1.6953125, "learning_rate": 4.042290472140431e-05, "loss": 0.20523257553577423, "mean_token_accuracy": 0.9297789335250854, "num_tokens": 236092.0, "step": 170 }, { "entropy": 0.2690446972846985, "epoch": 0.026368542791056285, "grad_norm": 2.15625, "learning_rate": 4.029433508274686e-05, "loss": 0.26763197779655457, "mean_token_accuracy": 0.9070660471916199, "num_tokens": 237402.0, "step": 171 }, { "entropy": 0.22288963198661804, "epoch": 0.026522744795682344, "grad_norm": 2.03125, "learning_rate": 4.0165115711377945e-05, "loss": 0.24567259848117828, "mean_token_accuracy": 0.9189382791519165, "num_tokens": 238804.0, "step": 172 }, { "entropy": 0.19029025733470917, "epoch": 0.026676946800308403, "grad_norm": 1.8671875, "learning_rate": 4.003525209678449e-05, "loss": 0.18879841268062592, "mean_token_accuracy": 0.9351808428764343, "num_tokens": 240941.0, "step": 173 }, { "entropy": 0.2573792338371277, "epoch": 0.026831148804934465, "grad_norm": 2.96875, "learning_rate": 3.9904749755822114e-05, "loss": 0.2607381045818329, "mean_token_accuracy": 0.906000018119812, "num_tokens": 242449.0, "step": 174 }, { "entropy": 0.2028045505285263, "epoch": 0.026985350809560524, "grad_norm": 1.3984375, "learning_rate": 3.977361423248075e-05, "loss": 0.1825239360332489, "mean_token_accuracy": 0.9339895844459534, "num_tokens": 244184.0, "step": 175 }, { "entropy": 0.27057698369026184, "epoch": 0.027139552814186584, "grad_norm": 3.140625, "learning_rate": 3.964185109764915e-05, "loss": 0.30133944749832153, "mean_token_accuracy": 0.8857142925262451, "num_tokens": 245347.0, "step": 176 }, { "entropy": 0.18647152185440063, "epoch": 0.027293754818812646, "grad_norm": 1.8046875, "learning_rate": 3.95094659488782e-05, "loss": 0.1798812299966812, "mean_token_accuracy": 0.9323040246963501, "num_tokens": 247039.0, "step": 177 }, { "entropy": 0.2583964765071869, "epoch": 0.027447956823438705, "grad_norm": 2.28125, "learning_rate": 3.9376464410143124e-05, "loss": 0.2609320878982544, "mean_token_accuracy": 0.9023405909538269, "num_tokens": 248286.0, "step": 178 }, { "entropy": 0.24908345937728882, "epoch": 0.027602158828064764, "grad_norm": 2.09375, "learning_rate": 3.9242852131604585e-05, "loss": 0.2381179928779602, "mean_token_accuracy": 0.9222641587257385, "num_tokens": 249619.0, "step": 179 }, { "entropy": 0.21503198146820068, "epoch": 0.027756360832690823, "grad_norm": 2.5, "learning_rate": 3.910863478936864e-05, "loss": 0.2604519724845886, "mean_token_accuracy": 0.9127399921417236, "num_tokens": 251346.0, "step": 180 }, { "entropy": 0.22753889858722687, "epoch": 0.027910562837316886, "grad_norm": 1.84375, "learning_rate": 3.897381808524562e-05, "loss": 0.23742565512657166, "mean_token_accuracy": 0.9219380617141724, "num_tokens": 252840.0, "step": 181 }, { "entropy": 0.25326159596443176, "epoch": 0.028064764841942945, "grad_norm": 2.203125, "learning_rate": 3.883840774650788e-05, "loss": 0.28680431842803955, "mean_token_accuracy": 0.9005083441734314, "num_tokens": 254225.0, "step": 182 }, { "entropy": 0.24126410484313965, "epoch": 0.028218966846569004, "grad_norm": 2.109375, "learning_rate": 3.870240952564653e-05, "loss": 0.2406134009361267, "mean_token_accuracy": 0.9119541645050049, "num_tokens": 255630.0, "step": 183 }, { "entropy": 0.2304130345582962, "epoch": 0.028373168851195067, "grad_norm": 1.6953125, "learning_rate": 3.856582920012706e-05, "loss": 0.22154204547405243, "mean_token_accuracy": 0.9195979833602905, "num_tokens": 257031.0, "step": 184 }, { "entropy": 0.16509661078453064, "epoch": 0.028527370855821126, "grad_norm": 1.3125, "learning_rate": 3.842867257214383e-05, "loss": 0.15430063009262085, "mean_token_accuracy": 0.940733790397644, "num_tokens": 259165.0, "step": 185 }, { "entropy": 0.24022063612937927, "epoch": 0.028681572860447185, "grad_norm": 1.7890625, "learning_rate": 3.8290945468373684e-05, "loss": 0.20412693917751312, "mean_token_accuracy": 0.9327940344810486, "num_tokens": 260780.0, "step": 186 }, { "entropy": 0.2785824239253998, "epoch": 0.028835774865073247, "grad_norm": 2.390625, "learning_rate": 3.8152653739728363e-05, "loss": 0.2689974308013916, "mean_token_accuracy": 0.9066666960716248, "num_tokens": 261988.0, "step": 187 }, { "entropy": 0.20374569296836853, "epoch": 0.028989976869699306, "grad_norm": 2.0, "learning_rate": 3.8013803261105916e-05, "loss": 0.21978892385959625, "mean_token_accuracy": 0.9233038425445557, "num_tokens": 263691.0, "step": 188 }, { "entropy": 0.2387579381465912, "epoch": 0.029144178874325365, "grad_norm": 1.984375, "learning_rate": 3.787439993114123e-05, "loss": 0.23546524345874786, "mean_token_accuracy": 0.9189907312393188, "num_tokens": 265205.0, "step": 189 }, { "entropy": 0.22492903470993042, "epoch": 0.029298380878951428, "grad_norm": 1.8671875, "learning_rate": 3.7734449671955326e-05, "loss": 0.21074332296848297, "mean_token_accuracy": 0.9219586849212646, "num_tokens": 266520.0, "step": 190 }, { "entropy": 0.19710952043533325, "epoch": 0.029452582883577487, "grad_norm": 1.9296875, "learning_rate": 3.759395842890384e-05, "loss": 0.1993340104818344, "mean_token_accuracy": 0.9277042150497437, "num_tokens": 268340.0, "step": 191 }, { "entropy": 0.24934346973896027, "epoch": 0.029606784888203546, "grad_norm": 1.890625, "learning_rate": 3.7452932170324464e-05, "loss": 0.24506257474422455, "mean_token_accuracy": 0.9209383130073547, "num_tokens": 269499.0, "step": 192 }, { "entropy": 0.2751508355140686, "epoch": 0.029760986892829605, "grad_norm": 2.4375, "learning_rate": 3.731137688728335e-05, "loss": 0.28203558921813965, "mean_token_accuracy": 0.9066317677497864, "num_tokens": 270653.0, "step": 193 }, { "entropy": 0.2998161017894745, "epoch": 0.029915188897455668, "grad_norm": 2.640625, "learning_rate": 3.716929859332063e-05, "loss": 0.2953347861766815, "mean_token_accuracy": 0.9018287062644958, "num_tokens": 271700.0, "step": 194 }, { "entropy": 0.2493629902601242, "epoch": 0.030069390902081727, "grad_norm": 2.1875, "learning_rate": 3.7026703324194966e-05, "loss": 0.26706650853157043, "mean_token_accuracy": 0.9076277017593384, "num_tokens": 273137.0, "step": 195 }, { "entropy": 0.20723779499530792, "epoch": 0.030223592906707786, "grad_norm": 2.140625, "learning_rate": 3.688359713762707e-05, "loss": 0.22939355671405792, "mean_token_accuracy": 0.9125827550888062, "num_tokens": 274655.0, "step": 196 }, { "entropy": 0.22990985214710236, "epoch": 0.03037779491133385, "grad_norm": 2.046875, "learning_rate": 3.673998611304246e-05, "loss": 0.2153758704662323, "mean_token_accuracy": 0.9279279112815857, "num_tokens": 275773.0, "step": 197 }, { "entropy": 0.29038283228874207, "epoch": 0.030531996915959907, "grad_norm": 2.71875, "learning_rate": 3.6595876351313116e-05, "loss": 0.304492324590683, "mean_token_accuracy": 0.9004576802253723, "num_tokens": 276655.0, "step": 198 }, { "entropy": 0.19836601614952087, "epoch": 0.030686198920585966, "grad_norm": 1.8359375, "learning_rate": 3.645127397449832e-05, "loss": 0.2065221518278122, "mean_token_accuracy": 0.9339622855186462, "num_tokens": 278359.0, "step": 199 }, { "entropy": 0.25179192423820496, "epoch": 0.03084040092521203, "grad_norm": 2.265625, "learning_rate": 3.6306185125584615e-05, "loss": 0.2616140842437744, "mean_token_accuracy": 0.9063336253166199, "num_tokens": 279488.0, "step": 200 }, { "entropy": 0.18242394924163818, "epoch": 0.030994602929838088, "grad_norm": 1.734375, "learning_rate": 3.616061596822478e-05, "loss": 0.17770832777023315, "mean_token_accuracy": 0.9277376532554626, "num_tokens": 281295.0, "step": 201 }, { "entropy": 0.24629506468772888, "epoch": 0.031148804934464147, "grad_norm": 2.4375, "learning_rate": 3.601457268647606e-05, "loss": 0.2535253167152405, "mean_token_accuracy": 0.9059450030326843, "num_tokens": 282430.0, "step": 202 }, { "entropy": 0.19920703768730164, "epoch": 0.03130300693909021, "grad_norm": 1.921875, "learning_rate": 3.586806148453736e-05, "loss": 0.20293940603733063, "mean_token_accuracy": 0.9283132553100586, "num_tokens": 284098.0, "step": 203 }, { "entropy": 0.1916186809539795, "epoch": 0.031457208943716265, "grad_norm": 1.578125, "learning_rate": 3.572108858648579e-05, "loss": 0.1925540268421173, "mean_token_accuracy": 0.9329091906547546, "num_tokens": 285835.0, "step": 204 }, { "entropy": 0.24154330790042877, "epoch": 0.03161141094834233, "grad_norm": 2.0625, "learning_rate": 3.557366023601216e-05, "loss": 0.2560335099697113, "mean_token_accuracy": 0.9222126007080078, "num_tokens": 287000.0, "step": 205 }, { "entropy": 0.24839094281196594, "epoch": 0.03176561295296839, "grad_norm": 2.09375, "learning_rate": 3.542578269615579e-05, "loss": 0.24170006811618805, "mean_token_accuracy": 0.9167927503585815, "num_tokens": 288330.0, "step": 206 }, { "entropy": 0.19456236064434052, "epoch": 0.031919814957594446, "grad_norm": 1.640625, "learning_rate": 3.527746224903842e-05, "loss": 0.18520742654800415, "mean_token_accuracy": 0.9366295337677002, "num_tokens": 289774.0, "step": 207 }, { "entropy": 0.24151258170604706, "epoch": 0.03207401696222051, "grad_norm": 1.8828125, "learning_rate": 3.512870519559733e-05, "loss": 0.22108638286590576, "mean_token_accuracy": 0.9167962670326233, "num_tokens": 291068.0, "step": 208 }, { "entropy": 0.3510158658027649, "epoch": 0.03222821896684657, "grad_norm": 3.71875, "learning_rate": 3.49795178553177e-05, "loss": 0.41906648874282837, "mean_token_accuracy": 0.8701754212379456, "num_tokens": 291931.0, "step": 209 }, { "entropy": 0.3286966383457184, "epoch": 0.03238242097147263, "grad_norm": 3.171875, "learning_rate": 3.48299065659641e-05, "loss": 0.343354731798172, "mean_token_accuracy": 0.8834951519966125, "num_tokens": 292866.0, "step": 210 }, { "entropy": 0.19397929310798645, "epoch": 0.03253662297609869, "grad_norm": 1.6875, "learning_rate": 3.467987768331127e-05, "loss": 0.1917928159236908, "mean_token_accuracy": 0.9349930882453918, "num_tokens": 294320.0, "step": 211 }, { "entropy": 0.2259572446346283, "epoch": 0.03269082498072475, "grad_norm": 2.203125, "learning_rate": 3.452943758087414e-05, "loss": 0.24537329375743866, "mean_token_accuracy": 0.9182724356651306, "num_tokens": 295833.0, "step": 212 }, { "entropy": 0.22965691983699799, "epoch": 0.03284502698535081, "grad_norm": 1.7890625, "learning_rate": 3.437859264963702e-05, "loss": 0.2151767462491989, "mean_token_accuracy": 0.9223232865333557, "num_tokens": 297270.0, "step": 213 }, { "entropy": 0.2611003518104553, "epoch": 0.03299922898997687, "grad_norm": 2.890625, "learning_rate": 3.422734929778213e-05, "loss": 0.2612400949001312, "mean_token_accuracy": 0.8977055549621582, "num_tokens": 298324.0, "step": 214 }, { "entropy": 0.1909189224243164, "epoch": 0.03315343099460293, "grad_norm": 1.8671875, "learning_rate": 3.407571395041736e-05, "loss": 0.20462700724601746, "mean_token_accuracy": 0.9242695569992065, "num_tokens": 300009.0, "step": 215 }, { "entropy": 0.2556368410587311, "epoch": 0.03330763299922899, "grad_norm": 2.03125, "learning_rate": 3.392369304930334e-05, "loss": 0.2566298246383667, "mean_token_accuracy": 0.9090163707733154, "num_tokens": 301237.0, "step": 216 }, { "entropy": 0.27811554074287415, "epoch": 0.03346183500385505, "grad_norm": 2.0625, "learning_rate": 3.377129305257975e-05, "loss": 0.2745239734649658, "mean_token_accuracy": 0.9044750332832336, "num_tokens": 302407.0, "step": 217 }, { "entropy": 0.21509166061878204, "epoch": 0.03361603700848111, "grad_norm": 1.84375, "learning_rate": 3.361852043449096e-05, "loss": 0.2006048709154129, "mean_token_accuracy": 0.9250646233558655, "num_tokens": 303963.0, "step": 218 }, { "entropy": 0.2612791359424591, "epoch": 0.03377023901310717, "grad_norm": 2.0, "learning_rate": 3.3465381685111054e-05, "loss": 0.27390342950820923, "mean_token_accuracy": 0.8982036113739014, "num_tokens": 305140.0, "step": 219 }, { "entropy": 0.2126745879650116, "epoch": 0.03392444101773323, "grad_norm": 1.609375, "learning_rate": 3.331188331006804e-05, "loss": 0.20790794491767883, "mean_token_accuracy": 0.9276844263076782, "num_tokens": 306517.0, "step": 220 }, { "entropy": 0.216102734208107, "epoch": 0.034078643022359294, "grad_norm": 1.53125, "learning_rate": 3.315803183026753e-05, "loss": 0.2031707614660263, "mean_token_accuracy": 0.9320327043533325, "num_tokens": 308114.0, "step": 221 }, { "entropy": 0.23003709316253662, "epoch": 0.03423284502698535, "grad_norm": 2.09375, "learning_rate": 3.30038337816157e-05, "loss": 0.24152696132659912, "mean_token_accuracy": 0.9172229766845703, "num_tokens": 309620.0, "step": 222 }, { "entropy": 0.25657832622528076, "epoch": 0.03438704703161141, "grad_norm": 1.9375, "learning_rate": 3.284929571474164e-05, "loss": 0.2669946551322937, "mean_token_accuracy": 0.9029045701026917, "num_tokens": 310833.0, "step": 223 }, { "entropy": 0.23583689332008362, "epoch": 0.034541249036237474, "grad_norm": 2.125, "learning_rate": 3.2694424194719046e-05, "loss": 0.24596942961215973, "mean_token_accuracy": 0.9083665609359741, "num_tokens": 312096.0, "step": 224 }, { "entropy": 0.197276309132576, "epoch": 0.03469545104086353, "grad_norm": 1.703125, "learning_rate": 3.2539225800787385e-05, "loss": 0.19344845414161682, "mean_token_accuracy": 0.93291836977005, "num_tokens": 313550.0, "step": 225 }, { "entropy": 0.3082696497440338, "epoch": 0.03484965304548959, "grad_norm": 3.484375, "learning_rate": 3.2383707126072315e-05, "loss": 0.3064239025115967, "mean_token_accuracy": 0.8925233483314514, "num_tokens": 314628.0, "step": 226 }, { "entropy": 0.19953380525112152, "epoch": 0.03500385505011565, "grad_norm": 1.734375, "learning_rate": 3.222787477730567e-05, "loss": 0.19340643286705017, "mean_token_accuracy": 0.9274017214775085, "num_tokens": 316468.0, "step": 227 }, { "entropy": 0.27000153064727783, "epoch": 0.03515805705474171, "grad_norm": 3.828125, "learning_rate": 3.207173537454472e-05, "loss": 0.2817123830318451, "mean_token_accuracy": 0.9068965315818787, "num_tokens": 317636.0, "step": 228 }, { "entropy": 0.22825853526592255, "epoch": 0.03531225905936777, "grad_norm": 2.125, "learning_rate": 3.191529555089102e-05, "loss": 0.22379839420318604, "mean_token_accuracy": 0.9244868159294128, "num_tokens": 319008.0, "step": 229 }, { "entropy": 0.2942773997783661, "epoch": 0.03546646106399383, "grad_norm": 2.6875, "learning_rate": 3.175856195220855e-05, "loss": 0.2916644215583801, "mean_token_accuracy": 0.8996211886405945, "num_tokens": 320072.0, "step": 230 }, { "entropy": 0.2531821131706238, "epoch": 0.03562066306861989, "grad_norm": 2.265625, "learning_rate": 3.160154123684143e-05, "loss": 0.2512527108192444, "mean_token_accuracy": 0.9058629274368286, "num_tokens": 321291.0, "step": 231 }, { "entropy": 0.234887957572937, "epoch": 0.035774865073245954, "grad_norm": 1.9140625, "learning_rate": 3.1444240075331054e-05, "loss": 0.2259407341480255, "mean_token_accuracy": 0.9231894612312317, "num_tokens": 322666.0, "step": 232 }, { "entropy": 0.23325884342193604, "epoch": 0.03592906707787201, "grad_norm": 1.96875, "learning_rate": 3.128666515013269e-05, "loss": 0.2157772332429886, "mean_token_accuracy": 0.9207017421722412, "num_tokens": 324099.0, "step": 233 }, { "entropy": 0.15830406546592712, "epoch": 0.03608326908249807, "grad_norm": 1.03125, "learning_rate": 3.112882315533163e-05, "loss": 0.1372249573469162, "mean_token_accuracy": 0.9470046162605286, "num_tokens": 326277.0, "step": 234 }, { "entropy": 0.25762706995010376, "epoch": 0.036237471087124135, "grad_norm": 1.828125, "learning_rate": 3.097072079635878e-05, "loss": 0.23957906663417816, "mean_token_accuracy": 0.915335476398468, "num_tokens": 327537.0, "step": 235 }, { "entropy": 0.21047890186309814, "epoch": 0.03639167309175019, "grad_norm": 1.7421875, "learning_rate": 3.081236478970583e-05, "loss": 0.22065354883670807, "mean_token_accuracy": 0.9236826300621033, "num_tokens": 329196.0, "step": 236 }, { "entropy": 0.22569093108177185, "epoch": 0.03654587509637625, "grad_norm": 1.90625, "learning_rate": 3.065376186263991e-05, "loss": 0.21428702771663666, "mean_token_accuracy": 0.9252577424049377, "num_tokens": 330368.0, "step": 237 }, { "entropy": 0.2325230836868286, "epoch": 0.036700077101002315, "grad_norm": 1.7578125, "learning_rate": 3.049491875291778e-05, "loss": 0.23734821379184723, "mean_token_accuracy": 0.9114202260971069, "num_tokens": 331742.0, "step": 238 }, { "entropy": 0.2122831493616104, "epoch": 0.03685427910562837, "grad_norm": 1.609375, "learning_rate": 3.0335842208499637e-05, "loss": 0.2174147367477417, "mean_token_accuracy": 0.9171270728111267, "num_tokens": 333198.0, "step": 239 }, { "entropy": 0.23024694621562958, "epoch": 0.03700848111025443, "grad_norm": 2.046875, "learning_rate": 3.0176538987262442e-05, "loss": 0.2907542288303375, "mean_token_accuracy": 0.9019264578819275, "num_tokens": 334348.0, "step": 240 }, { "entropy": 0.2648603022098541, "epoch": 0.037162683114880496, "grad_norm": 1.875, "learning_rate": 3.0017015856712814e-05, "loss": 0.2652634382247925, "mean_token_accuracy": 0.9065656661987305, "num_tokens": 335544.0, "step": 241 }, { "entropy": 0.2533347010612488, "epoch": 0.03731688511950655, "grad_norm": 1.96875, "learning_rate": 2.9857279593699544e-05, "loss": 0.2646684944629669, "mean_token_accuracy": 0.9075286388397217, "num_tokens": 336774.0, "step": 242 }, { "entropy": 0.22679953277111053, "epoch": 0.037471087124132614, "grad_norm": 2.078125, "learning_rate": 2.9697336984125683e-05, "loss": 0.22257877886295319, "mean_token_accuracy": 0.9175019264221191, "num_tokens": 338079.0, "step": 243 }, { "entropy": 0.19455574452877045, "epoch": 0.03762528912875868, "grad_norm": 1.5546875, "learning_rate": 2.9537194822660295e-05, "loss": 0.19329281151294708, "mean_token_accuracy": 0.9266055226325989, "num_tokens": 339722.0, "step": 244 }, { "entropy": 0.20773011445999146, "epoch": 0.03777949113338473, "grad_norm": 1.9453125, "learning_rate": 2.9376859912449794e-05, "loss": 0.20826096832752228, "mean_token_accuracy": 0.9232895374298096, "num_tokens": 341177.0, "step": 245 }, { "entropy": 0.2844797372817993, "epoch": 0.037933693138010795, "grad_norm": 1.796875, "learning_rate": 2.9216339064828914e-05, "loss": 0.2653990387916565, "mean_token_accuracy": 0.910646378993988, "num_tokens": 342237.0, "step": 246 }, { "entropy": 0.19197861850261688, "epoch": 0.03808789514263686, "grad_norm": 1.5859375, "learning_rate": 2.9055639099031386e-05, "loss": 0.191925048828125, "mean_token_accuracy": 0.9356250166893005, "num_tokens": 343845.0, "step": 247 }, { "entropy": 0.28776344656944275, "epoch": 0.03824209714726291, "grad_norm": 2.59375, "learning_rate": 2.8894766841900223e-05, "loss": 0.27679842710494995, "mean_token_accuracy": 0.9086069464683533, "num_tokens": 344980.0, "step": 248 }, { "entropy": 0.23193758726119995, "epoch": 0.038396299151888975, "grad_norm": 1.9765625, "learning_rate": 2.8733729127597692e-05, "loss": 0.2313500940799713, "mean_token_accuracy": 0.9189602732658386, "num_tokens": 346296.0, "step": 249 }, { "entropy": 0.19187554717063904, "epoch": 0.03855050115651504, "grad_norm": 1.4765625, "learning_rate": 2.8572532797315006e-05, "loss": 0.17860986292362213, "mean_token_accuracy": 0.9357484579086304, "num_tokens": 347767.0, "step": 250 }, { "entropy": 0.26534777879714966, "epoch": 0.038704703161141094, "grad_norm": 2.234375, "learning_rate": 2.8411184698981684e-05, "loss": 0.2811349630355835, "mean_token_accuracy": 0.9026548862457275, "num_tokens": 349131.0, "step": 251 }, { "entropy": 0.19166985154151917, "epoch": 0.038858905165767156, "grad_norm": 1.4375, "learning_rate": 2.824969168697466e-05, "loss": 0.1818903237581253, "mean_token_accuracy": 0.9364994764328003, "num_tokens": 351013.0, "step": 252 }, { "entropy": 0.2197422981262207, "epoch": 0.03901310717039321, "grad_norm": 2.0, "learning_rate": 2.808806062182705e-05, "loss": 0.24899303913116455, "mean_token_accuracy": 0.9060351252555847, "num_tokens": 352330.0, "step": 253 }, { "entropy": 0.24478891491889954, "epoch": 0.039167309175019274, "grad_norm": 2.046875, "learning_rate": 2.792629836993676e-05, "loss": 0.24458467960357666, "mean_token_accuracy": 0.914650559425354, "num_tokens": 353826.0, "step": 254 }, { "entropy": 0.17300452291965485, "epoch": 0.03932151117964534, "grad_norm": 1.453125, "learning_rate": 2.776441180327475e-05, "loss": 0.1748412549495697, "mean_token_accuracy": 0.9393326640129089, "num_tokens": 355812.0, "step": 255 }, { "entropy": 0.28217461705207825, "epoch": 0.03947571318427139, "grad_norm": 2.375, "learning_rate": 2.76024077990931e-05, "loss": 0.28308406472206116, "mean_token_accuracy": 0.908906877040863, "num_tokens": 356808.0, "step": 256 }, { "epoch": 0.03947571318427139, "eval_entropy": 0.2422610384068991, "eval_loss": 0.2376217544078827, "eval_mean_token_accuracy": 0.9154835451416105, "eval_num_tokens": 356808.0, "eval_runtime": 34.9417, "eval_samples_per_second": 78.159, "eval_steps_per_second": 9.788, "step": 256 }, { "entropy": 0.2056795060634613, "epoch": 0.039629915188897455, "grad_norm": 1.7265625, "learning_rate": 2.7440293239632885e-05, "loss": 0.1848773956298828, "mean_token_accuracy": 0.9414348602294922, "num_tokens": 358182.0, "step": 257 }, { "entropy": 0.21008774638175964, "epoch": 0.03978411719352352, "grad_norm": 2.125, "learning_rate": 2.7278075011831757e-05, "loss": 0.23831506073474884, "mean_token_accuracy": 0.9120956659317017, "num_tokens": 359612.0, "step": 258 }, { "entropy": 0.22274059057235718, "epoch": 0.03993831919814957, "grad_norm": 2.078125, "learning_rate": 2.711576000703141e-05, "loss": 0.22159968316555023, "mean_token_accuracy": 0.9259036183357239, "num_tokens": 361280.0, "step": 259 }, { "entropy": 0.24206753075122833, "epoch": 0.040092521202775636, "grad_norm": 2.21875, "learning_rate": 2.6953355120684802e-05, "loss": 0.2599974274635315, "mean_token_accuracy": 0.915960431098938, "num_tokens": 362704.0, "step": 260 }, { "entropy": 0.22195129096508026, "epoch": 0.0402467232074017, "grad_norm": 1.8203125, "learning_rate": 2.6790867252063247e-05, "loss": 0.22732976078987122, "mean_token_accuracy": 0.9146426320075989, "num_tokens": 364153.0, "step": 261 }, { "entropy": 0.19769293069839478, "epoch": 0.040400925212027754, "grad_norm": 1.5390625, "learning_rate": 2.6628303303963288e-05, "loss": 0.18025925755500793, "mean_token_accuracy": 0.9401107430458069, "num_tokens": 366148.0, "step": 262 }, { "entropy": 0.36093661189079285, "epoch": 0.040555127216653816, "grad_norm": 2.828125, "learning_rate": 2.646567018241349e-05, "loss": 0.36829474568367004, "mean_token_accuracy": 0.8780487775802612, "num_tokens": 367140.0, "step": 263 }, { "entropy": 0.28070077300071716, "epoch": 0.04070932922127988, "grad_norm": 2.171875, "learning_rate": 2.6302974796381015e-05, "loss": 0.27073192596435547, "mean_token_accuracy": 0.9048058986663818, "num_tokens": 368230.0, "step": 264 }, { "entropy": 0.28238415718078613, "epoch": 0.040863531225905934, "grad_norm": 2.078125, "learning_rate": 2.6140224057478158e-05, "loss": 0.2595861256122589, "mean_token_accuracy": 0.9181897044181824, "num_tokens": 369387.0, "step": 265 }, { "entropy": 0.24161042273044586, "epoch": 0.041017733230532, "grad_norm": 1.6328125, "learning_rate": 2.5977424879668705e-05, "loss": 0.22480149567127228, "mean_token_accuracy": 0.9269341230392456, "num_tokens": 370791.0, "step": 266 }, { "entropy": 0.1969321221113205, "epoch": 0.04117193523515806, "grad_norm": 1.53125, "learning_rate": 2.5814584178974218e-05, "loss": 0.1720927655696869, "mean_token_accuracy": 0.934974730014801, "num_tokens": 372383.0, "step": 267 }, { "entropy": 0.23700961470603943, "epoch": 0.041326137239784115, "grad_norm": 1.921875, "learning_rate": 2.5651708873180223e-05, "loss": 0.22749063372612, "mean_token_accuracy": 0.917475700378418, "num_tokens": 373627.0, "step": 268 }, { "entropy": 0.22176285088062286, "epoch": 0.04148033924441018, "grad_norm": 1.4375, "learning_rate": 2.5488805881542356e-05, "loss": 0.19518814980983734, "mean_token_accuracy": 0.922112226486206, "num_tokens": 375150.0, "step": 269 }, { "entropy": 0.19811592996120453, "epoch": 0.04163454124903624, "grad_norm": 1.65625, "learning_rate": 2.5325882124492395e-05, "loss": 0.2038094401359558, "mean_token_accuracy": 0.9243918657302856, "num_tokens": 376679.0, "step": 270 }, { "entropy": 0.16331960260868073, "epoch": 0.041788743253662296, "grad_norm": 1.296875, "learning_rate": 2.5162944523344256e-05, "loss": 0.15330754220485687, "mean_token_accuracy": 0.9463318586349487, "num_tokens": 378718.0, "step": 271 }, { "entropy": 0.2266637682914734, "epoch": 0.04194294525828836, "grad_norm": 1.7734375, "learning_rate": 2.5e-05, "loss": 0.20924291014671326, "mean_token_accuracy": 0.9225251078605652, "num_tokens": 380120.0, "step": 272 }, { "entropy": 0.27386748790740967, "epoch": 0.04209714726291442, "grad_norm": 2.296875, "learning_rate": 2.4837055476655746e-05, "loss": 0.28491681814193726, "mean_token_accuracy": 0.9068265557289124, "num_tokens": 381212.0, "step": 273 }, { "entropy": 0.2462942749261856, "epoch": 0.042251349267540476, "grad_norm": 1.9375, "learning_rate": 2.4674117875507615e-05, "loss": 0.23223665356636047, "mean_token_accuracy": 0.9165329337120056, "num_tokens": 382466.0, "step": 274 }, { "entropy": 0.2614425718784332, "epoch": 0.04240555127216654, "grad_norm": 2.265625, "learning_rate": 2.451119411845765e-05, "loss": 0.27489128708839417, "mean_token_accuracy": 0.9016948938369751, "num_tokens": 383654.0, "step": 275 }, { "entropy": 0.21999643743038177, "epoch": 0.0425597532767926, "grad_norm": 2.140625, "learning_rate": 2.4348291126819783e-05, "loss": 0.2654040455818176, "mean_token_accuracy": 0.9077669978141785, "num_tokens": 385104.0, "step": 276 }, { "entropy": 0.2447359710931778, "epoch": 0.04271395528141866, "grad_norm": 2.546875, "learning_rate": 2.4185415821025795e-05, "loss": 0.2940978705883026, "mean_token_accuracy": 0.8986432552337646, "num_tokens": 386365.0, "step": 277 }, { "entropy": 0.24432024359703064, "epoch": 0.04286815728604472, "grad_norm": 2.171875, "learning_rate": 2.4022575120331307e-05, "loss": 0.2683406174182892, "mean_token_accuracy": 0.9004524946212769, "num_tokens": 387478.0, "step": 278 }, { "entropy": 0.19444933533668518, "epoch": 0.04302235929067078, "grad_norm": 1.7265625, "learning_rate": 2.3859775942521854e-05, "loss": 0.18984566628932953, "mean_token_accuracy": 0.9271844625473022, "num_tokens": 388928.0, "step": 279 }, { "entropy": 0.25862905383110046, "epoch": 0.04317656129529684, "grad_norm": 2.359375, "learning_rate": 2.3697025203618987e-05, "loss": 0.2914562523365021, "mean_token_accuracy": 0.906593382358551, "num_tokens": 390210.0, "step": 280 }, { "entropy": 0.2573435604572296, "epoch": 0.0433307632999229, "grad_norm": 2.15625, "learning_rate": 2.3534329817586513e-05, "loss": 0.25994932651519775, "mean_token_accuracy": 0.9036144614219666, "num_tokens": 391214.0, "step": 281 }, { "entropy": 0.25984057784080505, "epoch": 0.043484965304548956, "grad_norm": 2.109375, "learning_rate": 2.3371696696036715e-05, "loss": 0.23992516100406647, "mean_token_accuracy": 0.9247743487358093, "num_tokens": 392219.0, "step": 282 }, { "entropy": 0.20528267323970795, "epoch": 0.04363916730917502, "grad_norm": 1.9140625, "learning_rate": 2.320913274793676e-05, "loss": 0.20434120297431946, "mean_token_accuracy": 0.9243749976158142, "num_tokens": 393827.0, "step": 283 }, { "entropy": 0.44059571623802185, "epoch": 0.04379336931380108, "grad_norm": 3.546875, "learning_rate": 2.30466448793152e-05, "loss": 0.49274563789367676, "mean_token_accuracy": 0.834419846534729, "num_tokens": 394602.0, "step": 284 }, { "entropy": 0.24022506177425385, "epoch": 0.04394757131842714, "grad_norm": 1.921875, "learning_rate": 2.28842399929686e-05, "loss": 0.23765617609024048, "mean_token_accuracy": 0.9164490699768066, "num_tokens": 395759.0, "step": 285 }, { "entropy": 0.23994681239128113, "epoch": 0.0441017733230532, "grad_norm": 1.84375, "learning_rate": 2.272192498816825e-05, "loss": 0.2343621850013733, "mean_token_accuracy": 0.9188445806503296, "num_tokens": 397221.0, "step": 286 }, { "entropy": 0.27961966395378113, "epoch": 0.04425597532767926, "grad_norm": 2.25, "learning_rate": 2.255970676036712e-05, "loss": 0.27381986379623413, "mean_token_accuracy": 0.8992950916290283, "num_tokens": 398222.0, "step": 287 }, { "entropy": 0.1786043792963028, "epoch": 0.04441017733230532, "grad_norm": 1.4921875, "learning_rate": 2.2397592200906906e-05, "loss": 0.17795482277870178, "mean_token_accuracy": 0.9386597871780396, "num_tokens": 400170.0, "step": 288 }, { "entropy": 0.1822587549686432, "epoch": 0.04456437933693138, "grad_norm": 1.375, "learning_rate": 2.223558819672526e-05, "loss": 0.1628590077161789, "mean_token_accuracy": 0.9355238676071167, "num_tokens": 401791.0, "step": 289 }, { "entropy": 0.22401201725006104, "epoch": 0.04471858134155744, "grad_norm": 1.9765625, "learning_rate": 2.2073701630063243e-05, "loss": 0.23397932946681976, "mean_token_accuracy": 0.9228187799453735, "num_tokens": 403289.0, "step": 290 }, { "entropy": 0.26227450370788574, "epoch": 0.0448727833461835, "grad_norm": 2.28125, "learning_rate": 2.1911939378172956e-05, "loss": 0.2669812738895416, "mean_token_accuracy": 0.9153226017951965, "num_tokens": 404537.0, "step": 291 }, { "entropy": 0.21649585664272308, "epoch": 0.04502698535080956, "grad_norm": 1.46875, "learning_rate": 2.175030831302535e-05, "loss": 0.18651390075683594, "mean_token_accuracy": 0.9295774698257446, "num_tokens": 405894.0, "step": 292 }, { "entropy": 0.2264479100704193, "epoch": 0.04518118735543562, "grad_norm": 2.046875, "learning_rate": 2.158881530101832e-05, "loss": 0.24527707695960999, "mean_token_accuracy": 0.9157626032829285, "num_tokens": 407469.0, "step": 293 }, { "entropy": 0.19007329642772675, "epoch": 0.04533538936006168, "grad_norm": 1.828125, "learning_rate": 2.1427467202685007e-05, "loss": 0.18996097147464752, "mean_token_accuracy": 0.9266110062599182, "num_tokens": 409153.0, "step": 294 }, { "entropy": 0.2581518888473511, "epoch": 0.04548959136468774, "grad_norm": 1.890625, "learning_rate": 2.126627087240231e-05, "loss": 0.2599462568759918, "mean_token_accuracy": 0.9158653616905212, "num_tokens": 410409.0, "step": 295 }, { "entropy": 0.22935496270656586, "epoch": 0.045643793369313804, "grad_norm": 2.09375, "learning_rate": 2.110523315809978e-05, "loss": 0.21854767203330994, "mean_token_accuracy": 0.9225852489471436, "num_tokens": 411825.0, "step": 296 }, { "entropy": 0.25962114334106445, "epoch": 0.04579799537393986, "grad_norm": 2.296875, "learning_rate": 2.0944360900968617e-05, "loss": 0.28228771686553955, "mean_token_accuracy": 0.8985915780067444, "num_tokens": 412898.0, "step": 297 }, { "entropy": 0.25601744651794434, "epoch": 0.04595219737856592, "grad_norm": 1.9765625, "learning_rate": 2.0783660935171092e-05, "loss": 0.26037973165512085, "mean_token_accuracy": 0.9110707640647888, "num_tokens": 414008.0, "step": 298 }, { "entropy": 0.2810611128807068, "epoch": 0.046106399383191984, "grad_norm": 2.328125, "learning_rate": 2.0623140087550215e-05, "loss": 0.29850900173187256, "mean_token_accuracy": 0.9104072451591492, "num_tokens": 415121.0, "step": 299 }, { "entropy": 0.22841358184814453, "epoch": 0.04626060138781804, "grad_norm": 1.84375, "learning_rate": 2.046280517733971e-05, "loss": 0.22839921712875366, "mean_token_accuracy": 0.923349916934967, "num_tokens": 416538.0, "step": 300 }, { "entropy": 0.2764427959918976, "epoch": 0.0464148033924441, "grad_norm": 2.34375, "learning_rate": 2.0302663015874322e-05, "loss": 0.2636858820915222, "mean_token_accuracy": 0.9106976985931396, "num_tokens": 417621.0, "step": 301 }, { "entropy": 0.18497152626514435, "epoch": 0.046569005397070165, "grad_norm": 1.5, "learning_rate": 2.0142720406300465e-05, "loss": 0.18430255353450775, "mean_token_accuracy": 0.929759681224823, "num_tokens": 419252.0, "step": 302 }, { "entropy": 0.2483554184436798, "epoch": 0.04672320740169622, "grad_norm": 1.9140625, "learning_rate": 1.9982984143287188e-05, "loss": 0.24268567562103271, "mean_token_accuracy": 0.9065420627593994, "num_tokens": 420437.0, "step": 303 }, { "entropy": 0.2957545518875122, "epoch": 0.04687740940632228, "grad_norm": 2.59375, "learning_rate": 1.9823461012737564e-05, "loss": 0.3344174325466156, "mean_token_accuracy": 0.8834766149520874, "num_tokens": 421492.0, "step": 304 }, { "entropy": 0.23411741852760315, "epoch": 0.047031611410948346, "grad_norm": 1.5703125, "learning_rate": 1.966415779150037e-05, "loss": 0.21458064019680023, "mean_token_accuracy": 0.9274131059646606, "num_tokens": 422795.0, "step": 305 }, { "entropy": 0.2103796899318695, "epoch": 0.0471858134155744, "grad_norm": 1.671875, "learning_rate": 1.9505081247082237e-05, "loss": 0.20959612727165222, "mean_token_accuracy": 0.9208722710609436, "num_tokens": 424408.0, "step": 306 }, { "entropy": 0.2197587639093399, "epoch": 0.047340015420200464, "grad_norm": 1.6796875, "learning_rate": 1.9346238137360106e-05, "loss": 0.20553667843341827, "mean_token_accuracy": 0.9193548560142517, "num_tokens": 425718.0, "step": 307 }, { "entropy": 0.24315893650054932, "epoch": 0.04749421742482652, "grad_norm": 1.6484375, "learning_rate": 1.918763521029418e-05, "loss": 0.22866766154766083, "mean_token_accuracy": 0.9147771596908569, "num_tokens": 427005.0, "step": 308 }, { "entropy": 0.2538098990917206, "epoch": 0.04764841942945258, "grad_norm": 2.078125, "learning_rate": 1.9029279203641232e-05, "loss": 0.2357470542192459, "mean_token_accuracy": 0.9233912229537964, "num_tokens": 427992.0, "step": 309 }, { "entropy": 0.3305405378341675, "epoch": 0.047802621434078645, "grad_norm": 2.875, "learning_rate": 1.8871176844668374e-05, "loss": 0.3201872408390045, "mean_token_accuracy": 0.8776978254318237, "num_tokens": 428834.0, "step": 310 }, { "entropy": 0.22924208641052246, "epoch": 0.0479568234387047, "grad_norm": 1.703125, "learning_rate": 1.8713334849867315e-05, "loss": 0.2193642556667328, "mean_token_accuracy": 0.9297805428504944, "num_tokens": 430437.0, "step": 311 }, { "entropy": 0.2438676506280899, "epoch": 0.04811102544333076, "grad_norm": 1.7578125, "learning_rate": 1.8555759924668952e-05, "loss": 0.2391282469034195, "mean_token_accuracy": 0.9204819202423096, "num_tokens": 431690.0, "step": 312 }, { "entropy": 0.30626124143600464, "epoch": 0.048265227447956825, "grad_norm": 2.484375, "learning_rate": 1.8398458763158578e-05, "loss": 0.31509530544281006, "mean_token_accuracy": 0.8954593539237976, "num_tokens": 432645.0, "step": 313 }, { "entropy": 0.26661908626556396, "epoch": 0.04841942945258288, "grad_norm": 1.9921875, "learning_rate": 1.8241438047791454e-05, "loss": 0.2524988651275635, "mean_token_accuracy": 0.9092437028884888, "num_tokens": 433843.0, "step": 314 }, { "entropy": 0.22748155891895294, "epoch": 0.04857363145720894, "grad_norm": 1.8515625, "learning_rate": 1.8084704449108985e-05, "loss": 0.2243906408548355, "mean_token_accuracy": 0.9239205121994019, "num_tokens": 435310.0, "step": 315 }, { "entropy": 0.17577649652957916, "epoch": 0.048727833461835006, "grad_norm": 1.671875, "learning_rate": 1.7928264625455282e-05, "loss": 0.1813218891620636, "mean_token_accuracy": 0.9322709441184998, "num_tokens": 437326.0, "step": 316 }, { "entropy": 0.27867627143859863, "epoch": 0.04888203546646106, "grad_norm": 2.390625, "learning_rate": 1.7772125222694337e-05, "loss": 0.28030475974082947, "mean_token_accuracy": 0.8948306441307068, "num_tokens": 438456.0, "step": 317 }, { "entropy": 0.23422475159168243, "epoch": 0.049036237471087124, "grad_norm": 1.65625, "learning_rate": 1.7616292873927688e-05, "loss": 0.2259235829114914, "mean_token_accuracy": 0.915672242641449, "num_tokens": 439721.0, "step": 318 }, { "entropy": 0.20051687955856323, "epoch": 0.04919043947571319, "grad_norm": 1.5625, "learning_rate": 1.7460774199212625e-05, "loss": 0.20561350882053375, "mean_token_accuracy": 0.9247232675552368, "num_tokens": 441084.0, "step": 319 }, { "entropy": 0.17916183173656464, "epoch": 0.04934464148033924, "grad_norm": 1.265625, "learning_rate": 1.7305575805280956e-05, "loss": 0.16743285953998566, "mean_token_accuracy": 0.9406779408454895, "num_tokens": 442862.0, "step": 320 }, { "entropy": 0.18751926720142365, "epoch": 0.049498843484965305, "grad_norm": 1.3671875, "learning_rate": 1.7150704285258375e-05, "loss": 0.16947750747203827, "mean_token_accuracy": 0.9436795711517334, "num_tokens": 444468.0, "step": 321 }, { "entropy": 0.17793025076389313, "epoch": 0.04965304548959137, "grad_norm": 1.28125, "learning_rate": 1.6996166218384307e-05, "loss": 0.16534742712974548, "mean_token_accuracy": 0.939068078994751, "num_tokens": 446150.0, "step": 322 }, { "entropy": 0.2475776970386505, "epoch": 0.04980724749421742, "grad_norm": 2.15625, "learning_rate": 1.684196816973248e-05, "loss": 0.2468724101781845, "mean_token_accuracy": 0.919457733631134, "num_tokens": 447412.0, "step": 323 }, { "entropy": 0.2225208878517151, "epoch": 0.049961449498843485, "grad_norm": 1.625, "learning_rate": 1.6688116689931972e-05, "loss": 0.20401687920093536, "mean_token_accuracy": 0.9311926364898682, "num_tokens": 448946.0, "step": 324 }, { "entropy": 0.2503822445869446, "epoch": 0.05011565150346955, "grad_norm": 1.96875, "learning_rate": 1.6534618314888945e-05, "loss": 0.22844718396663666, "mean_token_accuracy": 0.9175724387168884, "num_tokens": 450058.0, "step": 325 }, { "entropy": 0.25004157423973083, "epoch": 0.050269853508095604, "grad_norm": 2.203125, "learning_rate": 1.638147956550904e-05, "loss": 0.25791749358177185, "mean_token_accuracy": 0.9117646813392639, "num_tokens": 451324.0, "step": 326 }, { "entropy": 0.22011376917362213, "epoch": 0.050424055512721666, "grad_norm": 1.8515625, "learning_rate": 1.622870694742026e-05, "loss": 0.19179725646972656, "mean_token_accuracy": 0.9320175647735596, "num_tokens": 452700.0, "step": 327 }, { "entropy": 0.193440780043602, "epoch": 0.05057825751734773, "grad_norm": 1.625, "learning_rate": 1.6076306950696658e-05, "loss": 0.19295921921730042, "mean_token_accuracy": 0.9318463206291199, "num_tokens": 454322.0, "step": 328 }, { "entropy": 0.17849111557006836, "epoch": 0.050732459521973784, "grad_norm": 1.46875, "learning_rate": 1.592428604958264e-05, "loss": 0.16607390344142914, "mean_token_accuracy": 0.9433174133300781, "num_tokens": 456006.0, "step": 329 }, { "entropy": 0.2486262321472168, "epoch": 0.05088666152659985, "grad_norm": 1.953125, "learning_rate": 1.5772650702217878e-05, "loss": 0.2480083853006363, "mean_token_accuracy": 0.9057851433753967, "num_tokens": 457224.0, "step": 330 }, { "entropy": 0.27837270498275757, "epoch": 0.05104086353122591, "grad_norm": 2.59375, "learning_rate": 1.5621407350362986e-05, "loss": 0.2996099293231964, "mean_token_accuracy": 0.9042253494262695, "num_tokens": 458297.0, "step": 331 }, { "entropy": 0.20956268906593323, "epoch": 0.051195065535851965, "grad_norm": 1.5625, "learning_rate": 1.5470562419125868e-05, "loss": 0.18728220462799072, "mean_token_accuracy": 0.9295774698257446, "num_tokens": 459796.0, "step": 332 }, { "entropy": 0.29057589173316956, "epoch": 0.05134926754047803, "grad_norm": 2.40625, "learning_rate": 1.5320122316688735e-05, "loss": 0.29962292313575745, "mean_token_accuracy": 0.8858093023300171, "num_tokens": 460706.0, "step": 333 }, { "entropy": 0.1948358118534088, "epoch": 0.05150346954510408, "grad_norm": 1.578125, "learning_rate": 1.517009343403591e-05, "loss": 0.1801883429288864, "mean_token_accuracy": 0.93376624584198, "num_tokens": 462254.0, "step": 334 }, { "entropy": 0.22513329982757568, "epoch": 0.051657671549730146, "grad_norm": 2.046875, "learning_rate": 1.5020482144682308e-05, "loss": 0.22428080439567566, "mean_token_accuracy": 0.9161764979362488, "num_tokens": 463622.0, "step": 335 }, { "entropy": 0.2175763100385666, "epoch": 0.05181187355435621, "grad_norm": 2.15625, "learning_rate": 1.4871294804402675e-05, "loss": 0.21439555287361145, "mean_token_accuracy": 0.9237037301063538, "num_tokens": 464980.0, "step": 336 }, { "entropy": 0.1653544306755066, "epoch": 0.051966075558982264, "grad_norm": 1.796875, "learning_rate": 1.472253775096159e-05, "loss": 0.16475962102413177, "mean_token_accuracy": 0.9355390667915344, "num_tokens": 466741.0, "step": 337 }, { "entropy": 0.20776669681072235, "epoch": 0.052120277563608326, "grad_norm": 1.9453125, "learning_rate": 1.4574217303844211e-05, "loss": 0.19919782876968384, "mean_token_accuracy": 0.9283204674720764, "num_tokens": 468172.0, "step": 338 }, { "entropy": 0.18218226730823517, "epoch": 0.05227447956823439, "grad_norm": 1.6875, "learning_rate": 1.4426339763987844e-05, "loss": 0.1778276562690735, "mean_token_accuracy": 0.9303686618804932, "num_tokens": 469889.0, "step": 339 }, { "entropy": 0.25532829761505127, "epoch": 0.052428681572860444, "grad_norm": 1.9375, "learning_rate": 1.4278911413514204e-05, "loss": 0.26636841893196106, "mean_token_accuracy": 0.9083333611488342, "num_tokens": 471217.0, "step": 340 }, { "entropy": 0.19937695562839508, "epoch": 0.05258288357748651, "grad_norm": 1.6015625, "learning_rate": 1.4131938515462639e-05, "loss": 0.1952292025089264, "mean_token_accuracy": 0.9280303120613098, "num_tokens": 472809.0, "step": 341 }, { "entropy": 0.28071922063827515, "epoch": 0.05273708558211257, "grad_norm": 2.4375, "learning_rate": 1.3985427313523947e-05, "loss": 0.28267180919647217, "mean_token_accuracy": 0.885199248790741, "num_tokens": 473871.0, "step": 342 }, { "entropy": 0.1708391159772873, "epoch": 0.052891287586738625, "grad_norm": 1.40625, "learning_rate": 1.3839384031775226e-05, "loss": 0.1682218760251999, "mean_token_accuracy": 0.9421338438987732, "num_tokens": 475538.0, "step": 343 }, { "entropy": 0.17169421911239624, "epoch": 0.05304548959136469, "grad_norm": 1.671875, "learning_rate": 1.3693814874415389e-05, "loss": 0.1755795031785965, "mean_token_accuracy": 0.9377777576446533, "num_tokens": 477346.0, "step": 344 }, { "entropy": 0.2197735607624054, "epoch": 0.05319969159599075, "grad_norm": 1.8515625, "learning_rate": 1.3548726025501688e-05, "loss": 0.22578758001327515, "mean_token_accuracy": 0.9094029068946838, "num_tokens": 478811.0, "step": 345 }, { "entropy": 0.21483223140239716, "epoch": 0.053353893600616806, "grad_norm": 1.6484375, "learning_rate": 1.340412364868689e-05, "loss": 0.21270032227039337, "mean_token_accuracy": 0.9238030910491943, "num_tokens": 480302.0, "step": 346 }, { "entropy": 0.27951836585998535, "epoch": 0.05350809560524287, "grad_norm": 2.28125, "learning_rate": 1.3260013886957538e-05, "loss": 0.2666223645210266, "mean_token_accuracy": 0.9077869057655334, "num_tokens": 481286.0, "step": 347 }, { "entropy": 0.1917494833469391, "epoch": 0.05366229760986893, "grad_norm": 1.578125, "learning_rate": 1.3116402862372933e-05, "loss": 0.19692182540893555, "mean_token_accuracy": 0.9339783787727356, "num_tokens": 483051.0, "step": 348 }, { "entropy": 0.20676381886005402, "epoch": 0.053816499614494986, "grad_norm": 1.6328125, "learning_rate": 1.2973296675805041e-05, "loss": 0.20207884907722473, "mean_token_accuracy": 0.9374217987060547, "num_tokens": 484657.0, "step": 349 }, { "entropy": 0.19531835615634918, "epoch": 0.05397070161912105, "grad_norm": 1.7421875, "learning_rate": 1.2830701406679375e-05, "loss": 0.18931494653224945, "mean_token_accuracy": 0.9317750930786133, "num_tokens": 486248.0, "step": 350 }, { "entropy": 0.3396989405155182, "epoch": 0.05412490362374711, "grad_norm": 5.1875, "learning_rate": 1.2688623112716652e-05, "loss": 0.37070798873901367, "mean_token_accuracy": 0.869767427444458, "num_tokens": 487116.0, "step": 351 }, { "entropy": 0.17527468502521515, "epoch": 0.05427910562837317, "grad_norm": 1.859375, "learning_rate": 1.2547067829675535e-05, "loss": 0.17982880771160126, "mean_token_accuracy": 0.9339567422866821, "num_tokens": 488835.0, "step": 352 }, { "entropy": 0.2687583565711975, "epoch": 0.05443330763299923, "grad_norm": 2.03125, "learning_rate": 1.2406041571096164e-05, "loss": 0.2823106646537781, "mean_token_accuracy": 0.9135371446609497, "num_tokens": 489988.0, "step": 353 }, { "entropy": 0.1937769502401352, "epoch": 0.05458750963762529, "grad_norm": 1.8515625, "learning_rate": 1.2265550328044681e-05, "loss": 0.19238050282001495, "mean_token_accuracy": 0.9310998916625977, "num_tokens": 491578.0, "step": 354 }, { "entropy": 0.17158617079257965, "epoch": 0.05474171164225135, "grad_norm": 1.4765625, "learning_rate": 1.2125600068858772e-05, "loss": 0.16338223218917847, "mean_token_accuracy": 0.9456647634506226, "num_tokens": 493316.0, "step": 355 }, { "entropy": 0.19250212609767914, "epoch": 0.05489591364687741, "grad_norm": 1.671875, "learning_rate": 1.1986196738894078e-05, "loss": 0.17621511220932007, "mean_token_accuracy": 0.9345238208770752, "num_tokens": 494668.0, "step": 356 }, { "entropy": 0.19578416645526886, "epoch": 0.05505011565150347, "grad_norm": 1.8828125, "learning_rate": 1.1847346260271647e-05, "loss": 0.183770090341568, "mean_token_accuracy": 0.9346092343330383, "num_tokens": 495930.0, "step": 357 }, { "entropy": 0.22412899136543274, "epoch": 0.05520431765612953, "grad_norm": 1.828125, "learning_rate": 1.1709054531626313e-05, "loss": 0.2516805827617645, "mean_token_accuracy": 0.9137670397758484, "num_tokens": 497260.0, "step": 358 }, { "entropy": 0.2025316208600998, "epoch": 0.05535851966075559, "grad_norm": 1.3203125, "learning_rate": 1.1571327427856177e-05, "loss": 0.19299444556236267, "mean_token_accuracy": 0.9367007613182068, "num_tokens": 498832.0, "step": 359 }, { "entropy": 0.2235983908176422, "epoch": 0.05551272166538165, "grad_norm": 1.5859375, "learning_rate": 1.1434170799872947e-05, "loss": 0.200628861784935, "mean_token_accuracy": 0.929682195186615, "num_tokens": 500319.0, "step": 360 }, { "entropy": 0.28108713030815125, "epoch": 0.05566692367000771, "grad_norm": 2.40625, "learning_rate": 1.1297590474353464e-05, "loss": 0.2882252335548401, "mean_token_accuracy": 0.8986828923225403, "num_tokens": 501314.0, "step": 361 }, { "entropy": 0.21756984293460846, "epoch": 0.05582112567463377, "grad_norm": 2.125, "learning_rate": 1.116159225349213e-05, "loss": 0.23450873792171478, "mean_token_accuracy": 0.9163208603858948, "num_tokens": 502768.0, "step": 362 }, { "entropy": 0.2556920051574707, "epoch": 0.05597532767925983, "grad_norm": 2.453125, "learning_rate": 1.1026181914754388e-05, "loss": 0.2757260203361511, "mean_token_accuracy": 0.9049859046936035, "num_tokens": 503839.0, "step": 363 }, { "entropy": 0.21779917180538177, "epoch": 0.05612952968388589, "grad_norm": 1.953125, "learning_rate": 1.089136521063137e-05, "loss": 0.22174124419689178, "mean_token_accuracy": 0.9221984148025513, "num_tokens": 505248.0, "step": 364 }, { "entropy": 0.3109717071056366, "epoch": 0.05628373168851195, "grad_norm": 2.578125, "learning_rate": 1.075714786839542e-05, "loss": 0.2979055345058441, "mean_token_accuracy": 0.8831614851951599, "num_tokens": 506129.0, "step": 365 }, { "entropy": 0.22565557062625885, "epoch": 0.05643793369313801, "grad_norm": 1.859375, "learning_rate": 1.0623535589856887e-05, "loss": 0.23962406814098358, "mean_token_accuracy": 0.9183965921401978, "num_tokens": 507534.0, "step": 366 }, { "entropy": 0.16417403519153595, "epoch": 0.05659213569776407, "grad_norm": 2.25, "learning_rate": 1.0490534051121808e-05, "loss": 0.16284841299057007, "mean_token_accuracy": 0.937706708908081, "num_tokens": 509356.0, "step": 367 }, { "entropy": 0.18802893161773682, "epoch": 0.05674633770239013, "grad_norm": 1.6640625, "learning_rate": 1.0358148902350853e-05, "loss": 0.19001488387584686, "mean_token_accuracy": 0.930488646030426, "num_tokens": 510817.0, "step": 368 }, { "entropy": 0.22402897477149963, "epoch": 0.05690053970701619, "grad_norm": 2.125, "learning_rate": 1.0226385767519259e-05, "loss": 0.228716179728508, "mean_token_accuracy": 0.924344539642334, "num_tokens": 512160.0, "step": 369 }, { "entropy": 0.24438747763633728, "epoch": 0.05705474171164225, "grad_norm": 1.984375, "learning_rate": 1.0095250244177887e-05, "loss": 0.22704952955245972, "mean_token_accuracy": 0.918749988079071, "num_tokens": 513288.0, "step": 370 }, { "entropy": 0.23192906379699707, "epoch": 0.057208943716268314, "grad_norm": 1.96875, "learning_rate": 9.964747903215513e-06, "loss": 0.22084636986255646, "mean_token_accuracy": 0.929665744304657, "num_tokens": 514732.0, "step": 371 }, { "entropy": 0.1626010537147522, "epoch": 0.05736314572089437, "grad_norm": 1.3203125, "learning_rate": 9.834884288622054e-06, "loss": 0.15189611911773682, "mean_token_accuracy": 0.941209077835083, "num_tokens": 516543.0, "step": 372 }, { "entropy": 0.16602161526679993, "epoch": 0.05751734772552043, "grad_norm": 1.3828125, "learning_rate": 9.705664917253143e-06, "loss": 0.18036378920078278, "mean_token_accuracy": 0.9382113814353943, "num_tokens": 518396.0, "step": 373 }, { "entropy": 0.16473768651485443, "epoch": 0.057671549730146494, "grad_norm": 1.3046875, "learning_rate": 9.577095278595694e-06, "loss": 0.15197424590587616, "mean_token_accuracy": 0.9414084553718567, "num_tokens": 520179.0, "step": 374 }, { "entropy": 0.1879141479730606, "epoch": 0.05782575173477255, "grad_norm": 1.8046875, "learning_rate": 9.449180834534749e-06, "loss": 0.18156398832798004, "mean_token_accuracy": 0.9304715991020203, "num_tokens": 521841.0, "step": 375 }, { "entropy": 0.2549605369567871, "epoch": 0.05797995373939861, "grad_norm": 2.203125, "learning_rate": 9.321927019121435e-06, "loss": 0.257169634103775, "mean_token_accuracy": 0.9048386812210083, "num_tokens": 523089.0, "step": 376 }, { "entropy": 0.18407224118709564, "epoch": 0.058134155744024675, "grad_norm": 1.609375, "learning_rate": 9.195339238342071e-06, "loss": 0.18074241280555725, "mean_token_accuracy": 0.936096727848053, "num_tokens": 524834.0, "step": 377 }, { "entropy": 0.21801158785820007, "epoch": 0.05828835774865073, "grad_norm": 1.9609375, "learning_rate": 9.069422869888583e-06, "loss": 0.22194962203502655, "mean_token_accuracy": 0.923652708530426, "num_tokens": 526178.0, "step": 378 }, { "entropy": 0.18715234100818634, "epoch": 0.05844255975327679, "grad_norm": 1.34375, "learning_rate": 8.944183262929984e-06, "loss": 0.17807839810848236, "mean_token_accuracy": 0.9365825057029724, "num_tokens": 527889.0, "step": 379 }, { "entropy": 0.196278914809227, "epoch": 0.058596761757902856, "grad_norm": 1.6953125, "learning_rate": 8.819625737885187e-06, "loss": 0.20651084184646606, "mean_token_accuracy": 0.9256097674369812, "num_tokens": 529537.0, "step": 380 }, { "entropy": 0.35177287459373474, "epoch": 0.05875096376252891, "grad_norm": 3.296875, "learning_rate": 8.695755586196924e-06, "loss": 0.385383665561676, "mean_token_accuracy": 0.8580645322799683, "num_tokens": 530475.0, "step": 381 }, { "entropy": 0.25344812870025635, "epoch": 0.058905165767154974, "grad_norm": 2.078125, "learning_rate": 8.572578070107016e-06, "loss": 0.25393110513687134, "mean_token_accuracy": 0.917894721031189, "num_tokens": 531433.0, "step": 382 }, { "entropy": 0.3020297884941101, "epoch": 0.059059367771781036, "grad_norm": 2.359375, "learning_rate": 8.450098422432787e-06, "loss": 0.3018152415752411, "mean_token_accuracy": 0.9065510630607605, "num_tokens": 532479.0, "step": 383 }, { "entropy": 0.15192678570747375, "epoch": 0.05921356977640709, "grad_norm": 1.4296875, "learning_rate": 8.328321846344755e-06, "loss": 0.1450488418340683, "mean_token_accuracy": 0.9468623399734497, "num_tokens": 534463.0, "step": 384 }, { "epoch": 0.05921356977640709, "eval_entropy": 0.22466930076044206, "eval_loss": 0.22621265053749084, "eval_mean_token_accuracy": 0.9194652596760912, "eval_num_tokens": 534463.0, "eval_runtime": 34.9665, "eval_samples_per_second": 78.103, "eval_steps_per_second": 9.781, "step": 384 }, { "entropy": 0.18735887110233307, "epoch": 0.059367771781033155, "grad_norm": 1.375, "learning_rate": 8.207253515145625e-06, "loss": 0.18456675112247467, "mean_token_accuracy": 0.9276748299598694, "num_tokens": 536144.0, "step": 385 }, { "entropy": 0.2384348064661026, "epoch": 0.05952197378565921, "grad_norm": 1.9375, "learning_rate": 8.086898572050494e-06, "loss": 0.24932722747325897, "mean_token_accuracy": 0.9125475287437439, "num_tokens": 537467.0, "step": 386 }, { "entropy": 0.21620430052280426, "epoch": 0.05967617579028527, "grad_norm": 1.8515625, "learning_rate": 7.967262129968378e-06, "loss": 0.20638106763362885, "mean_token_accuracy": 0.9262917637825012, "num_tokens": 538791.0, "step": 387 }, { "entropy": 0.22282716631889343, "epoch": 0.059830377794911335, "grad_norm": 1.8203125, "learning_rate": 7.848349271284952e-06, "loss": 0.24068771302700043, "mean_token_accuracy": 0.911854088306427, "num_tokens": 540115.0, "step": 388 }, { "entropy": 0.19987352192401886, "epoch": 0.05998457979953739, "grad_norm": 1.7109375, "learning_rate": 7.730165047646723e-06, "loss": 0.19121116399765015, "mean_token_accuracy": 0.93138587474823, "num_tokens": 541595.0, "step": 389 }, { "entropy": 0.2530774772167206, "epoch": 0.06013878180416345, "grad_norm": 2.484375, "learning_rate": 7.612714479746347e-06, "loss": 0.250463604927063, "mean_token_accuracy": 0.9078303575515747, "num_tokens": 542829.0, "step": 390 }, { "entropy": 0.2623169720172882, "epoch": 0.060292983808789516, "grad_norm": 2.515625, "learning_rate": 7.4960025571094025e-06, "loss": 0.27675166726112366, "mean_token_accuracy": 0.9017013311386108, "num_tokens": 543895.0, "step": 391 }, { "entropy": 0.2155791074037552, "epoch": 0.06044718581341557, "grad_norm": 1.7890625, "learning_rate": 7.380034237882394e-06, "loss": 0.21280765533447266, "mean_token_accuracy": 0.9217687249183655, "num_tokens": 545373.0, "step": 392 }, { "entropy": 0.3150392770767212, "epoch": 0.060601387818041634, "grad_norm": 2.5, "learning_rate": 7.264814448622106e-06, "loss": 0.3080776035785675, "mean_token_accuracy": 0.898815929889679, "num_tokens": 546310.0, "step": 393 }, { "entropy": 0.19685329496860504, "epoch": 0.0607555898226677, "grad_norm": 2.125, "learning_rate": 7.150348084086367e-06, "loss": 0.22213543951511383, "mean_token_accuracy": 0.9212239384651184, "num_tokens": 547854.0, "step": 394 }, { "entropy": 0.1816016435623169, "epoch": 0.06090979182729375, "grad_norm": 1.4140625, "learning_rate": 7.036640007026038e-06, "loss": 0.17253060638904572, "mean_token_accuracy": 0.9350804090499878, "num_tokens": 549541.0, "step": 395 }, { "entropy": 0.19817869365215302, "epoch": 0.061063993831919815, "grad_norm": 1.7890625, "learning_rate": 6.923695047978502e-06, "loss": 0.191897913813591, "mean_token_accuracy": 0.9271523356437683, "num_tokens": 551059.0, "step": 396 }, { "entropy": 0.24792121350765228, "epoch": 0.06121819583654588, "grad_norm": 2.25, "learning_rate": 6.811518005062423e-06, "loss": 0.2625022828578949, "mean_token_accuracy": 0.9022988677024841, "num_tokens": 552111.0, "step": 397 }, { "entropy": 0.24607616662979126, "epoch": 0.06137239784117193, "grad_norm": 2.28125, "learning_rate": 6.700113643773892e-06, "loss": 0.22993192076683044, "mean_token_accuracy": 0.9271889328956604, "num_tokens": 553204.0, "step": 398 }, { "entropy": 0.25920623540878296, "epoch": 0.061526599845797995, "grad_norm": 2.453125, "learning_rate": 6.589486696784028e-06, "loss": 0.27900075912475586, "mean_token_accuracy": 0.9022931456565857, "num_tokens": 554215.0, "step": 399 }, { "entropy": 0.28530606627464294, "epoch": 0.06168080185042406, "grad_norm": 2.4375, "learning_rate": 6.47964186373787e-06, "loss": 0.2928396165370941, "mean_token_accuracy": 0.8845500946044922, "num_tokens": 555401.0, "step": 400 }, { "entropy": 0.2927665114402771, "epoch": 0.061835003855050114, "grad_norm": 2.25, "learning_rate": 6.370583811054778e-06, "loss": 0.2968969941139221, "mean_token_accuracy": 0.9039433598518372, "num_tokens": 556398.0, "step": 401 }, { "entropy": 0.23018132150173187, "epoch": 0.061989205859676176, "grad_norm": 1.96875, "learning_rate": 6.262317171730167e-06, "loss": 0.23996573686599731, "mean_token_accuracy": 0.9214015007019043, "num_tokens": 557462.0, "step": 402 }, { "entropy": 0.25166183710098267, "epoch": 0.06214340786430224, "grad_norm": 2.0, "learning_rate": 6.154846545138695e-06, "loss": 0.2649187445640564, "mean_token_accuracy": 0.9033687710762024, "num_tokens": 558598.0, "step": 403 }, { "entropy": 0.23649781942367554, "epoch": 0.062297609868928294, "grad_norm": 2.15625, "learning_rate": 6.048176496838856e-06, "loss": 0.21528743207454681, "mean_token_accuracy": 0.9269746541976929, "num_tokens": 559948.0, "step": 404 }, { "entropy": 0.22737731039524078, "epoch": 0.06245181187355436, "grad_norm": 1.796875, "learning_rate": 5.9423115583790604e-06, "loss": 0.21719223260879517, "mean_token_accuracy": 0.9225531816482544, "num_tokens": 561131.0, "step": 405 }, { "entropy": 0.21060694754123688, "epoch": 0.06260601387818042, "grad_norm": 1.4453125, "learning_rate": 5.8372562271051e-06, "loss": 0.19261834025382996, "mean_token_accuracy": 0.9304878115653992, "num_tokens": 562779.0, "step": 406 }, { "entropy": 0.24134337902069092, "epoch": 0.06276021588280647, "grad_norm": 1.8671875, "learning_rate": 5.733014965969091e-06, "loss": 0.2224052995443344, "mean_token_accuracy": 0.9310910701751709, "num_tokens": 564006.0, "step": 407 }, { "entropy": 0.19692017138004303, "epoch": 0.06291441788743253, "grad_norm": 1.6328125, "learning_rate": 5.629592203339909e-06, "loss": 0.18327265977859497, "mean_token_accuracy": 0.9346548914909363, "num_tokens": 565376.0, "step": 408 }, { "entropy": 0.2016250342130661, "epoch": 0.0630686198920586, "grad_norm": 1.4765625, "learning_rate": 5.526992332815012e-06, "loss": 0.20120908319950104, "mean_token_accuracy": 0.9263085126876831, "num_tokens": 566836.0, "step": 409 }, { "entropy": 0.14676110446453094, "epoch": 0.06322282189668466, "grad_norm": 1.3046875, "learning_rate": 5.4252197130338525e-06, "loss": 0.1583862602710724, "mean_token_accuracy": 0.9458128213882446, "num_tokens": 569280.0, "step": 410 }, { "entropy": 0.1877201646566391, "epoch": 0.06337702390131071, "grad_norm": 2.09375, "learning_rate": 5.3242786674926545e-06, "loss": 0.18557564914226532, "mean_token_accuracy": 0.9334638118743896, "num_tokens": 570821.0, "step": 411 }, { "entropy": 0.21993833780288696, "epoch": 0.06353122590593678, "grad_norm": 1.7421875, "learning_rate": 5.224173484360798e-06, "loss": 0.19618681073188782, "mean_token_accuracy": 0.9358024597167969, "num_tokens": 572044.0, "step": 412 }, { "entropy": 0.20039010047912598, "epoch": 0.06368542791056284, "grad_norm": 1.3671875, "learning_rate": 5.124908416298615e-06, "loss": 0.18724791705608368, "mean_token_accuracy": 0.9329929947853088, "num_tokens": 573619.0, "step": 413 }, { "entropy": 0.21013715863227844, "epoch": 0.06383962991518889, "grad_norm": 1.796875, "learning_rate": 5.026487680276723e-06, "loss": 0.21998311579227448, "mean_token_accuracy": 0.9184691905975342, "num_tokens": 574829.0, "step": 414 }, { "entropy": 0.26953125, "epoch": 0.06399383191981496, "grad_norm": 2.171875, "learning_rate": 4.928915457396913e-06, "loss": 0.26942914724349976, "mean_token_accuracy": 0.9191489219665527, "num_tokens": 576012.0, "step": 415 }, { "entropy": 0.23597829043865204, "epoch": 0.06414803392444102, "grad_norm": 1.84375, "learning_rate": 4.832195892714489e-06, "loss": 0.22428561747074127, "mean_token_accuracy": 0.9230215549468994, "num_tokens": 577410.0, "step": 416 }, { "entropy": 0.28713032603263855, "epoch": 0.06430223592906707, "grad_norm": 2.0625, "learning_rate": 4.736333095062228e-06, "loss": 0.2505059242248535, "mean_token_accuracy": 0.9073724150657654, "num_tokens": 578476.0, "step": 417 }, { "entropy": 0.2858028709888458, "epoch": 0.06445643793369314, "grad_norm": 2.015625, "learning_rate": 4.641331136875768e-06, "loss": 0.2911134958267212, "mean_token_accuracy": 0.9045093059539795, "num_tokens": 579615.0, "step": 418 }, { "entropy": 0.282069593667984, "epoch": 0.0646106399383192, "grad_norm": 2.09375, "learning_rate": 4.547194054020651e-06, "loss": 0.27553999423980713, "mean_token_accuracy": 0.90444016456604, "num_tokens": 580659.0, "step": 419 }, { "entropy": 0.22959555685520172, "epoch": 0.06476484194294525, "grad_norm": 1.9453125, "learning_rate": 4.453925845620854e-06, "loss": 0.22032871842384338, "mean_token_accuracy": 0.9136531352996826, "num_tokens": 582022.0, "step": 420 }, { "entropy": 0.2052592635154724, "epoch": 0.06491904394757132, "grad_norm": 1.7734375, "learning_rate": 4.361530473888889e-06, "loss": 0.20798712968826294, "mean_token_accuracy": 0.9232394099235535, "num_tokens": 583450.0, "step": 421 }, { "entropy": 0.32572290301322937, "epoch": 0.06507324595219738, "grad_norm": 2.578125, "learning_rate": 4.270011863957507e-06, "loss": 0.33982253074645996, "mean_token_accuracy": 0.8741418719291687, "num_tokens": 584332.0, "step": 422 }, { "entropy": 0.3089931607246399, "epoch": 0.06522744795682343, "grad_norm": 2.578125, "learning_rate": 4.179373903712913e-06, "loss": 0.30327266454696655, "mean_token_accuracy": 0.8930232524871826, "num_tokens": 585200.0, "step": 423 }, { "entropy": 0.19629529118537903, "epoch": 0.0653816499614495, "grad_norm": 1.703125, "learning_rate": 4.089620443629652e-06, "loss": 0.2054092288017273, "mean_token_accuracy": 0.9246435761451721, "num_tokens": 586681.0, "step": 424 }, { "entropy": 0.18628910183906555, "epoch": 0.06553585196607556, "grad_norm": 1.3359375, "learning_rate": 4.000755296606973e-06, "loss": 0.1760605424642563, "mean_token_accuracy": 0.9416413307189941, "num_tokens": 588334.0, "step": 425 }, { "entropy": 0.194645494222641, "epoch": 0.06569005397070161, "grad_norm": 1.90625, "learning_rate": 3.912782237806903e-06, "loss": 0.19329358637332916, "mean_token_accuracy": 0.9218025207519531, "num_tokens": 589851.0, "step": 426 }, { "entropy": 0.19448570907115936, "epoch": 0.06584425597532768, "grad_norm": 1.671875, "learning_rate": 3.825705004493849e-06, "loss": 0.18638762831687927, "mean_token_accuracy": 0.9315856695175171, "num_tokens": 591423.0, "step": 427 }, { "entropy": 0.26799967885017395, "epoch": 0.06599845797995374, "grad_norm": 2.125, "learning_rate": 3.739527295875811e-06, "loss": 0.2695932686328888, "mean_token_accuracy": 0.9055441617965698, "num_tokens": 592405.0, "step": 428 }, { "entropy": 0.20886771380901337, "epoch": 0.0661526599845798, "grad_norm": 1.875, "learning_rate": 3.6542527729472836e-06, "loss": 0.22071963548660278, "mean_token_accuracy": 0.9178168177604675, "num_tokens": 594007.0, "step": 429 }, { "entropy": 0.19780333340168, "epoch": 0.06630686198920586, "grad_norm": 1.4296875, "learning_rate": 3.5698850583336663e-06, "loss": 0.19298632442951202, "mean_token_accuracy": 0.9317794442176819, "num_tokens": 595774.0, "step": 430 }, { "entropy": 0.2335851490497589, "epoch": 0.06646106399383192, "grad_norm": 1.5859375, "learning_rate": 3.4864277361374264e-06, "loss": 0.21905845403671265, "mean_token_accuracy": 0.9286743402481079, "num_tokens": 597170.0, "step": 431 }, { "entropy": 0.17323604226112366, "epoch": 0.06661526599845798, "grad_norm": 1.640625, "learning_rate": 3.4038843517858075e-06, "loss": 0.17967088520526886, "mean_token_accuracy": 0.9360523819923401, "num_tokens": 599164.0, "step": 432 }, { "entropy": 0.2514375150203705, "epoch": 0.06676946800308405, "grad_norm": 1.8125, "learning_rate": 3.3222584118802192e-06, "loss": 0.2490684688091278, "mean_token_accuracy": 0.9187192320823669, "num_tokens": 600390.0, "step": 433 }, { "entropy": 0.22465308010578156, "epoch": 0.0669236700077101, "grad_norm": 2.515625, "learning_rate": 3.241553384047258e-06, "loss": 0.26371464133262634, "mean_token_accuracy": 0.9116766452789307, "num_tokens": 601734.0, "step": 434 }, { "entropy": 0.20948569476604462, "epoch": 0.06707787201233616, "grad_norm": 1.5703125, "learning_rate": 3.1617726967914235e-06, "loss": 0.21372012794017792, "mean_token_accuracy": 0.9316811561584473, "num_tokens": 603235.0, "step": 435 }, { "entropy": 0.20347538590431213, "epoch": 0.06723207401696223, "grad_norm": 1.6796875, "learning_rate": 3.0829197393494548e-06, "loss": 0.17981462180614471, "mean_token_accuracy": 0.9269624352455139, "num_tokens": 604708.0, "step": 436 }, { "entropy": 0.23263585567474365, "epoch": 0.06738627602158828, "grad_norm": 2.1875, "learning_rate": 3.004997861546327e-06, "loss": 0.23778997361660004, "mean_token_accuracy": 0.9214986562728882, "num_tokens": 605837.0, "step": 437 }, { "entropy": 0.23302724957466125, "epoch": 0.06754047802621434, "grad_norm": 2.203125, "learning_rate": 2.9280103736529896e-06, "loss": 0.23127038776874542, "mean_token_accuracy": 0.9103972911834717, "num_tokens": 607028.0, "step": 438 }, { "entropy": 0.18138211965560913, "epoch": 0.0676946800308404, "grad_norm": 1.4140625, "learning_rate": 2.8519605462456965e-06, "loss": 0.1681656837463379, "mean_token_accuracy": 0.9345430731773376, "num_tokens": 608579.0, "step": 439 }, { "entropy": 0.17149963974952698, "epoch": 0.06784888203546646, "grad_norm": 1.6171875, "learning_rate": 2.776851610067094e-06, "loss": 0.1811680942773819, "mean_token_accuracy": 0.932692289352417, "num_tokens": 610563.0, "step": 440 }, { "entropy": 0.15687499940395355, "epoch": 0.06800308404009252, "grad_norm": 1.34375, "learning_rate": 2.7026867558889694e-06, "loss": 0.15128004550933838, "mean_token_accuracy": 0.9400107264518738, "num_tokens": 612438.0, "step": 441 }, { "entropy": 0.22530966997146606, "epoch": 0.06815728604471859, "grad_norm": 2.046875, "learning_rate": 2.6294691343766718e-06, "loss": 0.22919264435768127, "mean_token_accuracy": 0.9237637519836426, "num_tokens": 613902.0, "step": 442 }, { "entropy": 0.21813379228115082, "epoch": 0.06831148804934464, "grad_norm": 1.7890625, "learning_rate": 2.557201855955316e-06, "loss": 0.20722565054893494, "mean_token_accuracy": 0.9286713004112244, "num_tokens": 615340.0, "step": 443 }, { "entropy": 0.22816047072410583, "epoch": 0.0684656900539707, "grad_norm": 1.7890625, "learning_rate": 2.4858879906775904e-06, "loss": 0.2418501079082489, "mean_token_accuracy": 0.9141337275505066, "num_tokens": 616664.0, "step": 444 }, { "entropy": 0.24174243211746216, "epoch": 0.06861989205859677, "grad_norm": 1.7421875, "learning_rate": 2.4155305680933938e-06, "loss": 0.24712735414505005, "mean_token_accuracy": 0.9127676486968994, "num_tokens": 617933.0, "step": 445 }, { "entropy": 0.23680631816387177, "epoch": 0.06877409406322282, "grad_norm": 2.15625, "learning_rate": 2.3461325771210683e-06, "loss": 0.24274389445781708, "mean_token_accuracy": 0.9137291312217712, "num_tokens": 619019.0, "step": 446 }, { "entropy": 0.21051788330078125, "epoch": 0.06892829606784888, "grad_norm": 1.5703125, "learning_rate": 2.2776969659205005e-06, "loss": 0.19205066561698914, "mean_token_accuracy": 0.9310897588729858, "num_tokens": 620275.0, "step": 447 }, { "entropy": 0.19069823622703552, "epoch": 0.06908249807247495, "grad_norm": 1.640625, "learning_rate": 2.2102266417677985e-06, "loss": 0.193171888589859, "mean_token_accuracy": 0.9300353527069092, "num_tokens": 621698.0, "step": 448 }, { "entropy": 0.26176121830940247, "epoch": 0.069236700077101, "grad_norm": 2.203125, "learning_rate": 2.143724470931846e-06, "loss": 0.2646713852882385, "mean_token_accuracy": 0.9019434452056885, "num_tokens": 622838.0, "step": 449 }, { "entropy": 0.37524735927581787, "epoch": 0.06939090208172706, "grad_norm": 3.421875, "learning_rate": 2.0781932785525122e-06, "loss": 0.3872081935405731, "mean_token_accuracy": 0.8746479153633118, "num_tokens": 623556.0, "step": 450 }, { "entropy": 0.20446714758872986, "epoch": 0.06954510408635313, "grad_norm": 1.984375, "learning_rate": 2.013635848520626e-06, "loss": 0.21962465345859528, "mean_token_accuracy": 0.9238095283508301, "num_tokens": 624824.0, "step": 451 }, { "entropy": 0.18340152502059937, "epoch": 0.06969930609097919, "grad_norm": 1.6796875, "learning_rate": 1.9500549233597453e-06, "loss": 0.1832038313150406, "mean_token_accuracy": 0.9371029138565063, "num_tokens": 626406.0, "step": 452 }, { "entropy": 0.3325141668319702, "epoch": 0.06985350809560524, "grad_norm": 3.046875, "learning_rate": 1.8874532041095989e-06, "loss": 0.34842032194137573, "mean_token_accuracy": 0.8773234486579895, "num_tokens": 627221.0, "step": 453 }, { "entropy": 0.20056799054145813, "epoch": 0.0700077101002313, "grad_norm": 1.59375, "learning_rate": 1.825833350211395e-06, "loss": 0.1930190622806549, "mean_token_accuracy": 0.9300291538238525, "num_tokens": 628944.0, "step": 454 }, { "entropy": 0.3074391484260559, "epoch": 0.07016191210485737, "grad_norm": 2.671875, "learning_rate": 1.7651979793947949e-06, "loss": 0.320962131023407, "mean_token_accuracy": 0.8794258236885071, "num_tokens": 629997.0, "step": 455 }, { "entropy": 0.2851220667362213, "epoch": 0.07031611410948342, "grad_norm": 2.4375, "learning_rate": 1.705549667566747e-06, "loss": 0.305853009223938, "mean_token_accuracy": 0.884324312210083, "num_tokens": 630930.0, "step": 456 }, { "entropy": 0.213734969496727, "epoch": 0.07047031611410948, "grad_norm": 1.875, "learning_rate": 1.6468909487020318e-06, "loss": 0.21344000101089478, "mean_token_accuracy": 0.9156540632247925, "num_tokens": 632337.0, "step": 457 }, { "entropy": 0.23210836946964264, "epoch": 0.07062451811873555, "grad_norm": 1.8984375, "learning_rate": 1.5892243147356128e-06, "loss": 0.22123272716999054, "mean_token_accuracy": 0.921897828578949, "num_tokens": 633715.0, "step": 458 }, { "entropy": 0.2013556957244873, "epoch": 0.0707787201233616, "grad_norm": 1.90625, "learning_rate": 1.5325522154568006e-06, "loss": 0.2120433896780014, "mean_token_accuracy": 0.9267473220825195, "num_tokens": 635211.0, "step": 459 }, { "entropy": 0.1748819798231125, "epoch": 0.07093292212798766, "grad_norm": 1.40625, "learning_rate": 1.4768770584051433e-06, "loss": 0.16574399173259735, "mean_token_accuracy": 0.9330986142158508, "num_tokens": 636923.0, "step": 460 }, { "entropy": 0.20135805010795593, "epoch": 0.07108712413261373, "grad_norm": 1.59375, "learning_rate": 1.422201208768187e-06, "loss": 0.20329774916172028, "mean_token_accuracy": 0.9288026094436646, "num_tokens": 638476.0, "step": 461 }, { "entropy": 0.19482704997062683, "epoch": 0.07124132613723978, "grad_norm": 1.5, "learning_rate": 1.3685269892809715e-06, "loss": 0.18484120070934296, "mean_token_accuracy": 0.9365351796150208, "num_tokens": 640233.0, "step": 462 }, { "entropy": 0.2483380138874054, "epoch": 0.07139552814186584, "grad_norm": 2.453125, "learning_rate": 1.315856680127367e-06, "loss": 0.2574044167995453, "mean_token_accuracy": 0.9011474251747131, "num_tokens": 641374.0, "step": 463 }, { "entropy": 0.25926902890205383, "epoch": 0.07154973014649191, "grad_norm": 2.03125, "learning_rate": 1.2641925188432102e-06, "loss": 0.2751407325267792, "mean_token_accuracy": 0.9096437692642212, "num_tokens": 642533.0, "step": 464 }, { "entropy": 0.19511115550994873, "epoch": 0.07170393215111796, "grad_norm": 1.7265625, "learning_rate": 1.2135367002212321e-06, "loss": 0.19707168638706207, "mean_token_accuracy": 0.9302915334701538, "num_tokens": 644119.0, "step": 465 }, { "entropy": 0.2082238495349884, "epoch": 0.07185813415574402, "grad_norm": 1.828125, "learning_rate": 1.1638913762178489e-06, "loss": 0.2105921357870102, "mean_token_accuracy": 0.9202454090118408, "num_tokens": 645431.0, "step": 466 }, { "entropy": 0.19069121778011322, "epoch": 0.07201233616037009, "grad_norm": 1.6796875, "learning_rate": 1.1152586558617118e-06, "loss": 0.17696255445480347, "mean_token_accuracy": 0.9442567825317383, "num_tokens": 647215.0, "step": 467 }, { "entropy": 0.22916826605796814, "epoch": 0.07216653816499614, "grad_norm": 1.7890625, "learning_rate": 1.0676406051641357e-06, "loss": 0.22586072981357574, "mean_token_accuracy": 0.9183526039123535, "num_tokens": 648607.0, "step": 468 }, { "entropy": 0.26740562915802, "epoch": 0.0723207401696222, "grad_norm": 2.125, "learning_rate": 1.0210392470313078e-06, "loss": 0.2589561343193054, "mean_token_accuracy": 0.9052631855010986, "num_tokens": 649660.0, "step": 469 }, { "entropy": 0.22609063982963562, "epoch": 0.07247494217424827, "grad_norm": 1.859375, "learning_rate": 9.754565611783812e-07, "loss": 0.23183754086494446, "mean_token_accuracy": 0.9105263352394104, "num_tokens": 650998.0, "step": 470 }, { "entropy": 0.2637474536895752, "epoch": 0.07262914417887432, "grad_norm": 2.15625, "learning_rate": 9.308944840453415e-07, "loss": 0.2506449520587921, "mean_token_accuracy": 0.9153633713722229, "num_tokens": 652093.0, "step": 471 }, { "entropy": 0.2541276514530182, "epoch": 0.07278334618350038, "grad_norm": 2.125, "learning_rate": 8.873549087147604e-07, "loss": 0.25114259123802185, "mean_token_accuracy": 0.9045345783233643, "num_tokens": 653358.0, "step": 472 }, { "entropy": 0.20104283094406128, "epoch": 0.07293754818812645, "grad_norm": 1.5234375, "learning_rate": 8.44839684831375e-07, "loss": 0.18859422206878662, "mean_token_accuracy": 0.9310559034347534, "num_tokens": 654976.0, "step": 473 }, { "entropy": 0.23548080027103424, "epoch": 0.0730917501927525, "grad_norm": 2.125, "learning_rate": 8.03350618523499e-07, "loss": 0.2508711516857147, "mean_token_accuracy": 0.9083601236343384, "num_tokens": 656228.0, "step": 474 }, { "entropy": 0.2388007789850235, "epoch": 0.07324595219737856, "grad_norm": 1.9609375, "learning_rate": 7.628894723263086e-07, "loss": 0.25423818826675415, "mean_token_accuracy": 0.9074475765228271, "num_tokens": 657619.0, "step": 475 }, { "entropy": 0.2098216712474823, "epoch": 0.07340015420200463, "grad_norm": 1.6640625, "learning_rate": 7.234579651069578e-07, "loss": 0.19636894762516022, "mean_token_accuracy": 0.9344852566719055, "num_tokens": 659016.0, "step": 476 }, { "entropy": 0.19445836544036865, "epoch": 0.07355435620663069, "grad_norm": 1.4296875, "learning_rate": 6.850577719915624e-07, "loss": 0.18777857720851898, "mean_token_accuracy": 0.9340922832489014, "num_tokens": 660693.0, "step": 477 }, { "entropy": 0.2021363377571106, "epoch": 0.07370855821125674, "grad_norm": 1.65625, "learning_rate": 6.47690524294034e-07, "loss": 0.1869696080684662, "mean_token_accuracy": 0.9333333373069763, "num_tokens": 662111.0, "step": 478 }, { "entropy": 0.19528843462467194, "epoch": 0.07386276021588281, "grad_norm": 2.03125, "learning_rate": 6.113578094467775e-07, "loss": 0.17778527736663818, "mean_token_accuracy": 0.9368270039558411, "num_tokens": 663512.0, "step": 479 }, { "entropy": 0.17402714490890503, "epoch": 0.07401696222050887, "grad_norm": 1.2734375, "learning_rate": 5.760611709332648e-07, "loss": 0.15594635903835297, "mean_token_accuracy": 0.9421712756156921, "num_tokens": 665353.0, "step": 480 }, { "entropy": 0.14156945049762726, "epoch": 0.07417116422513492, "grad_norm": 1.1640625, "learning_rate": 5.418021082224472e-07, "loss": 0.1273384541273117, "mean_token_accuracy": 0.9454138875007629, "num_tokens": 667596.0, "step": 481 }, { "entropy": 0.15703719854354858, "epoch": 0.07432536622976099, "grad_norm": 1.4140625, "learning_rate": 5.08582076705072e-07, "loss": 0.15257099270820618, "mean_token_accuracy": 0.9451599717140198, "num_tokens": 670011.0, "step": 482 }, { "entropy": 0.15173302590847015, "epoch": 0.07447956823438705, "grad_norm": 1.359375, "learning_rate": 4.764024876318357e-07, "loss": 0.14840558171272278, "mean_token_accuracy": 0.9457720518112183, "num_tokens": 672195.0, "step": 483 }, { "entropy": 0.251803457736969, "epoch": 0.0746337702390131, "grad_norm": 2.140625, "learning_rate": 4.4526470805345554e-07, "loss": 0.23033595085144043, "mean_token_accuracy": 0.9138405323028564, "num_tokens": 673294.0, "step": 484 }, { "entropy": 0.22149844467639923, "epoch": 0.07478797224363917, "grad_norm": 1.671875, "learning_rate": 4.1517006076257914e-07, "loss": 0.20876595377922058, "mean_token_accuracy": 0.920634925365448, "num_tokens": 674751.0, "step": 485 }, { "entropy": 0.19992657005786896, "epoch": 0.07494217424826523, "grad_norm": 1.609375, "learning_rate": 3.861198242375852e-07, "loss": 0.20208041369915009, "mean_token_accuracy": 0.9220055937767029, "num_tokens": 676195.0, "step": 486 }, { "entropy": 0.2647709846496582, "epoch": 0.07509637625289128, "grad_norm": 2.09375, "learning_rate": 3.581152325882825e-07, "loss": 0.26581087708473206, "mean_token_accuracy": 0.9089347124099731, "num_tokens": 677367.0, "step": 487 }, { "entropy": 0.18075726926326752, "epoch": 0.07525057825751735, "grad_norm": 1.7109375, "learning_rate": 3.311574755034796e-07, "loss": 0.19126133620738983, "mean_token_accuracy": 0.9318181872367859, "num_tokens": 678959.0, "step": 488 }, { "entropy": 0.23724618554115295, "epoch": 0.07540478026214341, "grad_norm": 2.25, "learning_rate": 3.0524769820044487e-07, "loss": 0.23674722015857697, "mean_token_accuracy": 0.9180327653884888, "num_tokens": 680248.0, "step": 489 }, { "entropy": 0.22051914036273956, "epoch": 0.07555898226676946, "grad_norm": 1.8828125, "learning_rate": 2.8038700137624495e-07, "loss": 0.2116030901670456, "mean_token_accuracy": 0.9300605058670044, "num_tokens": 681743.0, "step": 490 }, { "entropy": 0.1757911741733551, "epoch": 0.07571318427139553, "grad_norm": 1.6015625, "learning_rate": 2.5657644116100497e-07, "loss": 0.17098675668239594, "mean_token_accuracy": 0.9406231641769409, "num_tokens": 683452.0, "step": 491 }, { "entropy": 0.18268117308616638, "epoch": 0.07586738627602159, "grad_norm": 1.4140625, "learning_rate": 2.338170290730246e-07, "loss": 0.17703530192375183, "mean_token_accuracy": 0.9361584782600403, "num_tokens": 685277.0, "step": 492 }, { "entropy": 0.19099417328834534, "epoch": 0.07602158828064765, "grad_norm": 1.5, "learning_rate": 2.1210973197582085e-07, "loss": 0.19510860741138458, "mean_token_accuracy": 0.9316887855529785, "num_tokens": 686866.0, "step": 493 }, { "entropy": 0.21786467730998993, "epoch": 0.07617579028527371, "grad_norm": 2.078125, "learning_rate": 1.9145547203703597e-07, "loss": 0.2253967970609665, "mean_token_accuracy": 0.9227994084358215, "num_tokens": 688260.0, "step": 494 }, { "entropy": 0.22731785476207733, "epoch": 0.07632999228989977, "grad_norm": 1.8203125, "learning_rate": 1.7185512668927706e-07, "loss": 0.21878266334533691, "mean_token_accuracy": 0.9235293865203857, "num_tokens": 689628.0, "step": 495 }, { "entropy": 0.31587833166122437, "epoch": 0.07648419429452583, "grad_norm": 2.515625, "learning_rate": 1.533095285928432e-07, "loss": 0.31676945090293884, "mean_token_accuracy": 0.8903688788414001, "num_tokens": 690612.0, "step": 496 }, { "entropy": 0.22072257101535797, "epoch": 0.0766383962991519, "grad_norm": 1.875, "learning_rate": 1.3581946560033142e-07, "loss": 0.20424997806549072, "mean_token_accuracy": 0.9260969758033752, "num_tokens": 691919.0, "step": 497 }, { "entropy": 0.2378959059715271, "epoch": 0.07679259830377795, "grad_norm": 2.125, "learning_rate": 1.1938568072319412e-07, "loss": 0.23960573971271515, "mean_token_accuracy": 0.908172607421875, "num_tokens": 693016.0, "step": 498 }, { "entropy": 0.18599998950958252, "epoch": 0.076946800308404, "grad_norm": 1.4453125, "learning_rate": 1.0400887210015586e-07, "loss": 0.17737571895122528, "mean_token_accuracy": 0.9337517619132996, "num_tokens": 694458.0, "step": 499 }, { "entropy": 0.1896909922361374, "epoch": 0.07710100231303008, "grad_norm": 1.5546875, "learning_rate": 8.968969296756224e-08, "loss": 0.1934422105550766, "mean_token_accuracy": 0.9257456064224243, "num_tokens": 696109.0, "step": 500 }, { "entropy": 0.18347270786762238, "epoch": 0.07725520431765613, "grad_norm": 1.5, "learning_rate": 7.642875163162977e-08, "loss": 0.17866890132427216, "mean_token_accuracy": 0.9312201142311096, "num_tokens": 697789.0, "step": 501 }, { "entropy": 0.27496322989463806, "epoch": 0.07740940632228219, "grad_norm": 2.375, "learning_rate": 6.422661144259989e-08, "loss": 0.2631693482398987, "mean_token_accuracy": 0.9099326729774475, "num_tokens": 698985.0, "step": 502 }, { "entropy": 0.21727091073989868, "epoch": 0.07756360832690826, "grad_norm": 1.984375, "learning_rate": 5.308379077080816e-08, "loss": 0.22967125475406647, "mean_token_accuracy": 0.9191842675209045, "num_tokens": 700317.0, "step": 503 }, { "entropy": 0.21876828372478485, "epoch": 0.07771781033153431, "grad_norm": 1.8359375, "learning_rate": 4.300076298466571e-08, "loss": 0.22112873196601868, "mean_token_accuracy": 0.9281525015830994, "num_tokens": 701689.0, "step": 504 }, { "entropy": 0.1745622456073761, "epoch": 0.07787201233616037, "grad_norm": 1.140625, "learning_rate": 3.3977956430547576e-08, "loss": 0.1722312867641449, "mean_token_accuracy": 0.9407705664634705, "num_tokens": 703436.0, "step": 505 }, { "entropy": 0.22518332302570343, "epoch": 0.07802621434078642, "grad_norm": 1.7421875, "learning_rate": 2.6015754414593363e-08, "loss": 0.22960630059242249, "mean_token_accuracy": 0.9271809458732605, "num_tokens": 704831.0, "step": 506 }, { "entropy": 0.2924734055995941, "epoch": 0.07818041634541249, "grad_norm": 2.609375, "learning_rate": 1.911449518643138e-08, "loss": 0.28948456048965454, "mean_token_accuracy": 0.8898043036460876, "num_tokens": 705810.0, "step": 507 }, { "entropy": 0.29374387860298157, "epoch": 0.07833461835003855, "grad_norm": 2.53125, "learning_rate": 1.3274471924798471e-08, "loss": 0.2914823293685913, "mean_token_accuracy": 0.9072463512420654, "num_tokens": 706853.0, "step": 508 }, { "entropy": 0.18828892707824707, "epoch": 0.0784888203546646, "grad_norm": 1.5390625, "learning_rate": 8.495932725094414e-09, "loss": 0.19034327566623688, "mean_token_accuracy": 0.9341492056846619, "num_tokens": 708577.0, "step": 509 }, { "entropy": 0.3254898488521576, "epoch": 0.07864302235929067, "grad_norm": 2.6875, "learning_rate": 4.779080588834806e-09, "loss": 0.3536283075809479, "mean_token_accuracy": 0.8856015801429749, "num_tokens": 709599.0, "step": 510 }, { "entropy": 0.19601193070411682, "epoch": 0.07879722436391673, "grad_norm": 1.71875, "learning_rate": 2.124073415030181e-09, "loss": 0.19777625799179077, "mean_token_accuracy": 0.9288975596427917, "num_tokens": 711140.0, "step": 511 }, { "entropy": 0.22275681793689728, "epoch": 0.07895142636854278, "grad_norm": 2.234375, "learning_rate": 5.310239934885885e-10, "loss": 0.23580928146839142, "mean_token_accuracy": 0.9170305728912354, "num_tokens": 712522.0, "step": 512 }, { "epoch": 0.07895142636854278, "eval_entropy": 0.22568650308408236, "eval_loss": 0.22544851899147034, "eval_mean_token_accuracy": 0.919665330160431, "eval_num_tokens": 712522.0, "eval_runtime": 34.9909, "eval_samples_per_second": 78.049, "eval_steps_per_second": 9.774, "step": 512 } ], "logging_steps": 1, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 128, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4536491036033024e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }