{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2000136808263219, "eval_steps": 500, "global_step": 2924, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.840413160954922e-05, "grad_norm": 19.005646318396554, "learning_rate": 2.2779043280182232e-08, "loss": 1.2551, "step": 1 }, { "epoch": 0.00013680826321909844, "grad_norm": 19.825164309346295, "learning_rate": 4.5558086560364464e-08, "loss": 0.8688, "step": 2 }, { "epoch": 0.00020521239482864764, "grad_norm": 28.98854364816896, "learning_rate": 6.83371298405467e-08, "loss": 1.2541, "step": 3 }, { "epoch": 0.00027361652643819687, "grad_norm": 12.726041910800626, "learning_rate": 9.111617312072893e-08, "loss": 1.3166, "step": 4 }, { "epoch": 0.0003420206580477461, "grad_norm": 32.47354504715627, "learning_rate": 1.1389521640091118e-07, "loss": 1.0256, "step": 5 }, { "epoch": 0.0004104247896572953, "grad_norm": 26.14034306202561, "learning_rate": 1.366742596810934e-07, "loss": 1.5129, "step": 6 }, { "epoch": 0.00047882892126684454, "grad_norm": 22.59197644966778, "learning_rate": 1.5945330296127564e-07, "loss": 1.2663, "step": 7 }, { "epoch": 0.0005472330528763937, "grad_norm": 25.5670147105785, "learning_rate": 1.8223234624145786e-07, "loss": 1.4021, "step": 8 }, { "epoch": 0.000615637184485943, "grad_norm": 19.27808738750586, "learning_rate": 2.0501138952164012e-07, "loss": 0.9687, "step": 9 }, { "epoch": 0.0006840413160954922, "grad_norm": 11.116110861484534, "learning_rate": 2.2779043280182236e-07, "loss": 1.166, "step": 10 }, { "epoch": 0.0007524454477050414, "grad_norm": 20.307456555365246, "learning_rate": 2.5056947608200455e-07, "loss": 1.1607, "step": 11 }, { "epoch": 0.0008208495793145906, "grad_norm": 12.058120090485795, "learning_rate": 2.733485193621868e-07, "loss": 1.1394, "step": 12 }, { "epoch": 0.0008892537109241398, "grad_norm": 29.53469411959909, "learning_rate": 2.961275626423691e-07, "loss": 0.9815, "step": 13 }, { "epoch": 0.0009576578425336891, "grad_norm": 22.863198826841863, "learning_rate": 3.189066059225513e-07, "loss": 1.0778, "step": 14 }, { "epoch": 0.0010260619741432383, "grad_norm": 17.711071469911264, "learning_rate": 3.416856492027335e-07, "loss": 1.2028, "step": 15 }, { "epoch": 0.0010944661057527875, "grad_norm": 23.553529524293335, "learning_rate": 3.644646924829157e-07, "loss": 0.9792, "step": 16 }, { "epoch": 0.0011628702373623367, "grad_norm": 14.014274948913696, "learning_rate": 3.8724373576309803e-07, "loss": 1.1274, "step": 17 }, { "epoch": 0.001231274368971886, "grad_norm": 15.060242796380724, "learning_rate": 4.1002277904328024e-07, "loss": 1.1117, "step": 18 }, { "epoch": 0.001299678500581435, "grad_norm": 18.10445634136811, "learning_rate": 4.3280182232346246e-07, "loss": 0.8063, "step": 19 }, { "epoch": 0.0013680826321909843, "grad_norm": 19.88694931792778, "learning_rate": 4.555808656036447e-07, "loss": 1.2076, "step": 20 }, { "epoch": 0.0014364867638005335, "grad_norm": 31.709343418336065, "learning_rate": 4.783599088838269e-07, "loss": 1.1243, "step": 21 }, { "epoch": 0.0015048908954100827, "grad_norm": 14.322136912072493, "learning_rate": 5.011389521640091e-07, "loss": 1.0473, "step": 22 }, { "epoch": 0.001573295027019632, "grad_norm": 15.407758596349403, "learning_rate": 5.239179954441914e-07, "loss": 1.1462, "step": 23 }, { "epoch": 0.0016416991586291811, "grad_norm": 17.653636853777154, "learning_rate": 5.466970387243736e-07, "loss": 0.7997, "step": 24 }, { "epoch": 0.0017101032902387303, "grad_norm": 12.819934604614861, "learning_rate": 5.694760820045558e-07, "loss": 0.7004, "step": 25 }, { "epoch": 0.0017785074218482795, "grad_norm": 16.404739513799814, "learning_rate": 5.922551252847382e-07, "loss": 1.1528, "step": 26 }, { "epoch": 0.001846911553457829, "grad_norm": 20.489541339944363, "learning_rate": 6.150341685649204e-07, "loss": 0.9264, "step": 27 }, { "epoch": 0.0019153156850673782, "grad_norm": 15.10654212249387, "learning_rate": 6.378132118451026e-07, "loss": 1.1305, "step": 28 }, { "epoch": 0.001983719816676927, "grad_norm": 11.814154776465195, "learning_rate": 6.605922551252848e-07, "loss": 0.9956, "step": 29 }, { "epoch": 0.0020521239482864766, "grad_norm": 13.07191952655166, "learning_rate": 6.83371298405467e-07, "loss": 0.6871, "step": 30 }, { "epoch": 0.0021205280798960256, "grad_norm": 10.183949325669186, "learning_rate": 7.061503416856492e-07, "loss": 1.1098, "step": 31 }, { "epoch": 0.002188932211505575, "grad_norm": 10.159981004141525, "learning_rate": 7.289293849658314e-07, "loss": 0.7531, "step": 32 }, { "epoch": 0.002257336343115124, "grad_norm": 11.160497033320771, "learning_rate": 7.517084282460136e-07, "loss": 0.7946, "step": 33 }, { "epoch": 0.0023257404747246734, "grad_norm": 11.595618249120902, "learning_rate": 7.744874715261961e-07, "loss": 0.9285, "step": 34 }, { "epoch": 0.0023941446063342224, "grad_norm": 14.112149224597388, "learning_rate": 7.972665148063783e-07, "loss": 0.6524, "step": 35 }, { "epoch": 0.002462548737943772, "grad_norm": 9.682423448956861, "learning_rate": 8.200455580865605e-07, "loss": 0.7987, "step": 36 }, { "epoch": 0.002530952869553321, "grad_norm": 8.09964246078515, "learning_rate": 8.428246013667427e-07, "loss": 1.0615, "step": 37 }, { "epoch": 0.00259935700116287, "grad_norm": 10.817647425345854, "learning_rate": 8.656036446469249e-07, "loss": 0.9523, "step": 38 }, { "epoch": 0.0026677611327724196, "grad_norm": 8.998123556132862, "learning_rate": 8.883826879271071e-07, "loss": 0.6709, "step": 39 }, { "epoch": 0.0027361652643819686, "grad_norm": 6.850653523922662, "learning_rate": 9.111617312072894e-07, "loss": 1.0962, "step": 40 }, { "epoch": 0.002804569395991518, "grad_norm": 12.190153566665263, "learning_rate": 9.339407744874717e-07, "loss": 0.9311, "step": 41 }, { "epoch": 0.002872973527601067, "grad_norm": 6.213900932761143, "learning_rate": 9.567198177676538e-07, "loss": 0.5645, "step": 42 }, { "epoch": 0.0029413776592106164, "grad_norm": 7.108373282149988, "learning_rate": 9.79498861047836e-07, "loss": 0.9261, "step": 43 }, { "epoch": 0.0030097817908201654, "grad_norm": 10.518919022143347, "learning_rate": 1.0022779043280182e-06, "loss": 0.5782, "step": 44 }, { "epoch": 0.003078185922429715, "grad_norm": 7.764676932291131, "learning_rate": 1.0250569476082005e-06, "loss": 0.6455, "step": 45 }, { "epoch": 0.003146590054039264, "grad_norm": 6.521873460229345, "learning_rate": 1.0478359908883828e-06, "loss": 0.3617, "step": 46 }, { "epoch": 0.0032149941856488133, "grad_norm": 18.063747653307693, "learning_rate": 1.0706150341685651e-06, "loss": 0.6493, "step": 47 }, { "epoch": 0.0032833983172583622, "grad_norm": 9.220471237618497, "learning_rate": 1.0933940774487472e-06, "loss": 0.9514, "step": 48 }, { "epoch": 0.0033518024488679117, "grad_norm": 7.445975018992441, "learning_rate": 1.1161731207289296e-06, "loss": 0.5799, "step": 49 }, { "epoch": 0.0034202065804774607, "grad_norm": 7.466263840089492, "learning_rate": 1.1389521640091117e-06, "loss": 1.4957, "step": 50 }, { "epoch": 0.00348861071208701, "grad_norm": 7.85539824512469, "learning_rate": 1.161731207289294e-06, "loss": 0.6961, "step": 51 }, { "epoch": 0.003557014843696559, "grad_norm": 7.915840333343226, "learning_rate": 1.1845102505694763e-06, "loss": 0.9127, "step": 52 }, { "epoch": 0.0036254189753061085, "grad_norm": 7.225193236251746, "learning_rate": 1.2072892938496584e-06, "loss": 0.6309, "step": 53 }, { "epoch": 0.003693823106915658, "grad_norm": 8.294950416467335, "learning_rate": 1.2300683371298407e-06, "loss": 0.5625, "step": 54 }, { "epoch": 0.003762227238525207, "grad_norm": 6.585241188174048, "learning_rate": 1.252847380410023e-06, "loss": 0.5428, "step": 55 }, { "epoch": 0.0038306313701347563, "grad_norm": 6.969663654186199, "learning_rate": 1.2756264236902052e-06, "loss": 0.5, "step": 56 }, { "epoch": 0.0038990355017443053, "grad_norm": 5.529118056995139, "learning_rate": 1.2984054669703875e-06, "loss": 0.3135, "step": 57 }, { "epoch": 0.003967439633353854, "grad_norm": 7.12123855217604, "learning_rate": 1.3211845102505696e-06, "loss": 0.6864, "step": 58 }, { "epoch": 0.004035843764963404, "grad_norm": 7.080102741067122, "learning_rate": 1.343963553530752e-06, "loss": 0.3519, "step": 59 }, { "epoch": 0.004104247896572953, "grad_norm": 9.304129044871978, "learning_rate": 1.366742596810934e-06, "loss": 0.9068, "step": 60 }, { "epoch": 0.004172652028182502, "grad_norm": 8.629972158057784, "learning_rate": 1.3895216400911163e-06, "loss": 0.6226, "step": 61 }, { "epoch": 0.004241056159792051, "grad_norm": 6.120841151942936, "learning_rate": 1.4123006833712984e-06, "loss": 0.9269, "step": 62 }, { "epoch": 0.004309460291401601, "grad_norm": 5.75185766480899, "learning_rate": 1.4350797266514807e-06, "loss": 0.3918, "step": 63 }, { "epoch": 0.00437786442301115, "grad_norm": 5.178097442019518, "learning_rate": 1.4578587699316629e-06, "loss": 0.1852, "step": 64 }, { "epoch": 0.004446268554620699, "grad_norm": 7.078527881196362, "learning_rate": 1.4806378132118452e-06, "loss": 0.3746, "step": 65 }, { "epoch": 0.004514672686230248, "grad_norm": 6.076177486072526, "learning_rate": 1.5034168564920273e-06, "loss": 0.4731, "step": 66 }, { "epoch": 0.004583076817839798, "grad_norm": 5.309470109272941, "learning_rate": 1.5261958997722096e-06, "loss": 0.3059, "step": 67 }, { "epoch": 0.004651480949449347, "grad_norm": 11.840534750100392, "learning_rate": 1.5489749430523921e-06, "loss": 0.6966, "step": 68 }, { "epoch": 0.004719885081058896, "grad_norm": 10.222125041426017, "learning_rate": 1.5717539863325742e-06, "loss": 0.55, "step": 69 }, { "epoch": 0.004788289212668445, "grad_norm": 7.131042260797559, "learning_rate": 1.5945330296127566e-06, "loss": 0.8744, "step": 70 }, { "epoch": 0.004856693344277995, "grad_norm": 6.28439698217826, "learning_rate": 1.6173120728929387e-06, "loss": 0.3941, "step": 71 }, { "epoch": 0.004925097475887544, "grad_norm": 7.881210723948495, "learning_rate": 1.640091116173121e-06, "loss": 0.4078, "step": 72 }, { "epoch": 0.004993501607497093, "grad_norm": 8.96401475052852, "learning_rate": 1.662870159453303e-06, "loss": 0.4546, "step": 73 }, { "epoch": 0.005061905739106642, "grad_norm": 5.662264076690713, "learning_rate": 1.6856492027334854e-06, "loss": 0.508, "step": 74 }, { "epoch": 0.005130309870716191, "grad_norm": 5.650398726445972, "learning_rate": 1.7084282460136675e-06, "loss": 0.565, "step": 75 }, { "epoch": 0.00519871400232574, "grad_norm": 5.064481983036394, "learning_rate": 1.7312072892938498e-06, "loss": 0.4604, "step": 76 }, { "epoch": 0.005267118133935289, "grad_norm": 5.656431500805364, "learning_rate": 1.753986332574032e-06, "loss": 0.6448, "step": 77 }, { "epoch": 0.005335522265544839, "grad_norm": 6.727741346392739, "learning_rate": 1.7767653758542143e-06, "loss": 0.5996, "step": 78 }, { "epoch": 0.005403926397154388, "grad_norm": 7.0610981115319404, "learning_rate": 1.7995444191343964e-06, "loss": 0.5485, "step": 79 }, { "epoch": 0.005472330528763937, "grad_norm": 10.515029300861103, "learning_rate": 1.8223234624145789e-06, "loss": 0.3882, "step": 80 }, { "epoch": 0.005540734660373486, "grad_norm": 6.913243614884951, "learning_rate": 1.845102505694761e-06, "loss": 0.7728, "step": 81 }, { "epoch": 0.005609138791983036, "grad_norm": 7.44139695855309, "learning_rate": 1.8678815489749433e-06, "loss": 0.7166, "step": 82 }, { "epoch": 0.005677542923592585, "grad_norm": 6.013853689222373, "learning_rate": 1.8906605922551254e-06, "loss": 0.5366, "step": 83 }, { "epoch": 0.005745947055202134, "grad_norm": 7.164864581519447, "learning_rate": 1.9134396355353075e-06, "loss": 0.3289, "step": 84 }, { "epoch": 0.005814351186811683, "grad_norm": 4.504062778899276, "learning_rate": 1.93621867881549e-06, "loss": 0.6583, "step": 85 }, { "epoch": 0.005882755318421233, "grad_norm": 6.336368362696905, "learning_rate": 1.958997722095672e-06, "loss": 0.4917, "step": 86 }, { "epoch": 0.005951159450030782, "grad_norm": 6.941150117440937, "learning_rate": 1.9817767653758545e-06, "loss": 0.3338, "step": 87 }, { "epoch": 0.006019563581640331, "grad_norm": 5.375141053234146, "learning_rate": 2.0045558086560364e-06, "loss": 0.543, "step": 88 }, { "epoch": 0.006087967713249881, "grad_norm": 4.387718813621196, "learning_rate": 2.0273348519362187e-06, "loss": 0.4088, "step": 89 }, { "epoch": 0.00615637184485943, "grad_norm": 7.167311169971348, "learning_rate": 2.050113895216401e-06, "loss": 0.4818, "step": 90 }, { "epoch": 0.006224775976468979, "grad_norm": 6.185913691772371, "learning_rate": 2.0728929384965833e-06, "loss": 0.4268, "step": 91 }, { "epoch": 0.006293180108078528, "grad_norm": 5.868832008511959, "learning_rate": 2.0956719817767656e-06, "loss": 0.9069, "step": 92 }, { "epoch": 0.0063615842396880775, "grad_norm": 6.800985421978269, "learning_rate": 2.118451025056948e-06, "loss": 0.5596, "step": 93 }, { "epoch": 0.0064299883712976265, "grad_norm": 5.190847350978537, "learning_rate": 2.1412300683371303e-06, "loss": 0.754, "step": 94 }, { "epoch": 0.0064983925029071755, "grad_norm": 5.612694886865947, "learning_rate": 2.164009111617312e-06, "loss": 0.2145, "step": 95 }, { "epoch": 0.0065667966345167245, "grad_norm": 6.941990850815697, "learning_rate": 2.1867881548974945e-06, "loss": 0.3697, "step": 96 }, { "epoch": 0.006635200766126274, "grad_norm": 7.2841488548206215, "learning_rate": 2.209567198177677e-06, "loss": 0.4138, "step": 97 }, { "epoch": 0.006703604897735823, "grad_norm": 4.951824944216801, "learning_rate": 2.232346241457859e-06, "loss": 0.2513, "step": 98 }, { "epoch": 0.006772009029345372, "grad_norm": 8.000708965835068, "learning_rate": 2.255125284738041e-06, "loss": 0.6468, "step": 99 }, { "epoch": 0.006840413160954921, "grad_norm": 5.7999871220939, "learning_rate": 2.2779043280182233e-06, "loss": 0.3713, "step": 100 }, { "epoch": 0.006908817292564471, "grad_norm": 6.147647385197564, "learning_rate": 2.3006833712984057e-06, "loss": 0.2845, "step": 101 }, { "epoch": 0.00697722142417402, "grad_norm": 5.608769631195229, "learning_rate": 2.323462414578588e-06, "loss": 0.317, "step": 102 }, { "epoch": 0.007045625555783569, "grad_norm": 6.103642928359892, "learning_rate": 2.34624145785877e-06, "loss": 0.4528, "step": 103 }, { "epoch": 0.007114029687393118, "grad_norm": 5.298617726137569, "learning_rate": 2.3690205011389526e-06, "loss": 0.4712, "step": 104 }, { "epoch": 0.007182433819002668, "grad_norm": 4.204705887622809, "learning_rate": 2.3917995444191345e-06, "loss": 0.2891, "step": 105 }, { "epoch": 0.007250837950612217, "grad_norm": 4.968420341640101, "learning_rate": 2.414578587699317e-06, "loss": 0.3758, "step": 106 }, { "epoch": 0.007319242082221766, "grad_norm": 5.563778976676936, "learning_rate": 2.437357630979499e-06, "loss": 0.5295, "step": 107 }, { "epoch": 0.007387646213831316, "grad_norm": 7.90042876999444, "learning_rate": 2.4601366742596815e-06, "loss": 0.4192, "step": 108 }, { "epoch": 0.007456050345440865, "grad_norm": 4.968652688279104, "learning_rate": 2.4829157175398634e-06, "loss": 0.2256, "step": 109 }, { "epoch": 0.007524454477050414, "grad_norm": 4.555290300056485, "learning_rate": 2.505694760820046e-06, "loss": 0.2949, "step": 110 }, { "epoch": 0.007592858608659963, "grad_norm": 8.712030795473783, "learning_rate": 2.5284738041002284e-06, "loss": 0.212, "step": 111 }, { "epoch": 0.007661262740269513, "grad_norm": 6.010760612056703, "learning_rate": 2.5512528473804103e-06, "loss": 0.5523, "step": 112 }, { "epoch": 0.007729666871879062, "grad_norm": 5.3342068324429, "learning_rate": 2.5740318906605926e-06, "loss": 0.6505, "step": 113 }, { "epoch": 0.007798071003488611, "grad_norm": 4.600779818176575, "learning_rate": 2.596810933940775e-06, "loss": 0.3948, "step": 114 }, { "epoch": 0.00786647513509816, "grad_norm": 5.298359688154018, "learning_rate": 2.6195899772209573e-06, "loss": 0.5686, "step": 115 }, { "epoch": 0.007934879266707709, "grad_norm": 6.036839538684912, "learning_rate": 2.642369020501139e-06, "loss": 0.5582, "step": 116 }, { "epoch": 0.008003283398317258, "grad_norm": 4.140851917886642, "learning_rate": 2.6651480637813215e-06, "loss": 0.6109, "step": 117 }, { "epoch": 0.008071687529926808, "grad_norm": 5.633881593110654, "learning_rate": 2.687927107061504e-06, "loss": 0.6218, "step": 118 }, { "epoch": 0.008140091661536357, "grad_norm": 4.289564746015437, "learning_rate": 2.710706150341686e-06, "loss": 0.3407, "step": 119 }, { "epoch": 0.008208495793145906, "grad_norm": 5.727674930453696, "learning_rate": 2.733485193621868e-06, "loss": 0.5627, "step": 120 }, { "epoch": 0.008276899924755455, "grad_norm": 6.303750328927804, "learning_rate": 2.7562642369020503e-06, "loss": 0.5755, "step": 121 }, { "epoch": 0.008345304056365004, "grad_norm": 4.919276783907435, "learning_rate": 2.7790432801822326e-06, "loss": 0.7892, "step": 122 }, { "epoch": 0.008413708187974553, "grad_norm": 5.745666819984284, "learning_rate": 2.801822323462415e-06, "loss": 0.4816, "step": 123 }, { "epoch": 0.008482112319584102, "grad_norm": 3.3021973049739892, "learning_rate": 2.824601366742597e-06, "loss": 0.2087, "step": 124 }, { "epoch": 0.008550516451193653, "grad_norm": 3.583496027217604, "learning_rate": 2.847380410022779e-06, "loss": 0.3031, "step": 125 }, { "epoch": 0.008618920582803202, "grad_norm": 5.084549815637344, "learning_rate": 2.8701594533029615e-06, "loss": 0.4115, "step": 126 }, { "epoch": 0.008687324714412751, "grad_norm": 5.169411939973952, "learning_rate": 2.892938496583144e-06, "loss": 0.73, "step": 127 }, { "epoch": 0.0087557288460223, "grad_norm": 5.500460475372184, "learning_rate": 2.9157175398633257e-06, "loss": 0.7081, "step": 128 }, { "epoch": 0.008824132977631849, "grad_norm": 4.785514338582568, "learning_rate": 2.938496583143508e-06, "loss": 0.293, "step": 129 }, { "epoch": 0.008892537109241398, "grad_norm": 4.006016780712426, "learning_rate": 2.9612756264236903e-06, "loss": 0.3647, "step": 130 }, { "epoch": 0.008960941240850947, "grad_norm": 4.057031560288478, "learning_rate": 2.9840546697038727e-06, "loss": 0.4324, "step": 131 }, { "epoch": 0.009029345372460496, "grad_norm": 4.238679577088611, "learning_rate": 3.0068337129840546e-06, "loss": 0.5028, "step": 132 }, { "epoch": 0.009097749504070047, "grad_norm": 8.267964080806344, "learning_rate": 3.029612756264237e-06, "loss": 0.742, "step": 133 }, { "epoch": 0.009166153635679596, "grad_norm": 3.9999206356281434, "learning_rate": 3.052391799544419e-06, "loss": 0.5112, "step": 134 }, { "epoch": 0.009234557767289145, "grad_norm": 5.47105228463376, "learning_rate": 3.075170842824602e-06, "loss": 0.5871, "step": 135 }, { "epoch": 0.009302961898898694, "grad_norm": 4.607785859391537, "learning_rate": 3.0979498861047843e-06, "loss": 0.2264, "step": 136 }, { "epoch": 0.009371366030508243, "grad_norm": 4.22974469160654, "learning_rate": 3.120728929384966e-06, "loss": 0.4083, "step": 137 }, { "epoch": 0.009439770162117792, "grad_norm": 3.9994916890941803, "learning_rate": 3.1435079726651485e-06, "loss": 0.3025, "step": 138 }, { "epoch": 0.00950817429372734, "grad_norm": 4.589387496023141, "learning_rate": 3.1662870159453308e-06, "loss": 0.4453, "step": 139 }, { "epoch": 0.00957657842533689, "grad_norm": 3.9600038349730293, "learning_rate": 3.189066059225513e-06, "loss": 0.3212, "step": 140 }, { "epoch": 0.00964498255694644, "grad_norm": 4.444664316235586, "learning_rate": 3.211845102505695e-06, "loss": 0.4255, "step": 141 }, { "epoch": 0.00971338668855599, "grad_norm": 7.234159709454101, "learning_rate": 3.2346241457858773e-06, "loss": 0.6359, "step": 142 }, { "epoch": 0.009781790820165538, "grad_norm": 3.9100144748346644, "learning_rate": 3.2574031890660596e-06, "loss": 0.3349, "step": 143 }, { "epoch": 0.009850194951775087, "grad_norm": 5.202414409697801, "learning_rate": 3.280182232346242e-06, "loss": 0.2867, "step": 144 }, { "epoch": 0.009918599083384636, "grad_norm": 3.6307094516345657, "learning_rate": 3.302961275626424e-06, "loss": 0.3751, "step": 145 }, { "epoch": 0.009987003214994185, "grad_norm": 3.9965867589550914, "learning_rate": 3.325740318906606e-06, "loss": 0.4858, "step": 146 }, { "epoch": 0.010055407346603734, "grad_norm": 2.951678784787292, "learning_rate": 3.3485193621867885e-06, "loss": 0.2077, "step": 147 }, { "epoch": 0.010123811478213285, "grad_norm": 3.4251001218097823, "learning_rate": 3.371298405466971e-06, "loss": 0.2961, "step": 148 }, { "epoch": 0.010192215609822834, "grad_norm": 3.287778735219955, "learning_rate": 3.3940774487471527e-06, "loss": 0.3616, "step": 149 }, { "epoch": 0.010260619741432383, "grad_norm": 10.991617129669507, "learning_rate": 3.416856492027335e-06, "loss": 0.3236, "step": 150 }, { "epoch": 0.010329023873041932, "grad_norm": 4.033355155831931, "learning_rate": 3.4396355353075173e-06, "loss": 0.4022, "step": 151 }, { "epoch": 0.01039742800465148, "grad_norm": 4.3844776220404205, "learning_rate": 3.4624145785876997e-06, "loss": 0.611, "step": 152 }, { "epoch": 0.01046583213626103, "grad_norm": 14.058672778349182, "learning_rate": 3.4851936218678815e-06, "loss": 0.5051, "step": 153 }, { "epoch": 0.010534236267870579, "grad_norm": 7.80365163594154, "learning_rate": 3.507972665148064e-06, "loss": 0.3451, "step": 154 }, { "epoch": 0.010602640399480128, "grad_norm": 4.125150331012009, "learning_rate": 3.530751708428246e-06, "loss": 0.4217, "step": 155 }, { "epoch": 0.010671044531089679, "grad_norm": 5.039843855020962, "learning_rate": 3.5535307517084285e-06, "loss": 0.2837, "step": 156 }, { "epoch": 0.010739448662699227, "grad_norm": 3.918100501267, "learning_rate": 3.5763097949886104e-06, "loss": 0.3118, "step": 157 }, { "epoch": 0.010807852794308776, "grad_norm": 4.056138324816646, "learning_rate": 3.5990888382687927e-06, "loss": 0.4352, "step": 158 }, { "epoch": 0.010876256925918325, "grad_norm": 4.966299904438864, "learning_rate": 3.6218678815489755e-06, "loss": 0.7583, "step": 159 }, { "epoch": 0.010944661057527874, "grad_norm": 4.442087595940305, "learning_rate": 3.6446469248291578e-06, "loss": 0.4503, "step": 160 }, { "epoch": 0.011013065189137423, "grad_norm": 7.13931382576582, "learning_rate": 3.66742596810934e-06, "loss": 0.3963, "step": 161 }, { "epoch": 0.011081469320746972, "grad_norm": 4.112154234754447, "learning_rate": 3.690205011389522e-06, "loss": 0.3504, "step": 162 }, { "epoch": 0.011149873452356523, "grad_norm": 3.555506165520702, "learning_rate": 3.7129840546697043e-06, "loss": 0.4485, "step": 163 }, { "epoch": 0.011218277583966072, "grad_norm": 5.516215284710705, "learning_rate": 3.7357630979498866e-06, "loss": 0.4529, "step": 164 }, { "epoch": 0.011286681715575621, "grad_norm": 4.261153144261622, "learning_rate": 3.758542141230069e-06, "loss": 0.4671, "step": 165 }, { "epoch": 0.01135508584718517, "grad_norm": 4.115496916287671, "learning_rate": 3.781321184510251e-06, "loss": 0.4641, "step": 166 }, { "epoch": 0.011423489978794719, "grad_norm": 5.17290816114968, "learning_rate": 3.804100227790433e-06, "loss": 0.6399, "step": 167 }, { "epoch": 0.011491894110404268, "grad_norm": 4.391104403893715, "learning_rate": 3.826879271070615e-06, "loss": 0.4279, "step": 168 }, { "epoch": 0.011560298242013817, "grad_norm": 8.335203901793443, "learning_rate": 3.849658314350798e-06, "loss": 0.4812, "step": 169 }, { "epoch": 0.011628702373623366, "grad_norm": 4.434292856629052, "learning_rate": 3.87243735763098e-06, "loss": 0.702, "step": 170 }, { "epoch": 0.011697106505232917, "grad_norm": 3.7322460953597947, "learning_rate": 3.8952164009111624e-06, "loss": 0.6439, "step": 171 }, { "epoch": 0.011765510636842466, "grad_norm": 4.772309864666896, "learning_rate": 3.917995444191344e-06, "loss": 0.5708, "step": 172 }, { "epoch": 0.011833914768452015, "grad_norm": 5.2641211040220535, "learning_rate": 3.940774487471526e-06, "loss": 0.678, "step": 173 }, { "epoch": 0.011902318900061564, "grad_norm": 3.2721992281019676, "learning_rate": 3.963553530751709e-06, "loss": 0.2714, "step": 174 }, { "epoch": 0.011970723031671113, "grad_norm": 6.668229440465964, "learning_rate": 3.986332574031891e-06, "loss": 1.0851, "step": 175 }, { "epoch": 0.012039127163280662, "grad_norm": 4.790181525519879, "learning_rate": 4.009111617312073e-06, "loss": 0.2452, "step": 176 }, { "epoch": 0.01210753129489021, "grad_norm": 4.55762579283424, "learning_rate": 4.0318906605922555e-06, "loss": 0.5185, "step": 177 }, { "epoch": 0.012175935426499761, "grad_norm": 11.767477793918758, "learning_rate": 4.054669703872437e-06, "loss": 0.2694, "step": 178 }, { "epoch": 0.01224433955810931, "grad_norm": 4.458236624955253, "learning_rate": 4.07744874715262e-06, "loss": 0.5203, "step": 179 }, { "epoch": 0.01231274368971886, "grad_norm": 3.0751982973871077, "learning_rate": 4.100227790432802e-06, "loss": 0.3245, "step": 180 }, { "epoch": 0.012381147821328408, "grad_norm": 4.7046434226199505, "learning_rate": 4.123006833712984e-06, "loss": 0.6176, "step": 181 }, { "epoch": 0.012449551952937957, "grad_norm": 4.80791599074275, "learning_rate": 4.145785876993167e-06, "loss": 0.2326, "step": 182 }, { "epoch": 0.012517956084547506, "grad_norm": 4.092231061018268, "learning_rate": 4.168564920273349e-06, "loss": 0.5002, "step": 183 }, { "epoch": 0.012586360216157055, "grad_norm": 7.6722773118204675, "learning_rate": 4.191343963553531e-06, "loss": 0.8132, "step": 184 }, { "epoch": 0.012654764347766604, "grad_norm": 3.296817715766381, "learning_rate": 4.214123006833713e-06, "loss": 0.254, "step": 185 }, { "epoch": 0.012723168479376155, "grad_norm": 4.902526900681102, "learning_rate": 4.236902050113896e-06, "loss": 0.3638, "step": 186 }, { "epoch": 0.012791572610985704, "grad_norm": 5.190085731801548, "learning_rate": 4.259681093394078e-06, "loss": 0.409, "step": 187 }, { "epoch": 0.012859976742595253, "grad_norm": 4.835850531515545, "learning_rate": 4.2824601366742606e-06, "loss": 0.5272, "step": 188 }, { "epoch": 0.012928380874204802, "grad_norm": 4.459400239484077, "learning_rate": 4.3052391799544425e-06, "loss": 0.2758, "step": 189 }, { "epoch": 0.012996785005814351, "grad_norm": 6.026480770570654, "learning_rate": 4.328018223234624e-06, "loss": 0.5509, "step": 190 }, { "epoch": 0.0130651891374239, "grad_norm": 3.8588851432502635, "learning_rate": 4.350797266514807e-06, "loss": 0.5614, "step": 191 }, { "epoch": 0.013133593269033449, "grad_norm": 2.97172581953081, "learning_rate": 4.373576309794989e-06, "loss": 0.4809, "step": 192 }, { "epoch": 0.013201997400642998, "grad_norm": 2.376027361871209, "learning_rate": 4.396355353075171e-06, "loss": 0.1223, "step": 193 }, { "epoch": 0.013270401532252549, "grad_norm": 4.446738244083037, "learning_rate": 4.419134396355354e-06, "loss": 0.2533, "step": 194 }, { "epoch": 0.013338805663862098, "grad_norm": 2.8521301540621264, "learning_rate": 4.4419134396355355e-06, "loss": 0.1751, "step": 195 }, { "epoch": 0.013407209795471647, "grad_norm": 4.941495510094261, "learning_rate": 4.464692482915718e-06, "loss": 0.5595, "step": 196 }, { "epoch": 0.013475613927081196, "grad_norm": 6.919960480533056, "learning_rate": 4.4874715261959e-06, "loss": 0.3075, "step": 197 }, { "epoch": 0.013544018058690745, "grad_norm": 4.670698674847405, "learning_rate": 4.510250569476082e-06, "loss": 0.5307, "step": 198 }, { "epoch": 0.013612422190300294, "grad_norm": 5.495798023024644, "learning_rate": 4.533029612756265e-06, "loss": 0.3068, "step": 199 }, { "epoch": 0.013680826321909843, "grad_norm": 4.21326225243941, "learning_rate": 4.555808656036447e-06, "loss": 0.3253, "step": 200 }, { "epoch": 0.013749230453519393, "grad_norm": 4.978391733345305, "learning_rate": 4.578587699316629e-06, "loss": 0.731, "step": 201 }, { "epoch": 0.013817634585128942, "grad_norm": 4.980550608242736, "learning_rate": 4.601366742596811e-06, "loss": 0.5036, "step": 202 }, { "epoch": 0.013886038716738491, "grad_norm": 4.650973870216513, "learning_rate": 4.624145785876993e-06, "loss": 0.8233, "step": 203 }, { "epoch": 0.01395444284834804, "grad_norm": 3.5786284046787817, "learning_rate": 4.646924829157176e-06, "loss": 0.6272, "step": 204 }, { "epoch": 0.01402284697995759, "grad_norm": 9.652753363531936, "learning_rate": 4.669703872437358e-06, "loss": 0.7241, "step": 205 }, { "epoch": 0.014091251111567138, "grad_norm": 5.218372879564819, "learning_rate": 4.69248291571754e-06, "loss": 0.3955, "step": 206 }, { "epoch": 0.014159655243176687, "grad_norm": 3.6903642263509435, "learning_rate": 4.7152619589977225e-06, "loss": 0.5157, "step": 207 }, { "epoch": 0.014228059374786236, "grad_norm": 4.300377212657347, "learning_rate": 4.738041002277905e-06, "loss": 0.4585, "step": 208 }, { "epoch": 0.014296463506395787, "grad_norm": 3.561884693079683, "learning_rate": 4.760820045558087e-06, "loss": 0.4034, "step": 209 }, { "epoch": 0.014364867638005336, "grad_norm": 5.072530005861202, "learning_rate": 4.783599088838269e-06, "loss": 0.5482, "step": 210 }, { "epoch": 0.014433271769614885, "grad_norm": 4.554559408123922, "learning_rate": 4.806378132118452e-06, "loss": 0.3215, "step": 211 }, { "epoch": 0.014501675901224434, "grad_norm": 3.9709731956313132, "learning_rate": 4.829157175398634e-06, "loss": 0.2835, "step": 212 }, { "epoch": 0.014570080032833983, "grad_norm": 3.7952870631553224, "learning_rate": 4.851936218678816e-06, "loss": 0.5455, "step": 213 }, { "epoch": 0.014638484164443532, "grad_norm": 5.715299638984723, "learning_rate": 4.874715261958998e-06, "loss": 0.6453, "step": 214 }, { "epoch": 0.014706888296053081, "grad_norm": 3.3834166570951445, "learning_rate": 4.89749430523918e-06, "loss": 0.2783, "step": 215 }, { "epoch": 0.014775292427662632, "grad_norm": 3.7328468931137797, "learning_rate": 4.920273348519363e-06, "loss": 0.4554, "step": 216 }, { "epoch": 0.01484369655927218, "grad_norm": 4.9791531728883545, "learning_rate": 4.943052391799545e-06, "loss": 0.8466, "step": 217 }, { "epoch": 0.01491210069088173, "grad_norm": 2.980251409323108, "learning_rate": 4.965831435079727e-06, "loss": 0.3549, "step": 218 }, { "epoch": 0.014980504822491279, "grad_norm": 4.395206409724005, "learning_rate": 4.9886104783599095e-06, "loss": 0.7749, "step": 219 }, { "epoch": 0.015048908954100828, "grad_norm": 3.6082187303282236, "learning_rate": 5.011389521640092e-06, "loss": 0.4069, "step": 220 }, { "epoch": 0.015117313085710377, "grad_norm": 3.522924973793695, "learning_rate": 5.034168564920274e-06, "loss": 0.4589, "step": 221 }, { "epoch": 0.015185717217319926, "grad_norm": 6.392357132142796, "learning_rate": 5.056947608200457e-06, "loss": 0.6428, "step": 222 }, { "epoch": 0.015254121348929475, "grad_norm": 3.1645410811562975, "learning_rate": 5.079726651480639e-06, "loss": 0.2201, "step": 223 }, { "epoch": 0.015322525480539025, "grad_norm": 4.72759393204318, "learning_rate": 5.102505694760821e-06, "loss": 0.4195, "step": 224 }, { "epoch": 0.015390929612148574, "grad_norm": 3.801655997033193, "learning_rate": 5.125284738041003e-06, "loss": 0.5512, "step": 225 }, { "epoch": 0.015459333743758123, "grad_norm": 3.621387062415292, "learning_rate": 5.148063781321185e-06, "loss": 0.4901, "step": 226 }, { "epoch": 0.015527737875367672, "grad_norm": 3.65722512829578, "learning_rate": 5.170842824601367e-06, "loss": 0.3665, "step": 227 }, { "epoch": 0.015596142006977221, "grad_norm": 3.837425642504662, "learning_rate": 5.19362186788155e-06, "loss": 0.5216, "step": 228 }, { "epoch": 0.015664546138586772, "grad_norm": 4.20028311081782, "learning_rate": 5.216400911161732e-06, "loss": 0.5288, "step": 229 }, { "epoch": 0.01573295027019632, "grad_norm": 4.306887699574032, "learning_rate": 5.2391799544419145e-06, "loss": 0.5792, "step": 230 }, { "epoch": 0.01580135440180587, "grad_norm": 4.66087557028672, "learning_rate": 5.2619589977220964e-06, "loss": 0.7369, "step": 231 }, { "epoch": 0.015869758533415417, "grad_norm": 4.164492100067285, "learning_rate": 5.284738041002278e-06, "loss": 0.5648, "step": 232 }, { "epoch": 0.015938162665024968, "grad_norm": 4.133466975233253, "learning_rate": 5.307517084282461e-06, "loss": 0.4188, "step": 233 }, { "epoch": 0.016006566796634515, "grad_norm": 3.478063897815463, "learning_rate": 5.330296127562643e-06, "loss": 0.3895, "step": 234 }, { "epoch": 0.016074970928244066, "grad_norm": 4.09787986007061, "learning_rate": 5.353075170842825e-06, "loss": 0.3413, "step": 235 }, { "epoch": 0.016143375059853617, "grad_norm": 4.2430086763681185, "learning_rate": 5.375854214123008e-06, "loss": 0.438, "step": 236 }, { "epoch": 0.016211779191463164, "grad_norm": 3.5746900904316177, "learning_rate": 5.3986332574031895e-06, "loss": 0.6505, "step": 237 }, { "epoch": 0.016280183323072715, "grad_norm": 4.926562205761369, "learning_rate": 5.421412300683372e-06, "loss": 0.3022, "step": 238 }, { "epoch": 0.016348587454682262, "grad_norm": 4.282850425800018, "learning_rate": 5.444191343963554e-06, "loss": 0.3774, "step": 239 }, { "epoch": 0.016416991586291813, "grad_norm": 3.2225261315173337, "learning_rate": 5.466970387243736e-06, "loss": 0.3282, "step": 240 }, { "epoch": 0.01648539571790136, "grad_norm": 4.37771862575557, "learning_rate": 5.489749430523919e-06, "loss": 0.5993, "step": 241 }, { "epoch": 0.01655379984951091, "grad_norm": 3.6034791360470493, "learning_rate": 5.512528473804101e-06, "loss": 0.3754, "step": 242 }, { "epoch": 0.01662220398112046, "grad_norm": 3.835251741042257, "learning_rate": 5.5353075170842826e-06, "loss": 0.6983, "step": 243 }, { "epoch": 0.01669060811273001, "grad_norm": 4.331035585587104, "learning_rate": 5.558086560364465e-06, "loss": 0.98, "step": 244 }, { "epoch": 0.01675901224433956, "grad_norm": 4.148519109293207, "learning_rate": 5.580865603644647e-06, "loss": 0.5085, "step": 245 }, { "epoch": 0.016827416375949106, "grad_norm": 4.239177387619685, "learning_rate": 5.60364464692483e-06, "loss": 0.5053, "step": 246 }, { "epoch": 0.016895820507558657, "grad_norm": 3.5281193029528324, "learning_rate": 5.626423690205012e-06, "loss": 0.4398, "step": 247 }, { "epoch": 0.016964224639168204, "grad_norm": 3.960072846248373, "learning_rate": 5.649202733485194e-06, "loss": 0.4558, "step": 248 }, { "epoch": 0.017032628770777755, "grad_norm": 3.0619879703272965, "learning_rate": 5.6719817767653765e-06, "loss": 0.2537, "step": 249 }, { "epoch": 0.017101032902387306, "grad_norm": 4.5594895503379, "learning_rate": 5.694760820045558e-06, "loss": 0.8217, "step": 250 }, { "epoch": 0.017169437033996853, "grad_norm": 4.251591426550482, "learning_rate": 5.71753986332574e-06, "loss": 0.5406, "step": 251 }, { "epoch": 0.017237841165606404, "grad_norm": 4.23090121523519, "learning_rate": 5.740318906605923e-06, "loss": 0.5963, "step": 252 }, { "epoch": 0.01730624529721595, "grad_norm": 3.7564978570138647, "learning_rate": 5.763097949886105e-06, "loss": 0.2759, "step": 253 }, { "epoch": 0.017374649428825502, "grad_norm": 4.284384018925384, "learning_rate": 5.785876993166288e-06, "loss": 0.5485, "step": 254 }, { "epoch": 0.01744305356043505, "grad_norm": 4.556332731084533, "learning_rate": 5.8086560364464695e-06, "loss": 0.7456, "step": 255 }, { "epoch": 0.0175114576920446, "grad_norm": 10.270258352861504, "learning_rate": 5.831435079726651e-06, "loss": 0.7201, "step": 256 }, { "epoch": 0.017579861823654147, "grad_norm": 3.2269714502715416, "learning_rate": 5.854214123006834e-06, "loss": 0.3734, "step": 257 }, { "epoch": 0.017648265955263698, "grad_norm": 3.8008107579213957, "learning_rate": 5.876993166287016e-06, "loss": 0.2433, "step": 258 }, { "epoch": 0.01771667008687325, "grad_norm": 3.978495774292149, "learning_rate": 5.899772209567198e-06, "loss": 0.5346, "step": 259 }, { "epoch": 0.017785074218482796, "grad_norm": 4.062700758155044, "learning_rate": 5.922551252847381e-06, "loss": 0.39, "step": 260 }, { "epoch": 0.017853478350092346, "grad_norm": 3.680067071147968, "learning_rate": 5.945330296127563e-06, "loss": 0.491, "step": 261 }, { "epoch": 0.017921882481701894, "grad_norm": 3.8246688422043995, "learning_rate": 5.968109339407745e-06, "loss": 0.5016, "step": 262 }, { "epoch": 0.017990286613311444, "grad_norm": 4.135148482510889, "learning_rate": 5.990888382687927e-06, "loss": 0.5702, "step": 263 }, { "epoch": 0.01805869074492099, "grad_norm": 3.593063910962475, "learning_rate": 6.013667425968109e-06, "loss": 0.2668, "step": 264 }, { "epoch": 0.018127094876530542, "grad_norm": 4.015810980738893, "learning_rate": 6.036446469248292e-06, "loss": 0.3187, "step": 265 }, { "epoch": 0.018195499008140093, "grad_norm": 5.011331544756438, "learning_rate": 6.059225512528474e-06, "loss": 0.8526, "step": 266 }, { "epoch": 0.01826390313974964, "grad_norm": 3.0209954623857835, "learning_rate": 6.082004555808656e-06, "loss": 0.2629, "step": 267 }, { "epoch": 0.01833230727135919, "grad_norm": 3.4900134870055433, "learning_rate": 6.104783599088838e-06, "loss": 0.5662, "step": 268 }, { "epoch": 0.01840071140296874, "grad_norm": 3.814086544862785, "learning_rate": 6.127562642369021e-06, "loss": 0.4613, "step": 269 }, { "epoch": 0.01846911553457829, "grad_norm": 3.8774699214995976, "learning_rate": 6.150341685649204e-06, "loss": 0.6258, "step": 270 }, { "epoch": 0.018537519666187836, "grad_norm": 3.3098242226974413, "learning_rate": 6.173120728929386e-06, "loss": 0.368, "step": 271 }, { "epoch": 0.018605923797797387, "grad_norm": 2.93969036171499, "learning_rate": 6.1958997722095685e-06, "loss": 0.4095, "step": 272 }, { "epoch": 0.018674327929406938, "grad_norm": 4.297064274173813, "learning_rate": 6.21867881548975e-06, "loss": 0.4979, "step": 273 }, { "epoch": 0.018742732061016485, "grad_norm": 2.726229781259932, "learning_rate": 6.241457858769932e-06, "loss": 0.2681, "step": 274 }, { "epoch": 0.018811136192626036, "grad_norm": 4.059936257341167, "learning_rate": 6.264236902050115e-06, "loss": 0.4854, "step": 275 }, { "epoch": 0.018879540324235583, "grad_norm": 2.8492062278913792, "learning_rate": 6.287015945330297e-06, "loss": 0.4001, "step": 276 }, { "epoch": 0.018947944455845134, "grad_norm": 3.7236105785411384, "learning_rate": 6.309794988610479e-06, "loss": 0.5337, "step": 277 }, { "epoch": 0.01901634858745468, "grad_norm": 2.6859520290973746, "learning_rate": 6.3325740318906616e-06, "loss": 0.2388, "step": 278 }, { "epoch": 0.019084752719064232, "grad_norm": 3.527352182033933, "learning_rate": 6.3553530751708435e-06, "loss": 0.3062, "step": 279 }, { "epoch": 0.01915315685067378, "grad_norm": 4.1922463662091225, "learning_rate": 6.378132118451026e-06, "loss": 0.4103, "step": 280 }, { "epoch": 0.01922156098228333, "grad_norm": 4.923881152184107, "learning_rate": 6.400911161731208e-06, "loss": 0.3328, "step": 281 }, { "epoch": 0.01928996511389288, "grad_norm": 4.0054823885425614, "learning_rate": 6.42369020501139e-06, "loss": 0.7239, "step": 282 }, { "epoch": 0.019358369245502428, "grad_norm": 3.72005271533306, "learning_rate": 6.446469248291573e-06, "loss": 0.283, "step": 283 }, { "epoch": 0.01942677337711198, "grad_norm": 4.229505927786465, "learning_rate": 6.469248291571755e-06, "loss": 0.5943, "step": 284 }, { "epoch": 0.019495177508721526, "grad_norm": 4.2738946464430425, "learning_rate": 6.4920273348519365e-06, "loss": 0.3508, "step": 285 }, { "epoch": 0.019563581640331076, "grad_norm": 3.770529493275998, "learning_rate": 6.514806378132119e-06, "loss": 0.3137, "step": 286 }, { "epoch": 0.019631985771940624, "grad_norm": 3.926920529067754, "learning_rate": 6.537585421412301e-06, "loss": 0.4062, "step": 287 }, { "epoch": 0.019700389903550174, "grad_norm": 3.9972456822065126, "learning_rate": 6.560364464692484e-06, "loss": 0.616, "step": 288 }, { "epoch": 0.019768794035159725, "grad_norm": 3.3365769856570715, "learning_rate": 6.583143507972666e-06, "loss": 0.5358, "step": 289 }, { "epoch": 0.019837198166769272, "grad_norm": 3.2491968336235266, "learning_rate": 6.605922551252848e-06, "loss": 0.4092, "step": 290 }, { "epoch": 0.019905602298378823, "grad_norm": 3.8847805845683165, "learning_rate": 6.6287015945330304e-06, "loss": 0.5143, "step": 291 }, { "epoch": 0.01997400642998837, "grad_norm": 3.3325976553342707, "learning_rate": 6.651480637813212e-06, "loss": 0.51, "step": 292 }, { "epoch": 0.02004241056159792, "grad_norm": 4.416943863801096, "learning_rate": 6.674259681093394e-06, "loss": 0.5608, "step": 293 }, { "epoch": 0.02011081469320747, "grad_norm": 3.6480731169511116, "learning_rate": 6.697038724373577e-06, "loss": 0.7121, "step": 294 }, { "epoch": 0.02017921882481702, "grad_norm": 3.8245682444643467, "learning_rate": 6.719817767653759e-06, "loss": 0.3721, "step": 295 }, { "epoch": 0.02024762295642657, "grad_norm": 3.6028651407634533, "learning_rate": 6.742596810933942e-06, "loss": 0.3006, "step": 296 }, { "epoch": 0.020316027088036117, "grad_norm": 3.8916437737869125, "learning_rate": 6.7653758542141235e-06, "loss": 0.534, "step": 297 }, { "epoch": 0.020384431219645668, "grad_norm": 2.9868643449544185, "learning_rate": 6.788154897494305e-06, "loss": 0.1772, "step": 298 }, { "epoch": 0.020452835351255215, "grad_norm": 3.9659200776492756, "learning_rate": 6.810933940774488e-06, "loss": 0.6089, "step": 299 }, { "epoch": 0.020521239482864766, "grad_norm": 3.691094004832039, "learning_rate": 6.83371298405467e-06, "loss": 0.5175, "step": 300 }, { "epoch": 0.020589643614474313, "grad_norm": 4.339155662781035, "learning_rate": 6.856492027334852e-06, "loss": 0.5534, "step": 301 }, { "epoch": 0.020658047746083864, "grad_norm": 2.848978109947804, "learning_rate": 6.879271070615035e-06, "loss": 0.3531, "step": 302 }, { "epoch": 0.020726451877693414, "grad_norm": 3.009061083558082, "learning_rate": 6.9020501138952166e-06, "loss": 0.4712, "step": 303 }, { "epoch": 0.02079485600930296, "grad_norm": 4.499013660205827, "learning_rate": 6.924829157175399e-06, "loss": 0.6252, "step": 304 }, { "epoch": 0.020863260140912512, "grad_norm": 3.6016508720462195, "learning_rate": 6.947608200455581e-06, "loss": 0.1842, "step": 305 }, { "epoch": 0.02093166427252206, "grad_norm": 2.7585034355908298, "learning_rate": 6.970387243735763e-06, "loss": 0.3378, "step": 306 }, { "epoch": 0.02100006840413161, "grad_norm": 3.1021671042180614, "learning_rate": 6.993166287015946e-06, "loss": 0.4481, "step": 307 }, { "epoch": 0.021068472535741158, "grad_norm": 3.1072465354416106, "learning_rate": 7.015945330296128e-06, "loss": 0.3292, "step": 308 }, { "epoch": 0.02113687666735071, "grad_norm": 3.831242580313289, "learning_rate": 7.03872437357631e-06, "loss": 0.5045, "step": 309 }, { "epoch": 0.021205280798960256, "grad_norm": 5.871208743087901, "learning_rate": 7.061503416856492e-06, "loss": 0.5975, "step": 310 }, { "epoch": 0.021273684930569806, "grad_norm": 3.8521955020239957, "learning_rate": 7.084282460136674e-06, "loss": 0.5092, "step": 311 }, { "epoch": 0.021342089062179357, "grad_norm": 4.571616092842075, "learning_rate": 7.107061503416857e-06, "loss": 0.3806, "step": 312 }, { "epoch": 0.021410493193788904, "grad_norm": 4.393180904187398, "learning_rate": 7.129840546697039e-06, "loss": 0.7288, "step": 313 }, { "epoch": 0.021478897325398455, "grad_norm": 3.8126424153103007, "learning_rate": 7.152619589977221e-06, "loss": 0.7039, "step": 314 }, { "epoch": 0.021547301457008002, "grad_norm": 3.4203134570723823, "learning_rate": 7.1753986332574035e-06, "loss": 0.6817, "step": 315 }, { "epoch": 0.021615705588617553, "grad_norm": 3.023977660195016, "learning_rate": 7.1981776765375854e-06, "loss": 0.2983, "step": 316 }, { "epoch": 0.0216841097202271, "grad_norm": 2.8267522662784503, "learning_rate": 7.220956719817767e-06, "loss": 0.2302, "step": 317 }, { "epoch": 0.02175251385183665, "grad_norm": 3.3154795939266677, "learning_rate": 7.243735763097951e-06, "loss": 0.5082, "step": 318 }, { "epoch": 0.0218209179834462, "grad_norm": 3.8429064639186055, "learning_rate": 7.266514806378133e-06, "loss": 0.3542, "step": 319 }, { "epoch": 0.02188932211505575, "grad_norm": 4.278921245766677, "learning_rate": 7.2892938496583155e-06, "loss": 0.771, "step": 320 }, { "epoch": 0.0219577262466653, "grad_norm": 3.106450425940979, "learning_rate": 7.3120728929384974e-06, "loss": 0.3979, "step": 321 }, { "epoch": 0.022026130378274847, "grad_norm": 3.33226752924825, "learning_rate": 7.33485193621868e-06, "loss": 0.228, "step": 322 }, { "epoch": 0.022094534509884398, "grad_norm": 5.085273980661899, "learning_rate": 7.357630979498862e-06, "loss": 0.3076, "step": 323 }, { "epoch": 0.022162938641493945, "grad_norm": 3.4393584602841405, "learning_rate": 7.380410022779044e-06, "loss": 0.5177, "step": 324 }, { "epoch": 0.022231342773103496, "grad_norm": 2.928862773524759, "learning_rate": 7.403189066059227e-06, "loss": 0.2724, "step": 325 }, { "epoch": 0.022299746904713046, "grad_norm": 5.24084960419797, "learning_rate": 7.425968109339409e-06, "loss": 0.2973, "step": 326 }, { "epoch": 0.022368151036322594, "grad_norm": 3.815850356974092, "learning_rate": 7.4487471526195905e-06, "loss": 0.3185, "step": 327 }, { "epoch": 0.022436555167932144, "grad_norm": 3.7971008924311764, "learning_rate": 7.471526195899773e-06, "loss": 0.2887, "step": 328 }, { "epoch": 0.02250495929954169, "grad_norm": 3.2605466461272963, "learning_rate": 7.494305239179955e-06, "loss": 0.3849, "step": 329 }, { "epoch": 0.022573363431151242, "grad_norm": 3.5691999166848714, "learning_rate": 7.517084282460138e-06, "loss": 0.418, "step": 330 }, { "epoch": 0.02264176756276079, "grad_norm": 11.506955893774379, "learning_rate": 7.53986332574032e-06, "loss": 0.5523, "step": 331 }, { "epoch": 0.02271017169437034, "grad_norm": 3.9977086030783493, "learning_rate": 7.562642369020502e-06, "loss": 0.5796, "step": 332 }, { "epoch": 0.022778575825979887, "grad_norm": 4.5493431995368905, "learning_rate": 7.585421412300684e-06, "loss": 0.6003, "step": 333 }, { "epoch": 0.022846979957589438, "grad_norm": 4.003264376665545, "learning_rate": 7.608200455580866e-06, "loss": 0.8686, "step": 334 }, { "epoch": 0.02291538408919899, "grad_norm": 3.1989387557361133, "learning_rate": 7.630979498861048e-06, "loss": 0.2725, "step": 335 }, { "epoch": 0.022983788220808536, "grad_norm": 2.858402253497997, "learning_rate": 7.65375854214123e-06, "loss": 0.4511, "step": 336 }, { "epoch": 0.023052192352418087, "grad_norm": 6.383144412054614, "learning_rate": 7.676537585421414e-06, "loss": 0.367, "step": 337 }, { "epoch": 0.023120596484027634, "grad_norm": 4.3710183553866635, "learning_rate": 7.699316628701596e-06, "loss": 0.6829, "step": 338 }, { "epoch": 0.023189000615637185, "grad_norm": 3.345858524096247, "learning_rate": 7.722095671981777e-06, "loss": 0.3344, "step": 339 }, { "epoch": 0.023257404747246732, "grad_norm": 4.01325195046353, "learning_rate": 7.74487471526196e-06, "loss": 0.3982, "step": 340 }, { "epoch": 0.023325808878856283, "grad_norm": 3.588061740306305, "learning_rate": 7.767653758542141e-06, "loss": 0.5188, "step": 341 }, { "epoch": 0.023394213010465834, "grad_norm": 3.7438591028915096, "learning_rate": 7.790432801822325e-06, "loss": 0.5339, "step": 342 }, { "epoch": 0.02346261714207538, "grad_norm": 5.065830559769252, "learning_rate": 7.813211845102507e-06, "loss": 0.7875, "step": 343 }, { "epoch": 0.02353102127368493, "grad_norm": 2.7845650589850495, "learning_rate": 7.835990888382689e-06, "loss": 0.401, "step": 344 }, { "epoch": 0.02359942540529448, "grad_norm": 2.6792730060898475, "learning_rate": 7.85876993166287e-06, "loss": 0.2864, "step": 345 }, { "epoch": 0.02366782953690403, "grad_norm": 3.750514181966684, "learning_rate": 7.881548974943052e-06, "loss": 0.5005, "step": 346 }, { "epoch": 0.023736233668513577, "grad_norm": 3.594812302524199, "learning_rate": 7.904328018223234e-06, "loss": 0.4001, "step": 347 }, { "epoch": 0.023804637800123127, "grad_norm": 3.574369958899689, "learning_rate": 7.927107061503418e-06, "loss": 0.3995, "step": 348 }, { "epoch": 0.023873041931732678, "grad_norm": 3.149186208599906, "learning_rate": 7.9498861047836e-06, "loss": 0.4498, "step": 349 }, { "epoch": 0.023941446063342225, "grad_norm": 3.083526838304888, "learning_rate": 7.972665148063782e-06, "loss": 0.4633, "step": 350 }, { "epoch": 0.024009850194951776, "grad_norm": 4.332822804760295, "learning_rate": 7.995444191343964e-06, "loss": 0.8546, "step": 351 }, { "epoch": 0.024078254326561323, "grad_norm": 3.619934612534559, "learning_rate": 8.018223234624145e-06, "loss": 0.584, "step": 352 }, { "epoch": 0.024146658458170874, "grad_norm": 3.7238167137836293, "learning_rate": 8.041002277904329e-06, "loss": 0.3757, "step": 353 }, { "epoch": 0.02421506258978042, "grad_norm": 4.112077904696136, "learning_rate": 8.063781321184511e-06, "loss": 0.761, "step": 354 }, { "epoch": 0.024283466721389972, "grad_norm": 4.4836903358448055, "learning_rate": 8.086560364464693e-06, "loss": 0.4796, "step": 355 }, { "epoch": 0.024351870852999523, "grad_norm": 3.7059288355350253, "learning_rate": 8.109339407744875e-06, "loss": 0.5471, "step": 356 }, { "epoch": 0.02442027498460907, "grad_norm": 3.0309375719911515, "learning_rate": 8.132118451025057e-06, "loss": 0.5134, "step": 357 }, { "epoch": 0.02448867911621862, "grad_norm": 4.902699783588555, "learning_rate": 8.15489749430524e-06, "loss": 0.5135, "step": 358 }, { "epoch": 0.024557083247828168, "grad_norm": 3.146452881686815, "learning_rate": 8.177676537585422e-06, "loss": 0.2271, "step": 359 }, { "epoch": 0.02462548737943772, "grad_norm": 3.4669189517596486, "learning_rate": 8.200455580865604e-06, "loss": 0.5314, "step": 360 }, { "epoch": 0.024693891511047266, "grad_norm": 3.4618266063667864, "learning_rate": 8.223234624145786e-06, "loss": 0.5446, "step": 361 }, { "epoch": 0.024762295642656817, "grad_norm": 2.9157086956281377, "learning_rate": 8.246013667425968e-06, "loss": 0.5212, "step": 362 }, { "epoch": 0.024830699774266364, "grad_norm": 4.102495941029923, "learning_rate": 8.26879271070615e-06, "loss": 0.4213, "step": 363 }, { "epoch": 0.024899103905875915, "grad_norm": 4.3980161556530035, "learning_rate": 8.291571753986333e-06, "loss": 0.4604, "step": 364 }, { "epoch": 0.024967508037485465, "grad_norm": 3.826347607972443, "learning_rate": 8.314350797266515e-06, "loss": 0.3827, "step": 365 }, { "epoch": 0.025035912169095013, "grad_norm": 3.6339663242161184, "learning_rate": 8.337129840546699e-06, "loss": 0.4315, "step": 366 }, { "epoch": 0.025104316300704563, "grad_norm": 3.8880355827867694, "learning_rate": 8.35990888382688e-06, "loss": 0.6917, "step": 367 }, { "epoch": 0.02517272043231411, "grad_norm": 3.2629966014419987, "learning_rate": 8.382687927107063e-06, "loss": 0.2207, "step": 368 }, { "epoch": 0.02524112456392366, "grad_norm": 3.0120840284042694, "learning_rate": 8.405466970387244e-06, "loss": 0.3117, "step": 369 }, { "epoch": 0.02530952869553321, "grad_norm": 3.270519148361868, "learning_rate": 8.428246013667426e-06, "loss": 0.3724, "step": 370 }, { "epoch": 0.02537793282714276, "grad_norm": 3.077599153769058, "learning_rate": 8.45102505694761e-06, "loss": 0.2692, "step": 371 }, { "epoch": 0.02544633695875231, "grad_norm": 3.760271691286583, "learning_rate": 8.473804100227792e-06, "loss": 0.38, "step": 372 }, { "epoch": 0.025514741090361857, "grad_norm": 3.3310427703241645, "learning_rate": 8.496583143507974e-06, "loss": 0.6353, "step": 373 }, { "epoch": 0.025583145221971408, "grad_norm": 2.7123103115012217, "learning_rate": 8.519362186788156e-06, "loss": 0.173, "step": 374 }, { "epoch": 0.025651549353580955, "grad_norm": 2.8690538282879334, "learning_rate": 8.542141230068338e-06, "loss": 0.2527, "step": 375 }, { "epoch": 0.025719953485190506, "grad_norm": 2.8193627710654345, "learning_rate": 8.564920273348521e-06, "loss": 0.2424, "step": 376 }, { "epoch": 0.025788357616800053, "grad_norm": 4.96406103607819, "learning_rate": 8.587699316628703e-06, "loss": 0.2991, "step": 377 }, { "epoch": 0.025856761748409604, "grad_norm": 6.931880565197731, "learning_rate": 8.610478359908885e-06, "loss": 0.5233, "step": 378 }, { "epoch": 0.025925165880019155, "grad_norm": 3.423776420555386, "learning_rate": 8.633257403189067e-06, "loss": 0.4978, "step": 379 }, { "epoch": 0.025993570011628702, "grad_norm": 3.5539393381728153, "learning_rate": 8.656036446469249e-06, "loss": 0.5667, "step": 380 }, { "epoch": 0.026061974143238253, "grad_norm": 8.00034344576688, "learning_rate": 8.67881548974943e-06, "loss": 0.4214, "step": 381 }, { "epoch": 0.0261303782748478, "grad_norm": 3.277894275062366, "learning_rate": 8.701594533029614e-06, "loss": 0.3367, "step": 382 }, { "epoch": 0.02619878240645735, "grad_norm": 2.321946990898113, "learning_rate": 8.724373576309796e-06, "loss": 0.2716, "step": 383 }, { "epoch": 0.026267186538066898, "grad_norm": 3.0666897102108694, "learning_rate": 8.747152619589978e-06, "loss": 0.2733, "step": 384 }, { "epoch": 0.02633559066967645, "grad_norm": 2.581751056536653, "learning_rate": 8.76993166287016e-06, "loss": 0.2582, "step": 385 }, { "epoch": 0.026403994801285996, "grad_norm": 3.69392026034428, "learning_rate": 8.792710706150342e-06, "loss": 0.4831, "step": 386 }, { "epoch": 0.026472398932895547, "grad_norm": 4.124971158475906, "learning_rate": 8.815489749430525e-06, "loss": 0.4475, "step": 387 }, { "epoch": 0.026540803064505097, "grad_norm": 4.1570477401271555, "learning_rate": 8.838268792710707e-06, "loss": 0.4264, "step": 388 }, { "epoch": 0.026609207196114645, "grad_norm": 3.761411446351577, "learning_rate": 8.861047835990889e-06, "loss": 0.4156, "step": 389 }, { "epoch": 0.026677611327724195, "grad_norm": 3.6986484752669266, "learning_rate": 8.883826879271071e-06, "loss": 0.5276, "step": 390 }, { "epoch": 0.026746015459333743, "grad_norm": 2.812726165367046, "learning_rate": 8.906605922551253e-06, "loss": 0.2916, "step": 391 }, { "epoch": 0.026814419590943293, "grad_norm": 3.625073004678058, "learning_rate": 8.929384965831437e-06, "loss": 0.3811, "step": 392 }, { "epoch": 0.02688282372255284, "grad_norm": 3.1087214391674864, "learning_rate": 8.952164009111618e-06, "loss": 0.5179, "step": 393 }, { "epoch": 0.02695122785416239, "grad_norm": 4.16027113348425, "learning_rate": 8.9749430523918e-06, "loss": 0.5896, "step": 394 }, { "epoch": 0.027019631985771942, "grad_norm": 3.483591924009424, "learning_rate": 8.997722095671982e-06, "loss": 0.5815, "step": 395 }, { "epoch": 0.02708803611738149, "grad_norm": 3.0386850863089965, "learning_rate": 9.020501138952164e-06, "loss": 0.6145, "step": 396 }, { "epoch": 0.02715644024899104, "grad_norm": 3.520198441243741, "learning_rate": 9.043280182232346e-06, "loss": 0.3, "step": 397 }, { "epoch": 0.027224844380600587, "grad_norm": 3.1042388687868154, "learning_rate": 9.06605922551253e-06, "loss": 0.4012, "step": 398 }, { "epoch": 0.027293248512210138, "grad_norm": 3.12746082689984, "learning_rate": 9.088838268792711e-06, "loss": 0.5832, "step": 399 }, { "epoch": 0.027361652643819685, "grad_norm": 3.1097115425588857, "learning_rate": 9.111617312072893e-06, "loss": 0.2733, "step": 400 }, { "epoch": 0.027430056775429236, "grad_norm": 2.888210422265504, "learning_rate": 9.134396355353075e-06, "loss": 0.3149, "step": 401 }, { "epoch": 0.027498460907038787, "grad_norm": 2.7500043782286037, "learning_rate": 9.157175398633257e-06, "loss": 0.3753, "step": 402 }, { "epoch": 0.027566865038648334, "grad_norm": 3.768223574233722, "learning_rate": 9.17995444191344e-06, "loss": 0.9369, "step": 403 }, { "epoch": 0.027635269170257885, "grad_norm": 3.425670678541133, "learning_rate": 9.202733485193623e-06, "loss": 0.635, "step": 404 }, { "epoch": 0.027703673301867432, "grad_norm": 3.4196473452073683, "learning_rate": 9.225512528473805e-06, "loss": 0.4468, "step": 405 }, { "epoch": 0.027772077433476983, "grad_norm": 3.2052857624049675, "learning_rate": 9.248291571753986e-06, "loss": 0.7293, "step": 406 }, { "epoch": 0.02784048156508653, "grad_norm": 3.8373972180049907, "learning_rate": 9.271070615034168e-06, "loss": 0.6246, "step": 407 }, { "epoch": 0.02790888569669608, "grad_norm": 3.6417563649238, "learning_rate": 9.293849658314352e-06, "loss": 0.4313, "step": 408 }, { "epoch": 0.02797728982830563, "grad_norm": 3.0761725822326818, "learning_rate": 9.316628701594534e-06, "loss": 0.6512, "step": 409 }, { "epoch": 0.02804569395991518, "grad_norm": 3.7079495497499773, "learning_rate": 9.339407744874716e-06, "loss": 0.5228, "step": 410 }, { "epoch": 0.02811409809152473, "grad_norm": 3.7035983302217588, "learning_rate": 9.362186788154898e-06, "loss": 0.5712, "step": 411 }, { "epoch": 0.028182502223134277, "grad_norm": 3.4143242288299445, "learning_rate": 9.38496583143508e-06, "loss": 0.5704, "step": 412 }, { "epoch": 0.028250906354743827, "grad_norm": 2.739781484339611, "learning_rate": 9.407744874715261e-06, "loss": 0.3394, "step": 413 }, { "epoch": 0.028319310486353375, "grad_norm": 3.6047337899875713, "learning_rate": 9.430523917995445e-06, "loss": 0.6448, "step": 414 }, { "epoch": 0.028387714617962925, "grad_norm": 3.0949549784798385, "learning_rate": 9.453302961275629e-06, "loss": 0.5478, "step": 415 }, { "epoch": 0.028456118749572473, "grad_norm": 3.7213636798172005, "learning_rate": 9.47608200455581e-06, "loss": 0.2971, "step": 416 }, { "epoch": 0.028524522881182023, "grad_norm": 3.2720712801437117, "learning_rate": 9.498861047835992e-06, "loss": 0.6113, "step": 417 }, { "epoch": 0.028592927012791574, "grad_norm": 3.6928802695911265, "learning_rate": 9.521640091116174e-06, "loss": 0.4674, "step": 418 }, { "epoch": 0.02866133114440112, "grad_norm": 2.1746544856741985, "learning_rate": 9.544419134396356e-06, "loss": 0.2738, "step": 419 }, { "epoch": 0.028729735276010672, "grad_norm": 2.7947841906272792, "learning_rate": 9.567198177676538e-06, "loss": 0.6011, "step": 420 }, { "epoch": 0.02879813940762022, "grad_norm": 2.879511791504836, "learning_rate": 9.589977220956722e-06, "loss": 0.3807, "step": 421 }, { "epoch": 0.02886654353922977, "grad_norm": 3.352271798578965, "learning_rate": 9.612756264236904e-06, "loss": 0.4886, "step": 422 }, { "epoch": 0.028934947670839317, "grad_norm": 3.0034583939740513, "learning_rate": 9.635535307517085e-06, "loss": 0.5573, "step": 423 }, { "epoch": 0.029003351802448868, "grad_norm": 3.1738056170325692, "learning_rate": 9.658314350797267e-06, "loss": 0.6096, "step": 424 }, { "epoch": 0.02907175593405842, "grad_norm": 2.4493024711353923, "learning_rate": 9.68109339407745e-06, "loss": 0.3639, "step": 425 }, { "epoch": 0.029140160065667966, "grad_norm": 2.8357672390651323, "learning_rate": 9.703872437357633e-06, "loss": 0.3013, "step": 426 }, { "epoch": 0.029208564197277517, "grad_norm": 2.86195582252171, "learning_rate": 9.726651480637815e-06, "loss": 0.4527, "step": 427 }, { "epoch": 0.029276968328887064, "grad_norm": 2.889336809305718, "learning_rate": 9.749430523917997e-06, "loss": 0.3614, "step": 428 }, { "epoch": 0.029345372460496615, "grad_norm": 1.765439040594671, "learning_rate": 9.772209567198178e-06, "loss": 0.1831, "step": 429 }, { "epoch": 0.029413776592106162, "grad_norm": 2.9041918265476543, "learning_rate": 9.79498861047836e-06, "loss": 0.4614, "step": 430 }, { "epoch": 0.029482180723715713, "grad_norm": 2.717476148996379, "learning_rate": 9.817767653758544e-06, "loss": 0.3071, "step": 431 }, { "epoch": 0.029550584855325263, "grad_norm": 1.496299117014328, "learning_rate": 9.840546697038726e-06, "loss": 0.1715, "step": 432 }, { "epoch": 0.02961898898693481, "grad_norm": 4.705775429759128, "learning_rate": 9.863325740318908e-06, "loss": 0.7501, "step": 433 }, { "epoch": 0.02968739311854436, "grad_norm": 2.3308916869943745, "learning_rate": 9.88610478359909e-06, "loss": 0.3037, "step": 434 }, { "epoch": 0.02975579725015391, "grad_norm": 2.2247337762011963, "learning_rate": 9.908883826879272e-06, "loss": 0.3008, "step": 435 }, { "epoch": 0.02982420138176346, "grad_norm": 3.89882902575919, "learning_rate": 9.931662870159453e-06, "loss": 0.454, "step": 436 }, { "epoch": 0.029892605513373006, "grad_norm": 3.0862241201447413, "learning_rate": 9.954441913439637e-06, "loss": 0.4607, "step": 437 }, { "epoch": 0.029961009644982557, "grad_norm": 4.786859998429217, "learning_rate": 9.977220956719819e-06, "loss": 0.6094, "step": 438 }, { "epoch": 0.030029413776592104, "grad_norm": 4.676960145592508, "learning_rate": 1e-05, "loss": 0.3554, "step": 439 }, { "epoch": 0.030097817908201655, "grad_norm": 1.7808507170637213, "learning_rate": 9.999999877287929e-06, "loss": 0.1795, "step": 440 }, { "epoch": 0.030166222039811206, "grad_norm": 3.032278437024821, "learning_rate": 9.999999509151716e-06, "loss": 0.5839, "step": 441 }, { "epoch": 0.030234626171420753, "grad_norm": 3.022281287651799, "learning_rate": 9.999998895591385e-06, "loss": 0.4707, "step": 442 }, { "epoch": 0.030303030303030304, "grad_norm": 3.5026035842989565, "learning_rate": 9.999998036606964e-06, "loss": 0.5391, "step": 443 }, { "epoch": 0.03037143443463985, "grad_norm": 3.2818082062159686, "learning_rate": 9.999996932198493e-06, "loss": 0.4377, "step": 444 }, { "epoch": 0.030439838566249402, "grad_norm": 2.9644564449760695, "learning_rate": 9.999995582366028e-06, "loss": 0.3288, "step": 445 }, { "epoch": 0.03050824269785895, "grad_norm": 2.8180429937969365, "learning_rate": 9.999993987109635e-06, "loss": 0.2473, "step": 446 }, { "epoch": 0.0305766468294685, "grad_norm": 3.3010059979912114, "learning_rate": 9.999992146429393e-06, "loss": 0.5265, "step": 447 }, { "epoch": 0.03064505096107805, "grad_norm": 2.9658384501556077, "learning_rate": 9.99999006032539e-06, "loss": 0.4664, "step": 448 }, { "epoch": 0.030713455092687598, "grad_norm": 1.9908315269972456, "learning_rate": 9.999987728797733e-06, "loss": 0.2147, "step": 449 }, { "epoch": 0.03078185922429715, "grad_norm": 2.580728603379656, "learning_rate": 9.999985151846532e-06, "loss": 0.2969, "step": 450 }, { "epoch": 0.030850263355906696, "grad_norm": 2.599969656932276, "learning_rate": 9.999982329471915e-06, "loss": 0.3298, "step": 451 }, { "epoch": 0.030918667487516246, "grad_norm": 3.5494568288337636, "learning_rate": 9.99997926167402e-06, "loss": 0.8638, "step": 452 }, { "epoch": 0.030987071619125794, "grad_norm": 2.93573446175713, "learning_rate": 9.999975948452999e-06, "loss": 0.5184, "step": 453 }, { "epoch": 0.031055475750735344, "grad_norm": 2.575045885445724, "learning_rate": 9.999972389809014e-06, "loss": 0.3596, "step": 454 }, { "epoch": 0.031123879882344895, "grad_norm": 3.435352547654266, "learning_rate": 9.99996858574224e-06, "loss": 0.5975, "step": 455 }, { "epoch": 0.031192284013954442, "grad_norm": 2.1019723520921962, "learning_rate": 9.999964536252861e-06, "loss": 0.3655, "step": 456 }, { "epoch": 0.03126068814556399, "grad_norm": 5.970973334719753, "learning_rate": 9.99996024134108e-06, "loss": 0.5215, "step": 457 }, { "epoch": 0.031329092277173544, "grad_norm": 3.311366715093286, "learning_rate": 9.999955701007104e-06, "loss": 0.7659, "step": 458 }, { "epoch": 0.03139749640878309, "grad_norm": 2.657825731875826, "learning_rate": 9.99995091525116e-06, "loss": 0.4734, "step": 459 }, { "epoch": 0.03146590054039264, "grad_norm": 3.594246150058569, "learning_rate": 9.999945884073478e-06, "loss": 0.3776, "step": 460 }, { "epoch": 0.03153430467200219, "grad_norm": 2.318633309998629, "learning_rate": 9.99994060747431e-06, "loss": 0.1815, "step": 461 }, { "epoch": 0.03160270880361174, "grad_norm": 2.9719758527071263, "learning_rate": 9.999935085453911e-06, "loss": 0.3784, "step": 462 }, { "epoch": 0.03167111293522129, "grad_norm": 2.43643607860207, "learning_rate": 9.999929318012555e-06, "loss": 0.2908, "step": 463 }, { "epoch": 0.031739517066830834, "grad_norm": 3.575861268901555, "learning_rate": 9.999923305150523e-06, "loss": 0.476, "step": 464 }, { "epoch": 0.031807921198440385, "grad_norm": 3.5346319747717305, "learning_rate": 9.99991704686811e-06, "loss": 0.5814, "step": 465 }, { "epoch": 0.031876325330049936, "grad_norm": 2.6997709578922766, "learning_rate": 9.999910543165623e-06, "loss": 0.3302, "step": 466 }, { "epoch": 0.031944729461659487, "grad_norm": 3.1771880690190586, "learning_rate": 9.999903794043385e-06, "loss": 0.259, "step": 467 }, { "epoch": 0.03201313359326903, "grad_norm": 3.1329381161284546, "learning_rate": 9.999896799501722e-06, "loss": 0.3114, "step": 468 }, { "epoch": 0.03208153772487858, "grad_norm": 2.709007092568346, "learning_rate": 9.999889559540984e-06, "loss": 0.3012, "step": 469 }, { "epoch": 0.03214994185648813, "grad_norm": 3.071754990306721, "learning_rate": 9.999882074161517e-06, "loss": 0.5553, "step": 470 }, { "epoch": 0.03221834598809768, "grad_norm": 3.60034241769772, "learning_rate": 9.999874343363695e-06, "loss": 0.5269, "step": 471 }, { "epoch": 0.03228675011970723, "grad_norm": 2.8272688976708356, "learning_rate": 9.999866367147897e-06, "loss": 0.2089, "step": 472 }, { "epoch": 0.03235515425131678, "grad_norm": 3.876798181776706, "learning_rate": 9.999858145514513e-06, "loss": 0.7591, "step": 473 }, { "epoch": 0.03242355838292633, "grad_norm": 3.661939302691454, "learning_rate": 9.999849678463949e-06, "loss": 0.542, "step": 474 }, { "epoch": 0.03249196251453588, "grad_norm": 3.443931821949108, "learning_rate": 9.999840965996617e-06, "loss": 0.5828, "step": 475 }, { "epoch": 0.03256036664614543, "grad_norm": 2.663331599947143, "learning_rate": 9.999832008112947e-06, "loss": 0.284, "step": 476 }, { "epoch": 0.03262877077775498, "grad_norm": 3.094973640052321, "learning_rate": 9.999822804813378e-06, "loss": 0.379, "step": 477 }, { "epoch": 0.032697174909364524, "grad_norm": 2.9061791888193444, "learning_rate": 9.999813356098363e-06, "loss": 0.3004, "step": 478 }, { "epoch": 0.032765579040974074, "grad_norm": 2.7008084640919456, "learning_rate": 9.999803661968361e-06, "loss": 0.3416, "step": 479 }, { "epoch": 0.032833983172583625, "grad_norm": 3.1501273454931615, "learning_rate": 9.999793722423855e-06, "loss": 0.4789, "step": 480 }, { "epoch": 0.032902387304193176, "grad_norm": 2.4121181781314722, "learning_rate": 9.999783537465328e-06, "loss": 0.3347, "step": 481 }, { "epoch": 0.03297079143580272, "grad_norm": 4.221073697480436, "learning_rate": 9.999773107093282e-06, "loss": 0.4664, "step": 482 }, { "epoch": 0.03303919556741227, "grad_norm": 3.732556830201586, "learning_rate": 9.999762431308228e-06, "loss": 0.6438, "step": 483 }, { "epoch": 0.03310759969902182, "grad_norm": 3.4097579164097755, "learning_rate": 9.99975151011069e-06, "loss": 0.4665, "step": 484 }, { "epoch": 0.03317600383063137, "grad_norm": 3.472983242984133, "learning_rate": 9.999740343501203e-06, "loss": 0.7539, "step": 485 }, { "epoch": 0.03324440796224092, "grad_norm": 3.7602202063866947, "learning_rate": 9.999728931480318e-06, "loss": 0.3613, "step": 486 }, { "epoch": 0.033312812093850466, "grad_norm": 2.636213531128844, "learning_rate": 9.999717274048594e-06, "loss": 0.3593, "step": 487 }, { "epoch": 0.03338121622546002, "grad_norm": 3.362759823315215, "learning_rate": 9.999705371206601e-06, "loss": 0.5417, "step": 488 }, { "epoch": 0.03344962035706957, "grad_norm": 2.9222648788877486, "learning_rate": 9.999693222954928e-06, "loss": 0.5347, "step": 489 }, { "epoch": 0.03351802448867912, "grad_norm": 2.1015761084275515, "learning_rate": 9.999680829294165e-06, "loss": 0.2959, "step": 490 }, { "epoch": 0.03358642862028866, "grad_norm": 2.8531063551361457, "learning_rate": 9.999668190224926e-06, "loss": 0.3278, "step": 491 }, { "epoch": 0.03365483275189821, "grad_norm": 4.055530897710398, "learning_rate": 9.999655305747829e-06, "loss": 0.6176, "step": 492 }, { "epoch": 0.033723236883507764, "grad_norm": 2.712243304324089, "learning_rate": 9.999642175863504e-06, "loss": 0.298, "step": 493 }, { "epoch": 0.033791641015117314, "grad_norm": 3.3034662104755457, "learning_rate": 9.9996288005726e-06, "loss": 0.7193, "step": 494 }, { "epoch": 0.033860045146726865, "grad_norm": 3.3013404688487125, "learning_rate": 9.999615179875772e-06, "loss": 0.9162, "step": 495 }, { "epoch": 0.03392844927833641, "grad_norm": 2.816846381939879, "learning_rate": 9.999601313773687e-06, "loss": 0.5054, "step": 496 }, { "epoch": 0.03399685340994596, "grad_norm": 2.509384328176374, "learning_rate": 9.999587202267027e-06, "loss": 0.3333, "step": 497 }, { "epoch": 0.03406525754155551, "grad_norm": 3.1580517602245797, "learning_rate": 9.999572845356484e-06, "loss": 0.7081, "step": 498 }, { "epoch": 0.03413366167316506, "grad_norm": 3.174894042387177, "learning_rate": 9.999558243042763e-06, "loss": 0.5182, "step": 499 }, { "epoch": 0.03420206580477461, "grad_norm": 3.037093654972128, "learning_rate": 9.999543395326583e-06, "loss": 0.6484, "step": 500 }, { "epoch": 0.034270469936384156, "grad_norm": 2.110055230980543, "learning_rate": 9.999528302208668e-06, "loss": 0.3962, "step": 501 }, { "epoch": 0.034338874067993706, "grad_norm": 2.5062612388244694, "learning_rate": 9.999512963689763e-06, "loss": 0.5279, "step": 502 }, { "epoch": 0.03440727819960326, "grad_norm": 2.7988892233782203, "learning_rate": 9.999497379770617e-06, "loss": 0.3023, "step": 503 }, { "epoch": 0.03447568233121281, "grad_norm": 2.2315382856704087, "learning_rate": 9.999481550451999e-06, "loss": 0.3795, "step": 504 }, { "epoch": 0.03454408646282235, "grad_norm": 2.7850692649836706, "learning_rate": 9.999465475734685e-06, "loss": 0.5132, "step": 505 }, { "epoch": 0.0346124905944319, "grad_norm": 2.5851427164931096, "learning_rate": 9.999449155619463e-06, "loss": 0.3753, "step": 506 }, { "epoch": 0.03468089472604145, "grad_norm": 3.4393924532320455, "learning_rate": 9.999432590107133e-06, "loss": 0.8243, "step": 507 }, { "epoch": 0.034749298857651004, "grad_norm": 2.992353310236355, "learning_rate": 9.99941577919851e-06, "loss": 0.5794, "step": 508 }, { "epoch": 0.034817702989260554, "grad_norm": 2.302296079900006, "learning_rate": 9.99939872289442e-06, "loss": 0.3954, "step": 509 }, { "epoch": 0.0348861071208701, "grad_norm": 2.0260105857982094, "learning_rate": 9.999381421195697e-06, "loss": 0.2983, "step": 510 }, { "epoch": 0.03495451125247965, "grad_norm": 2.03826300341275, "learning_rate": 9.999363874103194e-06, "loss": 0.277, "step": 511 }, { "epoch": 0.0350229153840892, "grad_norm": 2.3748629179092577, "learning_rate": 9.999346081617769e-06, "loss": 0.3168, "step": 512 }, { "epoch": 0.03509131951569875, "grad_norm": 3.2327303183182665, "learning_rate": 9.999328043740296e-06, "loss": 0.4241, "step": 513 }, { "epoch": 0.035159723647308294, "grad_norm": 2.646355657508899, "learning_rate": 9.999309760471663e-06, "loss": 0.2505, "step": 514 }, { "epoch": 0.035228127778917845, "grad_norm": 3.4596515703476225, "learning_rate": 9.999291231812765e-06, "loss": 0.2641, "step": 515 }, { "epoch": 0.035296531910527396, "grad_norm": 3.3552837831544315, "learning_rate": 9.99927245776451e-06, "loss": 0.5704, "step": 516 }, { "epoch": 0.035364936042136946, "grad_norm": 3.2245231261060674, "learning_rate": 9.999253438327823e-06, "loss": 0.8433, "step": 517 }, { "epoch": 0.0354333401737465, "grad_norm": 2.2686388293265214, "learning_rate": 9.999234173503636e-06, "loss": 0.2878, "step": 518 }, { "epoch": 0.03550174430535604, "grad_norm": 2.986830006172932, "learning_rate": 9.999214663292896e-06, "loss": 0.5959, "step": 519 }, { "epoch": 0.03557014843696559, "grad_norm": 2.763118448549523, "learning_rate": 9.999194907696559e-06, "loss": 0.5187, "step": 520 }, { "epoch": 0.03563855256857514, "grad_norm": 3.5112666432036144, "learning_rate": 9.999174906715593e-06, "loss": 0.3753, "step": 521 }, { "epoch": 0.03570695670018469, "grad_norm": 3.1531185724743, "learning_rate": 9.999154660350982e-06, "loss": 0.4181, "step": 522 }, { "epoch": 0.035775360831794244, "grad_norm": 3.283017853927983, "learning_rate": 9.999134168603721e-06, "loss": 0.8353, "step": 523 }, { "epoch": 0.03584376496340379, "grad_norm": 2.757256568009885, "learning_rate": 9.999113431474815e-06, "loss": 0.488, "step": 524 }, { "epoch": 0.03591216909501334, "grad_norm": 2.414352035442871, "learning_rate": 9.99909244896528e-06, "loss": 0.4905, "step": 525 }, { "epoch": 0.03598057322662289, "grad_norm": 4.13071850114862, "learning_rate": 9.99907122107615e-06, "loss": 0.821, "step": 526 }, { "epoch": 0.03604897735823244, "grad_norm": 2.7526150645771432, "learning_rate": 9.99904974780846e-06, "loss": 0.4349, "step": 527 }, { "epoch": 0.03611738148984198, "grad_norm": 3.259450450400641, "learning_rate": 9.99902802916327e-06, "loss": 0.8564, "step": 528 }, { "epoch": 0.036185785621451534, "grad_norm": 2.619671636199153, "learning_rate": 9.999006065141645e-06, "loss": 0.2213, "step": 529 }, { "epoch": 0.036254189753061085, "grad_norm": 3.0937577401651986, "learning_rate": 9.998983855744661e-06, "loss": 0.6947, "step": 530 }, { "epoch": 0.036322593884670636, "grad_norm": 2.580064754754653, "learning_rate": 9.99896140097341e-06, "loss": 0.4946, "step": 531 }, { "epoch": 0.036390998016280186, "grad_norm": 2.647360287240753, "learning_rate": 9.998938700828995e-06, "loss": 0.5355, "step": 532 }, { "epoch": 0.03645940214788973, "grad_norm": 1.9956000738442015, "learning_rate": 9.998915755312528e-06, "loss": 0.2757, "step": 533 }, { "epoch": 0.03652780627949928, "grad_norm": 1.9908920189843855, "learning_rate": 9.998892564425136e-06, "loss": 0.3738, "step": 534 }, { "epoch": 0.03659621041110883, "grad_norm": 2.2733361133109105, "learning_rate": 9.998869128167957e-06, "loss": 0.4844, "step": 535 }, { "epoch": 0.03666461454271838, "grad_norm": 2.307397484057122, "learning_rate": 9.998845446542142e-06, "loss": 0.48, "step": 536 }, { "epoch": 0.036733018674327926, "grad_norm": 2.151295546881275, "learning_rate": 9.998821519548855e-06, "loss": 0.283, "step": 537 }, { "epoch": 0.03680142280593748, "grad_norm": 2.272447429464378, "learning_rate": 9.998797347189267e-06, "loss": 0.3994, "step": 538 }, { "epoch": 0.03686982693754703, "grad_norm": 2.0659569538489015, "learning_rate": 9.998772929464567e-06, "loss": 0.315, "step": 539 }, { "epoch": 0.03693823106915658, "grad_norm": 2.7324035185823776, "learning_rate": 9.998748266375952e-06, "loss": 0.6205, "step": 540 }, { "epoch": 0.03700663520076613, "grad_norm": 2.4751243597393633, "learning_rate": 9.998723357924635e-06, "loss": 0.2799, "step": 541 }, { "epoch": 0.03707503933237567, "grad_norm": 3.4702842257815103, "learning_rate": 9.998698204111836e-06, "loss": 0.7575, "step": 542 }, { "epoch": 0.03714344346398522, "grad_norm": 2.57121249503979, "learning_rate": 9.998672804938791e-06, "loss": 0.4561, "step": 543 }, { "epoch": 0.037211847595594774, "grad_norm": 3.0349831942952403, "learning_rate": 9.998647160406745e-06, "loss": 0.6722, "step": 544 }, { "epoch": 0.037280251727204325, "grad_norm": 3.374075683542556, "learning_rate": 9.99862127051696e-06, "loss": 0.4214, "step": 545 }, { "epoch": 0.037348655858813876, "grad_norm": 2.2828094626007656, "learning_rate": 9.998595135270705e-06, "loss": 0.3502, "step": 546 }, { "epoch": 0.03741705999042342, "grad_norm": 2.5184155855078583, "learning_rate": 9.998568754669261e-06, "loss": 0.4058, "step": 547 }, { "epoch": 0.03748546412203297, "grad_norm": 2.8893244214574754, "learning_rate": 9.998542128713926e-06, "loss": 0.5691, "step": 548 }, { "epoch": 0.03755386825364252, "grad_norm": 2.8385683603810627, "learning_rate": 9.998515257406005e-06, "loss": 0.3873, "step": 549 }, { "epoch": 0.03762227238525207, "grad_norm": 2.5095880170980367, "learning_rate": 9.998488140746818e-06, "loss": 0.4893, "step": 550 }, { "epoch": 0.037690676516861615, "grad_norm": 2.6982966502792407, "learning_rate": 9.998460778737695e-06, "loss": 0.534, "step": 551 }, { "epoch": 0.037759080648471166, "grad_norm": 2.7100450217459175, "learning_rate": 9.998433171379979e-06, "loss": 0.4553, "step": 552 }, { "epoch": 0.03782748478008072, "grad_norm": 3.2383800809018397, "learning_rate": 9.99840531867503e-06, "loss": 0.6666, "step": 553 }, { "epoch": 0.03789588891169027, "grad_norm": 3.3036823953674874, "learning_rate": 9.998377220624206e-06, "loss": 0.6522, "step": 554 }, { "epoch": 0.03796429304329982, "grad_norm": 2.976066052345043, "learning_rate": 9.998348877228894e-06, "loss": 0.4082, "step": 555 }, { "epoch": 0.03803269717490936, "grad_norm": 2.6571736581085856, "learning_rate": 9.998320288490479e-06, "loss": 0.4197, "step": 556 }, { "epoch": 0.03810110130651891, "grad_norm": 2.961781036881187, "learning_rate": 9.998291454410372e-06, "loss": 0.4245, "step": 557 }, { "epoch": 0.038169505438128463, "grad_norm": 2.620200873484101, "learning_rate": 9.998262374989981e-06, "loss": 0.2994, "step": 558 }, { "epoch": 0.038237909569738014, "grad_norm": 2.3525478288448385, "learning_rate": 9.998233050230737e-06, "loss": 0.4274, "step": 559 }, { "epoch": 0.03830631370134756, "grad_norm": 4.274524222774414, "learning_rate": 9.998203480134079e-06, "loss": 0.4827, "step": 560 }, { "epoch": 0.03837471783295711, "grad_norm": 2.0130062057503215, "learning_rate": 9.998173664701456e-06, "loss": 0.3116, "step": 561 }, { "epoch": 0.03844312196456666, "grad_norm": 3.2904788884168337, "learning_rate": 9.998143603934337e-06, "loss": 0.3079, "step": 562 }, { "epoch": 0.03851152609617621, "grad_norm": 2.863684257675164, "learning_rate": 9.998113297834191e-06, "loss": 0.5328, "step": 563 }, { "epoch": 0.03857993022778576, "grad_norm": 2.414545220811264, "learning_rate": 9.998082746402512e-06, "loss": 0.3098, "step": 564 }, { "epoch": 0.038648334359395305, "grad_norm": 2.6388396711667506, "learning_rate": 9.998051949640795e-06, "loss": 0.2236, "step": 565 }, { "epoch": 0.038716738491004855, "grad_norm": 2.7786709368119067, "learning_rate": 9.998020907550552e-06, "loss": 0.6232, "step": 566 }, { "epoch": 0.038785142622614406, "grad_norm": 3.0323022215613746, "learning_rate": 9.997989620133308e-06, "loss": 0.4689, "step": 567 }, { "epoch": 0.03885354675422396, "grad_norm": 2.9002164948978426, "learning_rate": 9.997958087390598e-06, "loss": 0.3252, "step": 568 }, { "epoch": 0.03892195088583351, "grad_norm": 4.140328738572101, "learning_rate": 9.997926309323972e-06, "loss": 1.0776, "step": 569 }, { "epoch": 0.03899035501744305, "grad_norm": 2.133246171618605, "learning_rate": 9.997894285934985e-06, "loss": 0.3407, "step": 570 }, { "epoch": 0.0390587591490526, "grad_norm": 3.3400596504252804, "learning_rate": 9.997862017225214e-06, "loss": 0.3322, "step": 571 }, { "epoch": 0.03912716328066215, "grad_norm": 2.6280921026794934, "learning_rate": 9.99782950319624e-06, "loss": 0.4054, "step": 572 }, { "epoch": 0.039195567412271703, "grad_norm": 2.8167003314039825, "learning_rate": 9.99779674384966e-06, "loss": 0.5913, "step": 573 }, { "epoch": 0.03926397154388125, "grad_norm": 3.599242035890027, "learning_rate": 9.997763739187083e-06, "loss": 0.6009, "step": 574 }, { "epoch": 0.0393323756754908, "grad_norm": 3.024154755435828, "learning_rate": 9.997730489210126e-06, "loss": 0.5536, "step": 575 }, { "epoch": 0.03940077980710035, "grad_norm": 2.9119806395538177, "learning_rate": 9.997696993920422e-06, "loss": 0.3893, "step": 576 }, { "epoch": 0.0394691839387099, "grad_norm": 3.2738778448613557, "learning_rate": 9.997663253319619e-06, "loss": 0.6714, "step": 577 }, { "epoch": 0.03953758807031945, "grad_norm": 2.942956553880227, "learning_rate": 9.997629267409367e-06, "loss": 0.4654, "step": 578 }, { "epoch": 0.039605992201928994, "grad_norm": 2.7063062723665654, "learning_rate": 9.997595036191338e-06, "loss": 0.2547, "step": 579 }, { "epoch": 0.039674396333538545, "grad_norm": 2.7554789388028587, "learning_rate": 9.997560559667212e-06, "loss": 0.5287, "step": 580 }, { "epoch": 0.039742800465148095, "grad_norm": 3.6030479065587087, "learning_rate": 9.997525837838681e-06, "loss": 0.8492, "step": 581 }, { "epoch": 0.039811204596757646, "grad_norm": 2.543487624806257, "learning_rate": 9.997490870707448e-06, "loss": 0.5553, "step": 582 }, { "epoch": 0.0398796087283672, "grad_norm": 2.673079503590658, "learning_rate": 9.997455658275232e-06, "loss": 0.5063, "step": 583 }, { "epoch": 0.03994801285997674, "grad_norm": 3.3773921153911437, "learning_rate": 9.99742020054376e-06, "loss": 0.4997, "step": 584 }, { "epoch": 0.04001641699158629, "grad_norm": 2.3222469268123067, "learning_rate": 9.997384497514771e-06, "loss": 0.2615, "step": 585 }, { "epoch": 0.04008482112319584, "grad_norm": 2.5452690531779347, "learning_rate": 9.99734854919002e-06, "loss": 0.4745, "step": 586 }, { "epoch": 0.04015322525480539, "grad_norm": 2.4936130536272403, "learning_rate": 9.997312355571268e-06, "loss": 0.2558, "step": 587 }, { "epoch": 0.04022162938641494, "grad_norm": 2.556537121920867, "learning_rate": 9.997275916660295e-06, "loss": 0.5697, "step": 588 }, { "epoch": 0.04029003351802449, "grad_norm": 2.7911751489985512, "learning_rate": 9.997239232458888e-06, "loss": 0.472, "step": 589 }, { "epoch": 0.04035843764963404, "grad_norm": 3.979641132960794, "learning_rate": 9.997202302968849e-06, "loss": 0.6806, "step": 590 }, { "epoch": 0.04042684178124359, "grad_norm": 2.986236031947915, "learning_rate": 9.99716512819199e-06, "loss": 0.5448, "step": 591 }, { "epoch": 0.04049524591285314, "grad_norm": 2.776223355061929, "learning_rate": 9.997127708130134e-06, "loss": 0.6911, "step": 592 }, { "epoch": 0.04056365004446268, "grad_norm": 2.40648412494369, "learning_rate": 9.99709004278512e-06, "loss": 0.4274, "step": 593 }, { "epoch": 0.040632054176072234, "grad_norm": 3.048839033890537, "learning_rate": 9.997052132158797e-06, "loss": 0.3227, "step": 594 }, { "epoch": 0.040700458307681785, "grad_norm": 3.56946444299656, "learning_rate": 9.997013976253022e-06, "loss": 0.538, "step": 595 }, { "epoch": 0.040768862439291335, "grad_norm": 2.94813325623227, "learning_rate": 9.996975575069675e-06, "loss": 0.5113, "step": 596 }, { "epoch": 0.04083726657090088, "grad_norm": 2.264498094944943, "learning_rate": 9.996936928610632e-06, "loss": 0.4166, "step": 597 }, { "epoch": 0.04090567070251043, "grad_norm": 2.73507415552398, "learning_rate": 9.996898036877797e-06, "loss": 0.606, "step": 598 }, { "epoch": 0.04097407483411998, "grad_norm": 3.2205909075599113, "learning_rate": 9.996858899873076e-06, "loss": 0.626, "step": 599 }, { "epoch": 0.04104247896572953, "grad_norm": 2.9534275969631008, "learning_rate": 9.996819517598392e-06, "loss": 0.5126, "step": 600 }, { "epoch": 0.04111088309733908, "grad_norm": 3.2629405856888702, "learning_rate": 9.996779890055675e-06, "loss": 0.5215, "step": 601 }, { "epoch": 0.041179287228948626, "grad_norm": 2.383158911867164, "learning_rate": 9.996740017246873e-06, "loss": 0.4916, "step": 602 }, { "epoch": 0.04124769136055818, "grad_norm": 2.5124486808849764, "learning_rate": 9.99669989917394e-06, "loss": 0.5541, "step": 603 }, { "epoch": 0.04131609549216773, "grad_norm": 2.194798976256575, "learning_rate": 9.99665953583885e-06, "loss": 0.3672, "step": 604 }, { "epoch": 0.04138449962377728, "grad_norm": 2.325852254437846, "learning_rate": 9.99661892724358e-06, "loss": 0.238, "step": 605 }, { "epoch": 0.04145290375538683, "grad_norm": 2.2719384229168664, "learning_rate": 9.996578073390124e-06, "loss": 0.4306, "step": 606 }, { "epoch": 0.04152130788699637, "grad_norm": 2.301042720011005, "learning_rate": 9.996536974280488e-06, "loss": 0.3472, "step": 607 }, { "epoch": 0.04158971201860592, "grad_norm": 3.612586586650408, "learning_rate": 9.99649562991669e-06, "loss": 0.6133, "step": 608 }, { "epoch": 0.041658116150215474, "grad_norm": 2.7651720376498177, "learning_rate": 9.996454040300758e-06, "loss": 0.4095, "step": 609 }, { "epoch": 0.041726520281825025, "grad_norm": 2.631246366974051, "learning_rate": 9.996412205434734e-06, "loss": 0.5595, "step": 610 }, { "epoch": 0.04179492441343457, "grad_norm": 3.418192288309614, "learning_rate": 9.996370125320672e-06, "loss": 0.378, "step": 611 }, { "epoch": 0.04186332854504412, "grad_norm": 2.7696900692101134, "learning_rate": 9.996327799960636e-06, "loss": 0.5094, "step": 612 }, { "epoch": 0.04193173267665367, "grad_norm": 3.0955040419430007, "learning_rate": 9.996285229356705e-06, "loss": 0.8477, "step": 613 }, { "epoch": 0.04200013680826322, "grad_norm": 2.818183914612734, "learning_rate": 9.99624241351097e-06, "loss": 0.5499, "step": 614 }, { "epoch": 0.04206854093987277, "grad_norm": 2.3383336233078587, "learning_rate": 9.996199352425528e-06, "loss": 0.4062, "step": 615 }, { "epoch": 0.042136945071482315, "grad_norm": 3.3528389978512823, "learning_rate": 9.996156046102498e-06, "loss": 0.8964, "step": 616 }, { "epoch": 0.042205349203091866, "grad_norm": 2.8863824279360495, "learning_rate": 9.996112494544002e-06, "loss": 0.564, "step": 617 }, { "epoch": 0.04227375333470142, "grad_norm": 2.8796617300833396, "learning_rate": 9.996068697752179e-06, "loss": 0.7033, "step": 618 }, { "epoch": 0.04234215746631097, "grad_norm": 2.395218543631772, "learning_rate": 9.996024655729177e-06, "loss": 0.4949, "step": 619 }, { "epoch": 0.04241056159792051, "grad_norm": 2.8317573204417394, "learning_rate": 9.995980368477161e-06, "loss": 0.5001, "step": 620 }, { "epoch": 0.04247896572953006, "grad_norm": 2.218526224483877, "learning_rate": 9.995935835998303e-06, "loss": 0.3228, "step": 621 }, { "epoch": 0.04254736986113961, "grad_norm": 2.159186968366268, "learning_rate": 9.995891058294788e-06, "loss": 0.4118, "step": 622 }, { "epoch": 0.04261577399274916, "grad_norm": 2.3670233080987124, "learning_rate": 9.995846035368815e-06, "loss": 0.2063, "step": 623 }, { "epoch": 0.042684178124358714, "grad_norm": 2.8945008506671512, "learning_rate": 9.995800767222596e-06, "loss": 0.4867, "step": 624 }, { "epoch": 0.04275258225596826, "grad_norm": 3.0508162414654123, "learning_rate": 9.995755253858348e-06, "loss": 0.8304, "step": 625 }, { "epoch": 0.04282098638757781, "grad_norm": 2.8033134526809538, "learning_rate": 9.995709495278309e-06, "loss": 0.2748, "step": 626 }, { "epoch": 0.04288939051918736, "grad_norm": 3.097725423687516, "learning_rate": 9.995663491484725e-06, "loss": 0.5737, "step": 627 }, { "epoch": 0.04295779465079691, "grad_norm": 3.2344539006141693, "learning_rate": 9.995617242479852e-06, "loss": 0.4775, "step": 628 }, { "epoch": 0.04302619878240646, "grad_norm": 3.035171420379785, "learning_rate": 9.995570748265962e-06, "loss": 0.8025, "step": 629 }, { "epoch": 0.043094602914016004, "grad_norm": 2.4669068675161054, "learning_rate": 9.995524008845335e-06, "loss": 0.5538, "step": 630 }, { "epoch": 0.043163007045625555, "grad_norm": 2.4630473462911797, "learning_rate": 9.995477024220268e-06, "loss": 0.3589, "step": 631 }, { "epoch": 0.043231411177235106, "grad_norm": 2.3204551036439365, "learning_rate": 9.995429794393063e-06, "loss": 0.3302, "step": 632 }, { "epoch": 0.04329981530884466, "grad_norm": 3.2084931214080354, "learning_rate": 9.995382319366044e-06, "loss": 0.3595, "step": 633 }, { "epoch": 0.0433682194404542, "grad_norm": 2.3007884778360976, "learning_rate": 9.995334599141537e-06, "loss": 0.4638, "step": 634 }, { "epoch": 0.04343662357206375, "grad_norm": 2.180980032525098, "learning_rate": 9.995286633721886e-06, "loss": 0.329, "step": 635 }, { "epoch": 0.0435050277036733, "grad_norm": 3.5162182455734374, "learning_rate": 9.995238423109445e-06, "loss": 0.5359, "step": 636 }, { "epoch": 0.04357343183528285, "grad_norm": 2.928198509759213, "learning_rate": 9.99518996730658e-06, "loss": 0.6009, "step": 637 }, { "epoch": 0.0436418359668924, "grad_norm": 3.0296130627148563, "learning_rate": 9.99514126631567e-06, "loss": 0.7303, "step": 638 }, { "epoch": 0.04371024009850195, "grad_norm": 2.555734207071128, "learning_rate": 9.995092320139106e-06, "loss": 0.3533, "step": 639 }, { "epoch": 0.0437786442301115, "grad_norm": 2.1888398290704445, "learning_rate": 9.995043128779289e-06, "loss": 0.3008, "step": 640 }, { "epoch": 0.04384704836172105, "grad_norm": 3.9124688503743825, "learning_rate": 9.994993692238634e-06, "loss": 0.5021, "step": 641 }, { "epoch": 0.0439154524933306, "grad_norm": 3.5969110270004214, "learning_rate": 9.99494401051957e-06, "loss": 0.8237, "step": 642 }, { "epoch": 0.04398385662494014, "grad_norm": 2.506234144698145, "learning_rate": 9.994894083624534e-06, "loss": 0.4956, "step": 643 }, { "epoch": 0.044052260756549694, "grad_norm": 2.773741503965097, "learning_rate": 9.994843911555974e-06, "loss": 0.3327, "step": 644 }, { "epoch": 0.044120664888159244, "grad_norm": 2.394587343439484, "learning_rate": 9.994793494316355e-06, "loss": 0.3665, "step": 645 }, { "epoch": 0.044189069019768795, "grad_norm": 2.5265861743674196, "learning_rate": 9.994742831908153e-06, "loss": 0.354, "step": 646 }, { "epoch": 0.044257473151378346, "grad_norm": 2.9795803880968146, "learning_rate": 9.994691924333853e-06, "loss": 0.3726, "step": 647 }, { "epoch": 0.04432587728298789, "grad_norm": 77.439673047676, "learning_rate": 9.994640771595954e-06, "loss": 0.7492, "step": 648 }, { "epoch": 0.04439428141459744, "grad_norm": 2.6141084864208857, "learning_rate": 9.994589373696966e-06, "loss": 0.4597, "step": 649 }, { "epoch": 0.04446268554620699, "grad_norm": 2.16582042340796, "learning_rate": 9.994537730639416e-06, "loss": 0.3088, "step": 650 }, { "epoch": 0.04453108967781654, "grad_norm": 2.4609602215641377, "learning_rate": 9.994485842425833e-06, "loss": 0.3357, "step": 651 }, { "epoch": 0.04459949380942609, "grad_norm": 2.1098056141750883, "learning_rate": 9.994433709058767e-06, "loss": 0.4539, "step": 652 }, { "epoch": 0.044667897941035636, "grad_norm": 2.380594640945342, "learning_rate": 9.994381330540777e-06, "loss": 0.1808, "step": 653 }, { "epoch": 0.04473630207264519, "grad_norm": 2.621319230979525, "learning_rate": 9.994328706874434e-06, "loss": 0.4131, "step": 654 }, { "epoch": 0.04480470620425474, "grad_norm": 2.2953718482876018, "learning_rate": 9.994275838062321e-06, "loss": 0.4972, "step": 655 }, { "epoch": 0.04487311033586429, "grad_norm": 2.2364972046481357, "learning_rate": 9.994222724107032e-06, "loss": 0.4803, "step": 656 }, { "epoch": 0.04494151446747383, "grad_norm": 2.671900152344576, "learning_rate": 9.994169365011176e-06, "loss": 0.3094, "step": 657 }, { "epoch": 0.04500991859908338, "grad_norm": 2.761624416914664, "learning_rate": 9.99411576077737e-06, "loss": 0.5746, "step": 658 }, { "epoch": 0.045078322730692934, "grad_norm": 1.9179726579241747, "learning_rate": 9.994061911408245e-06, "loss": 0.3432, "step": 659 }, { "epoch": 0.045146726862302484, "grad_norm": 2.343648647660058, "learning_rate": 9.994007816906449e-06, "loss": 0.3504, "step": 660 }, { "epoch": 0.045215130993912035, "grad_norm": 2.6259365227690763, "learning_rate": 9.99395347727463e-06, "loss": 0.5878, "step": 661 }, { "epoch": 0.04528353512552158, "grad_norm": 1.987978205791407, "learning_rate": 9.99389889251546e-06, "loss": 0.248, "step": 662 }, { "epoch": 0.04535193925713113, "grad_norm": 2.3602937652620137, "learning_rate": 9.993844062631616e-06, "loss": 0.3261, "step": 663 }, { "epoch": 0.04542034338874068, "grad_norm": 2.8301695025048508, "learning_rate": 9.993788987625793e-06, "loss": 0.359, "step": 664 }, { "epoch": 0.04548874752035023, "grad_norm": 2.242038196816417, "learning_rate": 9.99373366750069e-06, "loss": 0.146, "step": 665 }, { "epoch": 0.045557151651959775, "grad_norm": 3.1478773428082616, "learning_rate": 9.993678102259024e-06, "loss": 0.642, "step": 666 }, { "epoch": 0.045625555783569326, "grad_norm": 2.236559193763139, "learning_rate": 9.993622291903522e-06, "loss": 0.2971, "step": 667 }, { "epoch": 0.045693959915178876, "grad_norm": 3.1090123717514953, "learning_rate": 9.993566236436925e-06, "loss": 0.6911, "step": 668 }, { "epoch": 0.04576236404678843, "grad_norm": 2.460662426788662, "learning_rate": 9.993509935861983e-06, "loss": 0.3352, "step": 669 }, { "epoch": 0.04583076817839798, "grad_norm": 3.5008031996362585, "learning_rate": 9.99345339018146e-06, "loss": 0.9674, "step": 670 }, { "epoch": 0.04589917231000752, "grad_norm": 2.7057948846457514, "learning_rate": 9.993396599398133e-06, "loss": 0.3806, "step": 671 }, { "epoch": 0.04596757644161707, "grad_norm": 2.178913628533892, "learning_rate": 9.993339563514785e-06, "loss": 0.4046, "step": 672 }, { "epoch": 0.04603598057322662, "grad_norm": 2.020270465021151, "learning_rate": 9.993282282534222e-06, "loss": 0.327, "step": 673 }, { "epoch": 0.046104384704836174, "grad_norm": 3.6861103882453925, "learning_rate": 9.99322475645925e-06, "loss": 0.7445, "step": 674 }, { "epoch": 0.046172788836445725, "grad_norm": 3.0652691330293536, "learning_rate": 9.993166985292695e-06, "loss": 0.6326, "step": 675 }, { "epoch": 0.04624119296805527, "grad_norm": 2.3381368814924643, "learning_rate": 9.993108969037392e-06, "loss": 0.3128, "step": 676 }, { "epoch": 0.04630959709966482, "grad_norm": 2.937616579299129, "learning_rate": 9.993050707696191e-06, "loss": 0.7199, "step": 677 }, { "epoch": 0.04637800123127437, "grad_norm": 2.7850757977822287, "learning_rate": 9.992992201271949e-06, "loss": 0.406, "step": 678 }, { "epoch": 0.04644640536288392, "grad_norm": 2.4071830946767965, "learning_rate": 9.992933449767538e-06, "loss": 0.6542, "step": 679 }, { "epoch": 0.046514809494493464, "grad_norm": 2.0712216666199685, "learning_rate": 9.992874453185845e-06, "loss": 0.3555, "step": 680 }, { "epoch": 0.046583213626103015, "grad_norm": 2.885336972362876, "learning_rate": 9.992815211529763e-06, "loss": 0.5226, "step": 681 }, { "epoch": 0.046651617757712566, "grad_norm": 2.561879748781842, "learning_rate": 9.9927557248022e-06, "loss": 0.3612, "step": 682 }, { "epoch": 0.046720021889322116, "grad_norm": 2.6925583464995855, "learning_rate": 9.992695993006077e-06, "loss": 0.4052, "step": 683 }, { "epoch": 0.04678842602093167, "grad_norm": 2.20561965091021, "learning_rate": 9.992636016144324e-06, "loss": 0.4369, "step": 684 }, { "epoch": 0.04685683015254121, "grad_norm": 2.4699892543161295, "learning_rate": 9.992575794219885e-06, "loss": 0.4462, "step": 685 }, { "epoch": 0.04692523428415076, "grad_norm": 2.39179667264597, "learning_rate": 9.992515327235718e-06, "loss": 0.4878, "step": 686 }, { "epoch": 0.04699363841576031, "grad_norm": 2.0006735904183026, "learning_rate": 9.99245461519479e-06, "loss": 0.2515, "step": 687 }, { "epoch": 0.04706204254736986, "grad_norm": 2.703452658226581, "learning_rate": 9.992393658100084e-06, "loss": 0.5579, "step": 688 }, { "epoch": 0.047130446678979414, "grad_norm": 2.627639034305805, "learning_rate": 9.992332455954585e-06, "loss": 0.5142, "step": 689 }, { "epoch": 0.04719885081058896, "grad_norm": 2.2462994555939466, "learning_rate": 9.992271008761304e-06, "loss": 0.5269, "step": 690 }, { "epoch": 0.04726725494219851, "grad_norm": 2.089011477587009, "learning_rate": 9.992209316523253e-06, "loss": 0.3665, "step": 691 }, { "epoch": 0.04733565907380806, "grad_norm": 2.247532368900006, "learning_rate": 9.992147379243463e-06, "loss": 0.4374, "step": 692 }, { "epoch": 0.04740406320541761, "grad_norm": 3.111987463755167, "learning_rate": 9.992085196924972e-06, "loss": 0.4248, "step": 693 }, { "epoch": 0.047472467337027154, "grad_norm": 1.872581241077805, "learning_rate": 9.992022769570834e-06, "loss": 0.3093, "step": 694 }, { "epoch": 0.047540871468636704, "grad_norm": 2.4891549676935876, "learning_rate": 9.991960097184111e-06, "loss": 0.4321, "step": 695 }, { "epoch": 0.047609275600246255, "grad_norm": 2.2152716477295007, "learning_rate": 9.991897179767881e-06, "loss": 0.4684, "step": 696 }, { "epoch": 0.047677679731855806, "grad_norm": 2.4562729118579245, "learning_rate": 9.991834017325232e-06, "loss": 0.4613, "step": 697 }, { "epoch": 0.047746083863465356, "grad_norm": 2.8343595207826313, "learning_rate": 9.991770609859264e-06, "loss": 0.4355, "step": 698 }, { "epoch": 0.0478144879950749, "grad_norm": 2.2997091220835033, "learning_rate": 9.991706957373088e-06, "loss": 0.5773, "step": 699 }, { "epoch": 0.04788289212668445, "grad_norm": 2.4341599186992346, "learning_rate": 9.99164305986983e-06, "loss": 0.4633, "step": 700 }, { "epoch": 0.047951296258294, "grad_norm": 2.451043916683134, "learning_rate": 9.991578917352629e-06, "loss": 0.6135, "step": 701 }, { "epoch": 0.04801970038990355, "grad_norm": 1.8987344325151325, "learning_rate": 9.991514529824629e-06, "loss": 0.2861, "step": 702 }, { "epoch": 0.048088104521513096, "grad_norm": 3.1873669596564325, "learning_rate": 9.991449897288992e-06, "loss": 0.8021, "step": 703 }, { "epoch": 0.04815650865312265, "grad_norm": 1.41403324073216, "learning_rate": 9.991385019748891e-06, "loss": 0.1861, "step": 704 }, { "epoch": 0.0482249127847322, "grad_norm": 2.6940323152876635, "learning_rate": 9.99131989720751e-06, "loss": 0.4305, "step": 705 }, { "epoch": 0.04829331691634175, "grad_norm": 2.6421511207119934, "learning_rate": 9.991254529668045e-06, "loss": 0.383, "step": 706 }, { "epoch": 0.0483617210479513, "grad_norm": 2.578921622687247, "learning_rate": 9.991188917133705e-06, "loss": 0.5983, "step": 707 }, { "epoch": 0.04843012517956084, "grad_norm": 2.7748531972871557, "learning_rate": 9.991123059607712e-06, "loss": 0.5918, "step": 708 }, { "epoch": 0.048498529311170394, "grad_norm": 1.8303020181562535, "learning_rate": 9.991056957093296e-06, "loss": 0.2221, "step": 709 }, { "epoch": 0.048566933442779944, "grad_norm": 1.5931613854286044, "learning_rate": 9.990990609593702e-06, "loss": 0.3211, "step": 710 }, { "epoch": 0.048635337574389495, "grad_norm": 1.3722136778952645, "learning_rate": 9.990924017112192e-06, "loss": 0.2018, "step": 711 }, { "epoch": 0.048703741705999046, "grad_norm": 2.7610090495814066, "learning_rate": 9.990857179652026e-06, "loss": 0.5378, "step": 712 }, { "epoch": 0.04877214583760859, "grad_norm": 2.3910887960185585, "learning_rate": 9.99079009721649e-06, "loss": 0.305, "step": 713 }, { "epoch": 0.04884054996921814, "grad_norm": 2.540549303642118, "learning_rate": 9.990722769808876e-06, "loss": 0.6376, "step": 714 }, { "epoch": 0.04890895410082769, "grad_norm": 2.5770040907481317, "learning_rate": 9.99065519743249e-06, "loss": 0.4524, "step": 715 }, { "epoch": 0.04897735823243724, "grad_norm": 2.192526016146244, "learning_rate": 9.990587380090646e-06, "loss": 0.3743, "step": 716 }, { "epoch": 0.049045762364046785, "grad_norm": 2.8858177910984826, "learning_rate": 9.990519317786673e-06, "loss": 0.387, "step": 717 }, { "epoch": 0.049114166495656336, "grad_norm": 2.714700373973337, "learning_rate": 9.990451010523913e-06, "loss": 0.5199, "step": 718 }, { "epoch": 0.04918257062726589, "grad_norm": 2.3115113729042016, "learning_rate": 9.99038245830572e-06, "loss": 0.2825, "step": 719 }, { "epoch": 0.04925097475887544, "grad_norm": 2.4959692409068954, "learning_rate": 9.990313661135459e-06, "loss": 0.3625, "step": 720 }, { "epoch": 0.04931937889048499, "grad_norm": 2.5614930884439095, "learning_rate": 9.990244619016504e-06, "loss": 0.5881, "step": 721 }, { "epoch": 0.04938778302209453, "grad_norm": 2.396809258237056, "learning_rate": 9.990175331952246e-06, "loss": 0.3711, "step": 722 }, { "epoch": 0.04945618715370408, "grad_norm": 2.8918421888994317, "learning_rate": 9.990105799946084e-06, "loss": 0.629, "step": 723 }, { "epoch": 0.049524591285313634, "grad_norm": 2.6349744098984242, "learning_rate": 9.990036023001433e-06, "loss": 0.3662, "step": 724 }, { "epoch": 0.049592995416923184, "grad_norm": 1.8653704249111125, "learning_rate": 9.989966001121719e-06, "loss": 0.2978, "step": 725 }, { "epoch": 0.04966139954853273, "grad_norm": 2.3294231296773713, "learning_rate": 9.989895734310376e-06, "loss": 0.3409, "step": 726 }, { "epoch": 0.04972980368014228, "grad_norm": 2.695817286450022, "learning_rate": 9.989825222570852e-06, "loss": 0.6138, "step": 727 }, { "epoch": 0.04979820781175183, "grad_norm": 1.957626386482116, "learning_rate": 9.989754465906613e-06, "loss": 0.2175, "step": 728 }, { "epoch": 0.04986661194336138, "grad_norm": 2.5346130764917785, "learning_rate": 9.98968346432113e-06, "loss": 0.4453, "step": 729 }, { "epoch": 0.04993501607497093, "grad_norm": 1.7770963224120413, "learning_rate": 9.989612217817886e-06, "loss": 0.3539, "step": 730 }, { "epoch": 0.050003420206580475, "grad_norm": 3.0927144542739304, "learning_rate": 9.989540726400383e-06, "loss": 0.4257, "step": 731 }, { "epoch": 0.050071824338190025, "grad_norm": 3.0179322266448567, "learning_rate": 9.989468990072123e-06, "loss": 0.518, "step": 732 }, { "epoch": 0.050140228469799576, "grad_norm": 2.4238084028426825, "learning_rate": 9.989397008836632e-06, "loss": 0.3067, "step": 733 }, { "epoch": 0.05020863260140913, "grad_norm": 2.465734366250469, "learning_rate": 9.989324782697444e-06, "loss": 0.429, "step": 734 }, { "epoch": 0.05027703673301868, "grad_norm": 2.827155141871908, "learning_rate": 9.9892523116581e-06, "loss": 0.5432, "step": 735 }, { "epoch": 0.05034544086462822, "grad_norm": 2.7819262389685977, "learning_rate": 9.989179595722163e-06, "loss": 0.5961, "step": 736 }, { "epoch": 0.05041384499623777, "grad_norm": 2.3817949342345943, "learning_rate": 9.989106634893195e-06, "loss": 0.5276, "step": 737 }, { "epoch": 0.05048224912784732, "grad_norm": 1.939319970975725, "learning_rate": 9.989033429174783e-06, "loss": 0.3514, "step": 738 }, { "epoch": 0.050550653259456874, "grad_norm": 2.9038731279349483, "learning_rate": 9.98895997857052e-06, "loss": 0.7627, "step": 739 }, { "epoch": 0.05061905739106642, "grad_norm": 1.8759461479362605, "learning_rate": 9.988886283084008e-06, "loss": 0.3578, "step": 740 }, { "epoch": 0.05068746152267597, "grad_norm": 2.901276711896071, "learning_rate": 9.988812342718868e-06, "loss": 0.903, "step": 741 }, { "epoch": 0.05075586565428552, "grad_norm": 2.4568655507020747, "learning_rate": 9.988738157478726e-06, "loss": 0.39, "step": 742 }, { "epoch": 0.05082426978589507, "grad_norm": 2.520447509706398, "learning_rate": 9.988663727367224e-06, "loss": 0.4957, "step": 743 }, { "epoch": 0.05089267391750462, "grad_norm": 1.4480306595644827, "learning_rate": 9.988589052388018e-06, "loss": 0.1147, "step": 744 }, { "epoch": 0.050961078049114164, "grad_norm": 1.8227452815189278, "learning_rate": 9.98851413254477e-06, "loss": 0.2682, "step": 745 }, { "epoch": 0.051029482180723715, "grad_norm": 3.243068368586012, "learning_rate": 9.988438967841162e-06, "loss": 0.7346, "step": 746 }, { "epoch": 0.051097886312333266, "grad_norm": 2.48420353063812, "learning_rate": 9.988363558280877e-06, "loss": 0.481, "step": 747 }, { "epoch": 0.051166290443942816, "grad_norm": 2.737312020201155, "learning_rate": 9.988287903867621e-06, "loss": 0.6253, "step": 748 }, { "epoch": 0.05123469457555236, "grad_norm": 2.2559065248260164, "learning_rate": 9.988212004605106e-06, "loss": 0.3267, "step": 749 }, { "epoch": 0.05130309870716191, "grad_norm": 2.4478752673821544, "learning_rate": 9.98813586049706e-06, "loss": 0.4306, "step": 750 }, { "epoch": 0.05137150283877146, "grad_norm": 2.72511732500692, "learning_rate": 9.988059471547217e-06, "loss": 0.5038, "step": 751 }, { "epoch": 0.05143990697038101, "grad_norm": 2.0394130566220463, "learning_rate": 9.98798283775933e-06, "loss": 0.423, "step": 752 }, { "epoch": 0.05150831110199056, "grad_norm": 2.409584323780169, "learning_rate": 9.987905959137156e-06, "loss": 0.4477, "step": 753 }, { "epoch": 0.05157671523360011, "grad_norm": 2.6349878691080972, "learning_rate": 9.987828835684472e-06, "loss": 0.7607, "step": 754 }, { "epoch": 0.05164511936520966, "grad_norm": 2.1071925140099443, "learning_rate": 9.987751467405063e-06, "loss": 0.3467, "step": 755 }, { "epoch": 0.05171352349681921, "grad_norm": 2.85542160600604, "learning_rate": 9.987673854302728e-06, "loss": 0.6174, "step": 756 }, { "epoch": 0.05178192762842876, "grad_norm": 2.9881257733557667, "learning_rate": 9.987595996381272e-06, "loss": 0.3889, "step": 757 }, { "epoch": 0.05185033176003831, "grad_norm": 3.291177056312309, "learning_rate": 9.987517893644522e-06, "loss": 0.5497, "step": 758 }, { "epoch": 0.05191873589164785, "grad_norm": 2.06907615453158, "learning_rate": 9.987439546096309e-06, "loss": 0.3001, "step": 759 }, { "epoch": 0.051987140023257404, "grad_norm": 1.9762565768070317, "learning_rate": 9.987360953740477e-06, "loss": 0.2643, "step": 760 }, { "epoch": 0.052055544154866955, "grad_norm": 2.6735072587775988, "learning_rate": 9.987282116580888e-06, "loss": 0.488, "step": 761 }, { "epoch": 0.052123948286476506, "grad_norm": 2.293378764054159, "learning_rate": 9.987203034621408e-06, "loss": 0.4537, "step": 762 }, { "epoch": 0.05219235241808605, "grad_norm": 2.047060555863476, "learning_rate": 9.98712370786592e-06, "loss": 0.2842, "step": 763 }, { "epoch": 0.0522607565496956, "grad_norm": 1.9600886183102078, "learning_rate": 9.987044136318318e-06, "loss": 0.2664, "step": 764 }, { "epoch": 0.05232916068130515, "grad_norm": 2.182754259691024, "learning_rate": 9.98696431998251e-06, "loss": 0.2499, "step": 765 }, { "epoch": 0.0523975648129147, "grad_norm": 2.634885006201394, "learning_rate": 9.986884258862409e-06, "loss": 0.5682, "step": 766 }, { "epoch": 0.05246596894452425, "grad_norm": 2.9250584360137935, "learning_rate": 9.986803952961946e-06, "loss": 0.6357, "step": 767 }, { "epoch": 0.052534373076133796, "grad_norm": 2.0964572150608936, "learning_rate": 9.986723402285066e-06, "loss": 0.227, "step": 768 }, { "epoch": 0.05260277720774335, "grad_norm": 2.0156558869826804, "learning_rate": 9.98664260683572e-06, "loss": 0.2949, "step": 769 }, { "epoch": 0.0526711813393529, "grad_norm": 1.4929194752153676, "learning_rate": 9.986561566617876e-06, "loss": 0.2149, "step": 770 }, { "epoch": 0.05273958547096245, "grad_norm": 2.2379553525982168, "learning_rate": 9.98648028163551e-06, "loss": 0.2732, "step": 771 }, { "epoch": 0.05280798960257199, "grad_norm": 2.8390571102916256, "learning_rate": 9.98639875189261e-06, "loss": 0.5031, "step": 772 }, { "epoch": 0.05287639373418154, "grad_norm": 162.95901353083352, "learning_rate": 9.986316977393183e-06, "loss": 0.6771, "step": 773 }, { "epoch": 0.05294479786579109, "grad_norm": 2.585853281407233, "learning_rate": 9.986234958141236e-06, "loss": 0.4519, "step": 774 }, { "epoch": 0.053013201997400644, "grad_norm": 2.272055130262403, "learning_rate": 9.986152694140803e-06, "loss": 0.4857, "step": 775 }, { "epoch": 0.053081606129010195, "grad_norm": 3.1188697195677664, "learning_rate": 9.986070185395917e-06, "loss": 0.3322, "step": 776 }, { "epoch": 0.05315001026061974, "grad_norm": 14.273416921081909, "learning_rate": 9.98598743191063e-06, "loss": 0.6058, "step": 777 }, { "epoch": 0.05321841439222929, "grad_norm": 90.32843831537028, "learning_rate": 9.985904433689e-06, "loss": 0.6931, "step": 778 }, { "epoch": 0.05328681852383884, "grad_norm": 2.9038720605873194, "learning_rate": 9.985821190735104e-06, "loss": 0.7252, "step": 779 }, { "epoch": 0.05335522265544839, "grad_norm": 2.8529855808986078, "learning_rate": 9.98573770305303e-06, "loss": 0.3676, "step": 780 }, { "epoch": 0.05342362678705794, "grad_norm": 9.161885945351129, "learning_rate": 9.985653970646872e-06, "loss": 0.3736, "step": 781 }, { "epoch": 0.053492030918667485, "grad_norm": 2.194454833316623, "learning_rate": 9.985569993520741e-06, "loss": 0.411, "step": 782 }, { "epoch": 0.053560435050277036, "grad_norm": 2.457699052698779, "learning_rate": 9.985485771678762e-06, "loss": 0.6232, "step": 783 }, { "epoch": 0.05362883918188659, "grad_norm": 3.189070389554228, "learning_rate": 9.985401305125065e-06, "loss": 0.6159, "step": 784 }, { "epoch": 0.05369724331349614, "grad_norm": 2.2224152472891996, "learning_rate": 9.985316593863796e-06, "loss": 0.4199, "step": 785 }, { "epoch": 0.05376564744510568, "grad_norm": 2.4284306185336084, "learning_rate": 9.985231637899117e-06, "loss": 0.4189, "step": 786 }, { "epoch": 0.05383405157671523, "grad_norm": 2.197259487567383, "learning_rate": 9.985146437235197e-06, "loss": 0.539, "step": 787 }, { "epoch": 0.05390245570832478, "grad_norm": 2.463506090795793, "learning_rate": 9.985060991876214e-06, "loss": 0.408, "step": 788 }, { "epoch": 0.05397085983993433, "grad_norm": 2.0385982525716497, "learning_rate": 9.984975301826367e-06, "loss": 0.4012, "step": 789 }, { "epoch": 0.054039263971543884, "grad_norm": 3.136121776205271, "learning_rate": 9.98488936708986e-06, "loss": 0.6939, "step": 790 }, { "epoch": 0.05410766810315343, "grad_norm": 2.0265186593307067, "learning_rate": 9.984803187670908e-06, "loss": 0.3778, "step": 791 }, { "epoch": 0.05417607223476298, "grad_norm": 2.7865705550700772, "learning_rate": 9.984716763573747e-06, "loss": 0.6609, "step": 792 }, { "epoch": 0.05424447636637253, "grad_norm": 2.0149837405327324, "learning_rate": 9.984630094802617e-06, "loss": 0.3446, "step": 793 }, { "epoch": 0.05431288049798208, "grad_norm": 2.6307614654469527, "learning_rate": 9.98454318136177e-06, "loss": 0.6273, "step": 794 }, { "epoch": 0.05438128462959163, "grad_norm": 3.3954050719434226, "learning_rate": 9.984456023255473e-06, "loss": 0.5424, "step": 795 }, { "epoch": 0.054449688761201175, "grad_norm": 2.7899467605207766, "learning_rate": 9.984368620488005e-06, "loss": 0.6975, "step": 796 }, { "epoch": 0.054518092892810725, "grad_norm": 2.7340147217048627, "learning_rate": 9.984280973063656e-06, "loss": 0.4965, "step": 797 }, { "epoch": 0.054586497024420276, "grad_norm": 2.159237819874233, "learning_rate": 9.984193080986729e-06, "loss": 0.3692, "step": 798 }, { "epoch": 0.05465490115602983, "grad_norm": 1.8709534690051823, "learning_rate": 9.984104944261536e-06, "loss": 0.3756, "step": 799 }, { "epoch": 0.05472330528763937, "grad_norm": 2.725538586185041, "learning_rate": 9.984016562892404e-06, "loss": 0.3718, "step": 800 }, { "epoch": 0.05479170941924892, "grad_norm": 1.9853872248143476, "learning_rate": 9.983927936883673e-06, "loss": 0.4354, "step": 801 }, { "epoch": 0.05486011355085847, "grad_norm": 2.7453207498946446, "learning_rate": 9.983839066239692e-06, "loss": 0.6639, "step": 802 }, { "epoch": 0.05492851768246802, "grad_norm": 2.8104338924769867, "learning_rate": 9.983749950964821e-06, "loss": 0.5508, "step": 803 }, { "epoch": 0.05499692181407757, "grad_norm": 2.5538394582001507, "learning_rate": 9.983660591063437e-06, "loss": 0.391, "step": 804 }, { "epoch": 0.05506532594568712, "grad_norm": 2.6075046711895395, "learning_rate": 9.983570986539924e-06, "loss": 0.478, "step": 805 }, { "epoch": 0.05513373007729667, "grad_norm": 3.346116324951345, "learning_rate": 9.983481137398683e-06, "loss": 0.2269, "step": 806 }, { "epoch": 0.05520213420890622, "grad_norm": 2.674969827178773, "learning_rate": 9.983391043644123e-06, "loss": 0.6751, "step": 807 }, { "epoch": 0.05527053834051577, "grad_norm": 6.366397663578314, "learning_rate": 9.983300705280665e-06, "loss": 0.3584, "step": 808 }, { "epoch": 0.05533894247212531, "grad_norm": 2.339915412989942, "learning_rate": 9.983210122312745e-06, "loss": 0.373, "step": 809 }, { "epoch": 0.055407346603734864, "grad_norm": 2.4422770907808715, "learning_rate": 9.983119294744809e-06, "loss": 0.4731, "step": 810 }, { "epoch": 0.055475750735344415, "grad_norm": 2.496590214221062, "learning_rate": 9.983028222581315e-06, "loss": 0.4656, "step": 811 }, { "epoch": 0.055544154866953965, "grad_norm": 2.852359514028327, "learning_rate": 9.982936905826733e-06, "loss": 0.7777, "step": 812 }, { "epoch": 0.055612558998563516, "grad_norm": 4.243843556245071, "learning_rate": 9.982845344485544e-06, "loss": 0.4172, "step": 813 }, { "epoch": 0.05568096313017306, "grad_norm": 2.426962121234296, "learning_rate": 9.982753538562244e-06, "loss": 0.2729, "step": 814 }, { "epoch": 0.05574936726178261, "grad_norm": 2.0760990248465463, "learning_rate": 9.982661488061338e-06, "loss": 0.3998, "step": 815 }, { "epoch": 0.05581777139339216, "grad_norm": 2.651739206085146, "learning_rate": 9.982569192987346e-06, "loss": 0.6, "step": 816 }, { "epoch": 0.05588617552500171, "grad_norm": 3.5017900657668455, "learning_rate": 9.982476653344799e-06, "loss": 0.6264, "step": 817 }, { "epoch": 0.05595457965661126, "grad_norm": 2.4172610670865207, "learning_rate": 9.982383869138235e-06, "loss": 0.1731, "step": 818 }, { "epoch": 0.056022983788220806, "grad_norm": 2.828606411757475, "learning_rate": 9.982290840372212e-06, "loss": 0.4789, "step": 819 }, { "epoch": 0.05609138791983036, "grad_norm": 2.9248378643516912, "learning_rate": 9.982197567051296e-06, "loss": 0.6796, "step": 820 }, { "epoch": 0.05615979205143991, "grad_norm": 2.523782771437659, "learning_rate": 9.982104049180064e-06, "loss": 0.5955, "step": 821 }, { "epoch": 0.05622819618304946, "grad_norm": 2.394269421748327, "learning_rate": 9.982010286763109e-06, "loss": 0.3263, "step": 822 }, { "epoch": 0.056296600314659, "grad_norm": 2.143070439100002, "learning_rate": 9.981916279805029e-06, "loss": 0.418, "step": 823 }, { "epoch": 0.05636500444626855, "grad_norm": 3.2371897727397854, "learning_rate": 9.98182202831044e-06, "loss": 0.74, "step": 824 }, { "epoch": 0.056433408577878104, "grad_norm": 2.113788452204745, "learning_rate": 9.98172753228397e-06, "loss": 0.4705, "step": 825 }, { "epoch": 0.056501812709487655, "grad_norm": 1.8164161435749964, "learning_rate": 9.981632791730256e-06, "loss": 0.3659, "step": 826 }, { "epoch": 0.056570216841097205, "grad_norm": 2.8742623730496275, "learning_rate": 9.981537806653949e-06, "loss": 0.6062, "step": 827 }, { "epoch": 0.05663862097270675, "grad_norm": 2.4141550385846156, "learning_rate": 9.98144257705971e-06, "loss": 0.48, "step": 828 }, { "epoch": 0.0567070251043163, "grad_norm": 2.1336083516780966, "learning_rate": 9.981347102952215e-06, "loss": 0.3841, "step": 829 }, { "epoch": 0.05677542923592585, "grad_norm": 2.0026231937637218, "learning_rate": 9.98125138433615e-06, "loss": 0.3058, "step": 830 }, { "epoch": 0.0568438333675354, "grad_norm": 2.2972223966332703, "learning_rate": 9.98115542121621e-06, "loss": 0.412, "step": 831 }, { "epoch": 0.056912237499144945, "grad_norm": 2.010161135712434, "learning_rate": 9.981059213597111e-06, "loss": 0.357, "step": 832 }, { "epoch": 0.056980641630754496, "grad_norm": 2.347010036398286, "learning_rate": 9.980962761483573e-06, "loss": 0.4568, "step": 833 }, { "epoch": 0.057049045762364047, "grad_norm": 2.935307760138367, "learning_rate": 9.980866064880328e-06, "loss": 0.8245, "step": 834 }, { "epoch": 0.0571174498939736, "grad_norm": 2.701310910655135, "learning_rate": 9.980769123792125e-06, "loss": 0.6482, "step": 835 }, { "epoch": 0.05718585402558315, "grad_norm": 2.8914515576527267, "learning_rate": 9.98067193822372e-06, "loss": 0.4066, "step": 836 }, { "epoch": 0.05725425815719269, "grad_norm": 1.599369035674673, "learning_rate": 9.980574508179886e-06, "loss": 0.1768, "step": 837 }, { "epoch": 0.05732266228880224, "grad_norm": 2.1809426388978577, "learning_rate": 9.980476833665405e-06, "loss": 0.4907, "step": 838 }, { "epoch": 0.05739106642041179, "grad_norm": 1.9199460630552194, "learning_rate": 9.980378914685069e-06, "loss": 0.3885, "step": 839 }, { "epoch": 0.057459470552021344, "grad_norm": 2.7803658944699303, "learning_rate": 9.980280751243687e-06, "loss": 0.594, "step": 840 }, { "epoch": 0.057527874683630895, "grad_norm": 2.3866013749350423, "learning_rate": 9.980182343346075e-06, "loss": 0.4699, "step": 841 }, { "epoch": 0.05759627881524044, "grad_norm": 2.037734874691204, "learning_rate": 9.980083690997066e-06, "loss": 0.1492, "step": 842 }, { "epoch": 0.05766468294684999, "grad_norm": 2.7033672417160584, "learning_rate": 9.9799847942015e-06, "loss": 0.4978, "step": 843 }, { "epoch": 0.05773308707845954, "grad_norm": 2.5758861068010015, "learning_rate": 9.979885652964233e-06, "loss": 0.5276, "step": 844 }, { "epoch": 0.05780149121006909, "grad_norm": 2.5809462701699513, "learning_rate": 9.979786267290129e-06, "loss": 0.3193, "step": 845 }, { "epoch": 0.057869895341678634, "grad_norm": 2.7403183514322333, "learning_rate": 9.979686637184069e-06, "loss": 0.616, "step": 846 }, { "epoch": 0.057938299473288185, "grad_norm": 2.920597164185486, "learning_rate": 9.97958676265094e-06, "loss": 0.6216, "step": 847 }, { "epoch": 0.058006703604897736, "grad_norm": 2.600529222088412, "learning_rate": 9.979486643695651e-06, "loss": 0.6244, "step": 848 }, { "epoch": 0.058075107736507287, "grad_norm": 2.8128668439887825, "learning_rate": 9.979386280323109e-06, "loss": 0.8038, "step": 849 }, { "epoch": 0.05814351186811684, "grad_norm": 2.9082251061812268, "learning_rate": 9.979285672538243e-06, "loss": 0.4786, "step": 850 }, { "epoch": 0.05821191599972638, "grad_norm": 2.518236717809791, "learning_rate": 9.979184820345992e-06, "loss": 0.514, "step": 851 }, { "epoch": 0.05828032013133593, "grad_norm": 1.9239412545571917, "learning_rate": 9.979083723751306e-06, "loss": 0.2318, "step": 852 }, { "epoch": 0.05834872426294548, "grad_norm": 2.8216691175447544, "learning_rate": 9.978982382759147e-06, "loss": 0.7058, "step": 853 }, { "epoch": 0.05841712839455503, "grad_norm": 2.615041815439836, "learning_rate": 9.978880797374488e-06, "loss": 0.5574, "step": 854 }, { "epoch": 0.05848553252616458, "grad_norm": 2.854483250418998, "learning_rate": 9.978778967602319e-06, "loss": 0.5944, "step": 855 }, { "epoch": 0.05855393665777413, "grad_norm": 2.4382138399084705, "learning_rate": 9.978676893447635e-06, "loss": 0.2368, "step": 856 }, { "epoch": 0.05862234078938368, "grad_norm": 3.0137050921030437, "learning_rate": 9.978574574915448e-06, "loss": 0.6996, "step": 857 }, { "epoch": 0.05869074492099323, "grad_norm": 2.677444019337918, "learning_rate": 9.978472012010778e-06, "loss": 0.7086, "step": 858 }, { "epoch": 0.05875914905260278, "grad_norm": 2.834336283292121, "learning_rate": 9.97836920473866e-06, "loss": 0.5982, "step": 859 }, { "epoch": 0.058827553184212324, "grad_norm": 2.839807880075379, "learning_rate": 9.978266153104144e-06, "loss": 0.3241, "step": 860 }, { "epoch": 0.058895957315821874, "grad_norm": 2.2422394563722845, "learning_rate": 9.978162857112283e-06, "loss": 0.5662, "step": 861 }, { "epoch": 0.058964361447431425, "grad_norm": 2.6370935555143067, "learning_rate": 9.978059316768152e-06, "loss": 0.2694, "step": 862 }, { "epoch": 0.059032765579040976, "grad_norm": 2.8843940903681786, "learning_rate": 9.97795553207683e-06, "loss": 0.5115, "step": 863 }, { "epoch": 0.05910116971065053, "grad_norm": 2.2912083355309463, "learning_rate": 9.977851503043412e-06, "loss": 0.4323, "step": 864 }, { "epoch": 0.05916957384226007, "grad_norm": 2.5422310967860002, "learning_rate": 9.977747229673004e-06, "loss": 0.3959, "step": 865 }, { "epoch": 0.05923797797386962, "grad_norm": 2.606952502119045, "learning_rate": 9.977642711970726e-06, "loss": 0.5744, "step": 866 }, { "epoch": 0.05930638210547917, "grad_norm": 2.4721633636835567, "learning_rate": 9.977537949941704e-06, "loss": 0.4233, "step": 867 }, { "epoch": 0.05937478623708872, "grad_norm": 2.113839729678478, "learning_rate": 9.977432943591086e-06, "loss": 0.2799, "step": 868 }, { "epoch": 0.059443190368698266, "grad_norm": 2.532085834069017, "learning_rate": 9.977327692924021e-06, "loss": 0.6615, "step": 869 }, { "epoch": 0.05951159450030782, "grad_norm": 2.2524329052394014, "learning_rate": 9.977222197945679e-06, "loss": 0.5819, "step": 870 }, { "epoch": 0.05957999863191737, "grad_norm": 3.083735634144449, "learning_rate": 9.977116458661238e-06, "loss": 0.5505, "step": 871 }, { "epoch": 0.05964840276352692, "grad_norm": 2.506197091207657, "learning_rate": 9.977010475075886e-06, "loss": 0.4009, "step": 872 }, { "epoch": 0.05971680689513647, "grad_norm": 3.246917363056583, "learning_rate": 9.976904247194826e-06, "loss": 0.3479, "step": 873 }, { "epoch": 0.05978521102674601, "grad_norm": 2.617148418988763, "learning_rate": 9.976797775023272e-06, "loss": 0.5503, "step": 874 }, { "epoch": 0.059853615158355564, "grad_norm": 2.720839963211659, "learning_rate": 9.97669105856645e-06, "loss": 0.4437, "step": 875 }, { "epoch": 0.059922019289965114, "grad_norm": 2.615224540526721, "learning_rate": 9.9765840978296e-06, "loss": 0.4218, "step": 876 }, { "epoch": 0.059990423421574665, "grad_norm": 3.160299224825634, "learning_rate": 9.97647689281797e-06, "loss": 0.653, "step": 877 }, { "epoch": 0.06005882755318421, "grad_norm": 2.87779873956826, "learning_rate": 9.976369443536822e-06, "loss": 0.6902, "step": 878 }, { "epoch": 0.06012723168479376, "grad_norm": 4.425432748700984, "learning_rate": 9.976261749991433e-06, "loss": 0.569, "step": 879 }, { "epoch": 0.06019563581640331, "grad_norm": 3.392013085483725, "learning_rate": 9.976153812187087e-06, "loss": 0.8461, "step": 880 }, { "epoch": 0.06026403994801286, "grad_norm": 2.6792650028729543, "learning_rate": 9.97604563012908e-06, "loss": 0.5172, "step": 881 }, { "epoch": 0.06033244407962241, "grad_norm": 3.0190501255405278, "learning_rate": 9.975937203822726e-06, "loss": 0.5326, "step": 882 }, { "epoch": 0.060400848211231956, "grad_norm": 2.659570457921847, "learning_rate": 9.975828533273346e-06, "loss": 0.2099, "step": 883 }, { "epoch": 0.060469252342841506, "grad_norm": 2.422890579419945, "learning_rate": 9.975719618486272e-06, "loss": 0.4013, "step": 884 }, { "epoch": 0.06053765647445106, "grad_norm": 2.87984430566128, "learning_rate": 9.975610459466853e-06, "loss": 0.8378, "step": 885 }, { "epoch": 0.06060606060606061, "grad_norm": 2.6413678303823858, "learning_rate": 9.975501056220445e-06, "loss": 0.5809, "step": 886 }, { "epoch": 0.06067446473767016, "grad_norm": 2.4731835018601, "learning_rate": 9.97539140875242e-06, "loss": 0.4113, "step": 887 }, { "epoch": 0.0607428688692797, "grad_norm": 2.7005001785876495, "learning_rate": 9.975281517068158e-06, "loss": 0.2535, "step": 888 }, { "epoch": 0.06081127300088925, "grad_norm": 2.8545666847073883, "learning_rate": 9.975171381173054e-06, "loss": 0.3207, "step": 889 }, { "epoch": 0.060879677132498804, "grad_norm": 2.092682010933842, "learning_rate": 9.975061001072512e-06, "loss": 0.4148, "step": 890 }, { "epoch": 0.060948081264108354, "grad_norm": 3.0444640769135987, "learning_rate": 9.974950376771954e-06, "loss": 0.3724, "step": 891 }, { "epoch": 0.0610164853957179, "grad_norm": 2.260906014160596, "learning_rate": 9.974839508276806e-06, "loss": 0.373, "step": 892 }, { "epoch": 0.06108488952732745, "grad_norm": 2.2006818298644246, "learning_rate": 9.974728395592512e-06, "loss": 0.3318, "step": 893 }, { "epoch": 0.061153293658937, "grad_norm": 2.227229777952215, "learning_rate": 9.974617038724526e-06, "loss": 0.4155, "step": 894 }, { "epoch": 0.06122169779054655, "grad_norm": 3.1734148337429517, "learning_rate": 9.974505437678315e-06, "loss": 0.3618, "step": 895 }, { "epoch": 0.0612901019221561, "grad_norm": 2.6458504793908144, "learning_rate": 9.974393592459353e-06, "loss": 0.5052, "step": 896 }, { "epoch": 0.061358506053765645, "grad_norm": 1.5936560649864613, "learning_rate": 9.974281503073134e-06, "loss": 0.2588, "step": 897 }, { "epoch": 0.061426910185375196, "grad_norm": 2.195629244655586, "learning_rate": 9.974169169525157e-06, "loss": 0.2119, "step": 898 }, { "epoch": 0.061495314316984746, "grad_norm": 2.4977393330109634, "learning_rate": 9.974056591820937e-06, "loss": 0.5096, "step": 899 }, { "epoch": 0.0615637184485943, "grad_norm": 3.039358021702255, "learning_rate": 9.973943769966e-06, "loss": 0.536, "step": 900 }, { "epoch": 0.06163212258020385, "grad_norm": 2.479564640147992, "learning_rate": 9.973830703965887e-06, "loss": 0.5054, "step": 901 }, { "epoch": 0.06170052671181339, "grad_norm": 7.672584889309925, "learning_rate": 9.973717393826142e-06, "loss": 0.7662, "step": 902 }, { "epoch": 0.06176893084342294, "grad_norm": 3.5469020724838836, "learning_rate": 9.97360383955233e-06, "loss": 0.3153, "step": 903 }, { "epoch": 0.06183733497503249, "grad_norm": 2.8295275861524805, "learning_rate": 9.973490041150024e-06, "loss": 0.463, "step": 904 }, { "epoch": 0.061905739106642044, "grad_norm": 3.5169794313427505, "learning_rate": 9.973375998624811e-06, "loss": 0.7422, "step": 905 }, { "epoch": 0.06197414323825159, "grad_norm": 2.0305928340981363, "learning_rate": 9.973261711982287e-06, "loss": 0.3628, "step": 906 }, { "epoch": 0.06204254736986114, "grad_norm": 1.9382609172955425, "learning_rate": 9.973147181228062e-06, "loss": 0.3576, "step": 907 }, { "epoch": 0.06211095150147069, "grad_norm": 2.715048901607533, "learning_rate": 9.973032406367758e-06, "loss": 0.7963, "step": 908 }, { "epoch": 0.06217935563308024, "grad_norm": 2.5128756953939964, "learning_rate": 9.97291738740701e-06, "loss": 0.5051, "step": 909 }, { "epoch": 0.06224775976468979, "grad_norm": 2.7498763013242127, "learning_rate": 9.972802124351461e-06, "loss": 0.7066, "step": 910 }, { "epoch": 0.062316163896299334, "grad_norm": 2.511064073307623, "learning_rate": 9.972686617206773e-06, "loss": 0.5362, "step": 911 }, { "epoch": 0.062384568027908885, "grad_norm": 2.6836682049724327, "learning_rate": 9.972570865978612e-06, "loss": 0.5801, "step": 912 }, { "epoch": 0.062452972159518436, "grad_norm": 1.7320070597981576, "learning_rate": 9.97245487067266e-06, "loss": 0.1939, "step": 913 }, { "epoch": 0.06252137629112799, "grad_norm": 2.069980767501643, "learning_rate": 9.972338631294611e-06, "loss": 0.4395, "step": 914 }, { "epoch": 0.06258978042273754, "grad_norm": 2.08477922491963, "learning_rate": 9.972222147850173e-06, "loss": 0.425, "step": 915 }, { "epoch": 0.06265818455434709, "grad_norm": 2.2976105349187947, "learning_rate": 9.972105420345058e-06, "loss": 0.5508, "step": 916 }, { "epoch": 0.06272658868595664, "grad_norm": 2.2376307096342947, "learning_rate": 9.971988448785002e-06, "loss": 0.4197, "step": 917 }, { "epoch": 0.06279499281756618, "grad_norm": 1.785020860675655, "learning_rate": 9.97187123317574e-06, "loss": 0.1627, "step": 918 }, { "epoch": 0.06286339694917573, "grad_norm": 2.547862616179099, "learning_rate": 9.971753773523032e-06, "loss": 0.5211, "step": 919 }, { "epoch": 0.06293180108078528, "grad_norm": 2.2846397207380402, "learning_rate": 9.97163606983264e-06, "loss": 0.4611, "step": 920 }, { "epoch": 0.06300020521239483, "grad_norm": 2.280033982833032, "learning_rate": 9.97151812211034e-06, "loss": 0.5492, "step": 921 }, { "epoch": 0.06306860934400438, "grad_norm": 2.8694729700149804, "learning_rate": 9.971399930361924e-06, "loss": 0.4057, "step": 922 }, { "epoch": 0.06313701347561393, "grad_norm": 2.09159545569829, "learning_rate": 9.971281494593194e-06, "loss": 0.3997, "step": 923 }, { "epoch": 0.06320541760722348, "grad_norm": 1.6017822591690358, "learning_rate": 9.971162814809962e-06, "loss": 0.2121, "step": 924 }, { "epoch": 0.06327382173883303, "grad_norm": 1.5821487006399673, "learning_rate": 9.971043891018053e-06, "loss": 0.251, "step": 925 }, { "epoch": 0.06334222587044258, "grad_norm": 4.267253816271588, "learning_rate": 9.970924723223306e-06, "loss": 0.3508, "step": 926 }, { "epoch": 0.06341063000205212, "grad_norm": 12.948371926260517, "learning_rate": 9.970805311431566e-06, "loss": 0.4605, "step": 927 }, { "epoch": 0.06347903413366167, "grad_norm": 2.251327394630374, "learning_rate": 9.970685655648702e-06, "loss": 0.4338, "step": 928 }, { "epoch": 0.06354743826527122, "grad_norm": 3.2704055601568, "learning_rate": 9.970565755880579e-06, "loss": 0.5031, "step": 929 }, { "epoch": 0.06361584239688077, "grad_norm": 2.787958920553477, "learning_rate": 9.970445612133088e-06, "loss": 0.2409, "step": 930 }, { "epoch": 0.06368424652849032, "grad_norm": 3.012501477601255, "learning_rate": 9.970325224412125e-06, "loss": 0.5846, "step": 931 }, { "epoch": 0.06375265066009987, "grad_norm": 2.5459011123227615, "learning_rate": 9.970204592723598e-06, "loss": 0.4442, "step": 932 }, { "epoch": 0.06382105479170942, "grad_norm": 7.836329137536698, "learning_rate": 9.970083717073427e-06, "loss": 0.2004, "step": 933 }, { "epoch": 0.06388945892331897, "grad_norm": 2.741210366722295, "learning_rate": 9.969962597467548e-06, "loss": 0.2851, "step": 934 }, { "epoch": 0.06395786305492852, "grad_norm": 2.4373830559382834, "learning_rate": 9.969841233911905e-06, "loss": 0.3466, "step": 935 }, { "epoch": 0.06402626718653806, "grad_norm": 2.863561796050765, "learning_rate": 9.969719626412456e-06, "loss": 0.744, "step": 936 }, { "epoch": 0.06409467131814761, "grad_norm": 2.6016031554198573, "learning_rate": 9.969597774975169e-06, "loss": 0.5556, "step": 937 }, { "epoch": 0.06416307544975716, "grad_norm": 2.4142189715886957, "learning_rate": 9.969475679606025e-06, "loss": 0.639, "step": 938 }, { "epoch": 0.06423147958136671, "grad_norm": 2.493724062313865, "learning_rate": 9.969353340311017e-06, "loss": 0.5196, "step": 939 }, { "epoch": 0.06429988371297626, "grad_norm": 2.259362375189832, "learning_rate": 9.96923075709615e-06, "loss": 0.2889, "step": 940 }, { "epoch": 0.06436828784458581, "grad_norm": 2.4593037101757016, "learning_rate": 9.96910792996744e-06, "loss": 0.4073, "step": 941 }, { "epoch": 0.06443669197619536, "grad_norm": 2.638357904759169, "learning_rate": 9.96898485893092e-06, "loss": 0.5279, "step": 942 }, { "epoch": 0.06450509610780492, "grad_norm": 2.471612242852739, "learning_rate": 9.968861543992628e-06, "loss": 0.3464, "step": 943 }, { "epoch": 0.06457350023941447, "grad_norm": 2.5985116138314326, "learning_rate": 9.968737985158615e-06, "loss": 0.4986, "step": 944 }, { "epoch": 0.064641904371024, "grad_norm": 2.88906585297047, "learning_rate": 9.968614182434949e-06, "loss": 0.3231, "step": 945 }, { "epoch": 0.06471030850263355, "grad_norm": 2.824241564224556, "learning_rate": 9.968490135827707e-06, "loss": 0.6046, "step": 946 }, { "epoch": 0.0647787126342431, "grad_norm": 2.788936451860544, "learning_rate": 9.968365845342973e-06, "loss": 0.8125, "step": 947 }, { "epoch": 0.06484711676585266, "grad_norm": 2.231387428517909, "learning_rate": 9.968241310986855e-06, "loss": 0.3335, "step": 948 }, { "epoch": 0.0649155208974622, "grad_norm": 2.648629550103403, "learning_rate": 9.96811653276546e-06, "loss": 0.5245, "step": 949 }, { "epoch": 0.06498392502907176, "grad_norm": 2.1254031906780084, "learning_rate": 9.967991510684916e-06, "loss": 0.3916, "step": 950 }, { "epoch": 0.06505232916068131, "grad_norm": 2.7888986234845423, "learning_rate": 9.967866244751356e-06, "loss": 0.5935, "step": 951 }, { "epoch": 0.06512073329229086, "grad_norm": 2.53549216797209, "learning_rate": 9.967740734970934e-06, "loss": 0.5389, "step": 952 }, { "epoch": 0.06518913742390041, "grad_norm": 2.0505460622441922, "learning_rate": 9.967614981349807e-06, "loss": 0.3343, "step": 953 }, { "epoch": 0.06525754155550996, "grad_norm": 2.033231614102896, "learning_rate": 9.967488983894147e-06, "loss": 0.2591, "step": 954 }, { "epoch": 0.0653259456871195, "grad_norm": 2.047461272864372, "learning_rate": 9.967362742610141e-06, "loss": 0.4399, "step": 955 }, { "epoch": 0.06539434981872905, "grad_norm": 2.4575445309191095, "learning_rate": 9.967236257503984e-06, "loss": 0.5772, "step": 956 }, { "epoch": 0.0654627539503386, "grad_norm": 2.365222836378887, "learning_rate": 9.967109528581886e-06, "loss": 0.4493, "step": 957 }, { "epoch": 0.06553115808194815, "grad_norm": 2.673612609604145, "learning_rate": 9.966982555850066e-06, "loss": 0.6708, "step": 958 }, { "epoch": 0.0655995622135577, "grad_norm": 2.3283003350613534, "learning_rate": 9.966855339314756e-06, "loss": 0.317, "step": 959 }, { "epoch": 0.06566796634516725, "grad_norm": 2.7970693563349527, "learning_rate": 9.966727878982203e-06, "loss": 0.3967, "step": 960 }, { "epoch": 0.0657363704767768, "grad_norm": 2.5598827987809907, "learning_rate": 9.96660017485866e-06, "loss": 0.6765, "step": 961 }, { "epoch": 0.06580477460838635, "grad_norm": 2.1771196806457684, "learning_rate": 9.966472226950396e-06, "loss": 0.3974, "step": 962 }, { "epoch": 0.0658731787399959, "grad_norm": 1.6718957745963696, "learning_rate": 9.966344035263696e-06, "loss": 0.2621, "step": 963 }, { "epoch": 0.06594158287160544, "grad_norm": 8.51010910717314, "learning_rate": 9.966215599804846e-06, "loss": 0.3781, "step": 964 }, { "epoch": 0.06600998700321499, "grad_norm": 2.450196869394683, "learning_rate": 9.966086920580152e-06, "loss": 0.2412, "step": 965 }, { "epoch": 0.06607839113482454, "grad_norm": 3.412403550374459, "learning_rate": 9.965957997595932e-06, "loss": 0.5763, "step": 966 }, { "epoch": 0.06614679526643409, "grad_norm": 2.6786944770933063, "learning_rate": 9.965828830858513e-06, "loss": 0.4666, "step": 967 }, { "epoch": 0.06621519939804364, "grad_norm": 4.206406684213161, "learning_rate": 9.965699420374235e-06, "loss": 0.4244, "step": 968 }, { "epoch": 0.06628360352965319, "grad_norm": 2.885017130947985, "learning_rate": 9.96556976614945e-06, "loss": 0.4521, "step": 969 }, { "epoch": 0.06635200766126274, "grad_norm": 2.7045157774166877, "learning_rate": 9.965439868190524e-06, "loss": 0.5924, "step": 970 }, { "epoch": 0.0664204117928723, "grad_norm": 2.2816320435803936, "learning_rate": 9.96530972650383e-06, "loss": 0.2791, "step": 971 }, { "epoch": 0.06648881592448184, "grad_norm": 2.8717078142618466, "learning_rate": 9.965179341095758e-06, "loss": 0.5337, "step": 972 }, { "epoch": 0.06655722005609138, "grad_norm": 5.665693040492871, "learning_rate": 9.965048711972707e-06, "loss": 0.5172, "step": 973 }, { "epoch": 0.06662562418770093, "grad_norm": 2.5150796464988847, "learning_rate": 9.964917839141089e-06, "loss": 0.4313, "step": 974 }, { "epoch": 0.06669402831931048, "grad_norm": 1.8042985406154783, "learning_rate": 9.96478672260733e-06, "loss": 0.181, "step": 975 }, { "epoch": 0.06676243245092003, "grad_norm": 3.081157671723186, "learning_rate": 9.96465536237786e-06, "loss": 0.5547, "step": 976 }, { "epoch": 0.06683083658252958, "grad_norm": 3.916482782443657, "learning_rate": 9.964523758459134e-06, "loss": 0.399, "step": 977 }, { "epoch": 0.06689924071413914, "grad_norm": 1.8922081655607064, "learning_rate": 9.964391910857608e-06, "loss": 0.3603, "step": 978 }, { "epoch": 0.06696764484574869, "grad_norm": 1.8340830750967925, "learning_rate": 9.964259819579754e-06, "loss": 0.3779, "step": 979 }, { "epoch": 0.06703604897735824, "grad_norm": 2.678309272471299, "learning_rate": 9.964127484632054e-06, "loss": 0.7094, "step": 980 }, { "epoch": 0.06710445310896779, "grad_norm": 2.1789935179666933, "learning_rate": 9.963994906021008e-06, "loss": 0.3597, "step": 981 }, { "epoch": 0.06717285724057732, "grad_norm": 2.763689123308795, "learning_rate": 9.96386208375312e-06, "loss": 0.5027, "step": 982 }, { "epoch": 0.06724126137218688, "grad_norm": 2.949107197047589, "learning_rate": 9.96372901783491e-06, "loss": 0.4787, "step": 983 }, { "epoch": 0.06730966550379643, "grad_norm": 2.173838602539477, "learning_rate": 9.963595708272913e-06, "loss": 0.2186, "step": 984 }, { "epoch": 0.06737806963540598, "grad_norm": 1.9422003889078168, "learning_rate": 9.963462155073667e-06, "loss": 0.4371, "step": 985 }, { "epoch": 0.06744647376701553, "grad_norm": 2.3113350769320085, "learning_rate": 9.963328358243732e-06, "loss": 0.4089, "step": 986 }, { "epoch": 0.06751487789862508, "grad_norm": 2.4222693306855043, "learning_rate": 9.963194317789672e-06, "loss": 0.3948, "step": 987 }, { "epoch": 0.06758328203023463, "grad_norm": 2.343704305839156, "learning_rate": 9.963060033718068e-06, "loss": 0.3073, "step": 988 }, { "epoch": 0.06765168616184418, "grad_norm": 2.962808073499399, "learning_rate": 9.962925506035512e-06, "loss": 0.242, "step": 989 }, { "epoch": 0.06772009029345373, "grad_norm": 3.6694872701834464, "learning_rate": 9.962790734748606e-06, "loss": 1.0186, "step": 990 }, { "epoch": 0.06778849442506327, "grad_norm": 2.6994021038764564, "learning_rate": 9.962655719863965e-06, "loss": 0.2553, "step": 991 }, { "epoch": 0.06785689855667282, "grad_norm": 2.4496465002211254, "learning_rate": 9.96252046138822e-06, "loss": 0.3159, "step": 992 }, { "epoch": 0.06792530268828237, "grad_norm": 2.604339148849341, "learning_rate": 9.962384959328002e-06, "loss": 0.541, "step": 993 }, { "epoch": 0.06799370681989192, "grad_norm": 2.571094243325737, "learning_rate": 9.96224921368997e-06, "loss": 0.4937, "step": 994 }, { "epoch": 0.06806211095150147, "grad_norm": 3.0423169594890385, "learning_rate": 9.962113224480783e-06, "loss": 0.5872, "step": 995 }, { "epoch": 0.06813051508311102, "grad_norm": 2.4621359858647622, "learning_rate": 9.961976991707118e-06, "loss": 0.4698, "step": 996 }, { "epoch": 0.06819891921472057, "grad_norm": 2.6407397092925495, "learning_rate": 9.961840515375663e-06, "loss": 0.5666, "step": 997 }, { "epoch": 0.06826732334633012, "grad_norm": 2.783615863379243, "learning_rate": 9.961703795493111e-06, "loss": 0.8431, "step": 998 }, { "epoch": 0.06833572747793967, "grad_norm": 2.1074493554898246, "learning_rate": 9.96156683206618e-06, "loss": 0.3992, "step": 999 }, { "epoch": 0.06840413160954922, "grad_norm": 2.4335955746859694, "learning_rate": 9.961429625101589e-06, "loss": 0.4933, "step": 1000 }, { "epoch": 0.06847253574115876, "grad_norm": 2.4003982357590457, "learning_rate": 9.961292174606073e-06, "loss": 0.5583, "step": 1001 }, { "epoch": 0.06854093987276831, "grad_norm": 8.03069175828441, "learning_rate": 9.961154480586379e-06, "loss": 0.3758, "step": 1002 }, { "epoch": 0.06860934400437786, "grad_norm": 2.024991727299923, "learning_rate": 9.961016543049267e-06, "loss": 0.3516, "step": 1003 }, { "epoch": 0.06867774813598741, "grad_norm": 2.98784312109352, "learning_rate": 9.960878362001505e-06, "loss": 0.5376, "step": 1004 }, { "epoch": 0.06874615226759696, "grad_norm": 3.263873803439799, "learning_rate": 9.960739937449877e-06, "loss": 0.8229, "step": 1005 }, { "epoch": 0.06881455639920651, "grad_norm": 4.904585319006385, "learning_rate": 9.96060126940118e-06, "loss": 0.3604, "step": 1006 }, { "epoch": 0.06888296053081606, "grad_norm": 2.316986493776772, "learning_rate": 9.960462357862216e-06, "loss": 0.2947, "step": 1007 }, { "epoch": 0.06895136466242562, "grad_norm": 1.917835611667519, "learning_rate": 9.960323202839806e-06, "loss": 0.2573, "step": 1008 }, { "epoch": 0.06901976879403517, "grad_norm": 2.505825773922189, "learning_rate": 9.960183804340781e-06, "loss": 0.3942, "step": 1009 }, { "epoch": 0.0690881729256447, "grad_norm": 2.1819883187315248, "learning_rate": 9.960044162371982e-06, "loss": 0.3733, "step": 1010 }, { "epoch": 0.06915657705725425, "grad_norm": 2.826285327672023, "learning_rate": 9.959904276940265e-06, "loss": 0.3789, "step": 1011 }, { "epoch": 0.0692249811888638, "grad_norm": 2.1301830260321, "learning_rate": 9.959764148052493e-06, "loss": 0.2425, "step": 1012 }, { "epoch": 0.06929338532047336, "grad_norm": 2.5464661509924853, "learning_rate": 9.959623775715546e-06, "loss": 0.2416, "step": 1013 }, { "epoch": 0.0693617894520829, "grad_norm": 2.439948637140326, "learning_rate": 9.959483159936316e-06, "loss": 0.5277, "step": 1014 }, { "epoch": 0.06943019358369246, "grad_norm": 2.602979669257779, "learning_rate": 9.959342300721702e-06, "loss": 0.6769, "step": 1015 }, { "epoch": 0.06949859771530201, "grad_norm": 2.1303887341381227, "learning_rate": 9.959201198078621e-06, "loss": 0.4614, "step": 1016 }, { "epoch": 0.06956700184691156, "grad_norm": 2.0838843717597166, "learning_rate": 9.959059852013994e-06, "loss": 0.4435, "step": 1017 }, { "epoch": 0.06963540597852111, "grad_norm": 2.736639730721296, "learning_rate": 9.958918262534767e-06, "loss": 0.6352, "step": 1018 }, { "epoch": 0.06970381011013065, "grad_norm": 1.6325679805864135, "learning_rate": 9.958776429647882e-06, "loss": 0.3318, "step": 1019 }, { "epoch": 0.0697722142417402, "grad_norm": 2.938476075679172, "learning_rate": 9.958634353360305e-06, "loss": 0.7283, "step": 1020 }, { "epoch": 0.06984061837334975, "grad_norm": 2.714180482486242, "learning_rate": 9.95849203367901e-06, "loss": 0.6226, "step": 1021 }, { "epoch": 0.0699090225049593, "grad_norm": 2.7502208913606885, "learning_rate": 9.95834947061098e-06, "loss": 0.6839, "step": 1022 }, { "epoch": 0.06997742663656885, "grad_norm": 2.1840569920920023, "learning_rate": 9.958206664163215e-06, "loss": 0.4763, "step": 1023 }, { "epoch": 0.0700458307681784, "grad_norm": 2.9430433380825884, "learning_rate": 9.958063614342726e-06, "loss": 0.7998, "step": 1024 }, { "epoch": 0.07011423489978795, "grad_norm": 2.7031278940968098, "learning_rate": 9.95792032115653e-06, "loss": 0.6107, "step": 1025 }, { "epoch": 0.0701826390313975, "grad_norm": 2.245311209320568, "learning_rate": 9.957776784611663e-06, "loss": 0.6327, "step": 1026 }, { "epoch": 0.07025104316300705, "grad_norm": 1.813649824058935, "learning_rate": 9.957633004715173e-06, "loss": 0.3409, "step": 1027 }, { "epoch": 0.07031944729461659, "grad_norm": 6.212961234273394, "learning_rate": 9.957488981474113e-06, "loss": 0.2331, "step": 1028 }, { "epoch": 0.07038785142622614, "grad_norm": 2.282731692002078, "learning_rate": 9.957344714895554e-06, "loss": 0.3352, "step": 1029 }, { "epoch": 0.07045625555783569, "grad_norm": 2.3870921927576387, "learning_rate": 9.95720020498658e-06, "loss": 0.5065, "step": 1030 }, { "epoch": 0.07052465968944524, "grad_norm": 2.07707728598992, "learning_rate": 9.957055451754279e-06, "loss": 0.2718, "step": 1031 }, { "epoch": 0.07059306382105479, "grad_norm": 2.174911845821369, "learning_rate": 9.95691045520576e-06, "loss": 0.4036, "step": 1032 }, { "epoch": 0.07066146795266434, "grad_norm": 2.7400428014391616, "learning_rate": 9.95676521534814e-06, "loss": 0.577, "step": 1033 }, { "epoch": 0.07072987208427389, "grad_norm": 1.9644052383979276, "learning_rate": 9.956619732188547e-06, "loss": 0.4244, "step": 1034 }, { "epoch": 0.07079827621588344, "grad_norm": 2.243348787878243, "learning_rate": 9.95647400573412e-06, "loss": 0.1936, "step": 1035 }, { "epoch": 0.070866680347493, "grad_norm": 2.820697314355854, "learning_rate": 9.956328035992017e-06, "loss": 0.5918, "step": 1036 }, { "epoch": 0.07093508447910253, "grad_norm": 2.5232409232371387, "learning_rate": 9.9561818229694e-06, "loss": 0.3069, "step": 1037 }, { "epoch": 0.07100348861071208, "grad_norm": 2.5483300902689257, "learning_rate": 9.956035366673444e-06, "loss": 0.5191, "step": 1038 }, { "epoch": 0.07107189274232163, "grad_norm": 1.9759432459646187, "learning_rate": 9.955888667111341e-06, "loss": 0.3643, "step": 1039 }, { "epoch": 0.07114029687393118, "grad_norm": 2.734980826702942, "learning_rate": 9.955741724290291e-06, "loss": 0.6579, "step": 1040 }, { "epoch": 0.07120870100554073, "grad_norm": 2.2807755107458547, "learning_rate": 9.955594538217505e-06, "loss": 0.3344, "step": 1041 }, { "epoch": 0.07127710513715028, "grad_norm": 2.2859704058945534, "learning_rate": 9.955447108900212e-06, "loss": 0.3504, "step": 1042 }, { "epoch": 0.07134550926875984, "grad_norm": 2.592662382172054, "learning_rate": 9.955299436345641e-06, "loss": 0.5526, "step": 1043 }, { "epoch": 0.07141391340036939, "grad_norm": 2.1806541336728698, "learning_rate": 9.955151520561045e-06, "loss": 0.415, "step": 1044 }, { "epoch": 0.07148231753197894, "grad_norm": 2.0974863168232605, "learning_rate": 9.955003361553688e-06, "loss": 0.2659, "step": 1045 }, { "epoch": 0.07155072166358849, "grad_norm": 2.4965238961684943, "learning_rate": 9.954854959330835e-06, "loss": 0.5773, "step": 1046 }, { "epoch": 0.07161912579519802, "grad_norm": 2.4924343279879095, "learning_rate": 9.954706313899776e-06, "loss": 0.2072, "step": 1047 }, { "epoch": 0.07168752992680757, "grad_norm": 1.7107451866446768, "learning_rate": 9.954557425267804e-06, "loss": 0.2268, "step": 1048 }, { "epoch": 0.07175593405841713, "grad_norm": 1.8427143825333239, "learning_rate": 9.954408293442228e-06, "loss": 0.311, "step": 1049 }, { "epoch": 0.07182433819002668, "grad_norm": 3.0714400067861094, "learning_rate": 9.954258918430368e-06, "loss": 0.5399, "step": 1050 }, { "epoch": 0.07189274232163623, "grad_norm": 2.4892377706673443, "learning_rate": 9.954109300239559e-06, "loss": 0.5529, "step": 1051 }, { "epoch": 0.07196114645324578, "grad_norm": 2.458524895871505, "learning_rate": 9.95395943887714e-06, "loss": 0.6119, "step": 1052 }, { "epoch": 0.07202955058485533, "grad_norm": 3.601897482695106, "learning_rate": 9.95380933435047e-06, "loss": 0.7938, "step": 1053 }, { "epoch": 0.07209795471646488, "grad_norm": 2.863765436219965, "learning_rate": 9.953658986666916e-06, "loss": 0.4471, "step": 1054 }, { "epoch": 0.07216635884807443, "grad_norm": 2.1140092452531896, "learning_rate": 9.953508395833857e-06, "loss": 0.3621, "step": 1055 }, { "epoch": 0.07223476297968397, "grad_norm": 3.507976366731207, "learning_rate": 9.953357561858688e-06, "loss": 0.3872, "step": 1056 }, { "epoch": 0.07230316711129352, "grad_norm": 12.269420255288304, "learning_rate": 9.953206484748807e-06, "loss": 0.5578, "step": 1057 }, { "epoch": 0.07237157124290307, "grad_norm": 1.955268900096957, "learning_rate": 9.953055164511635e-06, "loss": 0.2682, "step": 1058 }, { "epoch": 0.07243997537451262, "grad_norm": 2.6371850343184837, "learning_rate": 9.952903601154598e-06, "loss": 0.4443, "step": 1059 }, { "epoch": 0.07250837950612217, "grad_norm": 2.674310951303961, "learning_rate": 9.952751794685135e-06, "loss": 0.5505, "step": 1060 }, { "epoch": 0.07257678363773172, "grad_norm": 2.184814350820009, "learning_rate": 9.952599745110697e-06, "loss": 0.321, "step": 1061 }, { "epoch": 0.07264518776934127, "grad_norm": 2.4112677955465154, "learning_rate": 9.952447452438747e-06, "loss": 0.6462, "step": 1062 }, { "epoch": 0.07271359190095082, "grad_norm": 2.7291597658657354, "learning_rate": 9.952294916676762e-06, "loss": 0.3574, "step": 1063 }, { "epoch": 0.07278199603256037, "grad_norm": 2.7133733431155154, "learning_rate": 9.952142137832226e-06, "loss": 0.511, "step": 1064 }, { "epoch": 0.07285040016416991, "grad_norm": 2.2399771358928167, "learning_rate": 9.951989115912641e-06, "loss": 0.5471, "step": 1065 }, { "epoch": 0.07291880429577946, "grad_norm": 2.3908808141232716, "learning_rate": 9.951835850925519e-06, "loss": 0.3687, "step": 1066 }, { "epoch": 0.07298720842738901, "grad_norm": 2.4220736637466045, "learning_rate": 9.951682342878379e-06, "loss": 0.4914, "step": 1067 }, { "epoch": 0.07305561255899856, "grad_norm": 2.501763794500433, "learning_rate": 9.95152859177876e-06, "loss": 0.3954, "step": 1068 }, { "epoch": 0.07312401669060811, "grad_norm": 2.130433030313395, "learning_rate": 9.951374597634208e-06, "loss": 0.3349, "step": 1069 }, { "epoch": 0.07319242082221766, "grad_norm": 27.39849931825789, "learning_rate": 9.951220360452277e-06, "loss": 0.2009, "step": 1070 }, { "epoch": 0.07326082495382721, "grad_norm": 2.3277777278401177, "learning_rate": 9.951065880240545e-06, "loss": 0.2969, "step": 1071 }, { "epoch": 0.07332922908543676, "grad_norm": 4.239793421564506, "learning_rate": 9.950911157006588e-06, "loss": 0.3955, "step": 1072 }, { "epoch": 0.07339763321704632, "grad_norm": 2.2500974845960666, "learning_rate": 9.950756190758006e-06, "loss": 0.4414, "step": 1073 }, { "epoch": 0.07346603734865585, "grad_norm": 2.3382791500674784, "learning_rate": 9.950600981502403e-06, "loss": 0.3794, "step": 1074 }, { "epoch": 0.0735344414802654, "grad_norm": 2.149605149367718, "learning_rate": 9.950445529247397e-06, "loss": 0.2963, "step": 1075 }, { "epoch": 0.07360284561187495, "grad_norm": 2.5783069488592876, "learning_rate": 9.950289834000619e-06, "loss": 0.4034, "step": 1076 }, { "epoch": 0.0736712497434845, "grad_norm": 3.9158428353790145, "learning_rate": 9.950133895769712e-06, "loss": 0.3434, "step": 1077 }, { "epoch": 0.07373965387509406, "grad_norm": 3.2613222532211483, "learning_rate": 9.949977714562329e-06, "loss": 0.8255, "step": 1078 }, { "epoch": 0.0738080580067036, "grad_norm": 2.6011985031577507, "learning_rate": 9.949821290386137e-06, "loss": 0.3436, "step": 1079 }, { "epoch": 0.07387646213831316, "grad_norm": 2.568121112171609, "learning_rate": 9.949664623248813e-06, "loss": 0.5479, "step": 1080 }, { "epoch": 0.07394486626992271, "grad_norm": 2.083469384439942, "learning_rate": 9.949507713158048e-06, "loss": 0.362, "step": 1081 }, { "epoch": 0.07401327040153226, "grad_norm": 3.0667755730656485, "learning_rate": 9.949350560121543e-06, "loss": 0.531, "step": 1082 }, { "epoch": 0.07408167453314181, "grad_norm": 2.2359452898594907, "learning_rate": 9.949193164147012e-06, "loss": 0.4589, "step": 1083 }, { "epoch": 0.07415007866475135, "grad_norm": 2.754907666742986, "learning_rate": 9.949035525242181e-06, "loss": 0.5378, "step": 1084 }, { "epoch": 0.0742184827963609, "grad_norm": 2.7109988092970756, "learning_rate": 9.94887764341479e-06, "loss": 0.5811, "step": 1085 }, { "epoch": 0.07428688692797045, "grad_norm": 1.8869144188174087, "learning_rate": 9.948719518672584e-06, "loss": 0.2918, "step": 1086 }, { "epoch": 0.07435529105958, "grad_norm": 2.7516804892410183, "learning_rate": 9.948561151023328e-06, "loss": 0.6971, "step": 1087 }, { "epoch": 0.07442369519118955, "grad_norm": 2.689578749438299, "learning_rate": 9.948402540474793e-06, "loss": 0.6808, "step": 1088 }, { "epoch": 0.0744920993227991, "grad_norm": 2.0837339306173415, "learning_rate": 9.948243687034768e-06, "loss": 0.4334, "step": 1089 }, { "epoch": 0.07456050345440865, "grad_norm": 2.958319359509056, "learning_rate": 9.948084590711047e-06, "loss": 0.7501, "step": 1090 }, { "epoch": 0.0746289075860182, "grad_norm": 3.26161155296106, "learning_rate": 9.94792525151144e-06, "loss": 0.5213, "step": 1091 }, { "epoch": 0.07469731171762775, "grad_norm": 2.811260692089301, "learning_rate": 9.947765669443768e-06, "loss": 0.3806, "step": 1092 }, { "epoch": 0.07476571584923729, "grad_norm": 2.0494170801548948, "learning_rate": 9.947605844515863e-06, "loss": 0.3226, "step": 1093 }, { "epoch": 0.07483411998084684, "grad_norm": 2.31212117984347, "learning_rate": 9.947445776735574e-06, "loss": 0.349, "step": 1094 }, { "epoch": 0.07490252411245639, "grad_norm": 8.48779733667929, "learning_rate": 9.947285466110755e-06, "loss": 0.42, "step": 1095 }, { "epoch": 0.07497092824406594, "grad_norm": 2.286329845099989, "learning_rate": 9.947124912649275e-06, "loss": 0.3795, "step": 1096 }, { "epoch": 0.07503933237567549, "grad_norm": 2.706773170400206, "learning_rate": 9.946964116359015e-06, "loss": 0.6394, "step": 1097 }, { "epoch": 0.07510773650728504, "grad_norm": 2.5888915346800183, "learning_rate": 9.946803077247866e-06, "loss": 0.4358, "step": 1098 }, { "epoch": 0.07517614063889459, "grad_norm": 3.4481266837490185, "learning_rate": 9.946641795323737e-06, "loss": 0.5366, "step": 1099 }, { "epoch": 0.07524454477050414, "grad_norm": 1.3880381190945505, "learning_rate": 9.946480270594539e-06, "loss": 0.1792, "step": 1100 }, { "epoch": 0.0753129489021137, "grad_norm": 1.388793035981185, "learning_rate": 9.946318503068205e-06, "loss": 0.195, "step": 1101 }, { "epoch": 0.07538135303372323, "grad_norm": 2.5190241695204647, "learning_rate": 9.94615649275267e-06, "loss": 0.5724, "step": 1102 }, { "epoch": 0.07544975716533278, "grad_norm": 2.635088878848173, "learning_rate": 9.945994239655892e-06, "loss": 0.649, "step": 1103 }, { "epoch": 0.07551816129694233, "grad_norm": 2.514263329506905, "learning_rate": 9.945831743785832e-06, "loss": 0.6236, "step": 1104 }, { "epoch": 0.07558656542855188, "grad_norm": 2.2233146000858826, "learning_rate": 9.945669005150467e-06, "loss": 0.45, "step": 1105 }, { "epoch": 0.07565496956016143, "grad_norm": 2.9574775510179423, "learning_rate": 9.945506023757786e-06, "loss": 0.1938, "step": 1106 }, { "epoch": 0.07572337369177098, "grad_norm": 2.773254549681466, "learning_rate": 9.945342799615786e-06, "loss": 0.5639, "step": 1107 }, { "epoch": 0.07579177782338054, "grad_norm": 2.6933960887904616, "learning_rate": 9.945179332732482e-06, "loss": 0.7536, "step": 1108 }, { "epoch": 0.07586018195499009, "grad_norm": 1.8153213214183834, "learning_rate": 9.945015623115896e-06, "loss": 0.1637, "step": 1109 }, { "epoch": 0.07592858608659964, "grad_norm": 2.596127741565586, "learning_rate": 9.944851670774064e-06, "loss": 0.5732, "step": 1110 }, { "epoch": 0.07599699021820917, "grad_norm": 2.298214589894453, "learning_rate": 9.944687475715034e-06, "loss": 0.4043, "step": 1111 }, { "epoch": 0.07606539434981872, "grad_norm": 2.5908068999149263, "learning_rate": 9.944523037946863e-06, "loss": 0.6742, "step": 1112 }, { "epoch": 0.07613379848142827, "grad_norm": 3.6503811219215687, "learning_rate": 9.944358357477625e-06, "loss": 0.437, "step": 1113 }, { "epoch": 0.07620220261303783, "grad_norm": 1.7422212588884096, "learning_rate": 9.944193434315406e-06, "loss": 0.2249, "step": 1114 }, { "epoch": 0.07627060674464738, "grad_norm": 2.121067742456988, "learning_rate": 9.944028268468294e-06, "loss": 0.3245, "step": 1115 }, { "epoch": 0.07633901087625693, "grad_norm": 1.8364369950665915, "learning_rate": 9.943862859944402e-06, "loss": 0.3279, "step": 1116 }, { "epoch": 0.07640741500786648, "grad_norm": 2.50753928862092, "learning_rate": 9.943697208751847e-06, "loss": 0.6031, "step": 1117 }, { "epoch": 0.07647581913947603, "grad_norm": 2.5395109514201977, "learning_rate": 9.94353131489876e-06, "loss": 0.6491, "step": 1118 }, { "epoch": 0.07654422327108558, "grad_norm": 1.6592630204889256, "learning_rate": 9.943365178393283e-06, "loss": 0.1443, "step": 1119 }, { "epoch": 0.07661262740269512, "grad_norm": 2.974407908078639, "learning_rate": 9.943198799243575e-06, "loss": 0.2536, "step": 1120 }, { "epoch": 0.07668103153430467, "grad_norm": 2.2385296176167797, "learning_rate": 9.943032177457796e-06, "loss": 0.5557, "step": 1121 }, { "epoch": 0.07674943566591422, "grad_norm": 2.5435955136600756, "learning_rate": 9.94286531304413e-06, "loss": 0.5373, "step": 1122 }, { "epoch": 0.07681783979752377, "grad_norm": 2.4961924466244785, "learning_rate": 9.942698206010764e-06, "loss": 0.3286, "step": 1123 }, { "epoch": 0.07688624392913332, "grad_norm": 2.0812113151162603, "learning_rate": 9.942530856365902e-06, "loss": 0.478, "step": 1124 }, { "epoch": 0.07695464806074287, "grad_norm": 2.114569590356325, "learning_rate": 9.94236326411776e-06, "loss": 0.3924, "step": 1125 }, { "epoch": 0.07702305219235242, "grad_norm": 2.628700554383146, "learning_rate": 9.942195429274562e-06, "loss": 0.4322, "step": 1126 }, { "epoch": 0.07709145632396197, "grad_norm": 2.180790386307906, "learning_rate": 9.942027351844545e-06, "loss": 0.5081, "step": 1127 }, { "epoch": 0.07715986045557152, "grad_norm": 3.0540891485711485, "learning_rate": 9.941859031835964e-06, "loss": 0.4666, "step": 1128 }, { "epoch": 0.07722826458718107, "grad_norm": 2.4452764733923837, "learning_rate": 9.941690469257075e-06, "loss": 0.5147, "step": 1129 }, { "epoch": 0.07729666871879061, "grad_norm": 3.618004197927457, "learning_rate": 9.941521664116156e-06, "loss": 0.3539, "step": 1130 }, { "epoch": 0.07736507285040016, "grad_norm": 2.2260470580067113, "learning_rate": 9.94135261642149e-06, "loss": 0.4078, "step": 1131 }, { "epoch": 0.07743347698200971, "grad_norm": 2.5456373682993036, "learning_rate": 9.941183326181377e-06, "loss": 0.5591, "step": 1132 }, { "epoch": 0.07750188111361926, "grad_norm": 1.7454834069744825, "learning_rate": 9.941013793404126e-06, "loss": 0.2913, "step": 1133 }, { "epoch": 0.07757028524522881, "grad_norm": 2.465971662956434, "learning_rate": 9.940844018098057e-06, "loss": 0.5041, "step": 1134 }, { "epoch": 0.07763868937683836, "grad_norm": 2.2909086743925693, "learning_rate": 9.940674000271505e-06, "loss": 0.5478, "step": 1135 }, { "epoch": 0.07770709350844791, "grad_norm": 1.8096186450776313, "learning_rate": 9.940503739932815e-06, "loss": 0.2513, "step": 1136 }, { "epoch": 0.07777549764005746, "grad_norm": 2.4581696703683384, "learning_rate": 9.940333237090345e-06, "loss": 0.3763, "step": 1137 }, { "epoch": 0.07784390177166702, "grad_norm": 2.9147272860188993, "learning_rate": 9.940162491752461e-06, "loss": 0.5572, "step": 1138 }, { "epoch": 0.07791230590327655, "grad_norm": 1.891122843085123, "learning_rate": 9.939991503927548e-06, "loss": 0.287, "step": 1139 }, { "epoch": 0.0779807100348861, "grad_norm": 2.484433419362462, "learning_rate": 9.939820273623996e-06, "loss": 0.5293, "step": 1140 }, { "epoch": 0.07804911416649565, "grad_norm": 2.270294730739799, "learning_rate": 9.93964880085021e-06, "loss": 0.4939, "step": 1141 }, { "epoch": 0.0781175182981052, "grad_norm": 2.8729565033745255, "learning_rate": 9.939477085614608e-06, "loss": 0.4051, "step": 1142 }, { "epoch": 0.07818592242971475, "grad_norm": 2.3275850681202273, "learning_rate": 9.939305127925621e-06, "loss": 0.5406, "step": 1143 }, { "epoch": 0.0782543265613243, "grad_norm": 2.0758393237540407, "learning_rate": 9.939132927791684e-06, "loss": 0.4152, "step": 1144 }, { "epoch": 0.07832273069293386, "grad_norm": 2.177385256029136, "learning_rate": 9.938960485221252e-06, "loss": 0.4319, "step": 1145 }, { "epoch": 0.07839113482454341, "grad_norm": 2.684058825132479, "learning_rate": 9.93878780022279e-06, "loss": 0.4993, "step": 1146 }, { "epoch": 0.07845953895615296, "grad_norm": 3.8437912481312253, "learning_rate": 9.938614872804773e-06, "loss": 0.5068, "step": 1147 }, { "epoch": 0.0785279430877625, "grad_norm": 2.342023951423456, "learning_rate": 9.938441702975689e-06, "loss": 0.5398, "step": 1148 }, { "epoch": 0.07859634721937205, "grad_norm": 2.4354689462487533, "learning_rate": 9.938268290744039e-06, "loss": 0.5742, "step": 1149 }, { "epoch": 0.0786647513509816, "grad_norm": 2.3135216621199324, "learning_rate": 9.938094636118336e-06, "loss": 0.492, "step": 1150 }, { "epoch": 0.07873315548259115, "grad_norm": 2.743440434191089, "learning_rate": 9.937920739107102e-06, "loss": 0.7444, "step": 1151 }, { "epoch": 0.0788015596142007, "grad_norm": 2.7621622228604146, "learning_rate": 9.937746599718873e-06, "loss": 0.6435, "step": 1152 }, { "epoch": 0.07886996374581025, "grad_norm": 2.5624796354833728, "learning_rate": 9.937572217962197e-06, "loss": 0.2986, "step": 1153 }, { "epoch": 0.0789383678774198, "grad_norm": 2.328004437083509, "learning_rate": 9.937397593845634e-06, "loss": 0.4305, "step": 1154 }, { "epoch": 0.07900677200902935, "grad_norm": 1.7041421355306077, "learning_rate": 9.937222727377753e-06, "loss": 0.2749, "step": 1155 }, { "epoch": 0.0790751761406389, "grad_norm": 1.8449907169263573, "learning_rate": 9.93704761856714e-06, "loss": 0.3106, "step": 1156 }, { "epoch": 0.07914358027224844, "grad_norm": 2.5131739647152576, "learning_rate": 9.936872267422389e-06, "loss": 0.4545, "step": 1157 }, { "epoch": 0.07921198440385799, "grad_norm": 2.3696901669965786, "learning_rate": 9.936696673952107e-06, "loss": 0.471, "step": 1158 }, { "epoch": 0.07928038853546754, "grad_norm": 2.4846723186675392, "learning_rate": 9.936520838164912e-06, "loss": 0.5902, "step": 1159 }, { "epoch": 0.07934879266707709, "grad_norm": 1.9369280263111415, "learning_rate": 9.936344760069437e-06, "loss": 0.3738, "step": 1160 }, { "epoch": 0.07941719679868664, "grad_norm": 1.9991798954624456, "learning_rate": 9.936168439674326e-06, "loss": 0.2936, "step": 1161 }, { "epoch": 0.07948560093029619, "grad_norm": 1.891088030996455, "learning_rate": 9.935991876988228e-06, "loss": 0.3636, "step": 1162 }, { "epoch": 0.07955400506190574, "grad_norm": 2.5316459852296376, "learning_rate": 9.935815072019815e-06, "loss": 0.5082, "step": 1163 }, { "epoch": 0.07962240919351529, "grad_norm": 1.969236941896376, "learning_rate": 9.935638024777763e-06, "loss": 0.3074, "step": 1164 }, { "epoch": 0.07969081332512484, "grad_norm": 2.20447743616131, "learning_rate": 9.935460735270762e-06, "loss": 0.2146, "step": 1165 }, { "epoch": 0.0797592174567344, "grad_norm": 2.0573897052695713, "learning_rate": 9.935283203507516e-06, "loss": 0.3431, "step": 1166 }, { "epoch": 0.07982762158834393, "grad_norm": 2.3085920136988998, "learning_rate": 9.935105429496737e-06, "loss": 0.2898, "step": 1167 }, { "epoch": 0.07989602571995348, "grad_norm": 2.049938028550261, "learning_rate": 9.934927413247152e-06, "loss": 0.433, "step": 1168 }, { "epoch": 0.07996442985156303, "grad_norm": 2.3120828909629205, "learning_rate": 9.934749154767503e-06, "loss": 0.6091, "step": 1169 }, { "epoch": 0.08003283398317258, "grad_norm": 3.1100037288950064, "learning_rate": 9.93457065406653e-06, "loss": 0.7005, "step": 1170 }, { "epoch": 0.08010123811478213, "grad_norm": 2.256315931548306, "learning_rate": 9.934391911153006e-06, "loss": 0.3124, "step": 1171 }, { "epoch": 0.08016964224639168, "grad_norm": 1.4971727509111505, "learning_rate": 9.934212926035695e-06, "loss": 0.2245, "step": 1172 }, { "epoch": 0.08023804637800123, "grad_norm": 2.499054926815104, "learning_rate": 9.93403369872339e-06, "loss": 0.6353, "step": 1173 }, { "epoch": 0.08030645050961079, "grad_norm": 2.2198163277449408, "learning_rate": 9.93385422922488e-06, "loss": 0.4164, "step": 1174 }, { "epoch": 0.08037485464122034, "grad_norm": 1.6845318599126964, "learning_rate": 9.933674517548984e-06, "loss": 0.3459, "step": 1175 }, { "epoch": 0.08044325877282987, "grad_norm": 1.2994357645231358, "learning_rate": 9.933494563704515e-06, "loss": 0.212, "step": 1176 }, { "epoch": 0.08051166290443942, "grad_norm": 2.511728627851412, "learning_rate": 9.933314367700311e-06, "loss": 0.4755, "step": 1177 }, { "epoch": 0.08058006703604897, "grad_norm": 2.682785900963887, "learning_rate": 9.933133929545213e-06, "loss": 0.645, "step": 1178 }, { "epoch": 0.08064847116765853, "grad_norm": 1.9793492751622508, "learning_rate": 9.932953249248082e-06, "loss": 0.2986, "step": 1179 }, { "epoch": 0.08071687529926808, "grad_norm": 2.425069375864637, "learning_rate": 9.932772326817784e-06, "loss": 0.5512, "step": 1180 }, { "epoch": 0.08078527943087763, "grad_norm": 2.037467754948447, "learning_rate": 9.932591162263199e-06, "loss": 0.3237, "step": 1181 }, { "epoch": 0.08085368356248718, "grad_norm": 3.5078248361485302, "learning_rate": 9.932409755593223e-06, "loss": 0.8034, "step": 1182 }, { "epoch": 0.08092208769409673, "grad_norm": 2.3764149598089888, "learning_rate": 9.932228106816755e-06, "loss": 0.4298, "step": 1183 }, { "epoch": 0.08099049182570628, "grad_norm": 2.9670193094692983, "learning_rate": 9.932046215942715e-06, "loss": 0.5693, "step": 1184 }, { "epoch": 0.08105889595731582, "grad_norm": 2.245805871178687, "learning_rate": 9.93186408298003e-06, "loss": 0.4186, "step": 1185 }, { "epoch": 0.08112730008892537, "grad_norm": 2.3706973277090397, "learning_rate": 9.931681707937642e-06, "loss": 0.3128, "step": 1186 }, { "epoch": 0.08119570422053492, "grad_norm": 2.0274892765801105, "learning_rate": 9.931499090824499e-06, "loss": 0.4016, "step": 1187 }, { "epoch": 0.08126410835214447, "grad_norm": 2.7615910545082403, "learning_rate": 9.931316231649565e-06, "loss": 0.7062, "step": 1188 }, { "epoch": 0.08133251248375402, "grad_norm": 2.2111081101333543, "learning_rate": 9.931133130421821e-06, "loss": 0.5111, "step": 1189 }, { "epoch": 0.08140091661536357, "grad_norm": 2.912089295960864, "learning_rate": 9.93094978715025e-06, "loss": 0.4021, "step": 1190 }, { "epoch": 0.08146932074697312, "grad_norm": 2.0671120071178115, "learning_rate": 9.930766201843852e-06, "loss": 0.4596, "step": 1191 }, { "epoch": 0.08153772487858267, "grad_norm": 2.896782117010925, "learning_rate": 9.930582374511635e-06, "loss": 0.5599, "step": 1192 }, { "epoch": 0.08160612901019222, "grad_norm": 2.6253631147644056, "learning_rate": 9.93039830516263e-06, "loss": 0.6979, "step": 1193 }, { "epoch": 0.08167453314180176, "grad_norm": 2.7746273228697085, "learning_rate": 9.930213993805863e-06, "loss": 0.4984, "step": 1194 }, { "epoch": 0.08174293727341131, "grad_norm": 2.3765101588176747, "learning_rate": 9.930029440450391e-06, "loss": 0.4718, "step": 1195 }, { "epoch": 0.08181134140502086, "grad_norm": 2.9412957888831444, "learning_rate": 9.929844645105263e-06, "loss": 0.7151, "step": 1196 }, { "epoch": 0.08187974553663041, "grad_norm": 3.0822157520529805, "learning_rate": 9.929659607779555e-06, "loss": 0.7752, "step": 1197 }, { "epoch": 0.08194814966823996, "grad_norm": 2.3074188502343382, "learning_rate": 9.92947432848235e-06, "loss": 0.3132, "step": 1198 }, { "epoch": 0.08201655379984951, "grad_norm": 2.032577150782066, "learning_rate": 9.929288807222738e-06, "loss": 0.4465, "step": 1199 }, { "epoch": 0.08208495793145906, "grad_norm": 2.847540425272367, "learning_rate": 9.929103044009829e-06, "loss": 0.7622, "step": 1200 }, { "epoch": 0.08215336206306861, "grad_norm": 2.785384865643226, "learning_rate": 9.928917038852741e-06, "loss": 0.4816, "step": 1201 }, { "epoch": 0.08222176619467816, "grad_norm": 2.356525133648848, "learning_rate": 9.928730791760602e-06, "loss": 0.23, "step": 1202 }, { "epoch": 0.0822901703262877, "grad_norm": 2.885556447720842, "learning_rate": 9.928544302742556e-06, "loss": 0.4198, "step": 1203 }, { "epoch": 0.08235857445789725, "grad_norm": 2.231306376399473, "learning_rate": 9.928357571807757e-06, "loss": 0.3524, "step": 1204 }, { "epoch": 0.0824269785895068, "grad_norm": 1.9000675989347786, "learning_rate": 9.928170598965368e-06, "loss": 0.3559, "step": 1205 }, { "epoch": 0.08249538272111635, "grad_norm": 2.7173924619423433, "learning_rate": 9.927983384224567e-06, "loss": 0.4564, "step": 1206 }, { "epoch": 0.0825637868527259, "grad_norm": 2.3977217926632926, "learning_rate": 9.927795927594547e-06, "loss": 0.5139, "step": 1207 }, { "epoch": 0.08263219098433545, "grad_norm": 2.479264473121129, "learning_rate": 9.927608229084506e-06, "loss": 0.5508, "step": 1208 }, { "epoch": 0.082700595115945, "grad_norm": 2.3820052483610534, "learning_rate": 9.927420288703657e-06, "loss": 0.3908, "step": 1209 }, { "epoch": 0.08276899924755456, "grad_norm": 2.1720227527445335, "learning_rate": 9.927232106461226e-06, "loss": 0.516, "step": 1210 }, { "epoch": 0.0828374033791641, "grad_norm": 2.182789649379566, "learning_rate": 9.927043682366452e-06, "loss": 0.5835, "step": 1211 }, { "epoch": 0.08290580751077366, "grad_norm": 2.1291937537688073, "learning_rate": 9.92685501642858e-06, "loss": 0.3252, "step": 1212 }, { "epoch": 0.0829742116423832, "grad_norm": 2.1271078720757175, "learning_rate": 9.926666108656873e-06, "loss": 0.3884, "step": 1213 }, { "epoch": 0.08304261577399275, "grad_norm": 2.382339042778516, "learning_rate": 9.926476959060602e-06, "loss": 0.601, "step": 1214 }, { "epoch": 0.0831110199056023, "grad_norm": 3.357852537574129, "learning_rate": 9.926287567649051e-06, "loss": 0.4132, "step": 1215 }, { "epoch": 0.08317942403721185, "grad_norm": 2.2704269952644704, "learning_rate": 9.92609793443152e-06, "loss": 0.5033, "step": 1216 }, { "epoch": 0.0832478281688214, "grad_norm": 1.6078726297558432, "learning_rate": 9.925908059417313e-06, "loss": 0.2466, "step": 1217 }, { "epoch": 0.08331623230043095, "grad_norm": 1.73668458881835, "learning_rate": 9.925717942615752e-06, "loss": 0.3716, "step": 1218 }, { "epoch": 0.0833846364320405, "grad_norm": 2.1369033852259816, "learning_rate": 9.925527584036167e-06, "loss": 0.5188, "step": 1219 }, { "epoch": 0.08345304056365005, "grad_norm": 2.1321121015975417, "learning_rate": 9.925336983687905e-06, "loss": 0.4737, "step": 1220 }, { "epoch": 0.0835214446952596, "grad_norm": 2.5964791606760613, "learning_rate": 9.925146141580317e-06, "loss": 0.5737, "step": 1221 }, { "epoch": 0.08358984882686914, "grad_norm": 2.0857230872779895, "learning_rate": 9.924955057722776e-06, "loss": 0.1974, "step": 1222 }, { "epoch": 0.08365825295847869, "grad_norm": 3.4476777173516497, "learning_rate": 9.924763732124657e-06, "loss": 0.4962, "step": 1223 }, { "epoch": 0.08372665709008824, "grad_norm": 2.292440051290243, "learning_rate": 9.924572164795352e-06, "loss": 0.701, "step": 1224 }, { "epoch": 0.08379506122169779, "grad_norm": 2.463595876977694, "learning_rate": 9.924380355744266e-06, "loss": 0.6776, "step": 1225 }, { "epoch": 0.08386346535330734, "grad_norm": 1.5062981484560714, "learning_rate": 9.924188304980812e-06, "loss": 0.1355, "step": 1226 }, { "epoch": 0.08393186948491689, "grad_norm": 2.4242742406092046, "learning_rate": 9.923996012514416e-06, "loss": 0.4888, "step": 1227 }, { "epoch": 0.08400027361652644, "grad_norm": 2.2892694054270906, "learning_rate": 9.92380347835452e-06, "loss": 0.2165, "step": 1228 }, { "epoch": 0.08406867774813599, "grad_norm": 2.2873782676605896, "learning_rate": 9.923610702510571e-06, "loss": 0.4236, "step": 1229 }, { "epoch": 0.08413708187974554, "grad_norm": 2.8465178402781026, "learning_rate": 9.923417684992033e-06, "loss": 0.7389, "step": 1230 }, { "epoch": 0.08420548601135508, "grad_norm": 2.4612023831668264, "learning_rate": 9.923224425808381e-06, "loss": 0.5255, "step": 1231 }, { "epoch": 0.08427389014296463, "grad_norm": 2.4401566809530317, "learning_rate": 9.923030924969101e-06, "loss": 0.4386, "step": 1232 }, { "epoch": 0.08434229427457418, "grad_norm": 1.3130130588155557, "learning_rate": 9.92283718248369e-06, "loss": 0.1456, "step": 1233 }, { "epoch": 0.08441069840618373, "grad_norm": 2.451556913370705, "learning_rate": 9.922643198361655e-06, "loss": 0.5747, "step": 1234 }, { "epoch": 0.08447910253779328, "grad_norm": 2.509890779573401, "learning_rate": 9.922448972612524e-06, "loss": 0.3913, "step": 1235 }, { "epoch": 0.08454750666940283, "grad_norm": 1.7773275269826443, "learning_rate": 9.922254505245825e-06, "loss": 0.2326, "step": 1236 }, { "epoch": 0.08461591080101238, "grad_norm": 2.308197942841891, "learning_rate": 9.922059796271107e-06, "loss": 0.5138, "step": 1237 }, { "epoch": 0.08468431493262193, "grad_norm": 1.0403646684757177, "learning_rate": 9.921864845697924e-06, "loss": 0.2023, "step": 1238 }, { "epoch": 0.08475271906423149, "grad_norm": 2.8221619794880963, "learning_rate": 9.921669653535848e-06, "loss": 0.6408, "step": 1239 }, { "epoch": 0.08482112319584102, "grad_norm": 2.3098647012659033, "learning_rate": 9.921474219794459e-06, "loss": 0.4226, "step": 1240 }, { "epoch": 0.08488952732745057, "grad_norm": 2.3641779372829923, "learning_rate": 9.921278544483349e-06, "loss": 0.2653, "step": 1241 }, { "epoch": 0.08495793145906012, "grad_norm": 3.1148827907468353, "learning_rate": 9.921082627612125e-06, "loss": 0.9064, "step": 1242 }, { "epoch": 0.08502633559066967, "grad_norm": 2.410126636453032, "learning_rate": 9.920886469190401e-06, "loss": 0.2768, "step": 1243 }, { "epoch": 0.08509473972227923, "grad_norm": 2.1607114585381693, "learning_rate": 9.920690069227805e-06, "loss": 0.3975, "step": 1244 }, { "epoch": 0.08516314385388878, "grad_norm": 2.4252998132303825, "learning_rate": 9.920493427733981e-06, "loss": 0.4116, "step": 1245 }, { "epoch": 0.08523154798549833, "grad_norm": 1.9140296777519055, "learning_rate": 9.920296544718577e-06, "loss": 0.2258, "step": 1246 }, { "epoch": 0.08529995211710788, "grad_norm": 2.715995796621719, "learning_rate": 9.92009942019126e-06, "loss": 0.5232, "step": 1247 }, { "epoch": 0.08536835624871743, "grad_norm": 1.601810605905163, "learning_rate": 9.919902054161704e-06, "loss": 0.2684, "step": 1248 }, { "epoch": 0.08543676038032696, "grad_norm": 2.866745044132113, "learning_rate": 9.919704446639597e-06, "loss": 0.7081, "step": 1249 }, { "epoch": 0.08550516451193652, "grad_norm": 2.3306326258323105, "learning_rate": 9.919506597634638e-06, "loss": 0.3879, "step": 1250 }, { "epoch": 0.08557356864354607, "grad_norm": 2.4299796931474003, "learning_rate": 9.919308507156542e-06, "loss": 0.4378, "step": 1251 }, { "epoch": 0.08564197277515562, "grad_norm": 2.5959552903608745, "learning_rate": 9.919110175215027e-06, "loss": 0.6486, "step": 1252 }, { "epoch": 0.08571037690676517, "grad_norm": 1.7353128998871183, "learning_rate": 9.91891160181983e-06, "loss": 0.2408, "step": 1253 }, { "epoch": 0.08577878103837472, "grad_norm": 2.3309134611155202, "learning_rate": 9.9187127869807e-06, "loss": 0.3765, "step": 1254 }, { "epoch": 0.08584718516998427, "grad_norm": 2.0657124355122645, "learning_rate": 9.918513730707392e-06, "loss": 0.4397, "step": 1255 }, { "epoch": 0.08591558930159382, "grad_norm": 2.033041853284862, "learning_rate": 9.918314433009681e-06, "loss": 0.4264, "step": 1256 }, { "epoch": 0.08598399343320337, "grad_norm": 2.1899111674420806, "learning_rate": 9.918114893897348e-06, "loss": 0.4381, "step": 1257 }, { "epoch": 0.08605239756481292, "grad_norm": 2.890762449553603, "learning_rate": 9.917915113380186e-06, "loss": 0.6049, "step": 1258 }, { "epoch": 0.08612080169642246, "grad_norm": 2.4769211698912406, "learning_rate": 9.917715091467999e-06, "loss": 0.7902, "step": 1259 }, { "epoch": 0.08618920582803201, "grad_norm": 2.3827851841095664, "learning_rate": 9.91751482817061e-06, "loss": 0.4084, "step": 1260 }, { "epoch": 0.08625760995964156, "grad_norm": 3.000357412862898, "learning_rate": 9.917314323497848e-06, "loss": 0.4498, "step": 1261 }, { "epoch": 0.08632601409125111, "grad_norm": 2.0098191041939626, "learning_rate": 9.917113577459553e-06, "loss": 0.3717, "step": 1262 }, { "epoch": 0.08639441822286066, "grad_norm": 2.8234454160662725, "learning_rate": 9.916912590065578e-06, "loss": 0.616, "step": 1263 }, { "epoch": 0.08646282235447021, "grad_norm": 2.0882720705880247, "learning_rate": 9.916711361325791e-06, "loss": 0.2943, "step": 1264 }, { "epoch": 0.08653122648607976, "grad_norm": 2.2037292845521055, "learning_rate": 9.916509891250066e-06, "loss": 0.3675, "step": 1265 }, { "epoch": 0.08659963061768931, "grad_norm": 2.424494587937646, "learning_rate": 9.916308179848295e-06, "loss": 0.3401, "step": 1266 }, { "epoch": 0.08666803474929886, "grad_norm": 2.3309909602492755, "learning_rate": 9.916106227130378e-06, "loss": 0.6571, "step": 1267 }, { "epoch": 0.0867364388809084, "grad_norm": 1.9745226915540333, "learning_rate": 9.915904033106227e-06, "loss": 0.4967, "step": 1268 }, { "epoch": 0.08680484301251795, "grad_norm": 2.527029775605986, "learning_rate": 9.915701597785769e-06, "loss": 0.3122, "step": 1269 }, { "epoch": 0.0868732471441275, "grad_norm": 2.276098617623925, "learning_rate": 9.915498921178937e-06, "loss": 0.4438, "step": 1270 }, { "epoch": 0.08694165127573705, "grad_norm": 2.1380344015628725, "learning_rate": 9.915296003295682e-06, "loss": 0.2944, "step": 1271 }, { "epoch": 0.0870100554073466, "grad_norm": 2.044012266357404, "learning_rate": 9.915092844145963e-06, "loss": 0.2948, "step": 1272 }, { "epoch": 0.08707845953895615, "grad_norm": 2.2790167799056142, "learning_rate": 9.914889443739755e-06, "loss": 0.4051, "step": 1273 }, { "epoch": 0.0871468636705657, "grad_norm": 1.7923895039412203, "learning_rate": 9.914685802087037e-06, "loss": 0.3446, "step": 1274 }, { "epoch": 0.08721526780217526, "grad_norm": 1.6026432717128722, "learning_rate": 9.914481919197807e-06, "loss": 0.2855, "step": 1275 }, { "epoch": 0.0872836719337848, "grad_norm": 2.64503052814656, "learning_rate": 9.914277795082072e-06, "loss": 0.4811, "step": 1276 }, { "epoch": 0.08735207606539434, "grad_norm": 2.4401725154221507, "learning_rate": 9.914073429749853e-06, "loss": 0.5482, "step": 1277 }, { "epoch": 0.0874204801970039, "grad_norm": 2.1378326964375796, "learning_rate": 9.913868823211182e-06, "loss": 0.3952, "step": 1278 }, { "epoch": 0.08748888432861344, "grad_norm": 2.578506181454309, "learning_rate": 9.913663975476099e-06, "loss": 0.5322, "step": 1279 }, { "epoch": 0.087557288460223, "grad_norm": 2.5426626328832604, "learning_rate": 9.913458886554659e-06, "loss": 0.432, "step": 1280 }, { "epoch": 0.08762569259183255, "grad_norm": 2.6392363672253976, "learning_rate": 9.91325355645693e-06, "loss": 0.6995, "step": 1281 }, { "epoch": 0.0876940967234421, "grad_norm": 2.518423123600879, "learning_rate": 9.913047985192991e-06, "loss": 0.4143, "step": 1282 }, { "epoch": 0.08776250085505165, "grad_norm": 2.320930616975427, "learning_rate": 9.912842172772935e-06, "loss": 0.3663, "step": 1283 }, { "epoch": 0.0878309049866612, "grad_norm": 1.7606301507410682, "learning_rate": 9.912636119206857e-06, "loss": 0.3513, "step": 1284 }, { "epoch": 0.08789930911827075, "grad_norm": 2.352000807155419, "learning_rate": 9.912429824504877e-06, "loss": 0.4475, "step": 1285 }, { "epoch": 0.08796771324988029, "grad_norm": 1.8527176026420162, "learning_rate": 9.91222328867712e-06, "loss": 0.385, "step": 1286 }, { "epoch": 0.08803611738148984, "grad_norm": 2.1003351030749666, "learning_rate": 9.912016511733722e-06, "loss": 0.4993, "step": 1287 }, { "epoch": 0.08810452151309939, "grad_norm": 2.2478175506153213, "learning_rate": 9.911809493684834e-06, "loss": 0.4184, "step": 1288 }, { "epoch": 0.08817292564470894, "grad_norm": 2.380086652927344, "learning_rate": 9.911602234540618e-06, "loss": 0.5502, "step": 1289 }, { "epoch": 0.08824132977631849, "grad_norm": 1.288646185723821, "learning_rate": 9.911394734311246e-06, "loss": 0.1993, "step": 1290 }, { "epoch": 0.08830973390792804, "grad_norm": 3.1583295005374152, "learning_rate": 9.911186993006903e-06, "loss": 0.6863, "step": 1291 }, { "epoch": 0.08837813803953759, "grad_norm": 2.282709543150597, "learning_rate": 9.910979010637787e-06, "loss": 0.5198, "step": 1292 }, { "epoch": 0.08844654217114714, "grad_norm": 2.3445086936030872, "learning_rate": 9.910770787214106e-06, "loss": 0.3424, "step": 1293 }, { "epoch": 0.08851494630275669, "grad_norm": 2.550500559296179, "learning_rate": 9.910562322746083e-06, "loss": 0.5032, "step": 1294 }, { "epoch": 0.08858335043436624, "grad_norm": 2.451789565356712, "learning_rate": 9.910353617243944e-06, "loss": 0.6054, "step": 1295 }, { "epoch": 0.08865175456597578, "grad_norm": 1.56020691454614, "learning_rate": 9.910144670717942e-06, "loss": 0.1841, "step": 1296 }, { "epoch": 0.08872015869758533, "grad_norm": 2.9921403487750844, "learning_rate": 9.909935483178326e-06, "loss": 0.7376, "step": 1297 }, { "epoch": 0.08878856282919488, "grad_norm": 3.053345680656066, "learning_rate": 9.909726054635369e-06, "loss": 0.7177, "step": 1298 }, { "epoch": 0.08885696696080443, "grad_norm": 2.3000197777727043, "learning_rate": 9.909516385099346e-06, "loss": 0.5793, "step": 1299 }, { "epoch": 0.08892537109241398, "grad_norm": 2.1552811533034038, "learning_rate": 9.909306474580551e-06, "loss": 0.4253, "step": 1300 }, { "epoch": 0.08899377522402353, "grad_norm": 2.218106948991913, "learning_rate": 9.909096323089288e-06, "loss": 0.3621, "step": 1301 }, { "epoch": 0.08906217935563308, "grad_norm": 2.8055134622298112, "learning_rate": 9.908885930635872e-06, "loss": 0.5759, "step": 1302 }, { "epoch": 0.08913058348724263, "grad_norm": 1.8458644836777451, "learning_rate": 9.90867529723063e-06, "loss": 0.2672, "step": 1303 }, { "epoch": 0.08919898761885219, "grad_norm": 2.5636649972595835, "learning_rate": 9.908464422883901e-06, "loss": 0.6844, "step": 1304 }, { "epoch": 0.08926739175046172, "grad_norm": 2.3566916218858993, "learning_rate": 9.908253307606035e-06, "loss": 0.5097, "step": 1305 }, { "epoch": 0.08933579588207127, "grad_norm": 2.1653574639297193, "learning_rate": 9.908041951407394e-06, "loss": 0.4159, "step": 1306 }, { "epoch": 0.08940420001368082, "grad_norm": 1.8298672622800063, "learning_rate": 9.907830354298355e-06, "loss": 0.3097, "step": 1307 }, { "epoch": 0.08947260414529037, "grad_norm": 2.2774044604576362, "learning_rate": 9.907618516289301e-06, "loss": 0.3361, "step": 1308 }, { "epoch": 0.08954100827689992, "grad_norm": 2.4647810454939516, "learning_rate": 9.907406437390632e-06, "loss": 0.6572, "step": 1309 }, { "epoch": 0.08960941240850948, "grad_norm": 2.439012608332497, "learning_rate": 9.907194117612757e-06, "loss": 0.2679, "step": 1310 }, { "epoch": 0.08967781654011903, "grad_norm": 2.25757856378057, "learning_rate": 9.9069815569661e-06, "loss": 0.2744, "step": 1311 }, { "epoch": 0.08974622067172858, "grad_norm": 2.7387758425978044, "learning_rate": 9.906768755461092e-06, "loss": 0.7606, "step": 1312 }, { "epoch": 0.08981462480333813, "grad_norm": 2.015129882416384, "learning_rate": 9.906555713108176e-06, "loss": 0.3065, "step": 1313 }, { "epoch": 0.08988302893494766, "grad_norm": 1.6721628623099793, "learning_rate": 9.906342429917815e-06, "loss": 0.2603, "step": 1314 }, { "epoch": 0.08995143306655722, "grad_norm": 1.9142606360199912, "learning_rate": 9.906128905900475e-06, "loss": 0.1653, "step": 1315 }, { "epoch": 0.09001983719816677, "grad_norm": 3.6653392007151027, "learning_rate": 9.905915141066636e-06, "loss": 0.2638, "step": 1316 }, { "epoch": 0.09008824132977632, "grad_norm": 2.307658223785356, "learning_rate": 9.905701135426792e-06, "loss": 0.2956, "step": 1317 }, { "epoch": 0.09015664546138587, "grad_norm": 1.923352425721254, "learning_rate": 9.905486888991446e-06, "loss": 0.2678, "step": 1318 }, { "epoch": 0.09022504959299542, "grad_norm": 2.359593747589941, "learning_rate": 9.905272401771115e-06, "loss": 0.4605, "step": 1319 }, { "epoch": 0.09029345372460497, "grad_norm": 2.2182309658463915, "learning_rate": 9.905057673776329e-06, "loss": 0.3914, "step": 1320 }, { "epoch": 0.09036185785621452, "grad_norm": 2.5099586264175393, "learning_rate": 9.904842705017625e-06, "loss": 0.5, "step": 1321 }, { "epoch": 0.09043026198782407, "grad_norm": 2.2938992173722585, "learning_rate": 9.904627495505557e-06, "loss": 0.4293, "step": 1322 }, { "epoch": 0.09049866611943361, "grad_norm": 2.3230813296649315, "learning_rate": 9.904412045250686e-06, "loss": 0.3574, "step": 1323 }, { "epoch": 0.09056707025104316, "grad_norm": 2.232738729370104, "learning_rate": 9.904196354263588e-06, "loss": 0.372, "step": 1324 }, { "epoch": 0.09063547438265271, "grad_norm": 2.4477371041744544, "learning_rate": 9.903980422554851e-06, "loss": 0.4428, "step": 1325 }, { "epoch": 0.09070387851426226, "grad_norm": 3.199474557377205, "learning_rate": 9.903764250135075e-06, "loss": 0.6211, "step": 1326 }, { "epoch": 0.09077228264587181, "grad_norm": 2.166368564094966, "learning_rate": 9.903547837014868e-06, "loss": 0.3557, "step": 1327 }, { "epoch": 0.09084068677748136, "grad_norm": 4.335576429024888, "learning_rate": 9.903331183204856e-06, "loss": 0.3968, "step": 1328 }, { "epoch": 0.09090909090909091, "grad_norm": 2.1703483924616687, "learning_rate": 9.903114288715671e-06, "loss": 0.3874, "step": 1329 }, { "epoch": 0.09097749504070046, "grad_norm": 1.9481936494139673, "learning_rate": 9.902897153557959e-06, "loss": 0.3615, "step": 1330 }, { "epoch": 0.09104589917231001, "grad_norm": 2.0647675302779533, "learning_rate": 9.90267977774238e-06, "loss": 0.3043, "step": 1331 }, { "epoch": 0.09111430330391955, "grad_norm": 2.6350716539698364, "learning_rate": 9.902462161279601e-06, "loss": 0.5025, "step": 1332 }, { "epoch": 0.0911827074355291, "grad_norm": 3.738807673046127, "learning_rate": 9.902244304180306e-06, "loss": 0.5365, "step": 1333 }, { "epoch": 0.09125111156713865, "grad_norm": 2.2616950249621732, "learning_rate": 9.90202620645519e-06, "loss": 0.5189, "step": 1334 }, { "epoch": 0.0913195156987482, "grad_norm": 1.6386319202194326, "learning_rate": 9.901807868114955e-06, "loss": 0.2507, "step": 1335 }, { "epoch": 0.09138791983035775, "grad_norm": 2.4742760557976813, "learning_rate": 9.901589289170316e-06, "loss": 0.3606, "step": 1336 }, { "epoch": 0.0914563239619673, "grad_norm": 1.4728610670397417, "learning_rate": 9.90137046963201e-06, "loss": 0.2756, "step": 1337 }, { "epoch": 0.09152472809357685, "grad_norm": 2.73158406194022, "learning_rate": 9.901151409510769e-06, "loss": 0.5913, "step": 1338 }, { "epoch": 0.0915931322251864, "grad_norm": 2.27181330076284, "learning_rate": 9.900932108817352e-06, "loss": 0.3542, "step": 1339 }, { "epoch": 0.09166153635679596, "grad_norm": 2.1018312204425613, "learning_rate": 9.90071256756252e-06, "loss": 0.5466, "step": 1340 }, { "epoch": 0.0917299404884055, "grad_norm": 2.4421588714200504, "learning_rate": 9.90049278575705e-06, "loss": 0.5123, "step": 1341 }, { "epoch": 0.09179834462001504, "grad_norm": 2.3501190114829384, "learning_rate": 9.90027276341173e-06, "loss": 0.3098, "step": 1342 }, { "epoch": 0.0918667487516246, "grad_norm": 4.192025984056581, "learning_rate": 9.900052500537358e-06, "loss": 0.4786, "step": 1343 }, { "epoch": 0.09193515288323414, "grad_norm": 2.5088999045517992, "learning_rate": 9.899831997144749e-06, "loss": 0.6745, "step": 1344 }, { "epoch": 0.0920035570148437, "grad_norm": 1.821937834112541, "learning_rate": 9.899611253244722e-06, "loss": 0.2788, "step": 1345 }, { "epoch": 0.09207196114645325, "grad_norm": 2.0612109086058386, "learning_rate": 9.899390268848118e-06, "loss": 0.3823, "step": 1346 }, { "epoch": 0.0921403652780628, "grad_norm": 2.2311689410191535, "learning_rate": 9.899169043965779e-06, "loss": 0.5482, "step": 1347 }, { "epoch": 0.09220876940967235, "grad_norm": 2.1503028817080203, "learning_rate": 9.898947578608567e-06, "loss": 0.5428, "step": 1348 }, { "epoch": 0.0922771735412819, "grad_norm": 1.8751873479069472, "learning_rate": 9.898725872787349e-06, "loss": 0.3721, "step": 1349 }, { "epoch": 0.09234557767289145, "grad_norm": 2.793416287505617, "learning_rate": 9.89850392651301e-06, "loss": 0.4974, "step": 1350 }, { "epoch": 0.09241398180450099, "grad_norm": 3.1683075861224985, "learning_rate": 9.898281739796444e-06, "loss": 0.6832, "step": 1351 }, { "epoch": 0.09248238593611054, "grad_norm": 1.6571871966944933, "learning_rate": 9.898059312648557e-06, "loss": 0.2803, "step": 1352 }, { "epoch": 0.09255079006772009, "grad_norm": 1.9949742367283172, "learning_rate": 9.897836645080266e-06, "loss": 0.2813, "step": 1353 }, { "epoch": 0.09261919419932964, "grad_norm": 2.563502859798151, "learning_rate": 9.897613737102501e-06, "loss": 0.5297, "step": 1354 }, { "epoch": 0.09268759833093919, "grad_norm": 2.6071111715410664, "learning_rate": 9.897390588726203e-06, "loss": 0.7275, "step": 1355 }, { "epoch": 0.09275600246254874, "grad_norm": 2.1503570754979395, "learning_rate": 9.897167199962326e-06, "loss": 0.298, "step": 1356 }, { "epoch": 0.09282440659415829, "grad_norm": 1.9607871252511941, "learning_rate": 9.896943570821835e-06, "loss": 0.3875, "step": 1357 }, { "epoch": 0.09289281072576784, "grad_norm": 2.0881229517600395, "learning_rate": 9.896719701315708e-06, "loss": 0.4681, "step": 1358 }, { "epoch": 0.09296121485737739, "grad_norm": 2.557255378874864, "learning_rate": 9.896495591454929e-06, "loss": 0.5841, "step": 1359 }, { "epoch": 0.09302961898898693, "grad_norm": 2.470334287109426, "learning_rate": 9.896271241250502e-06, "loss": 0.5539, "step": 1360 }, { "epoch": 0.09309802312059648, "grad_norm": 2.3737928685741245, "learning_rate": 9.89604665071344e-06, "loss": 0.4936, "step": 1361 }, { "epoch": 0.09316642725220603, "grad_norm": 1.6020962756067796, "learning_rate": 9.895821819854765e-06, "loss": 0.2783, "step": 1362 }, { "epoch": 0.09323483138381558, "grad_norm": 2.0172261995626326, "learning_rate": 9.895596748685513e-06, "loss": 0.2171, "step": 1363 }, { "epoch": 0.09330323551542513, "grad_norm": 1.8782829273807584, "learning_rate": 9.895371437216733e-06, "loss": 0.1697, "step": 1364 }, { "epoch": 0.09337163964703468, "grad_norm": 2.085996334089165, "learning_rate": 9.895145885459482e-06, "loss": 0.3198, "step": 1365 }, { "epoch": 0.09344004377864423, "grad_norm": 2.07455160412145, "learning_rate": 9.894920093424834e-06, "loss": 0.3331, "step": 1366 }, { "epoch": 0.09350844791025378, "grad_norm": 2.3232370819895167, "learning_rate": 9.894694061123868e-06, "loss": 0.4321, "step": 1367 }, { "epoch": 0.09357685204186333, "grad_norm": 2.791268472717152, "learning_rate": 9.894467788567684e-06, "loss": 0.508, "step": 1368 }, { "epoch": 0.09364525617347287, "grad_norm": 2.0117452953031654, "learning_rate": 9.894241275767385e-06, "loss": 0.3367, "step": 1369 }, { "epoch": 0.09371366030508242, "grad_norm": 1.9932229660877372, "learning_rate": 9.89401452273409e-06, "loss": 0.384, "step": 1370 }, { "epoch": 0.09378206443669197, "grad_norm": 2.1056660611017817, "learning_rate": 9.89378752947893e-06, "loss": 0.3717, "step": 1371 }, { "epoch": 0.09385046856830152, "grad_norm": 2.0505531256815863, "learning_rate": 9.893560296013045e-06, "loss": 0.5726, "step": 1372 }, { "epoch": 0.09391887269991107, "grad_norm": 2.263799533749674, "learning_rate": 9.89333282234759e-06, "loss": 0.4061, "step": 1373 }, { "epoch": 0.09398727683152062, "grad_norm": 2.286928216941061, "learning_rate": 9.89310510849373e-06, "loss": 0.4437, "step": 1374 }, { "epoch": 0.09405568096313018, "grad_norm": 2.245037082406216, "learning_rate": 9.892877154462643e-06, "loss": 0.4217, "step": 1375 }, { "epoch": 0.09412408509473973, "grad_norm": 2.1087190809170897, "learning_rate": 9.892648960265518e-06, "loss": 0.4713, "step": 1376 }, { "epoch": 0.09419248922634928, "grad_norm": 2.183330984120403, "learning_rate": 9.892420525913557e-06, "loss": 0.5301, "step": 1377 }, { "epoch": 0.09426089335795883, "grad_norm": 1.7784116261734508, "learning_rate": 9.89219185141797e-06, "loss": 0.2165, "step": 1378 }, { "epoch": 0.09432929748956836, "grad_norm": 2.338540862379948, "learning_rate": 9.891962936789983e-06, "loss": 0.35, "step": 1379 }, { "epoch": 0.09439770162117792, "grad_norm": 3.22002456947395, "learning_rate": 9.891733782040832e-06, "loss": 0.5321, "step": 1380 }, { "epoch": 0.09446610575278747, "grad_norm": 2.14327029033867, "learning_rate": 9.891504387181765e-06, "loss": 0.2843, "step": 1381 }, { "epoch": 0.09453450988439702, "grad_norm": 2.5299616790543755, "learning_rate": 9.891274752224041e-06, "loss": 0.4522, "step": 1382 }, { "epoch": 0.09460291401600657, "grad_norm": 2.053620543692243, "learning_rate": 9.891044877178934e-06, "loss": 0.3935, "step": 1383 }, { "epoch": 0.09467131814761612, "grad_norm": 2.28561150344231, "learning_rate": 9.890814762057725e-06, "loss": 0.5753, "step": 1384 }, { "epoch": 0.09473972227922567, "grad_norm": 2.5584457587619855, "learning_rate": 9.89058440687171e-06, "loss": 0.7715, "step": 1385 }, { "epoch": 0.09480812641083522, "grad_norm": 2.769382607327539, "learning_rate": 9.890353811632195e-06, "loss": 0.6922, "step": 1386 }, { "epoch": 0.09487653054244477, "grad_norm": 2.0688895175598034, "learning_rate": 9.890122976350501e-06, "loss": 0.4702, "step": 1387 }, { "epoch": 0.09494493467405431, "grad_norm": 2.077134729115061, "learning_rate": 9.889891901037957e-06, "loss": 0.1961, "step": 1388 }, { "epoch": 0.09501333880566386, "grad_norm": 2.2881193105662847, "learning_rate": 9.889660585705905e-06, "loss": 0.2391, "step": 1389 }, { "epoch": 0.09508174293727341, "grad_norm": 1.9254030123169168, "learning_rate": 9.8894290303657e-06, "loss": 0.4332, "step": 1390 }, { "epoch": 0.09515014706888296, "grad_norm": 2.7069980129386844, "learning_rate": 9.889197235028707e-06, "loss": 0.5862, "step": 1391 }, { "epoch": 0.09521855120049251, "grad_norm": 1.7929503163572342, "learning_rate": 9.888965199706304e-06, "loss": 0.3413, "step": 1392 }, { "epoch": 0.09528695533210206, "grad_norm": 2.4751257233524955, "learning_rate": 9.888732924409881e-06, "loss": 0.7001, "step": 1393 }, { "epoch": 0.09535535946371161, "grad_norm": 2.449768394110744, "learning_rate": 9.88850040915084e-06, "loss": 0.5703, "step": 1394 }, { "epoch": 0.09542376359532116, "grad_norm": 2.137603973904948, "learning_rate": 9.888267653940589e-06, "loss": 0.3145, "step": 1395 }, { "epoch": 0.09549216772693071, "grad_norm": 2.3826287433248035, "learning_rate": 9.888034658790558e-06, "loss": 0.4986, "step": 1396 }, { "epoch": 0.09556057185854025, "grad_norm": 2.3909039116513084, "learning_rate": 9.887801423712182e-06, "loss": 0.4985, "step": 1397 }, { "epoch": 0.0956289759901498, "grad_norm": 2.3344532579431085, "learning_rate": 9.88756794871691e-06, "loss": 0.4693, "step": 1398 }, { "epoch": 0.09569738012175935, "grad_norm": 1.9864608210198187, "learning_rate": 9.887334233816199e-06, "loss": 0.3817, "step": 1399 }, { "epoch": 0.0957657842533689, "grad_norm": 1.782483142516261, "learning_rate": 9.887100279021525e-06, "loss": 0.2023, "step": 1400 }, { "epoch": 0.09583418838497845, "grad_norm": 2.374454021439254, "learning_rate": 9.88686608434437e-06, "loss": 0.4458, "step": 1401 }, { "epoch": 0.095902592516588, "grad_norm": 2.2180264596126187, "learning_rate": 9.886631649796227e-06, "loss": 0.4697, "step": 1402 }, { "epoch": 0.09597099664819755, "grad_norm": 2.3892548817329953, "learning_rate": 9.886396975388608e-06, "loss": 0.3303, "step": 1403 }, { "epoch": 0.0960394007798071, "grad_norm": 2.2817361653287724, "learning_rate": 9.886162061133027e-06, "loss": 0.3705, "step": 1404 }, { "epoch": 0.09610780491141666, "grad_norm": 2.1252003603330927, "learning_rate": 9.88592690704102e-06, "loss": 0.4748, "step": 1405 }, { "epoch": 0.09617620904302619, "grad_norm": 2.3190153587041618, "learning_rate": 9.885691513124123e-06, "loss": 0.4763, "step": 1406 }, { "epoch": 0.09624461317463574, "grad_norm": 2.098329450249835, "learning_rate": 9.885455879393896e-06, "loss": 0.3347, "step": 1407 }, { "epoch": 0.0963130173062453, "grad_norm": 2.2966958319921122, "learning_rate": 9.885220005861901e-06, "loss": 0.492, "step": 1408 }, { "epoch": 0.09638142143785484, "grad_norm": 2.648408343849497, "learning_rate": 9.884983892539719e-06, "loss": 0.5811, "step": 1409 }, { "epoch": 0.0964498255694644, "grad_norm": 1.8474800235509696, "learning_rate": 9.884747539438939e-06, "loss": 0.3621, "step": 1410 }, { "epoch": 0.09651822970107395, "grad_norm": 2.5041456420532087, "learning_rate": 9.884510946571159e-06, "loss": 0.5254, "step": 1411 }, { "epoch": 0.0965866338326835, "grad_norm": 1.7498699889254017, "learning_rate": 9.884274113947997e-06, "loss": 0.3544, "step": 1412 }, { "epoch": 0.09665503796429305, "grad_norm": 2.2106687564418164, "learning_rate": 9.884037041581074e-06, "loss": 0.4624, "step": 1413 }, { "epoch": 0.0967234420959026, "grad_norm": 2.3066077557062195, "learning_rate": 9.883799729482029e-06, "loss": 0.595, "step": 1414 }, { "epoch": 0.09679184622751213, "grad_norm": 1.919557934685412, "learning_rate": 9.883562177662512e-06, "loss": 0.2066, "step": 1415 }, { "epoch": 0.09686025035912169, "grad_norm": 2.427854011761132, "learning_rate": 9.883324386134177e-06, "loss": 0.2919, "step": 1416 }, { "epoch": 0.09692865449073124, "grad_norm": 2.3661746511538473, "learning_rate": 9.883086354908703e-06, "loss": 0.3589, "step": 1417 }, { "epoch": 0.09699705862234079, "grad_norm": 2.039778996421973, "learning_rate": 9.882848083997766e-06, "loss": 0.5176, "step": 1418 }, { "epoch": 0.09706546275395034, "grad_norm": 3.1419896392138487, "learning_rate": 9.88260957341307e-06, "loss": 0.6523, "step": 1419 }, { "epoch": 0.09713386688555989, "grad_norm": 1.7700366378483698, "learning_rate": 9.882370823166317e-06, "loss": 0.3068, "step": 1420 }, { "epoch": 0.09720227101716944, "grad_norm": 1.7318911164083504, "learning_rate": 9.882131833269227e-06, "loss": 0.3188, "step": 1421 }, { "epoch": 0.09727067514877899, "grad_norm": 2.197635221974013, "learning_rate": 9.881892603733532e-06, "loss": 0.2715, "step": 1422 }, { "epoch": 0.09733907928038854, "grad_norm": 2.56292599533133, "learning_rate": 9.881653134570972e-06, "loss": 0.2877, "step": 1423 }, { "epoch": 0.09740748341199809, "grad_norm": 2.0892981949079448, "learning_rate": 9.881413425793305e-06, "loss": 0.5293, "step": 1424 }, { "epoch": 0.09747588754360763, "grad_norm": 2.3304941424713803, "learning_rate": 9.881173477412293e-06, "loss": 0.6877, "step": 1425 }, { "epoch": 0.09754429167521718, "grad_norm": 2.2711029322871292, "learning_rate": 9.880933289439717e-06, "loss": 0.3724, "step": 1426 }, { "epoch": 0.09761269580682673, "grad_norm": 2.1026440854205073, "learning_rate": 9.880692861887364e-06, "loss": 0.4124, "step": 1427 }, { "epoch": 0.09768109993843628, "grad_norm": 2.159219911391296, "learning_rate": 9.880452194767039e-06, "loss": 0.304, "step": 1428 }, { "epoch": 0.09774950407004583, "grad_norm": 2.649426278502004, "learning_rate": 9.880211288090551e-06, "loss": 0.6371, "step": 1429 }, { "epoch": 0.09781790820165538, "grad_norm": 2.188143860202586, "learning_rate": 9.879970141869726e-06, "loss": 0.4189, "step": 1430 }, { "epoch": 0.09788631233326493, "grad_norm": 1.4768101116233592, "learning_rate": 9.879728756116403e-06, "loss": 0.2027, "step": 1431 }, { "epoch": 0.09795471646487448, "grad_norm": 2.059262993911311, "learning_rate": 9.879487130842428e-06, "loss": 0.3849, "step": 1432 }, { "epoch": 0.09802312059648403, "grad_norm": 1.9961369877441608, "learning_rate": 9.879245266059662e-06, "loss": 0.5059, "step": 1433 }, { "epoch": 0.09809152472809357, "grad_norm": 2.3385311514455895, "learning_rate": 9.879003161779976e-06, "loss": 0.4525, "step": 1434 }, { "epoch": 0.09815992885970312, "grad_norm": 2.3002372124395993, "learning_rate": 9.878760818015257e-06, "loss": 0.4463, "step": 1435 }, { "epoch": 0.09822833299131267, "grad_norm": 2.1420107310154974, "learning_rate": 9.878518234777395e-06, "loss": 0.2475, "step": 1436 }, { "epoch": 0.09829673712292222, "grad_norm": 2.2946317708509403, "learning_rate": 9.8782754120783e-06, "loss": 0.5311, "step": 1437 }, { "epoch": 0.09836514125453177, "grad_norm": 1.5844192895586515, "learning_rate": 9.878032349929892e-06, "loss": 0.3524, "step": 1438 }, { "epoch": 0.09843354538614132, "grad_norm": 3.0439087538471212, "learning_rate": 9.8777890483441e-06, "loss": 0.6501, "step": 1439 }, { "epoch": 0.09850194951775088, "grad_norm": 2.0527226768036915, "learning_rate": 9.877545507332867e-06, "loss": 0.3911, "step": 1440 }, { "epoch": 0.09857035364936043, "grad_norm": 4.718384179088707, "learning_rate": 9.877301726908146e-06, "loss": 0.413, "step": 1441 }, { "epoch": 0.09863875778096998, "grad_norm": 1.6838746776879254, "learning_rate": 9.877057707081905e-06, "loss": 0.4392, "step": 1442 }, { "epoch": 0.09870716191257951, "grad_norm": 2.2789844668215964, "learning_rate": 9.876813447866122e-06, "loss": 0.2252, "step": 1443 }, { "epoch": 0.09877556604418906, "grad_norm": 2.4278264139517893, "learning_rate": 9.876568949272784e-06, "loss": 0.4149, "step": 1444 }, { "epoch": 0.09884397017579861, "grad_norm": 2.795938958538268, "learning_rate": 9.876324211313893e-06, "loss": 0.4561, "step": 1445 }, { "epoch": 0.09891237430740817, "grad_norm": 2.0418663186786965, "learning_rate": 9.87607923400146e-06, "loss": 0.4302, "step": 1446 }, { "epoch": 0.09898077843901772, "grad_norm": 2.551044047489408, "learning_rate": 9.875834017347515e-06, "loss": 0.6083, "step": 1447 }, { "epoch": 0.09904918257062727, "grad_norm": 1.8510740134747994, "learning_rate": 9.87558856136409e-06, "loss": 0.4429, "step": 1448 }, { "epoch": 0.09911758670223682, "grad_norm": 2.127196701716963, "learning_rate": 9.875342866063235e-06, "loss": 0.2942, "step": 1449 }, { "epoch": 0.09918599083384637, "grad_norm": 2.3576169885327647, "learning_rate": 9.87509693145701e-06, "loss": 0.5223, "step": 1450 }, { "epoch": 0.09925439496545592, "grad_norm": 3.734870909639304, "learning_rate": 9.874850757557483e-06, "loss": 0.4812, "step": 1451 }, { "epoch": 0.09932279909706546, "grad_norm": 2.2790177998983787, "learning_rate": 9.874604344376743e-06, "loss": 0.5075, "step": 1452 }, { "epoch": 0.099391203228675, "grad_norm": 2.5195437231420454, "learning_rate": 9.874357691926883e-06, "loss": 0.6485, "step": 1453 }, { "epoch": 0.09945960736028456, "grad_norm": 2.439184328270941, "learning_rate": 9.874110800220008e-06, "loss": 0.5913, "step": 1454 }, { "epoch": 0.09952801149189411, "grad_norm": 2.0330309762916854, "learning_rate": 9.873863669268239e-06, "loss": 0.3461, "step": 1455 }, { "epoch": 0.09959641562350366, "grad_norm": 1.674432376150482, "learning_rate": 9.873616299083705e-06, "loss": 0.2457, "step": 1456 }, { "epoch": 0.09966481975511321, "grad_norm": 1.633517893229935, "learning_rate": 9.87336868967855e-06, "loss": 0.3234, "step": 1457 }, { "epoch": 0.09973322388672276, "grad_norm": 4.823930615920926, "learning_rate": 9.873120841064924e-06, "loss": 0.3307, "step": 1458 }, { "epoch": 0.09980162801833231, "grad_norm": 2.2280510828616555, "learning_rate": 9.872872753254996e-06, "loss": 0.4589, "step": 1459 }, { "epoch": 0.09987003214994186, "grad_norm": 2.3998207458718843, "learning_rate": 9.872624426260942e-06, "loss": 0.4665, "step": 1460 }, { "epoch": 0.0999384362815514, "grad_norm": 1.9555848157685283, "learning_rate": 9.872375860094951e-06, "loss": 0.3268, "step": 1461 }, { "epoch": 0.10000684041316095, "grad_norm": 2.5956397927898593, "learning_rate": 9.872127054769227e-06, "loss": 0.5275, "step": 1462 }, { "epoch": 0.1000752445447705, "grad_norm": 2.029738094618409, "learning_rate": 9.871878010295978e-06, "loss": 0.3171, "step": 1463 }, { "epoch": 0.10014364867638005, "grad_norm": 3.133158771660856, "learning_rate": 9.87162872668743e-06, "loss": 0.8681, "step": 1464 }, { "epoch": 0.1002120528079896, "grad_norm": 2.402182820209993, "learning_rate": 9.87137920395582e-06, "loss": 0.5139, "step": 1465 }, { "epoch": 0.10028045693959915, "grad_norm": 2.2205357954047273, "learning_rate": 9.871129442113394e-06, "loss": 0.5019, "step": 1466 }, { "epoch": 0.1003488610712087, "grad_norm": 2.26197678406798, "learning_rate": 9.870879441172414e-06, "loss": 0.6009, "step": 1467 }, { "epoch": 0.10041726520281825, "grad_norm": 1.9419147355158268, "learning_rate": 9.870629201145149e-06, "loss": 0.3085, "step": 1468 }, { "epoch": 0.1004856693344278, "grad_norm": 3.9191658724124943, "learning_rate": 9.870378722043882e-06, "loss": 0.2614, "step": 1469 }, { "epoch": 0.10055407346603736, "grad_norm": 1.689324906615917, "learning_rate": 9.87012800388091e-06, "loss": 0.3248, "step": 1470 }, { "epoch": 0.10062247759764689, "grad_norm": 1.9273391558125035, "learning_rate": 9.869877046668536e-06, "loss": 0.4139, "step": 1471 }, { "epoch": 0.10069088172925644, "grad_norm": 2.002638737578967, "learning_rate": 9.869625850419083e-06, "loss": 0.3316, "step": 1472 }, { "epoch": 0.100759285860866, "grad_norm": 1.6024487346179612, "learning_rate": 9.869374415144876e-06, "loss": 0.2615, "step": 1473 }, { "epoch": 0.10082768999247554, "grad_norm": 2.310727316355756, "learning_rate": 9.869122740858259e-06, "loss": 0.3379, "step": 1474 }, { "epoch": 0.1008960941240851, "grad_norm": 2.394373193101393, "learning_rate": 9.868870827571585e-06, "loss": 0.4258, "step": 1475 }, { "epoch": 0.10096449825569465, "grad_norm": 1.891604339329735, "learning_rate": 9.868618675297219e-06, "loss": 0.3564, "step": 1476 }, { "epoch": 0.1010329023873042, "grad_norm": 2.1729381415444933, "learning_rate": 9.86836628404754e-06, "loss": 0.3913, "step": 1477 }, { "epoch": 0.10110130651891375, "grad_norm": 2.225832361000684, "learning_rate": 9.868113653834934e-06, "loss": 0.563, "step": 1478 }, { "epoch": 0.1011697106505233, "grad_norm": 3.857386319926131, "learning_rate": 9.8678607846718e-06, "loss": 0.4169, "step": 1479 }, { "epoch": 0.10123811478213283, "grad_norm": 2.177718185569903, "learning_rate": 9.867607676570554e-06, "loss": 0.4746, "step": 1480 }, { "epoch": 0.10130651891374239, "grad_norm": 2.1612335850405016, "learning_rate": 9.867354329543617e-06, "loss": 0.4657, "step": 1481 }, { "epoch": 0.10137492304535194, "grad_norm": 2.679750135929771, "learning_rate": 9.867100743603425e-06, "loss": 0.7071, "step": 1482 }, { "epoch": 0.10144332717696149, "grad_norm": 2.152305699616163, "learning_rate": 9.866846918762426e-06, "loss": 0.3735, "step": 1483 }, { "epoch": 0.10151173130857104, "grad_norm": 2.4057632145877967, "learning_rate": 9.86659285503308e-06, "loss": 0.2301, "step": 1484 }, { "epoch": 0.10158013544018059, "grad_norm": 2.0627471096664447, "learning_rate": 9.866338552427852e-06, "loss": 0.2513, "step": 1485 }, { "epoch": 0.10164853957179014, "grad_norm": 2.2625099866867955, "learning_rate": 9.866084010959232e-06, "loss": 0.4654, "step": 1486 }, { "epoch": 0.10171694370339969, "grad_norm": 2.51169853142985, "learning_rate": 9.86582923063971e-06, "loss": 0.6371, "step": 1487 }, { "epoch": 0.10178534783500924, "grad_norm": 1.6279114949308828, "learning_rate": 9.865574211481792e-06, "loss": 0.3199, "step": 1488 }, { "epoch": 0.10185375196661878, "grad_norm": 1.7902938236934396, "learning_rate": 9.865318953497997e-06, "loss": 0.2681, "step": 1489 }, { "epoch": 0.10192215609822833, "grad_norm": 1.538821274108402, "learning_rate": 9.865063456700851e-06, "loss": 0.2271, "step": 1490 }, { "epoch": 0.10199056022983788, "grad_norm": 2.2297536552121016, "learning_rate": 9.8648077211029e-06, "loss": 0.3975, "step": 1491 }, { "epoch": 0.10205896436144743, "grad_norm": 2.3887129005783008, "learning_rate": 9.864551746716694e-06, "loss": 0.2127, "step": 1492 }, { "epoch": 0.10212736849305698, "grad_norm": 2.656975663861877, "learning_rate": 9.864295533554798e-06, "loss": 0.6215, "step": 1493 }, { "epoch": 0.10219577262466653, "grad_norm": 2.127116929989824, "learning_rate": 9.864039081629786e-06, "loss": 0.3034, "step": 1494 }, { "epoch": 0.10226417675627608, "grad_norm": 2.3206288927961514, "learning_rate": 9.863782390954248e-06, "loss": 0.5487, "step": 1495 }, { "epoch": 0.10233258088788563, "grad_norm": 1.9611045530676792, "learning_rate": 9.863525461540784e-06, "loss": 0.3678, "step": 1496 }, { "epoch": 0.10240098501949518, "grad_norm": 1.8890158686155625, "learning_rate": 9.863268293402005e-06, "loss": 0.4011, "step": 1497 }, { "epoch": 0.10246938915110472, "grad_norm": 2.4053385853898606, "learning_rate": 9.863010886550532e-06, "loss": 0.4334, "step": 1498 }, { "epoch": 0.10253779328271427, "grad_norm": 2.19606970221219, "learning_rate": 9.862753240999001e-06, "loss": 0.3785, "step": 1499 }, { "epoch": 0.10260619741432382, "grad_norm": 2.5034898837262083, "learning_rate": 9.86249535676006e-06, "loss": 0.4143, "step": 1500 }, { "epoch": 0.10267460154593337, "grad_norm": 2.0738638594022754, "learning_rate": 9.862237233846367e-06, "loss": 0.4562, "step": 1501 }, { "epoch": 0.10274300567754292, "grad_norm": 5.056913354502133, "learning_rate": 9.86197887227059e-06, "loss": 0.8505, "step": 1502 }, { "epoch": 0.10281140980915247, "grad_norm": 1.3471274305869285, "learning_rate": 9.861720272045413e-06, "loss": 0.1232, "step": 1503 }, { "epoch": 0.10287981394076202, "grad_norm": 2.23049084974788, "learning_rate": 9.861461433183527e-06, "loss": 0.2925, "step": 1504 }, { "epoch": 0.10294821807237158, "grad_norm": 1.6391914121518798, "learning_rate": 9.861202355697639e-06, "loss": 0.1774, "step": 1505 }, { "epoch": 0.10301662220398113, "grad_norm": 2.5034620393877227, "learning_rate": 9.860943039600464e-06, "loss": 0.4814, "step": 1506 }, { "epoch": 0.10308502633559068, "grad_norm": 2.857418985945518, "learning_rate": 9.860683484904731e-06, "loss": 0.4242, "step": 1507 }, { "epoch": 0.10315343046720021, "grad_norm": 1.75090669236185, "learning_rate": 9.860423691623182e-06, "loss": 0.2069, "step": 1508 }, { "epoch": 0.10322183459880976, "grad_norm": 1.4828688760537903, "learning_rate": 9.860163659768565e-06, "loss": 0.2011, "step": 1509 }, { "epoch": 0.10329023873041931, "grad_norm": 1.985051584168061, "learning_rate": 9.85990338935365e-06, "loss": 0.321, "step": 1510 }, { "epoch": 0.10335864286202887, "grad_norm": 2.510788206778915, "learning_rate": 9.859642880391205e-06, "loss": 0.5876, "step": 1511 }, { "epoch": 0.10342704699363842, "grad_norm": 2.1428070954455904, "learning_rate": 9.859382132894021e-06, "loss": 0.4522, "step": 1512 }, { "epoch": 0.10349545112524797, "grad_norm": 2.235619285219997, "learning_rate": 9.859121146874897e-06, "loss": 0.464, "step": 1513 }, { "epoch": 0.10356385525685752, "grad_norm": 2.033927920213521, "learning_rate": 9.858859922346642e-06, "loss": 0.3481, "step": 1514 }, { "epoch": 0.10363225938846707, "grad_norm": 2.8218869149895407, "learning_rate": 9.85859845932208e-06, "loss": 0.5252, "step": 1515 }, { "epoch": 0.10370066352007662, "grad_norm": 1.9979625396978058, "learning_rate": 9.858336757814042e-06, "loss": 0.4973, "step": 1516 }, { "epoch": 0.10376906765168616, "grad_norm": 2.1501045625787714, "learning_rate": 9.858074817835376e-06, "loss": 0.4196, "step": 1517 }, { "epoch": 0.1038374717832957, "grad_norm": 2.469420130835498, "learning_rate": 9.857812639398938e-06, "loss": 0.5456, "step": 1518 }, { "epoch": 0.10390587591490526, "grad_norm": 1.653202983300422, "learning_rate": 9.857550222517598e-06, "loss": 0.234, "step": 1519 }, { "epoch": 0.10397428004651481, "grad_norm": 2.608457666750786, "learning_rate": 9.857287567204237e-06, "loss": 0.6565, "step": 1520 }, { "epoch": 0.10404268417812436, "grad_norm": 2.8595819867369143, "learning_rate": 9.857024673471747e-06, "loss": 0.758, "step": 1521 }, { "epoch": 0.10411108830973391, "grad_norm": 2.144872554864362, "learning_rate": 9.856761541333031e-06, "loss": 0.3834, "step": 1522 }, { "epoch": 0.10417949244134346, "grad_norm": 2.189787669829777, "learning_rate": 9.856498170801005e-06, "loss": 0.4126, "step": 1523 }, { "epoch": 0.10424789657295301, "grad_norm": 2.5421918291618937, "learning_rate": 9.856234561888596e-06, "loss": 0.2994, "step": 1524 }, { "epoch": 0.10431630070456256, "grad_norm": 2.2481527295283557, "learning_rate": 9.855970714608746e-06, "loss": 0.4185, "step": 1525 }, { "epoch": 0.1043847048361721, "grad_norm": 2.792616313759118, "learning_rate": 9.855706628974402e-06, "loss": 0.6226, "step": 1526 }, { "epoch": 0.10445310896778165, "grad_norm": 2.612182805435557, "learning_rate": 9.855442304998531e-06, "loss": 0.2935, "step": 1527 }, { "epoch": 0.1045215130993912, "grad_norm": 2.7053751317657704, "learning_rate": 9.855177742694105e-06, "loss": 0.6862, "step": 1528 }, { "epoch": 0.10458991723100075, "grad_norm": 3.8119667024178887, "learning_rate": 9.854912942074109e-06, "loss": 0.7868, "step": 1529 }, { "epoch": 0.1046583213626103, "grad_norm": 1.8273511781252043, "learning_rate": 9.854647903151542e-06, "loss": 0.3353, "step": 1530 }, { "epoch": 0.10472672549421985, "grad_norm": 3.152494008620496, "learning_rate": 9.854382625939412e-06, "loss": 0.3686, "step": 1531 }, { "epoch": 0.1047951296258294, "grad_norm": 2.6048700825864337, "learning_rate": 9.854117110450742e-06, "loss": 0.5224, "step": 1532 }, { "epoch": 0.10486353375743895, "grad_norm": 2.6917630195841333, "learning_rate": 9.853851356698564e-06, "loss": 0.5859, "step": 1533 }, { "epoch": 0.1049319378890485, "grad_norm": 2.3022319643592306, "learning_rate": 9.853585364695921e-06, "loss": 0.3941, "step": 1534 }, { "epoch": 0.10500034202065804, "grad_norm": 2.452250737920388, "learning_rate": 9.853319134455872e-06, "loss": 0.1923, "step": 1535 }, { "epoch": 0.10506874615226759, "grad_norm": 2.142912310928951, "learning_rate": 9.853052665991485e-06, "loss": 0.4999, "step": 1536 }, { "epoch": 0.10513715028387714, "grad_norm": 2.4458066544602195, "learning_rate": 9.852785959315835e-06, "loss": 0.5952, "step": 1537 }, { "epoch": 0.1052055544154867, "grad_norm": 1.8980675796098652, "learning_rate": 9.852519014442018e-06, "loss": 0.3314, "step": 1538 }, { "epoch": 0.10527395854709624, "grad_norm": 2.0829172736198376, "learning_rate": 9.852251831383136e-06, "loss": 0.4319, "step": 1539 }, { "epoch": 0.1053423626787058, "grad_norm": 2.4621929296750062, "learning_rate": 9.8519844101523e-06, "loss": 0.5704, "step": 1540 }, { "epoch": 0.10541076681031535, "grad_norm": 1.8854673136886395, "learning_rate": 9.851716750762641e-06, "loss": 0.2299, "step": 1541 }, { "epoch": 0.1054791709419249, "grad_norm": 5.145969499750406, "learning_rate": 9.851448853227294e-06, "loss": 0.5829, "step": 1542 }, { "epoch": 0.10554757507353445, "grad_norm": 2.667450811286529, "learning_rate": 9.851180717559413e-06, "loss": 0.5084, "step": 1543 }, { "epoch": 0.10561597920514398, "grad_norm": 1.9717379935014994, "learning_rate": 9.850912343772152e-06, "loss": 0.3049, "step": 1544 }, { "epoch": 0.10568438333675353, "grad_norm": 2.0994043507388183, "learning_rate": 9.85064373187869e-06, "loss": 0.4056, "step": 1545 }, { "epoch": 0.10575278746836309, "grad_norm": 1.6276153462343919, "learning_rate": 9.850374881892212e-06, "loss": 0.3052, "step": 1546 }, { "epoch": 0.10582119159997264, "grad_norm": 2.4907787371276315, "learning_rate": 9.85010579382591e-06, "loss": 0.3961, "step": 1547 }, { "epoch": 0.10588959573158219, "grad_norm": 1.8528934163986406, "learning_rate": 9.849836467692996e-06, "loss": 0.2542, "step": 1548 }, { "epoch": 0.10595799986319174, "grad_norm": 2.727752731809943, "learning_rate": 9.849566903506689e-06, "loss": 0.913, "step": 1549 }, { "epoch": 0.10602640399480129, "grad_norm": 1.768355663271228, "learning_rate": 9.84929710128022e-06, "loss": 0.2668, "step": 1550 }, { "epoch": 0.10609480812641084, "grad_norm": 2.2098125891763285, "learning_rate": 9.84902706102683e-06, "loss": 0.4574, "step": 1551 }, { "epoch": 0.10616321225802039, "grad_norm": 2.145677887083918, "learning_rate": 9.848756782759778e-06, "loss": 0.3362, "step": 1552 }, { "epoch": 0.10623161638962994, "grad_norm": 1.7951899007730654, "learning_rate": 9.84848626649233e-06, "loss": 0.2828, "step": 1553 }, { "epoch": 0.10630002052123948, "grad_norm": 2.457152310589652, "learning_rate": 9.84821551223776e-06, "loss": 0.3987, "step": 1554 }, { "epoch": 0.10636842465284903, "grad_norm": 2.445457015474612, "learning_rate": 9.847944520009362e-06, "loss": 0.2452, "step": 1555 }, { "epoch": 0.10643682878445858, "grad_norm": 2.0275658869826914, "learning_rate": 9.847673289820438e-06, "loss": 0.2917, "step": 1556 }, { "epoch": 0.10650523291606813, "grad_norm": 2.0386263793850827, "learning_rate": 9.847401821684298e-06, "loss": 0.3179, "step": 1557 }, { "epoch": 0.10657363704767768, "grad_norm": 1.544515170398112, "learning_rate": 9.84713011561427e-06, "loss": 0.2191, "step": 1558 }, { "epoch": 0.10664204117928723, "grad_norm": 1.3209064941972783, "learning_rate": 9.846858171623687e-06, "loss": 0.2143, "step": 1559 }, { "epoch": 0.10671044531089678, "grad_norm": 1.7043294125794644, "learning_rate": 9.846585989725902e-06, "loss": 0.2134, "step": 1560 }, { "epoch": 0.10677884944250633, "grad_norm": 2.5334976601530554, "learning_rate": 9.846313569934272e-06, "loss": 0.4718, "step": 1561 }, { "epoch": 0.10684725357411588, "grad_norm": 2.307010488549124, "learning_rate": 9.84604091226217e-06, "loss": 0.3506, "step": 1562 }, { "epoch": 0.10691565770572542, "grad_norm": 1.978210237363039, "learning_rate": 9.845768016722979e-06, "loss": 0.2587, "step": 1563 }, { "epoch": 0.10698406183733497, "grad_norm": 2.100552504099181, "learning_rate": 9.84549488333009e-06, "loss": 0.3838, "step": 1564 }, { "epoch": 0.10705246596894452, "grad_norm": 2.4221235393849123, "learning_rate": 9.845221512096919e-06, "loss": 0.4369, "step": 1565 }, { "epoch": 0.10712087010055407, "grad_norm": 1.958981349482534, "learning_rate": 9.844947903036875e-06, "loss": 0.2085, "step": 1566 }, { "epoch": 0.10718927423216362, "grad_norm": 2.581363360753668, "learning_rate": 9.844674056163393e-06, "loss": 0.4856, "step": 1567 }, { "epoch": 0.10725767836377317, "grad_norm": 2.3731299491752353, "learning_rate": 9.844399971489915e-06, "loss": 0.4187, "step": 1568 }, { "epoch": 0.10732608249538272, "grad_norm": 2.962026602106582, "learning_rate": 9.844125649029891e-06, "loss": 0.803, "step": 1569 }, { "epoch": 0.10739448662699227, "grad_norm": 2.6734635889685507, "learning_rate": 9.843851088796788e-06, "loss": 0.3767, "step": 1570 }, { "epoch": 0.10746289075860183, "grad_norm": 2.1575439551197855, "learning_rate": 9.843576290804084e-06, "loss": 0.1759, "step": 1571 }, { "epoch": 0.10753129489021136, "grad_norm": 2.1479170637071787, "learning_rate": 9.843301255065267e-06, "loss": 0.3494, "step": 1572 }, { "epoch": 0.10759969902182091, "grad_norm": 2.094377409462039, "learning_rate": 9.843025981593835e-06, "loss": 0.4827, "step": 1573 }, { "epoch": 0.10766810315343046, "grad_norm": 1.7743239560969963, "learning_rate": 9.842750470403301e-06, "loss": 0.3575, "step": 1574 }, { "epoch": 0.10773650728504001, "grad_norm": 3.4330438818064644, "learning_rate": 9.84247472150719e-06, "loss": 0.3866, "step": 1575 }, { "epoch": 0.10780491141664957, "grad_norm": 2.8110372872317186, "learning_rate": 9.842198734919033e-06, "loss": 0.533, "step": 1576 }, { "epoch": 0.10787331554825912, "grad_norm": 2.0057541430541206, "learning_rate": 9.841922510652383e-06, "loss": 0.4185, "step": 1577 }, { "epoch": 0.10794171967986867, "grad_norm": 2.373541434286981, "learning_rate": 9.841646048720791e-06, "loss": 0.5401, "step": 1578 }, { "epoch": 0.10801012381147822, "grad_norm": 2.0204056446173126, "learning_rate": 9.841369349137832e-06, "loss": 0.2818, "step": 1579 }, { "epoch": 0.10807852794308777, "grad_norm": 1.5221570600521466, "learning_rate": 9.841092411917089e-06, "loss": 0.2908, "step": 1580 }, { "epoch": 0.1081469320746973, "grad_norm": 2.5083077556683198, "learning_rate": 9.84081523707215e-06, "loss": 0.614, "step": 1581 }, { "epoch": 0.10821533620630686, "grad_norm": 2.071900589958949, "learning_rate": 9.840537824616623e-06, "loss": 0.3442, "step": 1582 }, { "epoch": 0.1082837403379164, "grad_norm": 2.6459180613090147, "learning_rate": 9.840260174564128e-06, "loss": 0.7893, "step": 1583 }, { "epoch": 0.10835214446952596, "grad_norm": 2.534134428175068, "learning_rate": 9.839982286928287e-06, "loss": 0.4244, "step": 1584 }, { "epoch": 0.10842054860113551, "grad_norm": 2.169970747897757, "learning_rate": 9.839704161722745e-06, "loss": 0.4631, "step": 1585 }, { "epoch": 0.10848895273274506, "grad_norm": 2.789877150139894, "learning_rate": 9.839425798961152e-06, "loss": 0.1902, "step": 1586 }, { "epoch": 0.10855735686435461, "grad_norm": 2.453387197829872, "learning_rate": 9.83914719865717e-06, "loss": 0.5626, "step": 1587 }, { "epoch": 0.10862576099596416, "grad_norm": 1.9562835787977744, "learning_rate": 9.838868360824478e-06, "loss": 0.1898, "step": 1588 }, { "epoch": 0.10869416512757371, "grad_norm": 1.9245444731069536, "learning_rate": 9.838589285476759e-06, "loss": 0.2971, "step": 1589 }, { "epoch": 0.10876256925918326, "grad_norm": 2.145212419652392, "learning_rate": 9.838309972627711e-06, "loss": 0.3154, "step": 1590 }, { "epoch": 0.1088309733907928, "grad_norm": 3.586784146493143, "learning_rate": 9.838030422291047e-06, "loss": 0.8206, "step": 1591 }, { "epoch": 0.10889937752240235, "grad_norm": 2.6283983133832964, "learning_rate": 9.837750634480487e-06, "loss": 0.6194, "step": 1592 }, { "epoch": 0.1089677816540119, "grad_norm": 2.086446228569926, "learning_rate": 9.837470609209765e-06, "loss": 0.4056, "step": 1593 }, { "epoch": 0.10903618578562145, "grad_norm": 2.4267964626742096, "learning_rate": 9.837190346492626e-06, "loss": 0.5023, "step": 1594 }, { "epoch": 0.109104589917231, "grad_norm": 2.180582259369662, "learning_rate": 9.836909846342826e-06, "loss": 0.3814, "step": 1595 }, { "epoch": 0.10917299404884055, "grad_norm": 2.913455189069457, "learning_rate": 9.836629108774132e-06, "loss": 0.5542, "step": 1596 }, { "epoch": 0.1092413981804501, "grad_norm": 2.401473286174761, "learning_rate": 9.836348133800327e-06, "loss": 0.2284, "step": 1597 }, { "epoch": 0.10930980231205965, "grad_norm": 3.2336257859251614, "learning_rate": 9.8360669214352e-06, "loss": 0.3631, "step": 1598 }, { "epoch": 0.1093782064436692, "grad_norm": 1.951005953434572, "learning_rate": 9.835785471692559e-06, "loss": 0.244, "step": 1599 }, { "epoch": 0.10944661057527874, "grad_norm": 1.9546677066218585, "learning_rate": 9.835503784586211e-06, "loss": 0.3818, "step": 1600 }, { "epoch": 0.10951501470688829, "grad_norm": 2.6321140857138183, "learning_rate": 9.835221860129989e-06, "loss": 0.849, "step": 1601 }, { "epoch": 0.10958341883849784, "grad_norm": 2.806099950415127, "learning_rate": 9.834939698337728e-06, "loss": 0.1882, "step": 1602 }, { "epoch": 0.1096518229701074, "grad_norm": 2.121259482637339, "learning_rate": 9.834657299223279e-06, "loss": 0.3059, "step": 1603 }, { "epoch": 0.10972022710171694, "grad_norm": 1.873378575124528, "learning_rate": 9.834374662800504e-06, "loss": 0.3548, "step": 1604 }, { "epoch": 0.1097886312333265, "grad_norm": 2.4630275388774128, "learning_rate": 9.834091789083276e-06, "loss": 0.6196, "step": 1605 }, { "epoch": 0.10985703536493605, "grad_norm": 1.9081966513267197, "learning_rate": 9.833808678085478e-06, "loss": 0.2863, "step": 1606 }, { "epoch": 0.1099254394965456, "grad_norm": 2.539489894988604, "learning_rate": 9.833525329821008e-06, "loss": 0.5827, "step": 1607 }, { "epoch": 0.10999384362815515, "grad_norm": 2.1749111538312444, "learning_rate": 9.833241744303775e-06, "loss": 0.3605, "step": 1608 }, { "epoch": 0.11006224775976468, "grad_norm": 2.0765956614812264, "learning_rate": 9.832957921547697e-06, "loss": 0.4388, "step": 1609 }, { "epoch": 0.11013065189137423, "grad_norm": 2.7678804468017675, "learning_rate": 9.832673861566706e-06, "loss": 0.8241, "step": 1610 }, { "epoch": 0.11019905602298379, "grad_norm": 2.528471348783112, "learning_rate": 9.832389564374745e-06, "loss": 0.4448, "step": 1611 }, { "epoch": 0.11026746015459334, "grad_norm": 1.3655549465691437, "learning_rate": 9.83210502998577e-06, "loss": 0.205, "step": 1612 }, { "epoch": 0.11033586428620289, "grad_norm": 2.102857910161409, "learning_rate": 9.831820258413744e-06, "loss": 0.2416, "step": 1613 }, { "epoch": 0.11040426841781244, "grad_norm": 2.093392701051883, "learning_rate": 9.831535249672651e-06, "loss": 0.52, "step": 1614 }, { "epoch": 0.11047267254942199, "grad_norm": 2.1037084584942702, "learning_rate": 9.831250003776473e-06, "loss": 0.4265, "step": 1615 }, { "epoch": 0.11054107668103154, "grad_norm": 2.255203191268263, "learning_rate": 9.830964520739216e-06, "loss": 0.4355, "step": 1616 }, { "epoch": 0.11060948081264109, "grad_norm": 2.309419248773874, "learning_rate": 9.830678800574891e-06, "loss": 0.5338, "step": 1617 }, { "epoch": 0.11067788494425063, "grad_norm": 1.658276820258206, "learning_rate": 9.830392843297525e-06, "loss": 0.3878, "step": 1618 }, { "epoch": 0.11074628907586018, "grad_norm": 2.0822365947610884, "learning_rate": 9.830106648921152e-06, "loss": 0.3906, "step": 1619 }, { "epoch": 0.11081469320746973, "grad_norm": 2.1945547547376862, "learning_rate": 9.82982021745982e-06, "loss": 0.2675, "step": 1620 }, { "epoch": 0.11088309733907928, "grad_norm": 2.267679151441602, "learning_rate": 9.82953354892759e-06, "loss": 0.6074, "step": 1621 }, { "epoch": 0.11095150147068883, "grad_norm": 2.030566439786606, "learning_rate": 9.82924664333853e-06, "loss": 0.3783, "step": 1622 }, { "epoch": 0.11101990560229838, "grad_norm": 1.7114296187984959, "learning_rate": 9.828959500706725e-06, "loss": 0.2884, "step": 1623 }, { "epoch": 0.11108830973390793, "grad_norm": 2.527280461788037, "learning_rate": 9.82867212104627e-06, "loss": 0.6023, "step": 1624 }, { "epoch": 0.11115671386551748, "grad_norm": 3.0032661311542714, "learning_rate": 9.82838450437127e-06, "loss": 0.3831, "step": 1625 }, { "epoch": 0.11122511799712703, "grad_norm": 2.4512675840378355, "learning_rate": 9.82809665069584e-06, "loss": 0.3929, "step": 1626 }, { "epoch": 0.11129352212873657, "grad_norm": 1.9783954828657422, "learning_rate": 9.827808560034113e-06, "loss": 0.3396, "step": 1627 }, { "epoch": 0.11136192626034612, "grad_norm": 2.4477850445742058, "learning_rate": 9.82752023240023e-06, "loss": 0.5224, "step": 1628 }, { "epoch": 0.11143033039195567, "grad_norm": 2.315474198845485, "learning_rate": 9.82723166780834e-06, "loss": 0.533, "step": 1629 }, { "epoch": 0.11149873452356522, "grad_norm": 2.3347678046801104, "learning_rate": 9.82694286627261e-06, "loss": 0.3988, "step": 1630 }, { "epoch": 0.11156713865517477, "grad_norm": 1.913443414646884, "learning_rate": 9.826653827807215e-06, "loss": 0.4146, "step": 1631 }, { "epoch": 0.11163554278678432, "grad_norm": 2.13630517447499, "learning_rate": 9.826364552426345e-06, "loss": 0.5623, "step": 1632 }, { "epoch": 0.11170394691839387, "grad_norm": 2.2440771852561796, "learning_rate": 9.826075040144194e-06, "loss": 0.578, "step": 1633 }, { "epoch": 0.11177235105000342, "grad_norm": 2.340041647361128, "learning_rate": 9.825785290974975e-06, "loss": 0.568, "step": 1634 }, { "epoch": 0.11184075518161297, "grad_norm": 2.0620074261948793, "learning_rate": 9.82549530493291e-06, "loss": 0.4915, "step": 1635 }, { "epoch": 0.11190915931322253, "grad_norm": 2.797958185584247, "learning_rate": 9.825205082032235e-06, "loss": 0.7703, "step": 1636 }, { "epoch": 0.11197756344483206, "grad_norm": 3.8979122518979628, "learning_rate": 9.824914622287195e-06, "loss": 0.2177, "step": 1637 }, { "epoch": 0.11204596757644161, "grad_norm": 2.6592752563472852, "learning_rate": 9.824623925712042e-06, "loss": 0.3804, "step": 1638 }, { "epoch": 0.11211437170805116, "grad_norm": 1.7092489680774894, "learning_rate": 9.824332992321052e-06, "loss": 0.2083, "step": 1639 }, { "epoch": 0.11218277583966071, "grad_norm": 2.2255393906764525, "learning_rate": 9.8240418221285e-06, "loss": 0.5422, "step": 1640 }, { "epoch": 0.11225117997127027, "grad_norm": 1.830266566228542, "learning_rate": 9.823750415148683e-06, "loss": 0.2469, "step": 1641 }, { "epoch": 0.11231958410287982, "grad_norm": 2.4081317306293806, "learning_rate": 9.8234587713959e-06, "loss": 0.6026, "step": 1642 }, { "epoch": 0.11238798823448937, "grad_norm": 2.9697902864533026, "learning_rate": 9.82316689088447e-06, "loss": 0.932, "step": 1643 }, { "epoch": 0.11245639236609892, "grad_norm": 2.345277406174529, "learning_rate": 9.822874773628717e-06, "loss": 0.5104, "step": 1644 }, { "epoch": 0.11252479649770847, "grad_norm": 2.043501166410559, "learning_rate": 9.82258241964298e-06, "loss": 0.4529, "step": 1645 }, { "epoch": 0.112593200629318, "grad_norm": 2.1390236134195444, "learning_rate": 9.822289828941612e-06, "loss": 0.5291, "step": 1646 }, { "epoch": 0.11266160476092756, "grad_norm": 2.254748727439008, "learning_rate": 9.82199700153897e-06, "loss": 0.5042, "step": 1647 }, { "epoch": 0.1127300088925371, "grad_norm": 1.9469919117562153, "learning_rate": 9.821703937449431e-06, "loss": 0.4333, "step": 1648 }, { "epoch": 0.11279841302414666, "grad_norm": 2.418546863276165, "learning_rate": 9.82141063668738e-06, "loss": 0.6881, "step": 1649 }, { "epoch": 0.11286681715575621, "grad_norm": 1.5779074197853638, "learning_rate": 9.821117099267211e-06, "loss": 0.2242, "step": 1650 }, { "epoch": 0.11293522128736576, "grad_norm": 2.000053781292932, "learning_rate": 9.820823325203335e-06, "loss": 0.5178, "step": 1651 }, { "epoch": 0.11300362541897531, "grad_norm": 2.791138493454162, "learning_rate": 9.82052931451017e-06, "loss": 0.6862, "step": 1652 }, { "epoch": 0.11307202955058486, "grad_norm": 2.4241648523877504, "learning_rate": 9.820235067202148e-06, "loss": 0.4801, "step": 1653 }, { "epoch": 0.11314043368219441, "grad_norm": 1.4390413505375177, "learning_rate": 9.819940583293713e-06, "loss": 0.1785, "step": 1654 }, { "epoch": 0.11320883781380395, "grad_norm": 2.1650781761287528, "learning_rate": 9.819645862799318e-06, "loss": 0.4248, "step": 1655 }, { "epoch": 0.1132772419454135, "grad_norm": 2.043243364217847, "learning_rate": 9.819350905733431e-06, "loss": 0.387, "step": 1656 }, { "epoch": 0.11334564607702305, "grad_norm": 1.7849907078354195, "learning_rate": 9.81905571211053e-06, "loss": 0.228, "step": 1657 }, { "epoch": 0.1134140502086326, "grad_norm": 2.601295216894652, "learning_rate": 9.818760281945102e-06, "loss": 0.6696, "step": 1658 }, { "epoch": 0.11348245434024215, "grad_norm": 3.6842698319222262, "learning_rate": 9.81846461525165e-06, "loss": 0.2927, "step": 1659 }, { "epoch": 0.1135508584718517, "grad_norm": 2.3141133892019643, "learning_rate": 9.818168712044687e-06, "loss": 0.6132, "step": 1660 }, { "epoch": 0.11361926260346125, "grad_norm": 1.7471921269535158, "learning_rate": 9.817872572338737e-06, "loss": 0.2321, "step": 1661 }, { "epoch": 0.1136876667350708, "grad_norm": 1.9032919124639385, "learning_rate": 9.817576196148334e-06, "loss": 0.2749, "step": 1662 }, { "epoch": 0.11375607086668035, "grad_norm": 1.5821402394463524, "learning_rate": 9.81727958348803e-06, "loss": 0.1685, "step": 1663 }, { "epoch": 0.11382447499828989, "grad_norm": 2.0897035498250167, "learning_rate": 9.816982734372382e-06, "loss": 0.3918, "step": 1664 }, { "epoch": 0.11389287912989944, "grad_norm": 1.9499756408050257, "learning_rate": 9.81668564881596e-06, "loss": 0.425, "step": 1665 }, { "epoch": 0.11396128326150899, "grad_norm": 1.991125908533325, "learning_rate": 9.816388326833347e-06, "loss": 0.3657, "step": 1666 }, { "epoch": 0.11402968739311854, "grad_norm": 23.431496138563237, "learning_rate": 9.816090768439135e-06, "loss": 0.4279, "step": 1667 }, { "epoch": 0.11409809152472809, "grad_norm": 1.9053882738286823, "learning_rate": 9.81579297364793e-06, "loss": 0.3353, "step": 1668 }, { "epoch": 0.11416649565633764, "grad_norm": 2.6282644522882896, "learning_rate": 9.815494942474354e-06, "loss": 0.3517, "step": 1669 }, { "epoch": 0.1142348997879472, "grad_norm": 1.9057924823784955, "learning_rate": 9.815196674933031e-06, "loss": 0.3299, "step": 1670 }, { "epoch": 0.11430330391955675, "grad_norm": 2.339522350826572, "learning_rate": 9.814898171038604e-06, "loss": 0.4534, "step": 1671 }, { "epoch": 0.1143717080511663, "grad_norm": 3.0907840696502964, "learning_rate": 9.814599430805722e-06, "loss": 0.4096, "step": 1672 }, { "epoch": 0.11444011218277583, "grad_norm": 2.0853490428046504, "learning_rate": 9.814300454249053e-06, "loss": 0.3868, "step": 1673 }, { "epoch": 0.11450851631438538, "grad_norm": 2.8641138362497967, "learning_rate": 9.814001241383267e-06, "loss": 0.7177, "step": 1674 }, { "epoch": 0.11457692044599493, "grad_norm": 1.9261626819521958, "learning_rate": 9.813701792223056e-06, "loss": 0.4124, "step": 1675 }, { "epoch": 0.11464532457760448, "grad_norm": 1.5547266313046735, "learning_rate": 9.813402106783115e-06, "loss": 0.1787, "step": 1676 }, { "epoch": 0.11471372870921404, "grad_norm": 2.2214740565787974, "learning_rate": 9.813102185078154e-06, "loss": 0.6182, "step": 1677 }, { "epoch": 0.11478213284082359, "grad_norm": 1.8385601587599305, "learning_rate": 9.812802027122897e-06, "loss": 0.2369, "step": 1678 }, { "epoch": 0.11485053697243314, "grad_norm": 2.5012173788068135, "learning_rate": 9.812501632932074e-06, "loss": 0.6116, "step": 1679 }, { "epoch": 0.11491894110404269, "grad_norm": 1.9209674828614183, "learning_rate": 9.812201002520431e-06, "loss": 0.255, "step": 1680 }, { "epoch": 0.11498734523565224, "grad_norm": 1.8976970963461754, "learning_rate": 9.811900135902728e-06, "loss": 0.3529, "step": 1681 }, { "epoch": 0.11505574936726179, "grad_norm": 1.433152065786788, "learning_rate": 9.811599033093728e-06, "loss": 0.2407, "step": 1682 }, { "epoch": 0.11512415349887133, "grad_norm": 2.333138029121911, "learning_rate": 9.811297694108212e-06, "loss": 0.5837, "step": 1683 }, { "epoch": 0.11519255763048088, "grad_norm": 2.080994004108149, "learning_rate": 9.810996118960972e-06, "loss": 0.2852, "step": 1684 }, { "epoch": 0.11526096176209043, "grad_norm": 2.519288114896319, "learning_rate": 9.810694307666811e-06, "loss": 0.4851, "step": 1685 }, { "epoch": 0.11532936589369998, "grad_norm": 2.2744058008563957, "learning_rate": 9.810392260240543e-06, "loss": 0.411, "step": 1686 }, { "epoch": 0.11539777002530953, "grad_norm": 2.27705772789196, "learning_rate": 9.810089976696992e-06, "loss": 0.5342, "step": 1687 }, { "epoch": 0.11546617415691908, "grad_norm": 1.6914519090194131, "learning_rate": 9.809787457051e-06, "loss": 0.2866, "step": 1688 }, { "epoch": 0.11553457828852863, "grad_norm": 2.535233494973369, "learning_rate": 9.809484701317411e-06, "loss": 0.6733, "step": 1689 }, { "epoch": 0.11560298242013818, "grad_norm": 2.4876315857238605, "learning_rate": 9.80918170951109e-06, "loss": 0.5148, "step": 1690 }, { "epoch": 0.11567138655174773, "grad_norm": 1.894256882611363, "learning_rate": 9.808878481646905e-06, "loss": 0.3596, "step": 1691 }, { "epoch": 0.11573979068335727, "grad_norm": 2.1438819896058483, "learning_rate": 9.808575017739743e-06, "loss": 0.4783, "step": 1692 }, { "epoch": 0.11580819481496682, "grad_norm": 2.02942219439942, "learning_rate": 9.8082713178045e-06, "loss": 0.3403, "step": 1693 }, { "epoch": 0.11587659894657637, "grad_norm": 2.4078562162566, "learning_rate": 9.807967381856081e-06, "loss": 0.3911, "step": 1694 }, { "epoch": 0.11594500307818592, "grad_norm": 1.9409140206542737, "learning_rate": 9.807663209909406e-06, "loss": 0.2235, "step": 1695 }, { "epoch": 0.11601340720979547, "grad_norm": 2.1277899238866684, "learning_rate": 9.807358801979404e-06, "loss": 0.4806, "step": 1696 }, { "epoch": 0.11608181134140502, "grad_norm": 2.2772490152902765, "learning_rate": 9.807054158081017e-06, "loss": 0.4707, "step": 1697 }, { "epoch": 0.11615021547301457, "grad_norm": 2.8530489561511594, "learning_rate": 9.806749278229201e-06, "loss": 0.5371, "step": 1698 }, { "epoch": 0.11621861960462412, "grad_norm": 1.8073370647958118, "learning_rate": 9.806444162438917e-06, "loss": 0.1848, "step": 1699 }, { "epoch": 0.11628702373623367, "grad_norm": 2.36512502831759, "learning_rate": 9.806138810725144e-06, "loss": 0.6134, "step": 1700 }, { "epoch": 0.11635542786784321, "grad_norm": 2.2938706250656486, "learning_rate": 9.80583322310287e-06, "loss": 0.3959, "step": 1701 }, { "epoch": 0.11642383199945276, "grad_norm": 1.5478858703276868, "learning_rate": 9.805527399587093e-06, "loss": 0.213, "step": 1702 }, { "epoch": 0.11649223613106231, "grad_norm": 2.190439418953084, "learning_rate": 9.805221340192828e-06, "loss": 0.5303, "step": 1703 }, { "epoch": 0.11656064026267186, "grad_norm": 2.3430635845838443, "learning_rate": 9.804915044935092e-06, "loss": 0.4443, "step": 1704 }, { "epoch": 0.11662904439428141, "grad_norm": 2.1372385386566726, "learning_rate": 9.804608513828927e-06, "loss": 0.3219, "step": 1705 }, { "epoch": 0.11669744852589096, "grad_norm": 2.4888815163274214, "learning_rate": 9.804301746889372e-06, "loss": 0.4967, "step": 1706 }, { "epoch": 0.11676585265750052, "grad_norm": 2.37066576141466, "learning_rate": 9.803994744131488e-06, "loss": 0.3443, "step": 1707 }, { "epoch": 0.11683425678911007, "grad_norm": 2.496703513681642, "learning_rate": 9.803687505570343e-06, "loss": 0.6241, "step": 1708 }, { "epoch": 0.11690266092071962, "grad_norm": 3.915655227020817, "learning_rate": 9.80338003122102e-06, "loss": 0.5906, "step": 1709 }, { "epoch": 0.11697106505232915, "grad_norm": 2.600585685383308, "learning_rate": 9.803072321098609e-06, "loss": 0.6715, "step": 1710 }, { "epoch": 0.1170394691839387, "grad_norm": 2.118868670614508, "learning_rate": 9.802764375218215e-06, "loss": 0.5929, "step": 1711 }, { "epoch": 0.11710787331554826, "grad_norm": 2.2519605657978983, "learning_rate": 9.802456193594953e-06, "loss": 0.4869, "step": 1712 }, { "epoch": 0.1171762774471578, "grad_norm": 2.7464412453066855, "learning_rate": 9.80214777624395e-06, "loss": 0.4547, "step": 1713 }, { "epoch": 0.11724468157876736, "grad_norm": 2.397192342905208, "learning_rate": 9.801839123180346e-06, "loss": 0.6543, "step": 1714 }, { "epoch": 0.11731308571037691, "grad_norm": 2.6837934672349557, "learning_rate": 9.801530234419288e-06, "loss": 0.8123, "step": 1715 }, { "epoch": 0.11738148984198646, "grad_norm": 1.9832942527705997, "learning_rate": 9.80122110997594e-06, "loss": 0.3899, "step": 1716 }, { "epoch": 0.11744989397359601, "grad_norm": 1.7796538546549305, "learning_rate": 9.800911749865476e-06, "loss": 0.3511, "step": 1717 }, { "epoch": 0.11751829810520556, "grad_norm": 1.9854728072522685, "learning_rate": 9.80060215410308e-06, "loss": 0.3445, "step": 1718 }, { "epoch": 0.11758670223681511, "grad_norm": 2.184673636000553, "learning_rate": 9.800292322703949e-06, "loss": 0.6327, "step": 1719 }, { "epoch": 0.11765510636842465, "grad_norm": 2.3844987499676127, "learning_rate": 9.799982255683289e-06, "loss": 0.4964, "step": 1720 }, { "epoch": 0.1177235105000342, "grad_norm": 1.7959190355025108, "learning_rate": 9.799671953056322e-06, "loss": 0.2555, "step": 1721 }, { "epoch": 0.11779191463164375, "grad_norm": 1.9754835056030533, "learning_rate": 9.799361414838277e-06, "loss": 0.3434, "step": 1722 }, { "epoch": 0.1178603187632533, "grad_norm": 1.9175639764540056, "learning_rate": 9.799050641044398e-06, "loss": 0.3121, "step": 1723 }, { "epoch": 0.11792872289486285, "grad_norm": 2.078255369643166, "learning_rate": 9.79873963168994e-06, "loss": 0.4852, "step": 1724 }, { "epoch": 0.1179971270264724, "grad_norm": 2.3280549454869974, "learning_rate": 9.79842838679017e-06, "loss": 0.7114, "step": 1725 }, { "epoch": 0.11806553115808195, "grad_norm": 2.0587246037105564, "learning_rate": 9.79811690636036e-06, "loss": 0.5125, "step": 1726 }, { "epoch": 0.1181339352896915, "grad_norm": 2.4104357855628478, "learning_rate": 9.797805190415803e-06, "loss": 0.4539, "step": 1727 }, { "epoch": 0.11820233942130105, "grad_norm": 2.4686744412798443, "learning_rate": 9.7974932389718e-06, "loss": 0.5542, "step": 1728 }, { "epoch": 0.11827074355291059, "grad_norm": 2.355549350706986, "learning_rate": 9.79718105204366e-06, "loss": 0.5929, "step": 1729 }, { "epoch": 0.11833914768452014, "grad_norm": 1.684169644238789, "learning_rate": 9.79686862964671e-06, "loss": 0.335, "step": 1730 }, { "epoch": 0.11840755181612969, "grad_norm": 0.9768790399616731, "learning_rate": 9.796555971796285e-06, "loss": 0.1433, "step": 1731 }, { "epoch": 0.11847595594773924, "grad_norm": 2.1859242690410574, "learning_rate": 9.796243078507729e-06, "loss": 0.6255, "step": 1732 }, { "epoch": 0.11854436007934879, "grad_norm": 2.0089005331323175, "learning_rate": 9.795929949796402e-06, "loss": 0.5419, "step": 1733 }, { "epoch": 0.11861276421095834, "grad_norm": 1.5152375178195099, "learning_rate": 9.795616585677674e-06, "loss": 0.2619, "step": 1734 }, { "epoch": 0.1186811683425679, "grad_norm": 1.7628480755239522, "learning_rate": 9.795302986166926e-06, "loss": 0.2653, "step": 1735 }, { "epoch": 0.11874957247417744, "grad_norm": 2.0094387022098306, "learning_rate": 9.794989151279553e-06, "loss": 0.3386, "step": 1736 }, { "epoch": 0.118817976605787, "grad_norm": 2.0234618829856115, "learning_rate": 9.794675081030956e-06, "loss": 0.4995, "step": 1737 }, { "epoch": 0.11888638073739653, "grad_norm": 1.9845657932743246, "learning_rate": 9.794360775436552e-06, "loss": 0.4268, "step": 1738 }, { "epoch": 0.11895478486900608, "grad_norm": 1.6586448027290883, "learning_rate": 9.79404623451177e-06, "loss": 0.4002, "step": 1739 }, { "epoch": 0.11902318900061563, "grad_norm": 2.2667965162093857, "learning_rate": 9.793731458272049e-06, "loss": 0.5886, "step": 1740 }, { "epoch": 0.11909159313222518, "grad_norm": 2.7992274870591087, "learning_rate": 9.79341644673284e-06, "loss": 0.9156, "step": 1741 }, { "epoch": 0.11915999726383474, "grad_norm": 2.2225615174497926, "learning_rate": 9.793101199909603e-06, "loss": 0.6246, "step": 1742 }, { "epoch": 0.11922840139544429, "grad_norm": 2.5844211511976996, "learning_rate": 9.792785717817815e-06, "loss": 0.4368, "step": 1743 }, { "epoch": 0.11929680552705384, "grad_norm": 2.518983358501225, "learning_rate": 9.792470000472957e-06, "loss": 0.583, "step": 1744 }, { "epoch": 0.11936520965866339, "grad_norm": 3.306673855285047, "learning_rate": 9.792154047890531e-06, "loss": 0.4327, "step": 1745 }, { "epoch": 0.11943361379027294, "grad_norm": 2.863120138259554, "learning_rate": 9.791837860086045e-06, "loss": 0.3093, "step": 1746 }, { "epoch": 0.11950201792188248, "grad_norm": 2.402345815518119, "learning_rate": 9.791521437075015e-06, "loss": 0.5829, "step": 1747 }, { "epoch": 0.11957042205349203, "grad_norm": 2.1330317080685437, "learning_rate": 9.791204778872976e-06, "loss": 0.421, "step": 1748 }, { "epoch": 0.11963882618510158, "grad_norm": 1.813491369298944, "learning_rate": 9.790887885495467e-06, "loss": 0.3755, "step": 1749 }, { "epoch": 0.11970723031671113, "grad_norm": 2.495311342811197, "learning_rate": 9.79057075695805e-06, "loss": 0.4408, "step": 1750 }, { "epoch": 0.11977563444832068, "grad_norm": 3.4206838559545316, "learning_rate": 9.790253393276283e-06, "loss": 0.3239, "step": 1751 }, { "epoch": 0.11984403857993023, "grad_norm": 1.7960691718319302, "learning_rate": 9.78993579446575e-06, "loss": 0.4056, "step": 1752 }, { "epoch": 0.11991244271153978, "grad_norm": 2.313429719668884, "learning_rate": 9.789617960542037e-06, "loss": 0.2841, "step": 1753 }, { "epoch": 0.11998084684314933, "grad_norm": 1.7678841847529967, "learning_rate": 9.789299891520746e-06, "loss": 0.4583, "step": 1754 }, { "epoch": 0.12004925097475888, "grad_norm": 6.822404363681085, "learning_rate": 9.788981587417488e-06, "loss": 0.5134, "step": 1755 }, { "epoch": 0.12011765510636842, "grad_norm": 2.6467782058041642, "learning_rate": 9.788663048247888e-06, "loss": 0.7722, "step": 1756 }, { "epoch": 0.12018605923797797, "grad_norm": 2.250850688865531, "learning_rate": 9.788344274027584e-06, "loss": 0.6018, "step": 1757 }, { "epoch": 0.12025446336958752, "grad_norm": 2.6035055459379026, "learning_rate": 9.788025264772218e-06, "loss": 0.5506, "step": 1758 }, { "epoch": 0.12032286750119707, "grad_norm": 2.6178645923491226, "learning_rate": 9.787706020497451e-06, "loss": 0.5048, "step": 1759 }, { "epoch": 0.12039127163280662, "grad_norm": 2.886214257370465, "learning_rate": 9.787386541218953e-06, "loss": 0.8323, "step": 1760 }, { "epoch": 0.12045967576441617, "grad_norm": 2.241947373880213, "learning_rate": 9.787066826952407e-06, "loss": 0.4842, "step": 1761 }, { "epoch": 0.12052807989602572, "grad_norm": 2.0668983594614287, "learning_rate": 9.786746877713502e-06, "loss": 0.4695, "step": 1762 }, { "epoch": 0.12059648402763527, "grad_norm": 2.9244265100092823, "learning_rate": 9.786426693517947e-06, "loss": 0.8329, "step": 1763 }, { "epoch": 0.12066488815924482, "grad_norm": 1.9998872546365283, "learning_rate": 9.786106274381457e-06, "loss": 0.4922, "step": 1764 }, { "epoch": 0.12073329229085437, "grad_norm": 1.6181650662462232, "learning_rate": 9.785785620319757e-06, "loss": 0.202, "step": 1765 }, { "epoch": 0.12080169642246391, "grad_norm": 2.481171374792831, "learning_rate": 9.78546473134859e-06, "loss": 0.4245, "step": 1766 }, { "epoch": 0.12087010055407346, "grad_norm": 2.2162378959196136, "learning_rate": 9.785143607483705e-06, "loss": 0.3422, "step": 1767 }, { "epoch": 0.12093850468568301, "grad_norm": 2.2038801671066497, "learning_rate": 9.784822248740864e-06, "loss": 0.5456, "step": 1768 }, { "epoch": 0.12100690881729256, "grad_norm": 2.3740356520319326, "learning_rate": 9.784500655135842e-06, "loss": 0.6282, "step": 1769 }, { "epoch": 0.12107531294890211, "grad_norm": 2.042023814032604, "learning_rate": 9.784178826684422e-06, "loss": 0.2763, "step": 1770 }, { "epoch": 0.12114371708051166, "grad_norm": 1.7394728101999544, "learning_rate": 9.783856763402405e-06, "loss": 0.4007, "step": 1771 }, { "epoch": 0.12121212121212122, "grad_norm": 2.3005327566567786, "learning_rate": 9.783534465305598e-06, "loss": 0.4674, "step": 1772 }, { "epoch": 0.12128052534373077, "grad_norm": 2.589964062642088, "learning_rate": 9.783211932409818e-06, "loss": 0.6372, "step": 1773 }, { "epoch": 0.12134892947534032, "grad_norm": 2.3807105064008516, "learning_rate": 9.782889164730897e-06, "loss": 0.624, "step": 1774 }, { "epoch": 0.12141733360694985, "grad_norm": 2.4351230916472564, "learning_rate": 9.782566162284681e-06, "loss": 0.5733, "step": 1775 }, { "epoch": 0.1214857377385594, "grad_norm": 1.8233835812191066, "learning_rate": 9.782242925087024e-06, "loss": 0.3928, "step": 1776 }, { "epoch": 0.12155414187016896, "grad_norm": 2.4006553361163263, "learning_rate": 9.78191945315379e-06, "loss": 0.6362, "step": 1777 }, { "epoch": 0.1216225460017785, "grad_norm": 2.752909443413117, "learning_rate": 9.781595746500857e-06, "loss": 0.5138, "step": 1778 }, { "epoch": 0.12169095013338806, "grad_norm": 1.7663924566352103, "learning_rate": 9.781271805144115e-06, "loss": 0.1713, "step": 1779 }, { "epoch": 0.12175935426499761, "grad_norm": 1.7707394902290245, "learning_rate": 9.780947629099466e-06, "loss": 0.3899, "step": 1780 }, { "epoch": 0.12182775839660716, "grad_norm": 2.492836559763443, "learning_rate": 9.78062321838282e-06, "loss": 0.3878, "step": 1781 }, { "epoch": 0.12189616252821671, "grad_norm": 2.155463185705875, "learning_rate": 9.7802985730101e-06, "loss": 0.5471, "step": 1782 }, { "epoch": 0.12196456665982626, "grad_norm": 2.207539558952624, "learning_rate": 9.77997369299724e-06, "loss": 0.4865, "step": 1783 }, { "epoch": 0.1220329707914358, "grad_norm": 2.0783071940011353, "learning_rate": 9.779648578360192e-06, "loss": 0.5116, "step": 1784 }, { "epoch": 0.12210137492304535, "grad_norm": 1.5668197832736765, "learning_rate": 9.77932322911491e-06, "loss": 0.2819, "step": 1785 }, { "epoch": 0.1221697790546549, "grad_norm": 3.433998474640638, "learning_rate": 9.778997645277366e-06, "loss": 0.335, "step": 1786 }, { "epoch": 0.12223818318626445, "grad_norm": 2.5051422401216827, "learning_rate": 9.778671826863538e-06, "loss": 0.6743, "step": 1787 }, { "epoch": 0.122306587317874, "grad_norm": 2.349419672562344, "learning_rate": 9.778345773889422e-06, "loss": 0.4124, "step": 1788 }, { "epoch": 0.12237499144948355, "grad_norm": 1.990845871743225, "learning_rate": 9.778019486371022e-06, "loss": 0.4132, "step": 1789 }, { "epoch": 0.1224433955810931, "grad_norm": 2.455006576381662, "learning_rate": 9.77769296432435e-06, "loss": 0.7719, "step": 1790 }, { "epoch": 0.12251179971270265, "grad_norm": 3.1525412761155085, "learning_rate": 9.777366207765438e-06, "loss": 0.6021, "step": 1791 }, { "epoch": 0.1225802038443122, "grad_norm": 1.9231927858579843, "learning_rate": 9.777039216710322e-06, "loss": 0.2783, "step": 1792 }, { "epoch": 0.12264860797592174, "grad_norm": 2.736840831092922, "learning_rate": 9.776711991175054e-06, "loss": 0.803, "step": 1793 }, { "epoch": 0.12271701210753129, "grad_norm": 2.5923053785933954, "learning_rate": 9.776384531175696e-06, "loss": 0.5786, "step": 1794 }, { "epoch": 0.12278541623914084, "grad_norm": 1.5955139711064785, "learning_rate": 9.776056836728318e-06, "loss": 0.2884, "step": 1795 }, { "epoch": 0.12285382037075039, "grad_norm": 2.6712857260254026, "learning_rate": 9.775728907849008e-06, "loss": 0.5484, "step": 1796 }, { "epoch": 0.12292222450235994, "grad_norm": 1.7740108764633804, "learning_rate": 9.775400744553863e-06, "loss": 0.2463, "step": 1797 }, { "epoch": 0.12299062863396949, "grad_norm": 2.115089334039658, "learning_rate": 9.775072346858986e-06, "loss": 0.5128, "step": 1798 }, { "epoch": 0.12305903276557904, "grad_norm": 2.2157971249372097, "learning_rate": 9.774743714780502e-06, "loss": 0.5534, "step": 1799 }, { "epoch": 0.1231274368971886, "grad_norm": 3.213693475563733, "learning_rate": 9.77441484833454e-06, "loss": 0.7504, "step": 1800 }, { "epoch": 0.12319584102879814, "grad_norm": 2.5304850906075305, "learning_rate": 9.774085747537241e-06, "loss": 0.6191, "step": 1801 }, { "epoch": 0.1232642451604077, "grad_norm": 2.599927646290457, "learning_rate": 9.773756412404757e-06, "loss": 0.309, "step": 1802 }, { "epoch": 0.12333264929201723, "grad_norm": 2.8188844748094146, "learning_rate": 9.77342684295326e-06, "loss": 0.2909, "step": 1803 }, { "epoch": 0.12340105342362678, "grad_norm": 1.9900130653311872, "learning_rate": 9.773097039198921e-06, "loss": 0.3283, "step": 1804 }, { "epoch": 0.12346945755523633, "grad_norm": 2.380963424148949, "learning_rate": 9.772767001157932e-06, "loss": 0.4588, "step": 1805 }, { "epoch": 0.12353786168684588, "grad_norm": 1.9563996634042284, "learning_rate": 9.772436728846488e-06, "loss": 0.3247, "step": 1806 }, { "epoch": 0.12360626581845544, "grad_norm": 3.1993491131092386, "learning_rate": 9.772106222280806e-06, "loss": 0.4369, "step": 1807 }, { "epoch": 0.12367466995006499, "grad_norm": 2.506287928363212, "learning_rate": 9.771775481477105e-06, "loss": 0.206, "step": 1808 }, { "epoch": 0.12374307408167454, "grad_norm": 2.2130445744967813, "learning_rate": 9.771444506451622e-06, "loss": 0.4588, "step": 1809 }, { "epoch": 0.12381147821328409, "grad_norm": 1.4051948827613976, "learning_rate": 9.771113297220601e-06, "loss": 0.1764, "step": 1810 }, { "epoch": 0.12387988234489364, "grad_norm": 2.5125227096821834, "learning_rate": 9.7707818538003e-06, "loss": 0.6564, "step": 1811 }, { "epoch": 0.12394828647650317, "grad_norm": 2.140308666523795, "learning_rate": 9.770450176206987e-06, "loss": 0.3341, "step": 1812 }, { "epoch": 0.12401669060811273, "grad_norm": 1.848024053438744, "learning_rate": 9.770118264456943e-06, "loss": 0.392, "step": 1813 }, { "epoch": 0.12408509473972228, "grad_norm": 2.7450611783247223, "learning_rate": 9.769786118566462e-06, "loss": 0.7273, "step": 1814 }, { "epoch": 0.12415349887133183, "grad_norm": 1.7826372232624603, "learning_rate": 9.769453738551844e-06, "loss": 0.1831, "step": 1815 }, { "epoch": 0.12422190300294138, "grad_norm": 2.3875976230475136, "learning_rate": 9.769121124429404e-06, "loss": 0.4759, "step": 1816 }, { "epoch": 0.12429030713455093, "grad_norm": 3.0032918770557773, "learning_rate": 9.768788276215471e-06, "loss": 0.4111, "step": 1817 }, { "epoch": 0.12435871126616048, "grad_norm": 2.0668308276176206, "learning_rate": 9.768455193926379e-06, "loss": 0.494, "step": 1818 }, { "epoch": 0.12442711539777003, "grad_norm": 2.0340126324282917, "learning_rate": 9.76812187757848e-06, "loss": 0.324, "step": 1819 }, { "epoch": 0.12449551952937958, "grad_norm": 2.4038698140960655, "learning_rate": 9.767788327188135e-06, "loss": 0.3543, "step": 1820 }, { "epoch": 0.12456392366098912, "grad_norm": 2.2581309717618323, "learning_rate": 9.767454542771718e-06, "loss": 0.5045, "step": 1821 }, { "epoch": 0.12463232779259867, "grad_norm": 2.0937012552878613, "learning_rate": 9.767120524345605e-06, "loss": 0.4655, "step": 1822 }, { "epoch": 0.12470073192420822, "grad_norm": 2.175059522439141, "learning_rate": 9.7667862719262e-06, "loss": 0.3723, "step": 1823 }, { "epoch": 0.12476913605581777, "grad_norm": 1.9344684741556426, "learning_rate": 9.766451785529905e-06, "loss": 0.2474, "step": 1824 }, { "epoch": 0.12483754018742732, "grad_norm": 2.3617027843716802, "learning_rate": 9.76611706517314e-06, "loss": 0.5173, "step": 1825 }, { "epoch": 0.12490594431903687, "grad_norm": 2.6321942879875633, "learning_rate": 9.765782110872335e-06, "loss": 0.4946, "step": 1826 }, { "epoch": 0.12497434845064642, "grad_norm": 2.662596453007605, "learning_rate": 9.765446922643928e-06, "loss": 0.2868, "step": 1827 }, { "epoch": 0.12504275258225597, "grad_norm": 2.5315565082623706, "learning_rate": 9.765111500504376e-06, "loss": 0.5956, "step": 1828 }, { "epoch": 0.12511115671386552, "grad_norm": 2.158120163233529, "learning_rate": 9.76477584447014e-06, "loss": 0.4644, "step": 1829 }, { "epoch": 0.12517956084547507, "grad_norm": 2.2277006151750656, "learning_rate": 9.764439954557696e-06, "loss": 0.6209, "step": 1830 }, { "epoch": 0.12524796497708462, "grad_norm": 2.2016629311888214, "learning_rate": 9.764103830783533e-06, "loss": 0.4987, "step": 1831 }, { "epoch": 0.12531636910869418, "grad_norm": 2.431903803421225, "learning_rate": 9.763767473164149e-06, "loss": 0.5969, "step": 1832 }, { "epoch": 0.12538477324030373, "grad_norm": 1.760554087276849, "learning_rate": 9.763430881716052e-06, "loss": 0.3654, "step": 1833 }, { "epoch": 0.12545317737191328, "grad_norm": 2.585634213782241, "learning_rate": 9.763094056455766e-06, "loss": 0.6755, "step": 1834 }, { "epoch": 0.1255215815035228, "grad_norm": 2.2167614770155297, "learning_rate": 9.762756997399822e-06, "loss": 0.6892, "step": 1835 }, { "epoch": 0.12558998563513235, "grad_norm": 1.8395315095665907, "learning_rate": 9.762419704564765e-06, "loss": 0.3268, "step": 1836 }, { "epoch": 0.1256583897667419, "grad_norm": 2.091230983750278, "learning_rate": 9.762082177967153e-06, "loss": 0.474, "step": 1837 }, { "epoch": 0.12572679389835145, "grad_norm": 1.8749126135807466, "learning_rate": 9.76174441762355e-06, "loss": 0.3706, "step": 1838 }, { "epoch": 0.125795198029961, "grad_norm": 2.0483169136082453, "learning_rate": 9.761406423550539e-06, "loss": 0.3263, "step": 1839 }, { "epoch": 0.12586360216157055, "grad_norm": 2.027138363351625, "learning_rate": 9.761068195764706e-06, "loss": 0.538, "step": 1840 }, { "epoch": 0.1259320062931801, "grad_norm": 1.7365828028026298, "learning_rate": 9.760729734282656e-06, "loss": 0.361, "step": 1841 }, { "epoch": 0.12600041042478966, "grad_norm": 2.070516561851875, "learning_rate": 9.760391039121e-06, "loss": 0.336, "step": 1842 }, { "epoch": 0.1260688145563992, "grad_norm": 2.0762486484001856, "learning_rate": 9.760052110296367e-06, "loss": 0.5886, "step": 1843 }, { "epoch": 0.12613721868800876, "grad_norm": 2.0258225299588206, "learning_rate": 9.759712947825387e-06, "loss": 0.2904, "step": 1844 }, { "epoch": 0.1262056228196183, "grad_norm": 1.8588204398337242, "learning_rate": 9.759373551724713e-06, "loss": 0.2773, "step": 1845 }, { "epoch": 0.12627402695122786, "grad_norm": 1.8140355503272019, "learning_rate": 9.759033922011004e-06, "loss": 0.4796, "step": 1846 }, { "epoch": 0.1263424310828374, "grad_norm": 1.7893304093258204, "learning_rate": 9.758694058700925e-06, "loss": 0.2521, "step": 1847 }, { "epoch": 0.12641083521444696, "grad_norm": 2.3324403451914315, "learning_rate": 9.758353961811164e-06, "loss": 0.4404, "step": 1848 }, { "epoch": 0.1264792393460565, "grad_norm": 2.341256315504885, "learning_rate": 9.758013631358414e-06, "loss": 0.5682, "step": 1849 }, { "epoch": 0.12654764347766606, "grad_norm": 1.8243848124688127, "learning_rate": 9.757673067359378e-06, "loss": 0.3374, "step": 1850 }, { "epoch": 0.1266160476092756, "grad_norm": 2.4941029497256952, "learning_rate": 9.757332269830773e-06, "loss": 0.526, "step": 1851 }, { "epoch": 0.12668445174088516, "grad_norm": 2.355502209022456, "learning_rate": 9.756991238789327e-06, "loss": 0.7073, "step": 1852 }, { "epoch": 0.12675285587249469, "grad_norm": 2.4946425850643092, "learning_rate": 9.75664997425178e-06, "loss": 0.5503, "step": 1853 }, { "epoch": 0.12682126000410424, "grad_norm": 2.81227480728571, "learning_rate": 9.756308476234881e-06, "loss": 0.2349, "step": 1854 }, { "epoch": 0.1268896641357138, "grad_norm": 2.2885594645244947, "learning_rate": 9.755966744755396e-06, "loss": 0.4933, "step": 1855 }, { "epoch": 0.12695806826732334, "grad_norm": 2.269667320965354, "learning_rate": 9.755624779830098e-06, "loss": 0.4479, "step": 1856 }, { "epoch": 0.1270264723989329, "grad_norm": 1.8633220756104811, "learning_rate": 9.755282581475769e-06, "loss": 0.4048, "step": 1857 }, { "epoch": 0.12709487653054244, "grad_norm": 2.5560625587004644, "learning_rate": 9.754940149709209e-06, "loss": 0.2047, "step": 1858 }, { "epoch": 0.127163280662152, "grad_norm": 2.3383285316303333, "learning_rate": 9.754597484547223e-06, "loss": 0.2567, "step": 1859 }, { "epoch": 0.12723168479376154, "grad_norm": 1.5052739612226513, "learning_rate": 9.754254586006636e-06, "loss": 0.1461, "step": 1860 }, { "epoch": 0.1273000889253711, "grad_norm": 1.9052990511331511, "learning_rate": 9.753911454104272e-06, "loss": 0.2759, "step": 1861 }, { "epoch": 0.12736849305698064, "grad_norm": 1.6358282539157967, "learning_rate": 9.75356808885698e-06, "loss": 0.2396, "step": 1862 }, { "epoch": 0.1274368971885902, "grad_norm": 2.150163551904746, "learning_rate": 9.753224490281612e-06, "loss": 0.3399, "step": 1863 }, { "epoch": 0.12750530132019974, "grad_norm": 2.1868945578185457, "learning_rate": 9.75288065839503e-06, "loss": 0.373, "step": 1864 }, { "epoch": 0.1275737054518093, "grad_norm": 1.9812227578200603, "learning_rate": 9.752536593214118e-06, "loss": 0.4527, "step": 1865 }, { "epoch": 0.12764210958341884, "grad_norm": 1.7417562062079137, "learning_rate": 9.752192294755757e-06, "loss": 0.3648, "step": 1866 }, { "epoch": 0.1277105137150284, "grad_norm": 3.1999626272969595, "learning_rate": 9.75184776303685e-06, "loss": 0.6352, "step": 1867 }, { "epoch": 0.12777891784663795, "grad_norm": 2.22096870531945, "learning_rate": 9.751502998074311e-06, "loss": 0.3223, "step": 1868 }, { "epoch": 0.1278473219782475, "grad_norm": 1.9738558436051141, "learning_rate": 9.751157999885057e-06, "loss": 0.3826, "step": 1869 }, { "epoch": 0.12791572610985705, "grad_norm": 1.865106440445715, "learning_rate": 9.750812768486028e-06, "loss": 0.3312, "step": 1870 }, { "epoch": 0.1279841302414666, "grad_norm": 2.8243735179994314, "learning_rate": 9.750467303894165e-06, "loss": 0.7745, "step": 1871 }, { "epoch": 0.12805253437307612, "grad_norm": 2.0161902056611756, "learning_rate": 9.750121606126429e-06, "loss": 0.3783, "step": 1872 }, { "epoch": 0.12812093850468567, "grad_norm": 2.542477632084732, "learning_rate": 9.749775675199784e-06, "loss": 0.5436, "step": 1873 }, { "epoch": 0.12818934263629522, "grad_norm": 1.8415977516017559, "learning_rate": 9.749429511131213e-06, "loss": 0.3901, "step": 1874 }, { "epoch": 0.12825774676790477, "grad_norm": 1.8175993743710297, "learning_rate": 9.749083113937707e-06, "loss": 0.4232, "step": 1875 }, { "epoch": 0.12832615089951432, "grad_norm": 2.6459047421758792, "learning_rate": 9.748736483636269e-06, "loss": 0.6906, "step": 1876 }, { "epoch": 0.12839455503112387, "grad_norm": 1.8352452210692913, "learning_rate": 9.748389620243914e-06, "loss": 0.3298, "step": 1877 }, { "epoch": 0.12846295916273343, "grad_norm": 1.9235807037035044, "learning_rate": 9.748042523777665e-06, "loss": 0.3096, "step": 1878 }, { "epoch": 0.12853136329434298, "grad_norm": 2.662125831603516, "learning_rate": 9.747695194254561e-06, "loss": 0.5599, "step": 1879 }, { "epoch": 0.12859976742595253, "grad_norm": 2.276098653631322, "learning_rate": 9.747347631691652e-06, "loss": 0.4067, "step": 1880 }, { "epoch": 0.12866817155756208, "grad_norm": 1.5319105981046934, "learning_rate": 9.746999836105994e-06, "loss": 0.2812, "step": 1881 }, { "epoch": 0.12873657568917163, "grad_norm": 1.7676532539713876, "learning_rate": 9.746651807514663e-06, "loss": 0.3288, "step": 1882 }, { "epoch": 0.12880497982078118, "grad_norm": 2.233886098392588, "learning_rate": 9.74630354593474e-06, "loss": 0.4605, "step": 1883 }, { "epoch": 0.12887338395239073, "grad_norm": 1.9780863138908604, "learning_rate": 9.745955051383319e-06, "loss": 0.2954, "step": 1884 }, { "epoch": 0.12894178808400028, "grad_norm": 2.0523757984344067, "learning_rate": 9.745606323877506e-06, "loss": 0.2475, "step": 1885 }, { "epoch": 0.12901019221560983, "grad_norm": 1.429874303735376, "learning_rate": 9.745257363434418e-06, "loss": 0.2901, "step": 1886 }, { "epoch": 0.12907859634721938, "grad_norm": 2.2056505661388264, "learning_rate": 9.744908170071184e-06, "loss": 0.5005, "step": 1887 }, { "epoch": 0.12914700047882893, "grad_norm": 2.2107958057411574, "learning_rate": 9.744558743804944e-06, "loss": 0.4793, "step": 1888 }, { "epoch": 0.12921540461043848, "grad_norm": 2.304212249634942, "learning_rate": 9.74420908465285e-06, "loss": 0.5769, "step": 1889 }, { "epoch": 0.129283808742048, "grad_norm": 2.8498312933663326, "learning_rate": 9.743859192632065e-06, "loss": 0.384, "step": 1890 }, { "epoch": 0.12935221287365756, "grad_norm": 3.0780212413901995, "learning_rate": 9.743509067759761e-06, "loss": 0.7963, "step": 1891 }, { "epoch": 0.1294206170052671, "grad_norm": 1.9931385911494284, "learning_rate": 9.743158710053129e-06, "loss": 0.5036, "step": 1892 }, { "epoch": 0.12948902113687666, "grad_norm": 2.2675367149495043, "learning_rate": 9.74280811952936e-06, "loss": 0.4048, "step": 1893 }, { "epoch": 0.1295574252684862, "grad_norm": 2.431743683986515, "learning_rate": 9.742457296205668e-06, "loss": 0.6561, "step": 1894 }, { "epoch": 0.12962582940009576, "grad_norm": 2.6057319336614126, "learning_rate": 9.742106240099269e-06, "loss": 0.5389, "step": 1895 }, { "epoch": 0.1296942335317053, "grad_norm": 1.8663564692134216, "learning_rate": 9.741754951227398e-06, "loss": 0.1985, "step": 1896 }, { "epoch": 0.12976263766331486, "grad_norm": 2.404526192135938, "learning_rate": 9.741403429607294e-06, "loss": 0.3844, "step": 1897 }, { "epoch": 0.1298310417949244, "grad_norm": 2.9663447173002995, "learning_rate": 9.741051675256216e-06, "loss": 0.6152, "step": 1898 }, { "epoch": 0.12989944592653396, "grad_norm": 1.9287645511177196, "learning_rate": 9.740699688191426e-06, "loss": 0.3397, "step": 1899 }, { "epoch": 0.1299678500581435, "grad_norm": 2.4569039486169673, "learning_rate": 9.740347468430203e-06, "loss": 0.5622, "step": 1900 }, { "epoch": 0.13003625418975306, "grad_norm": 2.2024143777370364, "learning_rate": 9.739995015989835e-06, "loss": 0.5637, "step": 1901 }, { "epoch": 0.13010465832136262, "grad_norm": 2.239843063301165, "learning_rate": 9.739642330887624e-06, "loss": 0.5527, "step": 1902 }, { "epoch": 0.13017306245297217, "grad_norm": 2.0597818843866103, "learning_rate": 9.739289413140879e-06, "loss": 0.325, "step": 1903 }, { "epoch": 0.13024146658458172, "grad_norm": 2.257212701081403, "learning_rate": 9.738936262766923e-06, "loss": 0.5326, "step": 1904 }, { "epoch": 0.13030987071619127, "grad_norm": 2.308533243332008, "learning_rate": 9.738582879783091e-06, "loss": 0.6161, "step": 1905 }, { "epoch": 0.13037827484780082, "grad_norm": 2.75830867939752, "learning_rate": 9.73822926420673e-06, "loss": 0.8568, "step": 1906 }, { "epoch": 0.13044667897941037, "grad_norm": 2.136607528107145, "learning_rate": 9.737875416055196e-06, "loss": 0.3595, "step": 1907 }, { "epoch": 0.13051508311101992, "grad_norm": 2.5720944879904755, "learning_rate": 9.737521335345858e-06, "loss": 0.6937, "step": 1908 }, { "epoch": 0.13058348724262944, "grad_norm": 2.2053817864304803, "learning_rate": 9.737167022096095e-06, "loss": 0.4071, "step": 1909 }, { "epoch": 0.130651891374239, "grad_norm": 2.0807597122687587, "learning_rate": 9.736812476323299e-06, "loss": 0.44, "step": 1910 }, { "epoch": 0.13072029550584854, "grad_norm": 2.6938795067441172, "learning_rate": 9.736457698044873e-06, "loss": 0.8803, "step": 1911 }, { "epoch": 0.1307886996374581, "grad_norm": 1.7596945287841506, "learning_rate": 9.73610268727823e-06, "loss": 0.268, "step": 1912 }, { "epoch": 0.13085710376906765, "grad_norm": 2.5085837464681093, "learning_rate": 9.735747444040798e-06, "loss": 0.3404, "step": 1913 }, { "epoch": 0.1309255079006772, "grad_norm": 2.146217378180681, "learning_rate": 9.735391968350012e-06, "loss": 0.3723, "step": 1914 }, { "epoch": 0.13099391203228675, "grad_norm": 2.492980424309117, "learning_rate": 9.735036260223322e-06, "loss": 0.4639, "step": 1915 }, { "epoch": 0.1310623161638963, "grad_norm": 1.832371385310156, "learning_rate": 9.734680319678187e-06, "loss": 0.2616, "step": 1916 }, { "epoch": 0.13113072029550585, "grad_norm": 2.332137528750146, "learning_rate": 9.734324146732076e-06, "loss": 0.2967, "step": 1917 }, { "epoch": 0.1311991244271154, "grad_norm": 1.7693153232763632, "learning_rate": 9.733967741402477e-06, "loss": 0.3323, "step": 1918 }, { "epoch": 0.13126752855872495, "grad_norm": 2.2382068255502197, "learning_rate": 9.733611103706882e-06, "loss": 0.4987, "step": 1919 }, { "epoch": 0.1313359326903345, "grad_norm": 2.857701538855891, "learning_rate": 9.733254233662791e-06, "loss": 0.6157, "step": 1920 }, { "epoch": 0.13140433682194405, "grad_norm": 2.2837035949786313, "learning_rate": 9.732897131287731e-06, "loss": 0.3372, "step": 1921 }, { "epoch": 0.1314727409535536, "grad_norm": 2.089166684858882, "learning_rate": 9.73253979659922e-06, "loss": 0.4576, "step": 1922 }, { "epoch": 0.13154114508516315, "grad_norm": 2.3138175059362514, "learning_rate": 9.732182229614807e-06, "loss": 0.3858, "step": 1923 }, { "epoch": 0.1316095492167727, "grad_norm": 2.215998326796716, "learning_rate": 9.731824430352036e-06, "loss": 0.4477, "step": 1924 }, { "epoch": 0.13167795334838225, "grad_norm": 1.7751059728875564, "learning_rate": 9.731466398828474e-06, "loss": 0.2558, "step": 1925 }, { "epoch": 0.1317463574799918, "grad_norm": 1.7489935096552958, "learning_rate": 9.731108135061691e-06, "loss": 0.2561, "step": 1926 }, { "epoch": 0.13181476161160133, "grad_norm": 2.445314974067582, "learning_rate": 9.730749639069275e-06, "loss": 0.5215, "step": 1927 }, { "epoch": 0.13188316574321088, "grad_norm": 2.4728384402801957, "learning_rate": 9.730390910868821e-06, "loss": 0.3142, "step": 1928 }, { "epoch": 0.13195156987482043, "grad_norm": 2.170906317418152, "learning_rate": 9.73003195047794e-06, "loss": 0.4633, "step": 1929 }, { "epoch": 0.13201997400642998, "grad_norm": 2.8114799345095207, "learning_rate": 9.72967275791425e-06, "loss": 0.6864, "step": 1930 }, { "epoch": 0.13208837813803953, "grad_norm": 2.1956127205822127, "learning_rate": 9.729313333195383e-06, "loss": 0.411, "step": 1931 }, { "epoch": 0.13215678226964908, "grad_norm": 2.151224766266855, "learning_rate": 9.728953676338977e-06, "loss": 0.4941, "step": 1932 }, { "epoch": 0.13222518640125863, "grad_norm": 2.0507096995633023, "learning_rate": 9.72859378736269e-06, "loss": 0.3809, "step": 1933 }, { "epoch": 0.13229359053286818, "grad_norm": 1.7439124266712704, "learning_rate": 9.728233666284187e-06, "loss": 0.32, "step": 1934 }, { "epoch": 0.13236199466447773, "grad_norm": 2.1244189716242756, "learning_rate": 9.727873313121142e-06, "loss": 0.4596, "step": 1935 }, { "epoch": 0.13243039879608728, "grad_norm": 2.4204300292251406, "learning_rate": 9.727512727891242e-06, "loss": 0.7471, "step": 1936 }, { "epoch": 0.13249880292769683, "grad_norm": 1.9817450584777343, "learning_rate": 9.727151910612192e-06, "loss": 0.5003, "step": 1937 }, { "epoch": 0.13256720705930639, "grad_norm": 2.5543831693566723, "learning_rate": 9.726790861301696e-06, "loss": 0.3878, "step": 1938 }, { "epoch": 0.13263561119091594, "grad_norm": 2.1271780926678665, "learning_rate": 9.72642957997748e-06, "loss": 0.4599, "step": 1939 }, { "epoch": 0.1327040153225255, "grad_norm": 2.6586863844362583, "learning_rate": 9.726068066657279e-06, "loss": 0.6774, "step": 1940 }, { "epoch": 0.13277241945413504, "grad_norm": 2.635576229517165, "learning_rate": 9.725706321358832e-06, "loss": 0.6603, "step": 1941 }, { "epoch": 0.1328408235857446, "grad_norm": 2.5229031584625075, "learning_rate": 9.725344344099901e-06, "loss": 0.4033, "step": 1942 }, { "epoch": 0.13290922771735414, "grad_norm": 1.552052968536777, "learning_rate": 9.72498213489825e-06, "loss": 0.1749, "step": 1943 }, { "epoch": 0.1329776318489637, "grad_norm": 1.5690256107486313, "learning_rate": 9.724619693771658e-06, "loss": 0.1958, "step": 1944 }, { "epoch": 0.1330460359805732, "grad_norm": 2.281599880194962, "learning_rate": 9.724257020737918e-06, "loss": 0.3821, "step": 1945 }, { "epoch": 0.13311444011218276, "grad_norm": 1.9155817697510373, "learning_rate": 9.72389411581483e-06, "loss": 0.1854, "step": 1946 }, { "epoch": 0.13318284424379231, "grad_norm": 2.4884821273230107, "learning_rate": 9.723530979020208e-06, "loss": 0.5781, "step": 1947 }, { "epoch": 0.13325124837540187, "grad_norm": 2.901475573723365, "learning_rate": 9.723167610371876e-06, "loss": 0.742, "step": 1948 }, { "epoch": 0.13331965250701142, "grad_norm": 1.649407090329871, "learning_rate": 9.722804009887667e-06, "loss": 0.3278, "step": 1949 }, { "epoch": 0.13338805663862097, "grad_norm": 1.8498835738057033, "learning_rate": 9.722440177585433e-06, "loss": 0.3068, "step": 1950 }, { "epoch": 0.13345646077023052, "grad_norm": 2.4421589171823097, "learning_rate": 9.72207611348303e-06, "loss": 0.4016, "step": 1951 }, { "epoch": 0.13352486490184007, "grad_norm": 2.124816451840917, "learning_rate": 9.72171181759833e-06, "loss": 0.269, "step": 1952 }, { "epoch": 0.13359326903344962, "grad_norm": 2.269298410207959, "learning_rate": 9.721347289949212e-06, "loss": 0.32, "step": 1953 }, { "epoch": 0.13366167316505917, "grad_norm": 2.5444219248929967, "learning_rate": 9.72098253055357e-06, "loss": 0.2749, "step": 1954 }, { "epoch": 0.13373007729666872, "grad_norm": 2.4479708192124705, "learning_rate": 9.720617539429308e-06, "loss": 0.5894, "step": 1955 }, { "epoch": 0.13379848142827827, "grad_norm": 2.7307663565710447, "learning_rate": 9.720252316594344e-06, "loss": 0.6695, "step": 1956 }, { "epoch": 0.13386688555988782, "grad_norm": 2.519429002210049, "learning_rate": 9.7198868620666e-06, "loss": 0.6163, "step": 1957 }, { "epoch": 0.13393528969149737, "grad_norm": 2.3713024462337984, "learning_rate": 9.719521175864014e-06, "loss": 0.55, "step": 1958 }, { "epoch": 0.13400369382310692, "grad_norm": 2.1448449841641186, "learning_rate": 9.719155258004542e-06, "loss": 0.3758, "step": 1959 }, { "epoch": 0.13407209795471647, "grad_norm": 1.5366152947203988, "learning_rate": 9.718789108506141e-06, "loss": 0.2458, "step": 1960 }, { "epoch": 0.13414050208632602, "grad_norm": 3.0339636797915595, "learning_rate": 9.718422727386782e-06, "loss": 0.6408, "step": 1961 }, { "epoch": 0.13420890621793558, "grad_norm": 4.671436914425687, "learning_rate": 9.718056114664453e-06, "loss": 0.504, "step": 1962 }, { "epoch": 0.13427731034954513, "grad_norm": 2.458926247901266, "learning_rate": 9.717689270357145e-06, "loss": 0.5717, "step": 1963 }, { "epoch": 0.13434571448115465, "grad_norm": 1.8838378441141885, "learning_rate": 9.717322194482867e-06, "loss": 0.4188, "step": 1964 }, { "epoch": 0.1344141186127642, "grad_norm": 3.3449066514587917, "learning_rate": 9.716954887059636e-06, "loss": 1.1848, "step": 1965 }, { "epoch": 0.13448252274437375, "grad_norm": 2.439953645007891, "learning_rate": 9.71658734810548e-06, "loss": 0.334, "step": 1966 }, { "epoch": 0.1345509268759833, "grad_norm": 2.38381167240508, "learning_rate": 9.716219577638442e-06, "loss": 0.5066, "step": 1967 }, { "epoch": 0.13461933100759285, "grad_norm": 2.294347553218899, "learning_rate": 9.715851575676572e-06, "loss": 0.4672, "step": 1968 }, { "epoch": 0.1346877351392024, "grad_norm": 2.4364995890273953, "learning_rate": 9.715483342237935e-06, "loss": 0.6473, "step": 1969 }, { "epoch": 0.13475613927081195, "grad_norm": 2.793581029830044, "learning_rate": 9.715114877340604e-06, "loss": 0.7333, "step": 1970 }, { "epoch": 0.1348245434024215, "grad_norm": 2.1767510593287303, "learning_rate": 9.714746181002665e-06, "loss": 0.5939, "step": 1971 }, { "epoch": 0.13489294753403105, "grad_norm": 2.166915142654646, "learning_rate": 9.714377253242218e-06, "loss": 0.4489, "step": 1972 }, { "epoch": 0.1349613516656406, "grad_norm": 2.2530040356654504, "learning_rate": 9.714008094077367e-06, "loss": 0.4041, "step": 1973 }, { "epoch": 0.13502975579725016, "grad_norm": 1.5230437821471559, "learning_rate": 9.713638703526239e-06, "loss": 0.2935, "step": 1974 }, { "epoch": 0.1350981599288597, "grad_norm": 2.018569126824943, "learning_rate": 9.713269081606958e-06, "loss": 0.452, "step": 1975 }, { "epoch": 0.13516656406046926, "grad_norm": 1.8878803941957387, "learning_rate": 9.71289922833767e-06, "loss": 0.3135, "step": 1976 }, { "epoch": 0.1352349681920788, "grad_norm": 2.2328536115936504, "learning_rate": 9.712529143736533e-06, "loss": 0.4808, "step": 1977 }, { "epoch": 0.13530337232368836, "grad_norm": 2.1790928832856853, "learning_rate": 9.712158827821706e-06, "loss": 0.4548, "step": 1978 }, { "epoch": 0.1353717764552979, "grad_norm": 2.3841556578173893, "learning_rate": 9.711788280611371e-06, "loss": 0.5887, "step": 1979 }, { "epoch": 0.13544018058690746, "grad_norm": 2.084968516851938, "learning_rate": 9.711417502123714e-06, "loss": 0.3197, "step": 1980 }, { "epoch": 0.135508584718517, "grad_norm": 1.861134185774496, "learning_rate": 9.711046492376935e-06, "loss": 0.4027, "step": 1981 }, { "epoch": 0.13557698885012653, "grad_norm": 1.6430285263796418, "learning_rate": 9.710675251389243e-06, "loss": 0.3301, "step": 1982 }, { "epoch": 0.13564539298173608, "grad_norm": 1.3373422710162273, "learning_rate": 9.710303779178863e-06, "loss": 0.1939, "step": 1983 }, { "epoch": 0.13571379711334564, "grad_norm": 2.3767987799078076, "learning_rate": 9.709932075764028e-06, "loss": 0.5444, "step": 1984 }, { "epoch": 0.1357822012449552, "grad_norm": 2.0689039909178177, "learning_rate": 9.709560141162982e-06, "loss": 0.4748, "step": 1985 }, { "epoch": 0.13585060537656474, "grad_norm": 2.2642558992605317, "learning_rate": 9.709187975393983e-06, "loss": 0.6362, "step": 1986 }, { "epoch": 0.1359190095081743, "grad_norm": 2.258320319013879, "learning_rate": 9.708815578475297e-06, "loss": 0.5387, "step": 1987 }, { "epoch": 0.13598741363978384, "grad_norm": 1.9249574589982845, "learning_rate": 9.708442950425202e-06, "loss": 0.4808, "step": 1988 }, { "epoch": 0.1360558177713934, "grad_norm": 1.6881285132746202, "learning_rate": 9.708070091261994e-06, "loss": 0.3892, "step": 1989 }, { "epoch": 0.13612422190300294, "grad_norm": 2.118644482241303, "learning_rate": 9.707697001003968e-06, "loss": 0.3839, "step": 1990 }, { "epoch": 0.1361926260346125, "grad_norm": 2.364233622689171, "learning_rate": 9.707323679669441e-06, "loss": 0.4965, "step": 1991 }, { "epoch": 0.13626103016622204, "grad_norm": 3.199802359423151, "learning_rate": 9.706950127276738e-06, "loss": 0.4802, "step": 1992 }, { "epoch": 0.1363294342978316, "grad_norm": 2.204600563741413, "learning_rate": 9.70657634384419e-06, "loss": 0.5112, "step": 1993 }, { "epoch": 0.13639783842944114, "grad_norm": 1.782736392323246, "learning_rate": 9.706202329390148e-06, "loss": 0.2756, "step": 1994 }, { "epoch": 0.1364662425610507, "grad_norm": 2.2001299012370503, "learning_rate": 9.705828083932972e-06, "loss": 0.4285, "step": 1995 }, { "epoch": 0.13653464669266024, "grad_norm": 2.2148195732990446, "learning_rate": 9.705453607491027e-06, "loss": 0.6457, "step": 1996 }, { "epoch": 0.1366030508242698, "grad_norm": 2.369052643176135, "learning_rate": 9.705078900082696e-06, "loss": 0.3413, "step": 1997 }, { "epoch": 0.13667145495587935, "grad_norm": 2.362352551294313, "learning_rate": 9.704703961726374e-06, "loss": 0.5719, "step": 1998 }, { "epoch": 0.1367398590874889, "grad_norm": 1.5217825777318414, "learning_rate": 9.704328792440462e-06, "loss": 0.236, "step": 1999 }, { "epoch": 0.13680826321909845, "grad_norm": 2.036180856993218, "learning_rate": 9.703953392243377e-06, "loss": 0.4581, "step": 2000 }, { "epoch": 0.13687666735070797, "grad_norm": 1.8486653550271668, "learning_rate": 9.703577761153542e-06, "loss": 0.3318, "step": 2001 }, { "epoch": 0.13694507148231752, "grad_norm": 1.2987867162313211, "learning_rate": 9.703201899189399e-06, "loss": 0.2184, "step": 2002 }, { "epoch": 0.13701347561392707, "grad_norm": 1.7897481062738274, "learning_rate": 9.702825806369394e-06, "loss": 0.3741, "step": 2003 }, { "epoch": 0.13708187974553662, "grad_norm": 2.5418801827069535, "learning_rate": 9.702449482711989e-06, "loss": 0.1836, "step": 2004 }, { "epoch": 0.13715028387714617, "grad_norm": 1.7452305531196306, "learning_rate": 9.702072928235658e-06, "loss": 0.2937, "step": 2005 }, { "epoch": 0.13721868800875572, "grad_norm": 1.8067937177797997, "learning_rate": 9.701696142958878e-06, "loss": 0.2085, "step": 2006 }, { "epoch": 0.13728709214036527, "grad_norm": 2.0812669893665348, "learning_rate": 9.701319126900149e-06, "loss": 0.4172, "step": 2007 }, { "epoch": 0.13735549627197483, "grad_norm": 1.5174669644799275, "learning_rate": 9.700941880077974e-06, "loss": 0.2435, "step": 2008 }, { "epoch": 0.13742390040358438, "grad_norm": 2.10649241119347, "learning_rate": 9.700564402510871e-06, "loss": 0.4457, "step": 2009 }, { "epoch": 0.13749230453519393, "grad_norm": 2.206506717432214, "learning_rate": 9.700186694217369e-06, "loss": 0.49, "step": 2010 }, { "epoch": 0.13756070866680348, "grad_norm": 3.0885717423739956, "learning_rate": 9.699808755216007e-06, "loss": 0.5836, "step": 2011 }, { "epoch": 0.13762911279841303, "grad_norm": 2.3946775979752424, "learning_rate": 9.699430585525336e-06, "loss": 0.6148, "step": 2012 }, { "epoch": 0.13769751693002258, "grad_norm": 2.5847640011156665, "learning_rate": 9.699052185163917e-06, "loss": 0.738, "step": 2013 }, { "epoch": 0.13776592106163213, "grad_norm": 2.5480936563382537, "learning_rate": 9.698673554150327e-06, "loss": 0.5849, "step": 2014 }, { "epoch": 0.13783432519324168, "grad_norm": 1.3610430053563436, "learning_rate": 9.69829469250315e-06, "loss": 0.228, "step": 2015 }, { "epoch": 0.13790272932485123, "grad_norm": 2.2483209729567717, "learning_rate": 9.697915600240979e-06, "loss": 0.3754, "step": 2016 }, { "epoch": 0.13797113345646078, "grad_norm": 2.0270920452062704, "learning_rate": 9.697536277382426e-06, "loss": 0.3626, "step": 2017 }, { "epoch": 0.13803953758807033, "grad_norm": 2.453272438478107, "learning_rate": 9.697156723946108e-06, "loss": 0.4761, "step": 2018 }, { "epoch": 0.13810794171967986, "grad_norm": 1.5917288736433541, "learning_rate": 9.696776939950657e-06, "loss": 0.2658, "step": 2019 }, { "epoch": 0.1381763458512894, "grad_norm": 1.3937693151923212, "learning_rate": 9.696396925414711e-06, "loss": 0.1962, "step": 2020 }, { "epoch": 0.13824474998289896, "grad_norm": 1.7328042293278971, "learning_rate": 9.696016680356928e-06, "loss": 0.2696, "step": 2021 }, { "epoch": 0.1383131541145085, "grad_norm": 2.295154900590374, "learning_rate": 9.695636204795966e-06, "loss": 0.5574, "step": 2022 }, { "epoch": 0.13838155824611806, "grad_norm": 2.1042599578672565, "learning_rate": 9.695255498750506e-06, "loss": 0.3678, "step": 2023 }, { "epoch": 0.1384499623777276, "grad_norm": 2.3633904676184936, "learning_rate": 9.694874562239233e-06, "loss": 0.2503, "step": 2024 }, { "epoch": 0.13851836650933716, "grad_norm": 1.9122351896296228, "learning_rate": 9.694493395280846e-06, "loss": 0.3889, "step": 2025 }, { "epoch": 0.1385867706409467, "grad_norm": 1.7650504823251731, "learning_rate": 9.694111997894053e-06, "loss": 0.3249, "step": 2026 }, { "epoch": 0.13865517477255626, "grad_norm": 2.8098964932747044, "learning_rate": 9.693730370097576e-06, "loss": 0.7769, "step": 2027 }, { "epoch": 0.1387235789041658, "grad_norm": 1.9536058062977935, "learning_rate": 9.693348511910146e-06, "loss": 0.3124, "step": 2028 }, { "epoch": 0.13879198303577536, "grad_norm": 1.5576379613178652, "learning_rate": 9.692966423350505e-06, "loss": 0.2021, "step": 2029 }, { "epoch": 0.1388603871673849, "grad_norm": 2.1321474862065557, "learning_rate": 9.692584104437413e-06, "loss": 0.4331, "step": 2030 }, { "epoch": 0.13892879129899446, "grad_norm": 2.048851383975424, "learning_rate": 9.69220155518963e-06, "loss": 0.3186, "step": 2031 }, { "epoch": 0.13899719543060401, "grad_norm": 1.4580967972436072, "learning_rate": 9.691818775625939e-06, "loss": 0.2412, "step": 2032 }, { "epoch": 0.13906559956221357, "grad_norm": 2.4178104841853267, "learning_rate": 9.691435765765124e-06, "loss": 0.3115, "step": 2033 }, { "epoch": 0.13913400369382312, "grad_norm": 2.1553079717277974, "learning_rate": 9.691052525625989e-06, "loss": 0.4676, "step": 2034 }, { "epoch": 0.13920240782543267, "grad_norm": 2.3440548634935534, "learning_rate": 9.690669055227341e-06, "loss": 0.2184, "step": 2035 }, { "epoch": 0.13927081195704222, "grad_norm": 2.1908591160309707, "learning_rate": 9.690285354588007e-06, "loss": 0.4734, "step": 2036 }, { "epoch": 0.13933921608865177, "grad_norm": 1.8857450421004023, "learning_rate": 9.689901423726817e-06, "loss": 0.1736, "step": 2037 }, { "epoch": 0.1394076202202613, "grad_norm": 2.31130465981862, "learning_rate": 9.689517262662617e-06, "loss": 0.7169, "step": 2038 }, { "epoch": 0.13947602435187084, "grad_norm": 2.260899337678477, "learning_rate": 9.689132871414266e-06, "loss": 0.4768, "step": 2039 }, { "epoch": 0.1395444284834804, "grad_norm": 2.1717523567645367, "learning_rate": 9.68874825000063e-06, "loss": 0.5489, "step": 2040 }, { "epoch": 0.13961283261508994, "grad_norm": 1.8490050878568252, "learning_rate": 9.688363398440587e-06, "loss": 0.4488, "step": 2041 }, { "epoch": 0.1396812367466995, "grad_norm": 2.2785154276271755, "learning_rate": 9.68797831675303e-06, "loss": 0.2408, "step": 2042 }, { "epoch": 0.13974964087830904, "grad_norm": 1.7196518699286447, "learning_rate": 9.687593004956858e-06, "loss": 0.3034, "step": 2043 }, { "epoch": 0.1398180450099186, "grad_norm": 2.560873422525091, "learning_rate": 9.687207463070986e-06, "loss": 0.3931, "step": 2044 }, { "epoch": 0.13988644914152815, "grad_norm": 1.8359587850757195, "learning_rate": 9.686821691114339e-06, "loss": 0.2424, "step": 2045 }, { "epoch": 0.1399548532731377, "grad_norm": 2.2499690252397238, "learning_rate": 9.686435689105848e-06, "loss": 0.4995, "step": 2046 }, { "epoch": 0.14002325740474725, "grad_norm": 2.2015709773293692, "learning_rate": 9.686049457064463e-06, "loss": 0.6154, "step": 2047 }, { "epoch": 0.1400916615363568, "grad_norm": 2.218750151110362, "learning_rate": 9.685662995009145e-06, "loss": 0.5267, "step": 2048 }, { "epoch": 0.14016006566796635, "grad_norm": 1.7414583296794262, "learning_rate": 9.685276302958857e-06, "loss": 0.3711, "step": 2049 }, { "epoch": 0.1402284697995759, "grad_norm": 1.9223658179202698, "learning_rate": 9.684889380932584e-06, "loss": 0.4077, "step": 2050 }, { "epoch": 0.14029687393118545, "grad_norm": 2.265957354141773, "learning_rate": 9.684502228949317e-06, "loss": 0.5655, "step": 2051 }, { "epoch": 0.140365278062795, "grad_norm": 2.8575033461767245, "learning_rate": 9.68411484702806e-06, "loss": 0.8683, "step": 2052 }, { "epoch": 0.14043368219440455, "grad_norm": 2.037327623974184, "learning_rate": 9.683727235187826e-06, "loss": 0.3915, "step": 2053 }, { "epoch": 0.1405020863260141, "grad_norm": 2.059346367285291, "learning_rate": 9.683339393447644e-06, "loss": 0.3918, "step": 2054 }, { "epoch": 0.14057049045762365, "grad_norm": 1.888587606575812, "learning_rate": 9.682951321826546e-06, "loss": 0.3621, "step": 2055 }, { "epoch": 0.14063889458923318, "grad_norm": 2.347529193658053, "learning_rate": 9.682563020343585e-06, "loss": 0.4032, "step": 2056 }, { "epoch": 0.14070729872084273, "grad_norm": 2.183145695475487, "learning_rate": 9.682174489017819e-06, "loss": 0.1978, "step": 2057 }, { "epoch": 0.14077570285245228, "grad_norm": 1.9728351625191949, "learning_rate": 9.681785727868319e-06, "loss": 0.4631, "step": 2058 }, { "epoch": 0.14084410698406183, "grad_norm": 1.8511381065313721, "learning_rate": 9.681396736914169e-06, "loss": 0.3733, "step": 2059 }, { "epoch": 0.14091251111567138, "grad_norm": 2.1022022400984715, "learning_rate": 9.681007516174458e-06, "loss": 0.4377, "step": 2060 }, { "epoch": 0.14098091524728093, "grad_norm": 2.262492915400588, "learning_rate": 9.680618065668295e-06, "loss": 0.5854, "step": 2061 }, { "epoch": 0.14104931937889048, "grad_norm": 1.9233313369272318, "learning_rate": 9.680228385414795e-06, "loss": 0.4265, "step": 2062 }, { "epoch": 0.14111772351050003, "grad_norm": 2.0133095904699236, "learning_rate": 9.679838475433085e-06, "loss": 0.2846, "step": 2063 }, { "epoch": 0.14118612764210958, "grad_norm": 2.1438478864559247, "learning_rate": 9.679448335742304e-06, "loss": 0.416, "step": 2064 }, { "epoch": 0.14125453177371913, "grad_norm": 2.44221460006429, "learning_rate": 9.679057966361603e-06, "loss": 0.6043, "step": 2065 }, { "epoch": 0.14132293590532868, "grad_norm": 2.0719061745589675, "learning_rate": 9.67866736731014e-06, "loss": 0.4274, "step": 2066 }, { "epoch": 0.14139134003693823, "grad_norm": 1.727802355742209, "learning_rate": 9.67827653860709e-06, "loss": 0.3115, "step": 2067 }, { "epoch": 0.14145974416854779, "grad_norm": 2.010531302792559, "learning_rate": 9.677885480271637e-06, "loss": 0.4693, "step": 2068 }, { "epoch": 0.14152814830015734, "grad_norm": 2.0419783589475453, "learning_rate": 9.677494192322976e-06, "loss": 0.4363, "step": 2069 }, { "epoch": 0.1415965524317669, "grad_norm": 2.6348880318046533, "learning_rate": 9.677102674780312e-06, "loss": 0.68, "step": 2070 }, { "epoch": 0.14166495656337644, "grad_norm": 2.1912592398279496, "learning_rate": 9.676710927662863e-06, "loss": 0.4933, "step": 2071 }, { "epoch": 0.141733360694986, "grad_norm": 2.5309823124238133, "learning_rate": 9.676318950989858e-06, "loss": 0.3394, "step": 2072 }, { "epoch": 0.14180176482659554, "grad_norm": 2.3589964840902393, "learning_rate": 9.675926744780537e-06, "loss": 0.5221, "step": 2073 }, { "epoch": 0.14187016895820506, "grad_norm": 1.6297485588829246, "learning_rate": 9.67553430905415e-06, "loss": 0.3542, "step": 2074 }, { "epoch": 0.1419385730898146, "grad_norm": 2.361794664662599, "learning_rate": 9.675141643829964e-06, "loss": 0.7145, "step": 2075 }, { "epoch": 0.14200697722142416, "grad_norm": 2.046888183958968, "learning_rate": 9.67474874912725e-06, "loss": 0.4165, "step": 2076 }, { "epoch": 0.14207538135303371, "grad_norm": 1.8712954999747085, "learning_rate": 9.674355624965293e-06, "loss": 0.2801, "step": 2077 }, { "epoch": 0.14214378548464326, "grad_norm": 2.128694856280006, "learning_rate": 9.673962271363388e-06, "loss": 0.5386, "step": 2078 }, { "epoch": 0.14221218961625282, "grad_norm": 1.9483767890802042, "learning_rate": 9.673568688340846e-06, "loss": 0.4396, "step": 2079 }, { "epoch": 0.14228059374786237, "grad_norm": 2.611172490985139, "learning_rate": 9.673174875916984e-06, "loss": 0.5465, "step": 2080 }, { "epoch": 0.14234899787947192, "grad_norm": 2.346219732488669, "learning_rate": 9.672780834111131e-06, "loss": 0.5871, "step": 2081 }, { "epoch": 0.14241740201108147, "grad_norm": 2.5704039815374404, "learning_rate": 9.672386562942633e-06, "loss": 0.3931, "step": 2082 }, { "epoch": 0.14248580614269102, "grad_norm": 2.155405065598809, "learning_rate": 9.671992062430837e-06, "loss": 0.536, "step": 2083 }, { "epoch": 0.14255421027430057, "grad_norm": 2.697498966098407, "learning_rate": 9.671597332595111e-06, "loss": 0.8295, "step": 2084 }, { "epoch": 0.14262261440591012, "grad_norm": 2.3084173570815083, "learning_rate": 9.67120237345483e-06, "loss": 0.5222, "step": 2085 }, { "epoch": 0.14269101853751967, "grad_norm": 1.60351037455546, "learning_rate": 9.670807185029378e-06, "loss": 0.2718, "step": 2086 }, { "epoch": 0.14275942266912922, "grad_norm": 1.8581985309985896, "learning_rate": 9.670411767338155e-06, "loss": 0.4675, "step": 2087 }, { "epoch": 0.14282782680073877, "grad_norm": 2.802539193936775, "learning_rate": 9.670016120400568e-06, "loss": 0.2734, "step": 2088 }, { "epoch": 0.14289623093234832, "grad_norm": 1.9119739737804298, "learning_rate": 9.66962024423604e-06, "loss": 0.4351, "step": 2089 }, { "epoch": 0.14296463506395787, "grad_norm": 2.618243656278975, "learning_rate": 9.669224138863999e-06, "loss": 0.735, "step": 2090 }, { "epoch": 0.14303303919556742, "grad_norm": 1.5616221393741294, "learning_rate": 9.668827804303893e-06, "loss": 0.1733, "step": 2091 }, { "epoch": 0.14310144332717697, "grad_norm": 4.130159287163491, "learning_rate": 9.668431240575172e-06, "loss": 0.5376, "step": 2092 }, { "epoch": 0.1431698474587865, "grad_norm": 2.32791578389671, "learning_rate": 9.6680344476973e-06, "loss": 0.6292, "step": 2093 }, { "epoch": 0.14323825159039605, "grad_norm": 1.99425557073327, "learning_rate": 9.667637425689757e-06, "loss": 0.1918, "step": 2094 }, { "epoch": 0.1433066557220056, "grad_norm": 2.157650009324152, "learning_rate": 9.66724017457203e-06, "loss": 0.2503, "step": 2095 }, { "epoch": 0.14337505985361515, "grad_norm": 2.815847322871972, "learning_rate": 9.666842694363616e-06, "loss": 0.6139, "step": 2096 }, { "epoch": 0.1434434639852247, "grad_norm": 2.9217075565114943, "learning_rate": 9.666444985084027e-06, "loss": 0.5545, "step": 2097 }, { "epoch": 0.14351186811683425, "grad_norm": 2.3132466837662267, "learning_rate": 9.666047046752784e-06, "loss": 0.5213, "step": 2098 }, { "epoch": 0.1435802722484438, "grad_norm": 3.428514001764872, "learning_rate": 9.66564887938942e-06, "loss": 0.5929, "step": 2099 }, { "epoch": 0.14364867638005335, "grad_norm": 1.3041459518631873, "learning_rate": 9.66525048301348e-06, "loss": 0.1909, "step": 2100 }, { "epoch": 0.1437170805116629, "grad_norm": 1.7972157569870015, "learning_rate": 9.664851857644517e-06, "loss": 0.393, "step": 2101 }, { "epoch": 0.14378548464327245, "grad_norm": 2.3667401309655305, "learning_rate": 9.664453003302097e-06, "loss": 0.5968, "step": 2102 }, { "epoch": 0.143853888774882, "grad_norm": 2.714916546620888, "learning_rate": 9.664053920005802e-06, "loss": 0.6704, "step": 2103 }, { "epoch": 0.14392229290649156, "grad_norm": 2.3538029792078334, "learning_rate": 9.663654607775216e-06, "loss": 0.4812, "step": 2104 }, { "epoch": 0.1439906970381011, "grad_norm": 2.094655502999177, "learning_rate": 9.663255066629942e-06, "loss": 0.5781, "step": 2105 }, { "epoch": 0.14405910116971066, "grad_norm": 2.152260538143857, "learning_rate": 9.66285529658959e-06, "loss": 0.3499, "step": 2106 }, { "epoch": 0.1441275053013202, "grad_norm": 1.242663332833704, "learning_rate": 9.662455297673784e-06, "loss": 0.1808, "step": 2107 }, { "epoch": 0.14419590943292976, "grad_norm": 2.2664187915222254, "learning_rate": 9.662055069902158e-06, "loss": 0.478, "step": 2108 }, { "epoch": 0.1442643135645393, "grad_norm": 2.468725361278571, "learning_rate": 9.661654613294356e-06, "loss": 0.6861, "step": 2109 }, { "epoch": 0.14433271769614886, "grad_norm": 2.178979280054231, "learning_rate": 9.661253927870035e-06, "loss": 0.4723, "step": 2110 }, { "epoch": 0.14440112182775838, "grad_norm": 2.9327453016204963, "learning_rate": 9.660853013648861e-06, "loss": 0.2665, "step": 2111 }, { "epoch": 0.14446952595936793, "grad_norm": 2.639791012722927, "learning_rate": 9.660451870650514e-06, "loss": 0.7062, "step": 2112 }, { "epoch": 0.14453793009097748, "grad_norm": 2.315541483908433, "learning_rate": 9.660050498894685e-06, "loss": 0.2095, "step": 2113 }, { "epoch": 0.14460633422258704, "grad_norm": 2.092029099169495, "learning_rate": 9.659648898401075e-06, "loss": 0.34, "step": 2114 }, { "epoch": 0.14467473835419659, "grad_norm": 1.624568368367163, "learning_rate": 9.659247069189394e-06, "loss": 0.2361, "step": 2115 }, { "epoch": 0.14474314248580614, "grad_norm": 1.877842000976201, "learning_rate": 9.65884501127937e-06, "loss": 0.3009, "step": 2116 }, { "epoch": 0.1448115466174157, "grad_norm": 2.360279913916867, "learning_rate": 9.658442724690733e-06, "loss": 0.5544, "step": 2117 }, { "epoch": 0.14487995074902524, "grad_norm": 2.811039375804274, "learning_rate": 9.658040209443234e-06, "loss": 0.8833, "step": 2118 }, { "epoch": 0.1449483548806348, "grad_norm": 2.2720945527665832, "learning_rate": 9.657637465556626e-06, "loss": 0.427, "step": 2119 }, { "epoch": 0.14501675901224434, "grad_norm": 2.400597348479758, "learning_rate": 9.65723449305068e-06, "loss": 0.4437, "step": 2120 }, { "epoch": 0.1450851631438539, "grad_norm": 2.250059848890117, "learning_rate": 9.656831291945178e-06, "loss": 0.4431, "step": 2121 }, { "epoch": 0.14515356727546344, "grad_norm": 1.8622162692399853, "learning_rate": 9.656427862259904e-06, "loss": 0.519, "step": 2122 }, { "epoch": 0.145221971407073, "grad_norm": 1.7048549833811304, "learning_rate": 9.656024204014668e-06, "loss": 0.3212, "step": 2123 }, { "epoch": 0.14529037553868254, "grad_norm": 2.1115211518560835, "learning_rate": 9.65562031722928e-06, "loss": 0.6032, "step": 2124 }, { "epoch": 0.1453587796702921, "grad_norm": 2.206548246346544, "learning_rate": 9.655216201923565e-06, "loss": 0.5149, "step": 2125 }, { "epoch": 0.14542718380190164, "grad_norm": 1.857040350044752, "learning_rate": 9.654811858117359e-06, "loss": 0.3703, "step": 2126 }, { "epoch": 0.1454955879335112, "grad_norm": 1.843000445511775, "learning_rate": 9.654407285830509e-06, "loss": 0.3417, "step": 2127 }, { "epoch": 0.14556399206512075, "grad_norm": 2.09737785807819, "learning_rate": 9.654002485082874e-06, "loss": 0.4641, "step": 2128 }, { "epoch": 0.1456323961967303, "grad_norm": 2.2886580477712073, "learning_rate": 9.653597455894322e-06, "loss": 0.5964, "step": 2129 }, { "epoch": 0.14570080032833982, "grad_norm": 2.0262273993017104, "learning_rate": 9.653192198284735e-06, "loss": 0.3009, "step": 2130 }, { "epoch": 0.14576920445994937, "grad_norm": 1.848085289110202, "learning_rate": 9.652786712274005e-06, "loss": 0.3815, "step": 2131 }, { "epoch": 0.14583760859155892, "grad_norm": 2.290918860379821, "learning_rate": 9.652380997882035e-06, "loss": 0.6519, "step": 2132 }, { "epoch": 0.14590601272316847, "grad_norm": 2.373339875896237, "learning_rate": 9.651975055128741e-06, "loss": 0.5203, "step": 2133 }, { "epoch": 0.14597441685477802, "grad_norm": 2.0882915686378474, "learning_rate": 9.651568884034047e-06, "loss": 0.5337, "step": 2134 }, { "epoch": 0.14604282098638757, "grad_norm": 5.011674127060455, "learning_rate": 9.65116248461789e-06, "loss": 0.5411, "step": 2135 }, { "epoch": 0.14611122511799712, "grad_norm": 1.7487823239196032, "learning_rate": 9.650755856900216e-06, "loss": 0.3692, "step": 2136 }, { "epoch": 0.14617962924960667, "grad_norm": 1.8117213056315187, "learning_rate": 9.650349000900988e-06, "loss": 0.3813, "step": 2137 }, { "epoch": 0.14624803338121622, "grad_norm": 3.760885148264482, "learning_rate": 9.649941916640173e-06, "loss": 0.3994, "step": 2138 }, { "epoch": 0.14631643751282578, "grad_norm": 2.6071690183183596, "learning_rate": 9.649534604137755e-06, "loss": 0.3758, "step": 2139 }, { "epoch": 0.14638484164443533, "grad_norm": 1.7063622309380455, "learning_rate": 9.649127063413727e-06, "loss": 0.2426, "step": 2140 }, { "epoch": 0.14645324577604488, "grad_norm": 5.095246608545552, "learning_rate": 9.648719294488092e-06, "loss": 0.4455, "step": 2141 }, { "epoch": 0.14652164990765443, "grad_norm": 2.99158413931698, "learning_rate": 9.648311297380866e-06, "loss": 0.3983, "step": 2142 }, { "epoch": 0.14659005403926398, "grad_norm": 2.12374637243647, "learning_rate": 9.647903072112075e-06, "loss": 0.3937, "step": 2143 }, { "epoch": 0.14665845817087353, "grad_norm": 2.03596140919931, "learning_rate": 9.647494618701756e-06, "loss": 0.5472, "step": 2144 }, { "epoch": 0.14672686230248308, "grad_norm": 3.3936520086371, "learning_rate": 9.647085937169958e-06, "loss": 0.3384, "step": 2145 }, { "epoch": 0.14679526643409263, "grad_norm": 2.3151586752883535, "learning_rate": 9.646677027536744e-06, "loss": 0.6207, "step": 2146 }, { "epoch": 0.14686367056570218, "grad_norm": 1.9369731886743167, "learning_rate": 9.646267889822181e-06, "loss": 0.3349, "step": 2147 }, { "epoch": 0.1469320746973117, "grad_norm": 1.592436749618523, "learning_rate": 9.645858524046354e-06, "loss": 0.3292, "step": 2148 }, { "epoch": 0.14700047882892125, "grad_norm": 1.4246699457076548, "learning_rate": 9.645448930229356e-06, "loss": 0.2533, "step": 2149 }, { "epoch": 0.1470688829605308, "grad_norm": 1.7819039332336521, "learning_rate": 9.645039108391293e-06, "loss": 0.3733, "step": 2150 }, { "epoch": 0.14713728709214036, "grad_norm": 2.7206713694453364, "learning_rate": 9.644629058552277e-06, "loss": 0.7635, "step": 2151 }, { "epoch": 0.1472056912237499, "grad_norm": 1.618024093661319, "learning_rate": 9.64421878073244e-06, "loss": 0.392, "step": 2152 }, { "epoch": 0.14727409535535946, "grad_norm": 2.715292156703912, "learning_rate": 9.643808274951919e-06, "loss": 0.7555, "step": 2153 }, { "epoch": 0.147342499486969, "grad_norm": 7.082879295008917, "learning_rate": 9.643397541230862e-06, "loss": 0.4448, "step": 2154 }, { "epoch": 0.14741090361857856, "grad_norm": 2.253156603434126, "learning_rate": 9.64298657958943e-06, "loss": 0.2316, "step": 2155 }, { "epoch": 0.1474793077501881, "grad_norm": 1.9391994521708684, "learning_rate": 9.6425753900478e-06, "loss": 0.3919, "step": 2156 }, { "epoch": 0.14754771188179766, "grad_norm": 1.6838366074437578, "learning_rate": 9.642163972626147e-06, "loss": 0.4545, "step": 2157 }, { "epoch": 0.1476161160134072, "grad_norm": 2.084603722589718, "learning_rate": 9.641752327344671e-06, "loss": 0.4437, "step": 2158 }, { "epoch": 0.14768452014501676, "grad_norm": 2.263859886675678, "learning_rate": 9.641340454223576e-06, "loss": 0.3457, "step": 2159 }, { "epoch": 0.1477529242766263, "grad_norm": 2.836889312163441, "learning_rate": 9.64092835328308e-06, "loss": 0.3643, "step": 2160 }, { "epoch": 0.14782132840823586, "grad_norm": 1.5820725612191606, "learning_rate": 9.640516024543407e-06, "loss": 0.3457, "step": 2161 }, { "epoch": 0.14788973253984541, "grad_norm": 22.70215379353321, "learning_rate": 9.6401034680248e-06, "loss": 0.3901, "step": 2162 }, { "epoch": 0.14795813667145497, "grad_norm": 1.5613114294782426, "learning_rate": 9.639690683747507e-06, "loss": 0.32, "step": 2163 }, { "epoch": 0.14802654080306452, "grad_norm": 2.525169631363602, "learning_rate": 9.639277671731791e-06, "loss": 0.3585, "step": 2164 }, { "epoch": 0.14809494493467407, "grad_norm": 2.136405691753452, "learning_rate": 9.638864431997925e-06, "loss": 0.4952, "step": 2165 }, { "epoch": 0.14816334906628362, "grad_norm": 1.8061633165144269, "learning_rate": 9.638450964566191e-06, "loss": 0.3947, "step": 2166 }, { "epoch": 0.14823175319789314, "grad_norm": 1.9832775355218388, "learning_rate": 9.638037269456886e-06, "loss": 0.5227, "step": 2167 }, { "epoch": 0.1483001573295027, "grad_norm": 1.7617485525989502, "learning_rate": 9.637623346690313e-06, "loss": 0.284, "step": 2168 }, { "epoch": 0.14836856146111224, "grad_norm": 2.1330758026579177, "learning_rate": 9.637209196286792e-06, "loss": 0.2789, "step": 2169 }, { "epoch": 0.1484369655927218, "grad_norm": 1.8648550556608852, "learning_rate": 9.63679481826665e-06, "loss": 0.3374, "step": 2170 }, { "epoch": 0.14850536972433134, "grad_norm": 2.6223371714470995, "learning_rate": 9.636380212650229e-06, "loss": 0.5167, "step": 2171 }, { "epoch": 0.1485737738559409, "grad_norm": 1.8671368528224697, "learning_rate": 9.635965379457879e-06, "loss": 0.3823, "step": 2172 }, { "epoch": 0.14864217798755044, "grad_norm": 2.2630747154736546, "learning_rate": 9.63555031870996e-06, "loss": 0.508, "step": 2173 }, { "epoch": 0.14871058211916, "grad_norm": 1.8274827546298815, "learning_rate": 9.635135030426846e-06, "loss": 0.4184, "step": 2174 }, { "epoch": 0.14877898625076955, "grad_norm": 2.16853642635083, "learning_rate": 9.634719514628923e-06, "loss": 0.6382, "step": 2175 }, { "epoch": 0.1488473903823791, "grad_norm": 1.9474643234760636, "learning_rate": 9.634303771336584e-06, "loss": 0.4594, "step": 2176 }, { "epoch": 0.14891579451398865, "grad_norm": 2.47310099888228, "learning_rate": 9.633887800570238e-06, "loss": 0.5918, "step": 2177 }, { "epoch": 0.1489841986455982, "grad_norm": 4.263248829773394, "learning_rate": 9.633471602350302e-06, "loss": 0.4219, "step": 2178 }, { "epoch": 0.14905260277720775, "grad_norm": 2.3039101016897536, "learning_rate": 9.633055176697205e-06, "loss": 0.5851, "step": 2179 }, { "epoch": 0.1491210069088173, "grad_norm": 2.0195490321662204, "learning_rate": 9.632638523631388e-06, "loss": 0.2149, "step": 2180 }, { "epoch": 0.14918941104042685, "grad_norm": 1.6285313625403366, "learning_rate": 9.6322216431733e-06, "loss": 0.2023, "step": 2181 }, { "epoch": 0.1492578151720364, "grad_norm": 1.8032869915779504, "learning_rate": 9.631804535343404e-06, "loss": 0.3478, "step": 2182 }, { "epoch": 0.14932621930364595, "grad_norm": 2.1568979588947244, "learning_rate": 9.631387200162176e-06, "loss": 0.3087, "step": 2183 }, { "epoch": 0.1493946234352555, "grad_norm": 2.216515645072842, "learning_rate": 9.6309696376501e-06, "loss": 0.7174, "step": 2184 }, { "epoch": 0.14946302756686503, "grad_norm": 1.9056469870300112, "learning_rate": 9.630551847827672e-06, "loss": 0.4544, "step": 2185 }, { "epoch": 0.14953143169847458, "grad_norm": 1.6600109575309236, "learning_rate": 9.630133830715396e-06, "loss": 0.3257, "step": 2186 }, { "epoch": 0.14959983583008413, "grad_norm": 1.534251863953843, "learning_rate": 9.629715586333794e-06, "loss": 0.1324, "step": 2187 }, { "epoch": 0.14966823996169368, "grad_norm": 1.7625448928857979, "learning_rate": 9.629297114703395e-06, "loss": 0.4194, "step": 2188 }, { "epoch": 0.14973664409330323, "grad_norm": 1.7442019516948344, "learning_rate": 9.62887841584474e-06, "loss": 0.4148, "step": 2189 }, { "epoch": 0.14980504822491278, "grad_norm": 1.794573603766733, "learning_rate": 9.628459489778378e-06, "loss": 0.1837, "step": 2190 }, { "epoch": 0.14987345235652233, "grad_norm": 2.2385970553620482, "learning_rate": 9.628040336524875e-06, "loss": 0.3078, "step": 2191 }, { "epoch": 0.14994185648813188, "grad_norm": 2.105578255150083, "learning_rate": 9.627620956104801e-06, "loss": 0.3739, "step": 2192 }, { "epoch": 0.15001026061974143, "grad_norm": 1.968251057252657, "learning_rate": 9.627201348538746e-06, "loss": 0.3903, "step": 2193 }, { "epoch": 0.15007866475135098, "grad_norm": 2.335850041005697, "learning_rate": 9.626781513847305e-06, "loss": 0.5805, "step": 2194 }, { "epoch": 0.15014706888296053, "grad_norm": 1.972957323723561, "learning_rate": 9.626361452051084e-06, "loss": 0.55, "step": 2195 }, { "epoch": 0.15021547301457008, "grad_norm": 3.1207445543714267, "learning_rate": 9.625941163170702e-06, "loss": 0.2969, "step": 2196 }, { "epoch": 0.15028387714617963, "grad_norm": 2.647672701717264, "learning_rate": 9.625520647226788e-06, "loss": 0.4223, "step": 2197 }, { "epoch": 0.15035228127778918, "grad_norm": 2.8542149265877867, "learning_rate": 9.625099904239987e-06, "loss": 0.4703, "step": 2198 }, { "epoch": 0.15042068540939874, "grad_norm": 2.502228703088731, "learning_rate": 9.624678934230948e-06, "loss": 0.5446, "step": 2199 }, { "epoch": 0.1504890895410083, "grad_norm": 1.7222758627898944, "learning_rate": 9.624257737220333e-06, "loss": 0.4304, "step": 2200 }, { "epoch": 0.15055749367261784, "grad_norm": 1.7536489514613114, "learning_rate": 9.62383631322882e-06, "loss": 0.3473, "step": 2201 }, { "epoch": 0.1506258978042274, "grad_norm": 2.526928509379891, "learning_rate": 9.62341466227709e-06, "loss": 0.6633, "step": 2202 }, { "epoch": 0.1506943019358369, "grad_norm": 2.528530968133775, "learning_rate": 9.622992784385842e-06, "loss": 0.3827, "step": 2203 }, { "epoch": 0.15076270606744646, "grad_norm": 2.4309838936318067, "learning_rate": 9.622570679575787e-06, "loss": 0.6125, "step": 2204 }, { "epoch": 0.150831110199056, "grad_norm": 2.239587602315575, "learning_rate": 9.62214834786764e-06, "loss": 0.4868, "step": 2205 }, { "epoch": 0.15089951433066556, "grad_norm": 1.8578631932743188, "learning_rate": 9.62172578928213e-06, "loss": 0.1995, "step": 2206 }, { "epoch": 0.1509679184622751, "grad_norm": 1.914463114172423, "learning_rate": 9.62130300384e-06, "loss": 0.4998, "step": 2207 }, { "epoch": 0.15103632259388466, "grad_norm": 2.669081203268765, "learning_rate": 9.620879991562005e-06, "loss": 0.3088, "step": 2208 }, { "epoch": 0.15110472672549422, "grad_norm": 2.208402902853859, "learning_rate": 9.620456752468903e-06, "loss": 0.4568, "step": 2209 }, { "epoch": 0.15117313085710377, "grad_norm": 2.4060644102980047, "learning_rate": 9.620033286581475e-06, "loss": 0.4741, "step": 2210 }, { "epoch": 0.15124153498871332, "grad_norm": 1.7922254821091648, "learning_rate": 9.619609593920502e-06, "loss": 0.4006, "step": 2211 }, { "epoch": 0.15130993912032287, "grad_norm": 2.642167571910239, "learning_rate": 9.619185674506781e-06, "loss": 0.7166, "step": 2212 }, { "epoch": 0.15137834325193242, "grad_norm": 1.7134450452361185, "learning_rate": 9.618761528361123e-06, "loss": 0.3288, "step": 2213 }, { "epoch": 0.15144674738354197, "grad_norm": 1.8769682368564289, "learning_rate": 9.618337155504345e-06, "loss": 0.3854, "step": 2214 }, { "epoch": 0.15151515151515152, "grad_norm": 1.470104709720277, "learning_rate": 9.617912555957277e-06, "loss": 0.1681, "step": 2215 }, { "epoch": 0.15158355564676107, "grad_norm": 1.9507823627852205, "learning_rate": 9.617487729740761e-06, "loss": 0.3976, "step": 2216 }, { "epoch": 0.15165195977837062, "grad_norm": 2.4223899232383146, "learning_rate": 9.61706267687565e-06, "loss": 0.3877, "step": 2217 }, { "epoch": 0.15172036390998017, "grad_norm": 2.2735582070570577, "learning_rate": 9.616637397382807e-06, "loss": 0.3468, "step": 2218 }, { "epoch": 0.15178876804158972, "grad_norm": 0.9744117513519485, "learning_rate": 9.616211891283108e-06, "loss": 0.1729, "step": 2219 }, { "epoch": 0.15185717217319927, "grad_norm": 1.992801407114219, "learning_rate": 9.615786158597438e-06, "loss": 0.3074, "step": 2220 }, { "epoch": 0.15192557630480882, "grad_norm": 2.180812927986842, "learning_rate": 9.615360199346694e-06, "loss": 0.3792, "step": 2221 }, { "epoch": 0.15199398043641835, "grad_norm": 1.8358836957474027, "learning_rate": 9.614934013551781e-06, "loss": 0.4021, "step": 2222 }, { "epoch": 0.1520623845680279, "grad_norm": 1.673780192998661, "learning_rate": 9.614507601233625e-06, "loss": 0.2235, "step": 2223 }, { "epoch": 0.15213078869963745, "grad_norm": 2.763411433480792, "learning_rate": 9.61408096241315e-06, "loss": 0.7753, "step": 2224 }, { "epoch": 0.152199192831247, "grad_norm": 2.000374352914679, "learning_rate": 9.613654097111302e-06, "loss": 0.5419, "step": 2225 }, { "epoch": 0.15226759696285655, "grad_norm": 2.4074130158243574, "learning_rate": 9.61322700534903e-06, "loss": 0.4524, "step": 2226 }, { "epoch": 0.1523360010944661, "grad_norm": 2.1501464635840986, "learning_rate": 9.612799687147302e-06, "loss": 0.504, "step": 2227 }, { "epoch": 0.15240440522607565, "grad_norm": 1.8560079212834109, "learning_rate": 9.61237214252709e-06, "loss": 0.3014, "step": 2228 }, { "epoch": 0.1524728093576852, "grad_norm": 2.0250019208139562, "learning_rate": 9.611944371509378e-06, "loss": 0.3005, "step": 2229 }, { "epoch": 0.15254121348929475, "grad_norm": 2.3393706223887425, "learning_rate": 9.611516374115169e-06, "loss": 0.6339, "step": 2230 }, { "epoch": 0.1526096176209043, "grad_norm": 2.3403865649473383, "learning_rate": 9.611088150365464e-06, "loss": 0.6797, "step": 2231 }, { "epoch": 0.15267802175251385, "grad_norm": 2.157881169880393, "learning_rate": 9.610659700281288e-06, "loss": 0.274, "step": 2232 }, { "epoch": 0.1527464258841234, "grad_norm": 2.476575442259272, "learning_rate": 9.610231023883667e-06, "loss": 0.5521, "step": 2233 }, { "epoch": 0.15281483001573296, "grad_norm": 2.194535358842023, "learning_rate": 9.609802121193648e-06, "loss": 0.442, "step": 2234 }, { "epoch": 0.1528832341473425, "grad_norm": 2.3548807640728024, "learning_rate": 9.609372992232279e-06, "loss": 0.553, "step": 2235 }, { "epoch": 0.15295163827895206, "grad_norm": 2.503073650600163, "learning_rate": 9.608943637020625e-06, "loss": 0.6567, "step": 2236 }, { "epoch": 0.1530200424105616, "grad_norm": 1.920232815559692, "learning_rate": 9.60851405557976e-06, "loss": 0.4005, "step": 2237 }, { "epoch": 0.15308844654217116, "grad_norm": 1.7672893773064502, "learning_rate": 9.608084247930772e-06, "loss": 0.3429, "step": 2238 }, { "epoch": 0.1531568506737807, "grad_norm": 2.733506582505147, "learning_rate": 9.607654214094757e-06, "loss": 0.6171, "step": 2239 }, { "epoch": 0.15322525480539023, "grad_norm": 2.0436873548863397, "learning_rate": 9.60722395409282e-06, "loss": 0.3362, "step": 2240 }, { "epoch": 0.15329365893699978, "grad_norm": 1.723510435260913, "learning_rate": 9.606793467946088e-06, "loss": 0.297, "step": 2241 }, { "epoch": 0.15336206306860933, "grad_norm": 1.98640593279768, "learning_rate": 9.606362755675685e-06, "loss": 0.4841, "step": 2242 }, { "epoch": 0.15343046720021888, "grad_norm": 2.34697764041422, "learning_rate": 9.605931817302752e-06, "loss": 0.524, "step": 2243 }, { "epoch": 0.15349887133182843, "grad_norm": 2.223025996824526, "learning_rate": 9.605500652848447e-06, "loss": 0.5387, "step": 2244 }, { "epoch": 0.15356727546343799, "grad_norm": 3.1624064314800227, "learning_rate": 9.605069262333928e-06, "loss": 0.6898, "step": 2245 }, { "epoch": 0.15363567959504754, "grad_norm": 2.2424072131392707, "learning_rate": 9.604637645780373e-06, "loss": 0.5918, "step": 2246 }, { "epoch": 0.1537040837266571, "grad_norm": 1.9160883789559597, "learning_rate": 9.604205803208966e-06, "loss": 0.3825, "step": 2247 }, { "epoch": 0.15377248785826664, "grad_norm": 2.0649004535488333, "learning_rate": 9.603773734640905e-06, "loss": 0.3781, "step": 2248 }, { "epoch": 0.1538408919898762, "grad_norm": 1.9225805759069405, "learning_rate": 9.603341440097397e-06, "loss": 0.3683, "step": 2249 }, { "epoch": 0.15390929612148574, "grad_norm": 2.0897603633323367, "learning_rate": 9.602908919599662e-06, "loss": 0.3659, "step": 2250 }, { "epoch": 0.1539777002530953, "grad_norm": 2.099975355321597, "learning_rate": 9.60247617316893e-06, "loss": 0.3321, "step": 2251 }, { "epoch": 0.15404610438470484, "grad_norm": 1.644992392212224, "learning_rate": 9.602043200826443e-06, "loss": 0.2275, "step": 2252 }, { "epoch": 0.1541145085163144, "grad_norm": 2.805530198352986, "learning_rate": 9.601610002593451e-06, "loss": 0.9473, "step": 2253 }, { "epoch": 0.15418291264792394, "grad_norm": 1.6428647670046936, "learning_rate": 9.60117657849122e-06, "loss": 0.2986, "step": 2254 }, { "epoch": 0.1542513167795335, "grad_norm": 2.2833765778193893, "learning_rate": 9.600742928541025e-06, "loss": 0.4241, "step": 2255 }, { "epoch": 0.15431972091114304, "grad_norm": 2.4765881076776437, "learning_rate": 9.60030905276415e-06, "loss": 0.3666, "step": 2256 }, { "epoch": 0.1543881250427526, "grad_norm": 2.236911707531892, "learning_rate": 9.59987495118189e-06, "loss": 0.6149, "step": 2257 }, { "epoch": 0.15445652917436214, "grad_norm": 2.039517479776048, "learning_rate": 9.599440623815557e-06, "loss": 0.3609, "step": 2258 }, { "epoch": 0.15452493330597167, "grad_norm": 2.148288374840518, "learning_rate": 9.599006070686467e-06, "loss": 0.6327, "step": 2259 }, { "epoch": 0.15459333743758122, "grad_norm": 6.575142517285814, "learning_rate": 9.59857129181595e-06, "loss": 0.559, "step": 2260 }, { "epoch": 0.15466174156919077, "grad_norm": 1.7689855504648018, "learning_rate": 9.59813628722535e-06, "loss": 0.4193, "step": 2261 }, { "epoch": 0.15473014570080032, "grad_norm": 2.4850982716814736, "learning_rate": 9.597701056936014e-06, "loss": 0.683, "step": 2262 }, { "epoch": 0.15479854983240987, "grad_norm": 2.0984767893097684, "learning_rate": 9.59726560096931e-06, "loss": 0.5553, "step": 2263 }, { "epoch": 0.15486695396401942, "grad_norm": 1.6595716703826209, "learning_rate": 9.596829919346608e-06, "loss": 0.1753, "step": 2264 }, { "epoch": 0.15493535809562897, "grad_norm": 3.0128724180621274, "learning_rate": 9.596394012089298e-06, "loss": 0.5919, "step": 2265 }, { "epoch": 0.15500376222723852, "grad_norm": 2.2049404300646547, "learning_rate": 9.595957879218773e-06, "loss": 0.4598, "step": 2266 }, { "epoch": 0.15507216635884807, "grad_norm": 1.826539068993618, "learning_rate": 9.595521520756443e-06, "loss": 0.3516, "step": 2267 }, { "epoch": 0.15514057049045762, "grad_norm": 1.1965722196577244, "learning_rate": 9.595084936723725e-06, "loss": 0.1631, "step": 2268 }, { "epoch": 0.15520897462206718, "grad_norm": 3.2580532515598595, "learning_rate": 9.594648127142048e-06, "loss": 0.5278, "step": 2269 }, { "epoch": 0.15527737875367673, "grad_norm": 2.1551661877377786, "learning_rate": 9.594211092032853e-06, "loss": 0.3569, "step": 2270 }, { "epoch": 0.15534578288528628, "grad_norm": 1.8529457928751847, "learning_rate": 9.593773831417592e-06, "loss": 0.3754, "step": 2271 }, { "epoch": 0.15541418701689583, "grad_norm": 1.9414326399270236, "learning_rate": 9.59333634531773e-06, "loss": 0.3546, "step": 2272 }, { "epoch": 0.15548259114850538, "grad_norm": 1.9020401675210254, "learning_rate": 9.592898633754739e-06, "loss": 0.4208, "step": 2273 }, { "epoch": 0.15555099528011493, "grad_norm": 2.4554752887440934, "learning_rate": 9.592460696750103e-06, "loss": 0.3832, "step": 2274 }, { "epoch": 0.15561939941172448, "grad_norm": 1.5799653846496486, "learning_rate": 9.59202253432532e-06, "loss": 0.2551, "step": 2275 }, { "epoch": 0.15568780354333403, "grad_norm": 2.0430481751212337, "learning_rate": 9.591584146501896e-06, "loss": 0.4025, "step": 2276 }, { "epoch": 0.15575620767494355, "grad_norm": 1.6833451877505496, "learning_rate": 9.591145533301351e-06, "loss": 0.285, "step": 2277 }, { "epoch": 0.1558246118065531, "grad_norm": 1.95808179910577, "learning_rate": 9.590706694745211e-06, "loss": 0.3805, "step": 2278 }, { "epoch": 0.15589301593816265, "grad_norm": 2.0402364238193367, "learning_rate": 9.59026763085502e-06, "loss": 0.4091, "step": 2279 }, { "epoch": 0.1559614200697722, "grad_norm": 1.4811798240037455, "learning_rate": 9.589828341652326e-06, "loss": 0.2511, "step": 2280 }, { "epoch": 0.15602982420138176, "grad_norm": 1.7497053835009917, "learning_rate": 9.589388827158692e-06, "loss": 0.1967, "step": 2281 }, { "epoch": 0.1560982283329913, "grad_norm": 1.4550500393403547, "learning_rate": 9.588949087395694e-06, "loss": 0.1784, "step": 2282 }, { "epoch": 0.15616663246460086, "grad_norm": 1.6129979815500026, "learning_rate": 9.588509122384915e-06, "loss": 0.2009, "step": 2283 }, { "epoch": 0.1562350365962104, "grad_norm": 1.664294378932995, "learning_rate": 9.588068932147949e-06, "loss": 0.2175, "step": 2284 }, { "epoch": 0.15630344072781996, "grad_norm": 2.01247006062338, "learning_rate": 9.587628516706406e-06, "loss": 0.4795, "step": 2285 }, { "epoch": 0.1563718448594295, "grad_norm": 2.619125662013473, "learning_rate": 9.587187876081901e-06, "loss": 0.6372, "step": 2286 }, { "epoch": 0.15644024899103906, "grad_norm": 2.498708128928976, "learning_rate": 9.586747010296064e-06, "loss": 0.6686, "step": 2287 }, { "epoch": 0.1565086531226486, "grad_norm": 2.1056579688829435, "learning_rate": 9.586305919370535e-06, "loss": 0.4886, "step": 2288 }, { "epoch": 0.15657705725425816, "grad_norm": 3.107924722186384, "learning_rate": 9.585864603326964e-06, "loss": 0.4943, "step": 2289 }, { "epoch": 0.1566454613858677, "grad_norm": 2.269993260642479, "learning_rate": 9.585423062187014e-06, "loss": 0.6212, "step": 2290 }, { "epoch": 0.15671386551747726, "grad_norm": 2.2633692791759517, "learning_rate": 9.584981295972357e-06, "loss": 0.5507, "step": 2291 }, { "epoch": 0.15678226964908681, "grad_norm": 1.5754709351469696, "learning_rate": 9.584539304704676e-06, "loss": 0.2198, "step": 2292 }, { "epoch": 0.15685067378069636, "grad_norm": 1.923008971893619, "learning_rate": 9.58409708840567e-06, "loss": 0.3849, "step": 2293 }, { "epoch": 0.15691907791230592, "grad_norm": 2.6941210283250054, "learning_rate": 9.583654647097041e-06, "loss": 0.6694, "step": 2294 }, { "epoch": 0.15698748204391547, "grad_norm": 2.4652095468836697, "learning_rate": 9.583211980800507e-06, "loss": 0.3554, "step": 2295 }, { "epoch": 0.157055886175525, "grad_norm": 1.997481242761403, "learning_rate": 9.582769089537797e-06, "loss": 0.4544, "step": 2296 }, { "epoch": 0.15712429030713454, "grad_norm": 2.398661295486635, "learning_rate": 9.582325973330652e-06, "loss": 0.7057, "step": 2297 }, { "epoch": 0.1571926944387441, "grad_norm": 1.7524283114475754, "learning_rate": 9.581882632200819e-06, "loss": 0.346, "step": 2298 }, { "epoch": 0.15726109857035364, "grad_norm": 1.7153352020173556, "learning_rate": 9.58143906617006e-06, "loss": 0.2804, "step": 2299 }, { "epoch": 0.1573295027019632, "grad_norm": 1.8024636806209808, "learning_rate": 9.58099527526015e-06, "loss": 0.344, "step": 2300 }, { "epoch": 0.15739790683357274, "grad_norm": 2.6838990194722188, "learning_rate": 9.58055125949287e-06, "loss": 0.7231, "step": 2301 }, { "epoch": 0.1574663109651823, "grad_norm": 2.163443638694922, "learning_rate": 9.580107018890013e-06, "loss": 0.4954, "step": 2302 }, { "epoch": 0.15753471509679184, "grad_norm": 2.338582359602779, "learning_rate": 9.579662553473388e-06, "loss": 0.4768, "step": 2303 }, { "epoch": 0.1576031192284014, "grad_norm": 2.4023451208095206, "learning_rate": 9.57921786326481e-06, "loss": 0.5977, "step": 2304 }, { "epoch": 0.15767152336001095, "grad_norm": 2.20992033580209, "learning_rate": 9.578772948286106e-06, "loss": 0.4679, "step": 2305 }, { "epoch": 0.1577399274916205, "grad_norm": 2.2821547791173664, "learning_rate": 9.578327808559117e-06, "loss": 0.2702, "step": 2306 }, { "epoch": 0.15780833162323005, "grad_norm": 1.8927641266399058, "learning_rate": 9.577882444105687e-06, "loss": 0.5343, "step": 2307 }, { "epoch": 0.1578767357548396, "grad_norm": 1.976646368788007, "learning_rate": 9.577436854947683e-06, "loss": 0.2896, "step": 2308 }, { "epoch": 0.15794513988644915, "grad_norm": 2.449545706139162, "learning_rate": 9.576991041106973e-06, "loss": 0.3969, "step": 2309 }, { "epoch": 0.1580135440180587, "grad_norm": 2.139682896596448, "learning_rate": 9.57654500260544e-06, "loss": 0.4411, "step": 2310 }, { "epoch": 0.15808194814966825, "grad_norm": 1.9357985093328576, "learning_rate": 9.57609873946498e-06, "loss": 0.4675, "step": 2311 }, { "epoch": 0.1581503522812778, "grad_norm": 2.0232793422144684, "learning_rate": 9.575652251707495e-06, "loss": 0.1984, "step": 2312 }, { "epoch": 0.15821875641288735, "grad_norm": 2.2321791028758415, "learning_rate": 9.575205539354902e-06, "loss": 0.5097, "step": 2313 }, { "epoch": 0.15828716054449687, "grad_norm": 1.7814734720288674, "learning_rate": 9.574758602429128e-06, "loss": 0.2178, "step": 2314 }, { "epoch": 0.15835556467610643, "grad_norm": 2.0162777250432957, "learning_rate": 9.574311440952109e-06, "loss": 0.4358, "step": 2315 }, { "epoch": 0.15842396880771598, "grad_norm": 1.670429179449294, "learning_rate": 9.573864054945797e-06, "loss": 0.1493, "step": 2316 }, { "epoch": 0.15849237293932553, "grad_norm": 2.1439217602900142, "learning_rate": 9.57341644443215e-06, "loss": 0.2195, "step": 2317 }, { "epoch": 0.15856077707093508, "grad_norm": 1.6469931172859407, "learning_rate": 9.57296860943314e-06, "loss": 0.312, "step": 2318 }, { "epoch": 0.15862918120254463, "grad_norm": 2.0874384505512085, "learning_rate": 9.572520549970746e-06, "loss": 0.3703, "step": 2319 }, { "epoch": 0.15869758533415418, "grad_norm": 1.4033317246898838, "learning_rate": 9.572072266066963e-06, "loss": 0.207, "step": 2320 }, { "epoch": 0.15876598946576373, "grad_norm": 2.542324054757312, "learning_rate": 9.571623757743797e-06, "loss": 0.7559, "step": 2321 }, { "epoch": 0.15883439359737328, "grad_norm": 2.3778139246779553, "learning_rate": 9.57117502502326e-06, "loss": 0.6293, "step": 2322 }, { "epoch": 0.15890279772898283, "grad_norm": 1.641395147395906, "learning_rate": 9.570726067927378e-06, "loss": 0.3191, "step": 2323 }, { "epoch": 0.15897120186059238, "grad_norm": 1.9953690312501864, "learning_rate": 9.570276886478189e-06, "loss": 0.3614, "step": 2324 }, { "epoch": 0.15903960599220193, "grad_norm": 2.6641996689551037, "learning_rate": 9.569827480697742e-06, "loss": 0.5688, "step": 2325 }, { "epoch": 0.15910801012381148, "grad_norm": 3.1620125711572373, "learning_rate": 9.569377850608093e-06, "loss": 0.4368, "step": 2326 }, { "epoch": 0.15917641425542103, "grad_norm": 1.8880136125799196, "learning_rate": 9.568927996231314e-06, "loss": 0.3972, "step": 2327 }, { "epoch": 0.15924481838703058, "grad_norm": 2.063708671427346, "learning_rate": 9.568477917589486e-06, "loss": 0.2248, "step": 2328 }, { "epoch": 0.15931322251864014, "grad_norm": 2.166431393977833, "learning_rate": 9.568027614704701e-06, "loss": 0.5516, "step": 2329 }, { "epoch": 0.15938162665024969, "grad_norm": 1.6733666031865373, "learning_rate": 9.567577087599063e-06, "loss": 0.3074, "step": 2330 }, { "epoch": 0.15945003078185924, "grad_norm": 2.9594009257291747, "learning_rate": 9.567126336294684e-06, "loss": 0.8672, "step": 2331 }, { "epoch": 0.1595184349134688, "grad_norm": 1.9435093040714675, "learning_rate": 9.566675360813688e-06, "loss": 0.3714, "step": 2332 }, { "epoch": 0.1595868390450783, "grad_norm": 2.006427134807766, "learning_rate": 9.566224161178217e-06, "loss": 0.3686, "step": 2333 }, { "epoch": 0.15965524317668786, "grad_norm": 1.7735650504032372, "learning_rate": 9.565772737410412e-06, "loss": 0.2437, "step": 2334 }, { "epoch": 0.1597236473082974, "grad_norm": 2.5155066942153566, "learning_rate": 9.565321089532432e-06, "loss": 0.6334, "step": 2335 }, { "epoch": 0.15979205143990696, "grad_norm": 2.2186479410688387, "learning_rate": 9.564869217566447e-06, "loss": 0.6981, "step": 2336 }, { "epoch": 0.1598604555715165, "grad_norm": 2.407548931581213, "learning_rate": 9.56441712153464e-06, "loss": 0.3317, "step": 2337 }, { "epoch": 0.15992885970312606, "grad_norm": 2.1141998142101794, "learning_rate": 9.563964801459199e-06, "loss": 0.5906, "step": 2338 }, { "epoch": 0.15999726383473561, "grad_norm": 2.3407656138446415, "learning_rate": 9.563512257362325e-06, "loss": 0.5347, "step": 2339 }, { "epoch": 0.16006566796634517, "grad_norm": 1.4980142632109072, "learning_rate": 9.563059489266233e-06, "loss": 0.2865, "step": 2340 }, { "epoch": 0.16013407209795472, "grad_norm": 1.63627413170915, "learning_rate": 9.562606497193146e-06, "loss": 0.3476, "step": 2341 }, { "epoch": 0.16020247622956427, "grad_norm": 2.4663298369379993, "learning_rate": 9.5621532811653e-06, "loss": 0.8419, "step": 2342 }, { "epoch": 0.16027088036117382, "grad_norm": 1.8727071253072827, "learning_rate": 9.561699841204941e-06, "loss": 0.4353, "step": 2343 }, { "epoch": 0.16033928449278337, "grad_norm": 2.48384937538015, "learning_rate": 9.561246177334325e-06, "loss": 0.2945, "step": 2344 }, { "epoch": 0.16040768862439292, "grad_norm": 1.5311848154596057, "learning_rate": 9.56079228957572e-06, "loss": 0.1949, "step": 2345 }, { "epoch": 0.16047609275600247, "grad_norm": 1.2862759918134516, "learning_rate": 9.560338177951407e-06, "loss": 0.1636, "step": 2346 }, { "epoch": 0.16054449688761202, "grad_norm": 1.9366225632283065, "learning_rate": 9.559883842483672e-06, "loss": 0.3335, "step": 2347 }, { "epoch": 0.16061290101922157, "grad_norm": 1.8483025950638312, "learning_rate": 9.559429283194822e-06, "loss": 0.4417, "step": 2348 }, { "epoch": 0.16068130515083112, "grad_norm": 1.4344117587912415, "learning_rate": 9.558974500107163e-06, "loss": 0.3258, "step": 2349 }, { "epoch": 0.16074970928244067, "grad_norm": 2.8210296960582393, "learning_rate": 9.558519493243022e-06, "loss": 0.4201, "step": 2350 }, { "epoch": 0.1608181134140502, "grad_norm": 2.526701850091519, "learning_rate": 9.558064262624731e-06, "loss": 0.5455, "step": 2351 }, { "epoch": 0.16088651754565975, "grad_norm": 2.1121261457733733, "learning_rate": 9.557608808274634e-06, "loss": 0.63, "step": 2352 }, { "epoch": 0.1609549216772693, "grad_norm": 2.330376240668212, "learning_rate": 9.557153130215089e-06, "loss": 0.4741, "step": 2353 }, { "epoch": 0.16102332580887885, "grad_norm": 1.797075699917438, "learning_rate": 9.556697228468463e-06, "loss": 0.4226, "step": 2354 }, { "epoch": 0.1610917299404884, "grad_norm": 1.8503342188040615, "learning_rate": 9.556241103057132e-06, "loss": 0.5055, "step": 2355 }, { "epoch": 0.16116013407209795, "grad_norm": 2.0110586398138297, "learning_rate": 9.555784754003487e-06, "loss": 0.5633, "step": 2356 }, { "epoch": 0.1612285382037075, "grad_norm": 1.966291913969929, "learning_rate": 9.555328181329925e-06, "loss": 0.3766, "step": 2357 }, { "epoch": 0.16129694233531705, "grad_norm": 2.2537585429909295, "learning_rate": 9.55487138505886e-06, "loss": 0.3235, "step": 2358 }, { "epoch": 0.1613653464669266, "grad_norm": 1.617330199317452, "learning_rate": 9.55441436521271e-06, "loss": 0.2888, "step": 2359 }, { "epoch": 0.16143375059853615, "grad_norm": 2.448182015883553, "learning_rate": 9.553957121813912e-06, "loss": 0.5513, "step": 2360 }, { "epoch": 0.1615021547301457, "grad_norm": 2.0610389953447386, "learning_rate": 9.553499654884905e-06, "loss": 0.1725, "step": 2361 }, { "epoch": 0.16157055886175525, "grad_norm": 1.9933154484601416, "learning_rate": 9.553041964448148e-06, "loss": 0.4787, "step": 2362 }, { "epoch": 0.1616389629933648, "grad_norm": 2.545492516668301, "learning_rate": 9.552584050526105e-06, "loss": 0.2524, "step": 2363 }, { "epoch": 0.16170736712497435, "grad_norm": 2.2939929137116315, "learning_rate": 9.552125913141254e-06, "loss": 0.4494, "step": 2364 }, { "epoch": 0.1617757712565839, "grad_norm": 1.647577074770403, "learning_rate": 9.55166755231608e-06, "loss": 0.3546, "step": 2365 }, { "epoch": 0.16184417538819346, "grad_norm": 1.9245403501173601, "learning_rate": 9.551208968073082e-06, "loss": 0.3056, "step": 2366 }, { "epoch": 0.161912579519803, "grad_norm": 2.281197384005419, "learning_rate": 9.550750160434772e-06, "loss": 0.3817, "step": 2367 }, { "epoch": 0.16198098365141256, "grad_norm": 1.7931079041745377, "learning_rate": 9.550291129423667e-06, "loss": 0.2627, "step": 2368 }, { "epoch": 0.16204938778302208, "grad_norm": 2.0192612000764387, "learning_rate": 9.5498318750623e-06, "loss": 0.3906, "step": 2369 }, { "epoch": 0.16211779191463163, "grad_norm": 2.209391285783671, "learning_rate": 9.549372397373216e-06, "loss": 0.4083, "step": 2370 }, { "epoch": 0.16218619604624118, "grad_norm": 1.8150662637599762, "learning_rate": 9.548912696378965e-06, "loss": 0.3315, "step": 2371 }, { "epoch": 0.16225460017785073, "grad_norm": 1.7436489253676937, "learning_rate": 9.548452772102111e-06, "loss": 0.3573, "step": 2372 }, { "epoch": 0.16232300430946028, "grad_norm": 2.3492645365772824, "learning_rate": 9.547992624565233e-06, "loss": 0.5478, "step": 2373 }, { "epoch": 0.16239140844106983, "grad_norm": 1.987249610756272, "learning_rate": 9.547532253790912e-06, "loss": 0.4034, "step": 2374 }, { "epoch": 0.16245981257267939, "grad_norm": 1.7015145500860207, "learning_rate": 9.547071659801752e-06, "loss": 0.2706, "step": 2375 }, { "epoch": 0.16252821670428894, "grad_norm": 1.5681491671743357, "learning_rate": 9.546610842620355e-06, "loss": 0.2161, "step": 2376 }, { "epoch": 0.1625966208358985, "grad_norm": 1.8765011123062616, "learning_rate": 9.546149802269342e-06, "loss": 0.3336, "step": 2377 }, { "epoch": 0.16266502496750804, "grad_norm": 2.424595701296787, "learning_rate": 9.545688538771343e-06, "loss": 0.3325, "step": 2378 }, { "epoch": 0.1627334290991176, "grad_norm": 2.0868532049635267, "learning_rate": 9.545227052149002e-06, "loss": 0.4796, "step": 2379 }, { "epoch": 0.16280183323072714, "grad_norm": 2.4014476345342732, "learning_rate": 9.544765342424968e-06, "loss": 0.4143, "step": 2380 }, { "epoch": 0.1628702373623367, "grad_norm": 1.86004652190736, "learning_rate": 9.544303409621904e-06, "loss": 0.3252, "step": 2381 }, { "epoch": 0.16293864149394624, "grad_norm": 2.3645546501089263, "learning_rate": 9.543841253762484e-06, "loss": 0.5674, "step": 2382 }, { "epoch": 0.1630070456255558, "grad_norm": 1.9211944367534108, "learning_rate": 9.543378874869393e-06, "loss": 0.3993, "step": 2383 }, { "epoch": 0.16307544975716534, "grad_norm": 2.0625647007301717, "learning_rate": 9.542916272965327e-06, "loss": 0.4338, "step": 2384 }, { "epoch": 0.1631438538887749, "grad_norm": 1.6123548946962152, "learning_rate": 9.542453448072994e-06, "loss": 0.1563, "step": 2385 }, { "epoch": 0.16321225802038444, "grad_norm": 2.7437378774475074, "learning_rate": 9.541990400215111e-06, "loss": 0.236, "step": 2386 }, { "epoch": 0.163280662151994, "grad_norm": 1.9189547705963241, "learning_rate": 9.541527129414405e-06, "loss": 0.2123, "step": 2387 }, { "epoch": 0.16334906628360352, "grad_norm": 2.7449132517186126, "learning_rate": 9.541063635693617e-06, "loss": 0.2748, "step": 2388 }, { "epoch": 0.16341747041521307, "grad_norm": 2.5483642682207712, "learning_rate": 9.540599919075497e-06, "loss": 0.5756, "step": 2389 }, { "epoch": 0.16348587454682262, "grad_norm": 2.2391428595615244, "learning_rate": 9.540135979582807e-06, "loss": 0.5662, "step": 2390 }, { "epoch": 0.16355427867843217, "grad_norm": 2.028466841538812, "learning_rate": 9.539671817238318e-06, "loss": 0.343, "step": 2391 }, { "epoch": 0.16362268281004172, "grad_norm": 1.9751056911460676, "learning_rate": 9.539207432064816e-06, "loss": 0.2801, "step": 2392 }, { "epoch": 0.16369108694165127, "grad_norm": 2.078803496632967, "learning_rate": 9.538742824085092e-06, "loss": 0.3793, "step": 2393 }, { "epoch": 0.16375949107326082, "grad_norm": 2.110430827631485, "learning_rate": 9.538277993321955e-06, "loss": 0.5305, "step": 2394 }, { "epoch": 0.16382789520487037, "grad_norm": 2.0520242974620944, "learning_rate": 9.537812939798217e-06, "loss": 0.3936, "step": 2395 }, { "epoch": 0.16389629933647992, "grad_norm": 2.892221329633891, "learning_rate": 9.537347663536706e-06, "loss": 0.2144, "step": 2396 }, { "epoch": 0.16396470346808947, "grad_norm": 1.9469000110581058, "learning_rate": 9.536882164560264e-06, "loss": 0.4684, "step": 2397 }, { "epoch": 0.16403310759969902, "grad_norm": 2.631999691201905, "learning_rate": 9.536416442891735e-06, "loss": 0.3855, "step": 2398 }, { "epoch": 0.16410151173130857, "grad_norm": 2.0822423931651106, "learning_rate": 9.53595049855398e-06, "loss": 0.5516, "step": 2399 }, { "epoch": 0.16416991586291813, "grad_norm": 1.2372200190983804, "learning_rate": 9.535484331569873e-06, "loss": 0.1224, "step": 2400 }, { "epoch": 0.16423831999452768, "grad_norm": 2.0981042692508556, "learning_rate": 9.535017941962292e-06, "loss": 0.4706, "step": 2401 }, { "epoch": 0.16430672412613723, "grad_norm": 2.829481247536939, "learning_rate": 9.534551329754132e-06, "loss": 0.5657, "step": 2402 }, { "epoch": 0.16437512825774678, "grad_norm": 1.8604610260725976, "learning_rate": 9.534084494968296e-06, "loss": 0.2077, "step": 2403 }, { "epoch": 0.16444353238935633, "grad_norm": 1.9714365521596702, "learning_rate": 9.533617437627696e-06, "loss": 0.4246, "step": 2404 }, { "epoch": 0.16451193652096588, "grad_norm": 2.554665719968946, "learning_rate": 9.533150157755262e-06, "loss": 0.4292, "step": 2405 }, { "epoch": 0.1645803406525754, "grad_norm": 1.9242972639438982, "learning_rate": 9.532682655373927e-06, "loss": 0.312, "step": 2406 }, { "epoch": 0.16464874478418495, "grad_norm": 2.653985617873916, "learning_rate": 9.532214930506638e-06, "loss": 0.4983, "step": 2407 }, { "epoch": 0.1647171489157945, "grad_norm": 2.3184031712959547, "learning_rate": 9.531746983176356e-06, "loss": 0.3044, "step": 2408 }, { "epoch": 0.16478555304740405, "grad_norm": 2.6702601254116485, "learning_rate": 9.531278813406047e-06, "loss": 0.5747, "step": 2409 }, { "epoch": 0.1648539571790136, "grad_norm": 1.9572272059850013, "learning_rate": 9.530810421218693e-06, "loss": 0.2678, "step": 2410 }, { "epoch": 0.16492236131062316, "grad_norm": 1.8292286101137656, "learning_rate": 9.530341806637285e-06, "loss": 0.3516, "step": 2411 }, { "epoch": 0.1649907654422327, "grad_norm": 2.448620110075896, "learning_rate": 9.529872969684824e-06, "loss": 0.4529, "step": 2412 }, { "epoch": 0.16505916957384226, "grad_norm": 2.1172673326481695, "learning_rate": 9.529403910384322e-06, "loss": 0.5051, "step": 2413 }, { "epoch": 0.1651275737054518, "grad_norm": 2.7681567139831817, "learning_rate": 9.528934628758805e-06, "loss": 0.7439, "step": 2414 }, { "epoch": 0.16519597783706136, "grad_norm": 2.0983909016631186, "learning_rate": 9.528465124831307e-06, "loss": 0.423, "step": 2415 }, { "epoch": 0.1652643819686709, "grad_norm": 2.78371038800111, "learning_rate": 9.527995398624871e-06, "loss": 0.4633, "step": 2416 }, { "epoch": 0.16533278610028046, "grad_norm": 1.7842738309930775, "learning_rate": 9.527525450162558e-06, "loss": 0.3067, "step": 2417 }, { "epoch": 0.16540119023189, "grad_norm": 2.507885111889496, "learning_rate": 9.527055279467432e-06, "loss": 0.6526, "step": 2418 }, { "epoch": 0.16546959436349956, "grad_norm": 2.3842315450687876, "learning_rate": 9.526584886562571e-06, "loss": 0.6934, "step": 2419 }, { "epoch": 0.1655379984951091, "grad_norm": 2.352250743029446, "learning_rate": 9.526114271471065e-06, "loss": 0.5087, "step": 2420 }, { "epoch": 0.16560640262671866, "grad_norm": 1.5317565070525072, "learning_rate": 9.525643434216014e-06, "loss": 0.2894, "step": 2421 }, { "epoch": 0.1656748067583282, "grad_norm": 2.048323802862455, "learning_rate": 9.52517237482053e-06, "loss": 0.4677, "step": 2422 }, { "epoch": 0.16574321088993776, "grad_norm": 2.645247025870921, "learning_rate": 9.524701093307735e-06, "loss": 0.5617, "step": 2423 }, { "epoch": 0.16581161502154732, "grad_norm": 1.8705949058622824, "learning_rate": 9.524229589700759e-06, "loss": 0.4023, "step": 2424 }, { "epoch": 0.16588001915315684, "grad_norm": 1.8061114593028378, "learning_rate": 9.523757864022747e-06, "loss": 0.3524, "step": 2425 }, { "epoch": 0.1659484232847664, "grad_norm": 2.4195313219335866, "learning_rate": 9.523285916296856e-06, "loss": 0.565, "step": 2426 }, { "epoch": 0.16601682741637594, "grad_norm": 2.623286306626401, "learning_rate": 9.52281374654625e-06, "loss": 0.668, "step": 2427 }, { "epoch": 0.1660852315479855, "grad_norm": 1.6504301529162948, "learning_rate": 9.522341354794101e-06, "loss": 0.3207, "step": 2428 }, { "epoch": 0.16615363567959504, "grad_norm": 1.979509915318122, "learning_rate": 9.521868741063603e-06, "loss": 0.4662, "step": 2429 }, { "epoch": 0.1662220398112046, "grad_norm": 2.0952999299516484, "learning_rate": 9.52139590537795e-06, "loss": 0.2751, "step": 2430 }, { "epoch": 0.16629044394281414, "grad_norm": 1.8626841602069584, "learning_rate": 9.520922847760355e-06, "loss": 0.2926, "step": 2431 }, { "epoch": 0.1663588480744237, "grad_norm": 2.028866084233344, "learning_rate": 9.520449568234033e-06, "loss": 0.3993, "step": 2432 }, { "epoch": 0.16642725220603324, "grad_norm": 2.174548616651746, "learning_rate": 9.519976066822217e-06, "loss": 0.4892, "step": 2433 }, { "epoch": 0.1664956563376428, "grad_norm": 1.942945081553618, "learning_rate": 9.51950234354815e-06, "loss": 0.3966, "step": 2434 }, { "epoch": 0.16656406046925235, "grad_norm": 2.4723930881748317, "learning_rate": 9.519028398435082e-06, "loss": 0.6617, "step": 2435 }, { "epoch": 0.1666324646008619, "grad_norm": 2.2617287248513374, "learning_rate": 9.518554231506281e-06, "loss": 0.4346, "step": 2436 }, { "epoch": 0.16670086873247145, "grad_norm": 2.586927198619331, "learning_rate": 9.518079842785015e-06, "loss": 0.3499, "step": 2437 }, { "epoch": 0.166769272864081, "grad_norm": 2.2884666852224305, "learning_rate": 9.517605232294575e-06, "loss": 0.4898, "step": 2438 }, { "epoch": 0.16683767699569055, "grad_norm": 2.100169294350989, "learning_rate": 9.517130400058255e-06, "loss": 0.2122, "step": 2439 }, { "epoch": 0.1669060811273001, "grad_norm": 2.089786026059408, "learning_rate": 9.51665534609936e-06, "loss": 0.3381, "step": 2440 }, { "epoch": 0.16697448525890965, "grad_norm": 2.279248143110725, "learning_rate": 9.516180070441212e-06, "loss": 0.2685, "step": 2441 }, { "epoch": 0.1670428893905192, "grad_norm": 1.704540805990932, "learning_rate": 9.515704573107136e-06, "loss": 0.3429, "step": 2442 }, { "epoch": 0.16711129352212872, "grad_norm": 2.440878730240296, "learning_rate": 9.515228854120475e-06, "loss": 0.492, "step": 2443 }, { "epoch": 0.16717969765373827, "grad_norm": 2.3252137594760973, "learning_rate": 9.514752913504577e-06, "loss": 0.5762, "step": 2444 }, { "epoch": 0.16724810178534782, "grad_norm": 2.321391095786421, "learning_rate": 9.514276751282805e-06, "loss": 0.4501, "step": 2445 }, { "epoch": 0.16731650591695738, "grad_norm": 1.8259268366789392, "learning_rate": 9.51380036747853e-06, "loss": 0.2229, "step": 2446 }, { "epoch": 0.16738491004856693, "grad_norm": 1.2390630898227708, "learning_rate": 9.513323762115137e-06, "loss": 0.1472, "step": 2447 }, { "epoch": 0.16745331418017648, "grad_norm": 1.737464515749075, "learning_rate": 9.512846935216018e-06, "loss": 0.3724, "step": 2448 }, { "epoch": 0.16752171831178603, "grad_norm": 1.9241262718029943, "learning_rate": 9.51236988680458e-06, "loss": 0.3734, "step": 2449 }, { "epoch": 0.16759012244339558, "grad_norm": 2.1141270797519427, "learning_rate": 9.511892616904239e-06, "loss": 0.4086, "step": 2450 }, { "epoch": 0.16765852657500513, "grad_norm": 3.185636480534137, "learning_rate": 9.511415125538419e-06, "loss": 0.6929, "step": 2451 }, { "epoch": 0.16772693070661468, "grad_norm": 2.562562912657153, "learning_rate": 9.510937412730558e-06, "loss": 0.6494, "step": 2452 }, { "epoch": 0.16779533483822423, "grad_norm": 2.1569030436237755, "learning_rate": 9.510459478504109e-06, "loss": 0.4648, "step": 2453 }, { "epoch": 0.16786373896983378, "grad_norm": 1.6005115596487263, "learning_rate": 9.509981322882527e-06, "loss": 0.2518, "step": 2454 }, { "epoch": 0.16793214310144333, "grad_norm": 2.156373341805898, "learning_rate": 9.509502945889281e-06, "loss": 0.4645, "step": 2455 }, { "epoch": 0.16800054723305288, "grad_norm": 1.7457573436478535, "learning_rate": 9.509024347547858e-06, "loss": 0.2637, "step": 2456 }, { "epoch": 0.16806895136466243, "grad_norm": 1.8840040812923988, "learning_rate": 9.508545527881743e-06, "loss": 0.2743, "step": 2457 }, { "epoch": 0.16813735549627198, "grad_norm": 1.9225229899925522, "learning_rate": 9.508066486914442e-06, "loss": 0.2541, "step": 2458 }, { "epoch": 0.16820575962788153, "grad_norm": 1.5725739928941773, "learning_rate": 9.50758722466947e-06, "loss": 0.1702, "step": 2459 }, { "epoch": 0.16827416375949109, "grad_norm": 1.8678642606687528, "learning_rate": 9.50710774117035e-06, "loss": 0.2017, "step": 2460 }, { "epoch": 0.16834256789110064, "grad_norm": 1.7200772839263594, "learning_rate": 9.506628036440617e-06, "loss": 0.3316, "step": 2461 }, { "epoch": 0.16841097202271016, "grad_norm": 1.4286687666118185, "learning_rate": 9.506148110503818e-06, "loss": 0.216, "step": 2462 }, { "epoch": 0.1684793761543197, "grad_norm": 2.004203227830548, "learning_rate": 9.50566796338351e-06, "loss": 0.3756, "step": 2463 }, { "epoch": 0.16854778028592926, "grad_norm": 1.868077764197891, "learning_rate": 9.505187595103261e-06, "loss": 0.2952, "step": 2464 }, { "epoch": 0.1686161844175388, "grad_norm": 2.9638801788095632, "learning_rate": 9.504707005686647e-06, "loss": 0.7451, "step": 2465 }, { "epoch": 0.16868458854914836, "grad_norm": 1.8937647478234412, "learning_rate": 9.504226195157263e-06, "loss": 0.2771, "step": 2466 }, { "epoch": 0.1687529926807579, "grad_norm": 1.9917159401279834, "learning_rate": 9.503745163538704e-06, "loss": 0.5777, "step": 2467 }, { "epoch": 0.16882139681236746, "grad_norm": 2.3601000600147346, "learning_rate": 9.503263910854586e-06, "loss": 0.3709, "step": 2468 }, { "epoch": 0.16888980094397701, "grad_norm": 2.1623389510729885, "learning_rate": 9.50278243712853e-06, "loss": 0.5019, "step": 2469 }, { "epoch": 0.16895820507558656, "grad_norm": 1.800474393107946, "learning_rate": 9.502300742384167e-06, "loss": 0.3915, "step": 2470 }, { "epoch": 0.16902660920719612, "grad_norm": 6.113233591692226, "learning_rate": 9.501818826645142e-06, "loss": 0.6142, "step": 2471 }, { "epoch": 0.16909501333880567, "grad_norm": 2.3455535085005104, "learning_rate": 9.50133668993511e-06, "loss": 0.4625, "step": 2472 }, { "epoch": 0.16916341747041522, "grad_norm": 1.769304050402877, "learning_rate": 9.500854332277737e-06, "loss": 0.3327, "step": 2473 }, { "epoch": 0.16923182160202477, "grad_norm": 2.2128350889773913, "learning_rate": 9.500371753696698e-06, "loss": 0.3145, "step": 2474 }, { "epoch": 0.16930022573363432, "grad_norm": 1.2268235481577463, "learning_rate": 9.499888954215682e-06, "loss": 0.1788, "step": 2475 }, { "epoch": 0.16936862986524387, "grad_norm": 2.1129580760083377, "learning_rate": 9.499405933858386e-06, "loss": 0.4716, "step": 2476 }, { "epoch": 0.16943703399685342, "grad_norm": 2.2944479075470605, "learning_rate": 9.49892269264852e-06, "loss": 0.4414, "step": 2477 }, { "epoch": 0.16950543812846297, "grad_norm": 1.5526277301561016, "learning_rate": 9.498439230609804e-06, "loss": 0.1982, "step": 2478 }, { "epoch": 0.16957384226007252, "grad_norm": 2.3117744235405633, "learning_rate": 9.497955547765966e-06, "loss": 0.4474, "step": 2479 }, { "epoch": 0.16964224639168204, "grad_norm": 2.2075545745477947, "learning_rate": 9.49747164414075e-06, "loss": 0.5726, "step": 2480 }, { "epoch": 0.1697106505232916, "grad_norm": 1.9551611872205006, "learning_rate": 9.496987519757907e-06, "loss": 0.3344, "step": 2481 }, { "epoch": 0.16977905465490115, "grad_norm": 2.639064734160413, "learning_rate": 9.496503174641202e-06, "loss": 0.725, "step": 2482 }, { "epoch": 0.1698474587865107, "grad_norm": 2.7052682527455665, "learning_rate": 9.496018608814407e-06, "loss": 0.9301, "step": 2483 }, { "epoch": 0.16991586291812025, "grad_norm": 1.7680609617910599, "learning_rate": 9.495533822301307e-06, "loss": 0.3508, "step": 2484 }, { "epoch": 0.1699842670497298, "grad_norm": 1.4813247460607966, "learning_rate": 9.4950488151257e-06, "loss": 0.1903, "step": 2485 }, { "epoch": 0.17005267118133935, "grad_norm": 1.5887003003839881, "learning_rate": 9.49456358731139e-06, "loss": 0.1971, "step": 2486 }, { "epoch": 0.1701210753129489, "grad_norm": 2.2403566149192593, "learning_rate": 9.494078138882195e-06, "loss": 0.407, "step": 2487 }, { "epoch": 0.17018947944455845, "grad_norm": 2.11986157507441, "learning_rate": 9.493592469861944e-06, "loss": 0.6203, "step": 2488 }, { "epoch": 0.170257883576168, "grad_norm": 2.626024772794484, "learning_rate": 9.493106580274473e-06, "loss": 0.8577, "step": 2489 }, { "epoch": 0.17032628770777755, "grad_norm": 1.9747096299435958, "learning_rate": 9.492620470143636e-06, "loss": 0.4243, "step": 2490 }, { "epoch": 0.1703946918393871, "grad_norm": 1.9862412515283148, "learning_rate": 9.492134139493292e-06, "loss": 0.511, "step": 2491 }, { "epoch": 0.17046309597099665, "grad_norm": 1.2141350767419874, "learning_rate": 9.49164758834731e-06, "loss": 0.21, "step": 2492 }, { "epoch": 0.1705315001026062, "grad_norm": 2.7831326659803954, "learning_rate": 9.491160816729577e-06, "loss": 0.5177, "step": 2493 }, { "epoch": 0.17059990423421575, "grad_norm": 3.7160778542894692, "learning_rate": 9.490673824663983e-06, "loss": 0.3173, "step": 2494 }, { "epoch": 0.1706683083658253, "grad_norm": 2.337453235576621, "learning_rate": 9.49018661217443e-06, "loss": 0.6656, "step": 2495 }, { "epoch": 0.17073671249743486, "grad_norm": 1.897741716139916, "learning_rate": 9.489699179284837e-06, "loss": 0.1442, "step": 2496 }, { "epoch": 0.1708051166290444, "grad_norm": 1.8310427550880732, "learning_rate": 9.489211526019128e-06, "loss": 0.2239, "step": 2497 }, { "epoch": 0.17087352076065393, "grad_norm": 2.1492218111849097, "learning_rate": 9.48872365240124e-06, "loss": 0.4285, "step": 2498 }, { "epoch": 0.17094192489226348, "grad_norm": 2.412661445236072, "learning_rate": 9.488235558455118e-06, "loss": 0.5381, "step": 2499 }, { "epoch": 0.17101032902387303, "grad_norm": 1.8516450453115956, "learning_rate": 9.48774724420472e-06, "loss": 0.3513, "step": 2500 }, { "epoch": 0.17107873315548258, "grad_norm": 2.9030219064891885, "learning_rate": 9.487258709674019e-06, "loss": 0.6403, "step": 2501 }, { "epoch": 0.17114713728709213, "grad_norm": 2.4539933398998914, "learning_rate": 9.48676995488699e-06, "loss": 0.684, "step": 2502 }, { "epoch": 0.17121554141870168, "grad_norm": 1.6820211124070374, "learning_rate": 9.486280979867627e-06, "loss": 0.3042, "step": 2503 }, { "epoch": 0.17128394555031123, "grad_norm": 1.7195106687098334, "learning_rate": 9.485791784639926e-06, "loss": 0.2151, "step": 2504 }, { "epoch": 0.17135234968192078, "grad_norm": 1.8171936830463, "learning_rate": 9.485302369227905e-06, "loss": 0.3796, "step": 2505 }, { "epoch": 0.17142075381353034, "grad_norm": 2.3627070399293912, "learning_rate": 9.484812733655584e-06, "loss": 0.6758, "step": 2506 }, { "epoch": 0.1714891579451399, "grad_norm": 2.1694361145568117, "learning_rate": 9.484322877946997e-06, "loss": 0.5503, "step": 2507 }, { "epoch": 0.17155756207674944, "grad_norm": 1.626729457542612, "learning_rate": 9.48383280212619e-06, "loss": 0.2417, "step": 2508 }, { "epoch": 0.171625966208359, "grad_norm": 1.5907113053715851, "learning_rate": 9.483342506217214e-06, "loss": 0.2115, "step": 2509 }, { "epoch": 0.17169437033996854, "grad_norm": 1.9803001061139414, "learning_rate": 9.48285199024414e-06, "loss": 0.2641, "step": 2510 }, { "epoch": 0.1717627744715781, "grad_norm": 4.656182512651914, "learning_rate": 9.482361254231043e-06, "loss": 0.1662, "step": 2511 }, { "epoch": 0.17183117860318764, "grad_norm": 1.756038402048669, "learning_rate": 9.481870298202009e-06, "loss": 0.3139, "step": 2512 }, { "epoch": 0.1718995827347972, "grad_norm": 1.9517427665094622, "learning_rate": 9.48137912218114e-06, "loss": 0.391, "step": 2513 }, { "epoch": 0.17196798686640674, "grad_norm": 1.7231416429304658, "learning_rate": 9.480887726192542e-06, "loss": 0.4141, "step": 2514 }, { "epoch": 0.1720363909980163, "grad_norm": 2.3139312799663427, "learning_rate": 9.480396110260338e-06, "loss": 0.5129, "step": 2515 }, { "epoch": 0.17210479512962584, "grad_norm": 2.050116302226635, "learning_rate": 9.479904274408656e-06, "loss": 0.4525, "step": 2516 }, { "epoch": 0.17217319926123537, "grad_norm": 1.7706512207933613, "learning_rate": 9.479412218661637e-06, "loss": 0.3481, "step": 2517 }, { "epoch": 0.17224160339284492, "grad_norm": 2.1991801859860622, "learning_rate": 9.47891994304344e-06, "loss": 0.4405, "step": 2518 }, { "epoch": 0.17231000752445447, "grad_norm": 2.7440173572805966, "learning_rate": 9.47842744757822e-06, "loss": 0.5497, "step": 2519 }, { "epoch": 0.17237841165606402, "grad_norm": 2.1485403521800794, "learning_rate": 9.477934732290155e-06, "loss": 0.3394, "step": 2520 }, { "epoch": 0.17244681578767357, "grad_norm": 2.101562698534421, "learning_rate": 9.477441797203431e-06, "loss": 0.4417, "step": 2521 }, { "epoch": 0.17251521991928312, "grad_norm": 2.403959782115622, "learning_rate": 9.47694864234224e-06, "loss": 0.379, "step": 2522 }, { "epoch": 0.17258362405089267, "grad_norm": 2.036461764688032, "learning_rate": 9.476455267730792e-06, "loss": 0.4978, "step": 2523 }, { "epoch": 0.17265202818250222, "grad_norm": 1.2456307163767706, "learning_rate": 9.475961673393306e-06, "loss": 0.2375, "step": 2524 }, { "epoch": 0.17272043231411177, "grad_norm": 2.2720637546084936, "learning_rate": 9.475467859354004e-06, "loss": 0.5643, "step": 2525 }, { "epoch": 0.17278883644572132, "grad_norm": 2.0333509876964824, "learning_rate": 9.474973825637126e-06, "loss": 0.3608, "step": 2526 }, { "epoch": 0.17285724057733087, "grad_norm": 2.5291592947167976, "learning_rate": 9.474479572266926e-06, "loss": 0.5658, "step": 2527 }, { "epoch": 0.17292564470894042, "grad_norm": 2.0216360578895585, "learning_rate": 9.47398509926766e-06, "loss": 0.5599, "step": 2528 }, { "epoch": 0.17299404884054997, "grad_norm": 2.0694414753665007, "learning_rate": 9.4734904066636e-06, "loss": 0.3405, "step": 2529 }, { "epoch": 0.17306245297215953, "grad_norm": 2.4419839770543574, "learning_rate": 9.47299549447903e-06, "loss": 0.7923, "step": 2530 }, { "epoch": 0.17313085710376908, "grad_norm": 1.311383533187853, "learning_rate": 9.472500362738242e-06, "loss": 0.1904, "step": 2531 }, { "epoch": 0.17319926123537863, "grad_norm": 2.0175927337974326, "learning_rate": 9.47200501146554e-06, "loss": 0.4764, "step": 2532 }, { "epoch": 0.17326766536698818, "grad_norm": 1.6293927479323522, "learning_rate": 9.471509440685234e-06, "loss": 0.2858, "step": 2533 }, { "epoch": 0.17333606949859773, "grad_norm": 2.076528409128379, "learning_rate": 9.471013650421652e-06, "loss": 0.5763, "step": 2534 }, { "epoch": 0.17340447363020725, "grad_norm": 2.72865251376611, "learning_rate": 9.47051764069913e-06, "loss": 0.6735, "step": 2535 }, { "epoch": 0.1734728777618168, "grad_norm": 2.2206856518060514, "learning_rate": 9.470021411542014e-06, "loss": 0.5233, "step": 2536 }, { "epoch": 0.17354128189342635, "grad_norm": 1.771260080154046, "learning_rate": 9.469524962974663e-06, "loss": 0.3432, "step": 2537 }, { "epoch": 0.1736096860250359, "grad_norm": 2.0810280578427487, "learning_rate": 9.469028295021444e-06, "loss": 0.4928, "step": 2538 }, { "epoch": 0.17367809015664545, "grad_norm": 1.5458783541812877, "learning_rate": 9.468531407706733e-06, "loss": 0.2241, "step": 2539 }, { "epoch": 0.173746494288255, "grad_norm": 5.578477266018291, "learning_rate": 9.468034301054925e-06, "loss": 0.352, "step": 2540 }, { "epoch": 0.17381489841986456, "grad_norm": 2.0124032679998245, "learning_rate": 9.467536975090416e-06, "loss": 0.3303, "step": 2541 }, { "epoch": 0.1738833025514741, "grad_norm": 2.3779971636024544, "learning_rate": 9.467039429837619e-06, "loss": 0.6691, "step": 2542 }, { "epoch": 0.17395170668308366, "grad_norm": 2.1238078040799104, "learning_rate": 9.466541665320957e-06, "loss": 0.5858, "step": 2543 }, { "epoch": 0.1740201108146932, "grad_norm": 2.572178012917071, "learning_rate": 9.46604368156486e-06, "loss": 0.536, "step": 2544 }, { "epoch": 0.17408851494630276, "grad_norm": 2.410890878410778, "learning_rate": 9.465545478593772e-06, "loss": 0.4276, "step": 2545 }, { "epoch": 0.1741569190779123, "grad_norm": 1.5260298069915126, "learning_rate": 9.465047056432149e-06, "loss": 0.3202, "step": 2546 }, { "epoch": 0.17422532320952186, "grad_norm": 3.434660814105427, "learning_rate": 9.464548415104456e-06, "loss": 0.6383, "step": 2547 }, { "epoch": 0.1742937273411314, "grad_norm": 1.5806876234181442, "learning_rate": 9.464049554635167e-06, "loss": 0.2701, "step": 2548 }, { "epoch": 0.17436213147274096, "grad_norm": 2.0448984693118644, "learning_rate": 9.463550475048769e-06, "loss": 0.4608, "step": 2549 }, { "epoch": 0.1744305356043505, "grad_norm": 1.788478056217681, "learning_rate": 9.463051176369758e-06, "loss": 0.2794, "step": 2550 }, { "epoch": 0.17449893973596006, "grad_norm": 2.2804640696400114, "learning_rate": 9.462551658622646e-06, "loss": 0.6566, "step": 2551 }, { "epoch": 0.1745673438675696, "grad_norm": 1.6524258902043174, "learning_rate": 9.462051921831946e-06, "loss": 0.2407, "step": 2552 }, { "epoch": 0.17463574799917916, "grad_norm": 2.849774536931241, "learning_rate": 9.461551966022193e-06, "loss": 0.617, "step": 2553 }, { "epoch": 0.1747041521307887, "grad_norm": 1.8472323506021047, "learning_rate": 9.461051791217924e-06, "loss": 0.3133, "step": 2554 }, { "epoch": 0.17477255626239824, "grad_norm": 2.116415354560203, "learning_rate": 9.460551397443691e-06, "loss": 0.4196, "step": 2555 }, { "epoch": 0.1748409603940078, "grad_norm": 2.0101947456051135, "learning_rate": 9.460050784724056e-06, "loss": 0.3398, "step": 2556 }, { "epoch": 0.17490936452561734, "grad_norm": 2.195814095750056, "learning_rate": 9.45954995308359e-06, "loss": 0.6937, "step": 2557 }, { "epoch": 0.1749777686572269, "grad_norm": 1.6345580577675844, "learning_rate": 9.459048902546878e-06, "loss": 0.1803, "step": 2558 }, { "epoch": 0.17504617278883644, "grad_norm": 1.9667792220019518, "learning_rate": 9.458547633138515e-06, "loss": 0.4558, "step": 2559 }, { "epoch": 0.175114576920446, "grad_norm": 1.9265152729739854, "learning_rate": 9.458046144883103e-06, "loss": 0.4637, "step": 2560 }, { "epoch": 0.17518298105205554, "grad_norm": 2.123560993233004, "learning_rate": 9.457544437805258e-06, "loss": 0.485, "step": 2561 }, { "epoch": 0.1752513851836651, "grad_norm": 1.5568392015489219, "learning_rate": 9.457042511929606e-06, "loss": 0.3531, "step": 2562 }, { "epoch": 0.17531978931527464, "grad_norm": 2.021740566365618, "learning_rate": 9.456540367280787e-06, "loss": 0.4594, "step": 2563 }, { "epoch": 0.1753881934468842, "grad_norm": 1.8913869898562714, "learning_rate": 9.456038003883446e-06, "loss": 0.4432, "step": 2564 }, { "epoch": 0.17545659757849374, "grad_norm": 2.0868751904263885, "learning_rate": 9.455535421762238e-06, "loss": 0.4439, "step": 2565 }, { "epoch": 0.1755250017101033, "grad_norm": 1.9231754958125342, "learning_rate": 9.45503262094184e-06, "loss": 0.4866, "step": 2566 }, { "epoch": 0.17559340584171285, "grad_norm": 2.422965112044937, "learning_rate": 9.454529601446927e-06, "loss": 0.3627, "step": 2567 }, { "epoch": 0.1756618099733224, "grad_norm": 1.7313277994846443, "learning_rate": 9.45402636330219e-06, "loss": 0.3639, "step": 2568 }, { "epoch": 0.17573021410493195, "grad_norm": 1.6029303175965757, "learning_rate": 9.453522906532331e-06, "loss": 0.1939, "step": 2569 }, { "epoch": 0.1757986182365415, "grad_norm": 1.7768655259365875, "learning_rate": 9.453019231162062e-06, "loss": 0.3485, "step": 2570 }, { "epoch": 0.17586702236815105, "grad_norm": 1.6441574124824858, "learning_rate": 9.452515337216104e-06, "loss": 0.3132, "step": 2571 }, { "epoch": 0.17593542649976057, "grad_norm": 1.9156452874856484, "learning_rate": 9.452011224719195e-06, "loss": 0.4747, "step": 2572 }, { "epoch": 0.17600383063137012, "grad_norm": 2.0033688819295805, "learning_rate": 9.451506893696074e-06, "loss": 0.371, "step": 2573 }, { "epoch": 0.17607223476297967, "grad_norm": 1.5430154720910203, "learning_rate": 9.451002344171501e-06, "loss": 0.2259, "step": 2574 }, { "epoch": 0.17614063889458922, "grad_norm": 2.241670708463587, "learning_rate": 9.450497576170238e-06, "loss": 0.5827, "step": 2575 }, { "epoch": 0.17620904302619878, "grad_norm": 2.1539672571729107, "learning_rate": 9.449992589717063e-06, "loss": 0.5303, "step": 2576 }, { "epoch": 0.17627744715780833, "grad_norm": 2.3772666931200086, "learning_rate": 9.449487384836763e-06, "loss": 0.4722, "step": 2577 }, { "epoch": 0.17634585128941788, "grad_norm": 2.031791039084187, "learning_rate": 9.448981961554136e-06, "loss": 0.4236, "step": 2578 }, { "epoch": 0.17641425542102743, "grad_norm": 2.011463363880865, "learning_rate": 9.448476319893989e-06, "loss": 0.4773, "step": 2579 }, { "epoch": 0.17648265955263698, "grad_norm": 2.1483123326687026, "learning_rate": 9.447970459881144e-06, "loss": 0.3381, "step": 2580 }, { "epoch": 0.17655106368424653, "grad_norm": 1.4090478641133144, "learning_rate": 9.447464381540428e-06, "loss": 0.1535, "step": 2581 }, { "epoch": 0.17661946781585608, "grad_norm": 2.0075682072138967, "learning_rate": 9.446958084896686e-06, "loss": 0.4754, "step": 2582 }, { "epoch": 0.17668787194746563, "grad_norm": 2.485298494922841, "learning_rate": 9.446451569974766e-06, "loss": 0.7572, "step": 2583 }, { "epoch": 0.17675627607907518, "grad_norm": 1.4665888144771826, "learning_rate": 9.445944836799531e-06, "loss": 0.3349, "step": 2584 }, { "epoch": 0.17682468021068473, "grad_norm": 2.295769341010253, "learning_rate": 9.445437885395854e-06, "loss": 0.6102, "step": 2585 }, { "epoch": 0.17689308434229428, "grad_norm": 2.185206839364895, "learning_rate": 9.444930715788621e-06, "loss": 0.6371, "step": 2586 }, { "epoch": 0.17696148847390383, "grad_norm": 1.943700888832051, "learning_rate": 9.444423328002721e-06, "loss": 0.3997, "step": 2587 }, { "epoch": 0.17702989260551338, "grad_norm": 2.0606712267627856, "learning_rate": 9.443915722063064e-06, "loss": 0.4479, "step": 2588 }, { "epoch": 0.17709829673712293, "grad_norm": 2.310609876300989, "learning_rate": 9.443407897994563e-06, "loss": 0.4744, "step": 2589 }, { "epoch": 0.17716670086873249, "grad_norm": 1.7001491454910054, "learning_rate": 9.442899855822147e-06, "loss": 0.3254, "step": 2590 }, { "epoch": 0.177235105000342, "grad_norm": 1.9201782226174484, "learning_rate": 9.44239159557075e-06, "loss": 0.3939, "step": 2591 }, { "epoch": 0.17730350913195156, "grad_norm": 1.9412071502031434, "learning_rate": 9.441883117265322e-06, "loss": 0.2838, "step": 2592 }, { "epoch": 0.1773719132635611, "grad_norm": 2.2037399563260274, "learning_rate": 9.44137442093082e-06, "loss": 0.5071, "step": 2593 }, { "epoch": 0.17744031739517066, "grad_norm": 1.6369523994627344, "learning_rate": 9.440865506592215e-06, "loss": 0.2819, "step": 2594 }, { "epoch": 0.1775087215267802, "grad_norm": 2.1953781566100874, "learning_rate": 9.440356374274486e-06, "loss": 0.6081, "step": 2595 }, { "epoch": 0.17757712565838976, "grad_norm": 2.1789298193399866, "learning_rate": 9.439847024002626e-06, "loss": 0.6323, "step": 2596 }, { "epoch": 0.1776455297899993, "grad_norm": 1.767799561682307, "learning_rate": 9.439337455801632e-06, "loss": 0.3833, "step": 2597 }, { "epoch": 0.17771393392160886, "grad_norm": 2.4419850571208004, "learning_rate": 9.438827669696517e-06, "loss": 0.6384, "step": 2598 }, { "epoch": 0.1777823380532184, "grad_norm": 1.5570894262046449, "learning_rate": 9.438317665712308e-06, "loss": 0.2473, "step": 2599 }, { "epoch": 0.17785074218482796, "grad_norm": 1.4885847403000492, "learning_rate": 9.437807443874033e-06, "loss": 0.2309, "step": 2600 }, { "epoch": 0.17791914631643752, "grad_norm": 1.7817927713129889, "learning_rate": 9.437297004206742e-06, "loss": 0.2343, "step": 2601 }, { "epoch": 0.17798755044804707, "grad_norm": 2.1425716848633054, "learning_rate": 9.436786346735484e-06, "loss": 0.4874, "step": 2602 }, { "epoch": 0.17805595457965662, "grad_norm": 1.7912391098846745, "learning_rate": 9.436275471485328e-06, "loss": 0.4063, "step": 2603 }, { "epoch": 0.17812435871126617, "grad_norm": 10.94428324029346, "learning_rate": 9.43576437848135e-06, "loss": 0.4638, "step": 2604 }, { "epoch": 0.17819276284287572, "grad_norm": 2.1572156485354435, "learning_rate": 9.435253067748633e-06, "loss": 0.3916, "step": 2605 }, { "epoch": 0.17826116697448527, "grad_norm": 3.3512928816468217, "learning_rate": 9.43474153931228e-06, "loss": 0.4674, "step": 2606 }, { "epoch": 0.17832957110609482, "grad_norm": 1.6393456738997951, "learning_rate": 9.434229793197398e-06, "loss": 0.2465, "step": 2607 }, { "epoch": 0.17839797523770437, "grad_norm": 2.251180974653004, "learning_rate": 9.433717829429103e-06, "loss": 0.6196, "step": 2608 }, { "epoch": 0.1784663793693139, "grad_norm": 2.113891062301652, "learning_rate": 9.433205648032528e-06, "loss": 0.5546, "step": 2609 }, { "epoch": 0.17853478350092344, "grad_norm": 2.092413076899209, "learning_rate": 9.432693249032812e-06, "loss": 0.4691, "step": 2610 }, { "epoch": 0.178603187632533, "grad_norm": 1.8892619918229636, "learning_rate": 9.432180632455107e-06, "loss": 0.4336, "step": 2611 }, { "epoch": 0.17867159176414255, "grad_norm": 2.0309699085500705, "learning_rate": 9.431667798324573e-06, "loss": 0.2592, "step": 2612 }, { "epoch": 0.1787399958957521, "grad_norm": 2.26325624548913, "learning_rate": 9.431154746666382e-06, "loss": 0.3349, "step": 2613 }, { "epoch": 0.17880840002736165, "grad_norm": 2.040997146449788, "learning_rate": 9.43064147750572e-06, "loss": 0.38, "step": 2614 }, { "epoch": 0.1788768041589712, "grad_norm": 1.73732105481145, "learning_rate": 9.430127990867778e-06, "loss": 0.3035, "step": 2615 }, { "epoch": 0.17894520829058075, "grad_norm": 2.2242434253402514, "learning_rate": 9.429614286777762e-06, "loss": 0.5901, "step": 2616 }, { "epoch": 0.1790136124221903, "grad_norm": 2.0351854640745617, "learning_rate": 9.429100365260886e-06, "loss": 0.4907, "step": 2617 }, { "epoch": 0.17908201655379985, "grad_norm": 2.189723048386382, "learning_rate": 9.428586226342375e-06, "loss": 0.4398, "step": 2618 }, { "epoch": 0.1791504206854094, "grad_norm": 2.564253602470076, "learning_rate": 9.428071870047469e-06, "loss": 0.3835, "step": 2619 }, { "epoch": 0.17921882481701895, "grad_norm": 1.7639700594216021, "learning_rate": 9.427557296401411e-06, "loss": 0.3607, "step": 2620 }, { "epoch": 0.1792872289486285, "grad_norm": 1.9157029342977967, "learning_rate": 9.42704250542946e-06, "loss": 0.3761, "step": 2621 }, { "epoch": 0.17935563308023805, "grad_norm": 1.8553002692576512, "learning_rate": 9.426527497156885e-06, "loss": 0.5257, "step": 2622 }, { "epoch": 0.1794240372118476, "grad_norm": 1.684420641455901, "learning_rate": 9.426012271608967e-06, "loss": 0.3335, "step": 2623 }, { "epoch": 0.17949244134345715, "grad_norm": 2.422596128837807, "learning_rate": 9.425496828810992e-06, "loss": 0.5597, "step": 2624 }, { "epoch": 0.1795608454750667, "grad_norm": 2.300897345989752, "learning_rate": 9.42498116878826e-06, "loss": 0.628, "step": 2625 }, { "epoch": 0.17962924960667626, "grad_norm": 2.4071871307424266, "learning_rate": 9.424465291566088e-06, "loss": 0.3359, "step": 2626 }, { "epoch": 0.17969765373828578, "grad_norm": 2.121221900286262, "learning_rate": 9.423949197169792e-06, "loss": 0.2124, "step": 2627 }, { "epoch": 0.17976605786989533, "grad_norm": 2.0502487617554523, "learning_rate": 9.423432885624708e-06, "loss": 0.531, "step": 2628 }, { "epoch": 0.17983446200150488, "grad_norm": 2.5493263079310857, "learning_rate": 9.422916356956175e-06, "loss": 0.6223, "step": 2629 }, { "epoch": 0.17990286613311443, "grad_norm": 2.9309238520803853, "learning_rate": 9.422399611189553e-06, "loss": 0.3322, "step": 2630 }, { "epoch": 0.17997127026472398, "grad_norm": 1.8512772046575234, "learning_rate": 9.421882648350198e-06, "loss": 0.2724, "step": 2631 }, { "epoch": 0.18003967439633353, "grad_norm": 1.9057323386106932, "learning_rate": 9.421365468463493e-06, "loss": 0.2639, "step": 2632 }, { "epoch": 0.18010807852794308, "grad_norm": 1.7897014018498498, "learning_rate": 9.420848071554819e-06, "loss": 0.4349, "step": 2633 }, { "epoch": 0.18017648265955263, "grad_norm": 2.0320930052117685, "learning_rate": 9.420330457649573e-06, "loss": 0.2799, "step": 2634 }, { "epoch": 0.18024488679116218, "grad_norm": 2.0930213371980853, "learning_rate": 9.419812626773163e-06, "loss": 0.2096, "step": 2635 }, { "epoch": 0.18031329092277174, "grad_norm": 2.6455476010844867, "learning_rate": 9.419294578951006e-06, "loss": 0.5554, "step": 2636 }, { "epoch": 0.18038169505438129, "grad_norm": 1.7047049713812876, "learning_rate": 9.418776314208532e-06, "loss": 0.2687, "step": 2637 }, { "epoch": 0.18045009918599084, "grad_norm": 2.2133174730346883, "learning_rate": 9.418257832571176e-06, "loss": 0.5936, "step": 2638 }, { "epoch": 0.1805185033176004, "grad_norm": 2.0849705215698373, "learning_rate": 9.417739134064392e-06, "loss": 0.5457, "step": 2639 }, { "epoch": 0.18058690744920994, "grad_norm": 1.8464780704494765, "learning_rate": 9.417220218713636e-06, "loss": 0.2589, "step": 2640 }, { "epoch": 0.1806553115808195, "grad_norm": 2.2713201173153017, "learning_rate": 9.416701086544384e-06, "loss": 0.3117, "step": 2641 }, { "epoch": 0.18072371571242904, "grad_norm": 2.3993777441956547, "learning_rate": 9.416181737582112e-06, "loss": 0.4758, "step": 2642 }, { "epoch": 0.1807921198440386, "grad_norm": 2.16697762318101, "learning_rate": 9.415662171852317e-06, "loss": 0.5773, "step": 2643 }, { "epoch": 0.18086052397564814, "grad_norm": 1.7404126783448868, "learning_rate": 9.415142389380498e-06, "loss": 0.2783, "step": 2644 }, { "epoch": 0.1809289281072577, "grad_norm": 1.4949898444424041, "learning_rate": 9.414622390192171e-06, "loss": 0.3453, "step": 2645 }, { "epoch": 0.18099733223886721, "grad_norm": 1.4438995072075584, "learning_rate": 9.414102174312859e-06, "loss": 0.2372, "step": 2646 }, { "epoch": 0.18106573637047677, "grad_norm": 1.9267143789849224, "learning_rate": 9.413581741768096e-06, "loss": 0.2429, "step": 2647 }, { "epoch": 0.18113414050208632, "grad_norm": 2.7904531073520293, "learning_rate": 9.41306109258343e-06, "loss": 0.4634, "step": 2648 }, { "epoch": 0.18120254463369587, "grad_norm": 1.8370273926254699, "learning_rate": 9.412540226784413e-06, "loss": 0.3856, "step": 2649 }, { "epoch": 0.18127094876530542, "grad_norm": 1.9189446252719726, "learning_rate": 9.412019144396616e-06, "loss": 0.3916, "step": 2650 }, { "epoch": 0.18133935289691497, "grad_norm": 2.43709047251783, "learning_rate": 9.411497845445612e-06, "loss": 0.5569, "step": 2651 }, { "epoch": 0.18140775702852452, "grad_norm": 2.0347969338375016, "learning_rate": 9.410976329956992e-06, "loss": 0.1683, "step": 2652 }, { "epoch": 0.18147616116013407, "grad_norm": 2.1097724045699464, "learning_rate": 9.410454597956351e-06, "loss": 0.6015, "step": 2653 }, { "epoch": 0.18154456529174362, "grad_norm": 1.96228984539729, "learning_rate": 9.409932649469302e-06, "loss": 0.5205, "step": 2654 }, { "epoch": 0.18161296942335317, "grad_norm": 1.602145623838583, "learning_rate": 9.409410484521464e-06, "loss": 0.1611, "step": 2655 }, { "epoch": 0.18168137355496272, "grad_norm": 1.9375732930994778, "learning_rate": 9.408888103138467e-06, "loss": 0.3227, "step": 2656 }, { "epoch": 0.18174977768657227, "grad_norm": 9.223901686376882, "learning_rate": 9.40836550534595e-06, "loss": 0.2303, "step": 2657 }, { "epoch": 0.18181818181818182, "grad_norm": 2.0486226265165177, "learning_rate": 9.407842691169566e-06, "loss": 0.3358, "step": 2658 }, { "epoch": 0.18188658594979137, "grad_norm": 1.686691028460811, "learning_rate": 9.40731966063498e-06, "loss": 0.3672, "step": 2659 }, { "epoch": 0.18195499008140092, "grad_norm": 2.2619415463358177, "learning_rate": 9.40679641376786e-06, "loss": 0.3529, "step": 2660 }, { "epoch": 0.18202339421301048, "grad_norm": 1.5860100762433806, "learning_rate": 9.406272950593892e-06, "loss": 0.2703, "step": 2661 }, { "epoch": 0.18209179834462003, "grad_norm": 2.134360160608494, "learning_rate": 9.405749271138771e-06, "loss": 0.4408, "step": 2662 }, { "epoch": 0.18216020247622958, "grad_norm": 1.8227092656724593, "learning_rate": 9.4052253754282e-06, "loss": 0.4037, "step": 2663 }, { "epoch": 0.1822286066078391, "grad_norm": 1.7539424085394437, "learning_rate": 9.404701263487893e-06, "loss": 0.2555, "step": 2664 }, { "epoch": 0.18229701073944865, "grad_norm": 2.021698613262951, "learning_rate": 9.40417693534358e-06, "loss": 0.5047, "step": 2665 }, { "epoch": 0.1823654148710582, "grad_norm": 2.060528716237837, "learning_rate": 9.403652391020996e-06, "loss": 0.4486, "step": 2666 }, { "epoch": 0.18243381900266775, "grad_norm": 2.025721821794134, "learning_rate": 9.403127630545887e-06, "loss": 0.5139, "step": 2667 }, { "epoch": 0.1825022231342773, "grad_norm": 2.968831071248616, "learning_rate": 9.402602653944012e-06, "loss": 0.319, "step": 2668 }, { "epoch": 0.18257062726588685, "grad_norm": 2.0459292571083987, "learning_rate": 9.402077461241139e-06, "loss": 0.423, "step": 2669 }, { "epoch": 0.1826390313974964, "grad_norm": 1.328461315544582, "learning_rate": 9.401552052463046e-06, "loss": 0.1062, "step": 2670 }, { "epoch": 0.18270743552910595, "grad_norm": 2.3125621684197806, "learning_rate": 9.401026427635522e-06, "loss": 0.4412, "step": 2671 }, { "epoch": 0.1827758396607155, "grad_norm": 1.6802377730941944, "learning_rate": 9.40050058678437e-06, "loss": 0.3354, "step": 2672 }, { "epoch": 0.18284424379232506, "grad_norm": 2.218047514373858, "learning_rate": 9.3999745299354e-06, "loss": 0.5354, "step": 2673 }, { "epoch": 0.1829126479239346, "grad_norm": 2.0886919501848586, "learning_rate": 9.399448257114432e-06, "loss": 0.5341, "step": 2674 }, { "epoch": 0.18298105205554416, "grad_norm": 2.465689893238432, "learning_rate": 9.398921768347298e-06, "loss": 0.4261, "step": 2675 }, { "epoch": 0.1830494561871537, "grad_norm": 1.6469516249534444, "learning_rate": 9.398395063659841e-06, "loss": 0.2917, "step": 2676 }, { "epoch": 0.18311786031876326, "grad_norm": 3.7603775121001215, "learning_rate": 9.397868143077916e-06, "loss": 0.3657, "step": 2677 }, { "epoch": 0.1831862644503728, "grad_norm": 2.483703990504375, "learning_rate": 9.397341006627387e-06, "loss": 0.6388, "step": 2678 }, { "epoch": 0.18325466858198236, "grad_norm": 2.078786400541084, "learning_rate": 9.396813654334124e-06, "loss": 0.4568, "step": 2679 }, { "epoch": 0.1833230727135919, "grad_norm": 2.0348887203636727, "learning_rate": 9.396286086224017e-06, "loss": 0.3429, "step": 2680 }, { "epoch": 0.18339147684520146, "grad_norm": 2.028480247983894, "learning_rate": 9.39575830232296e-06, "loss": 0.6193, "step": 2681 }, { "epoch": 0.183459880976811, "grad_norm": 6.6899365416112975, "learning_rate": 9.395230302656856e-06, "loss": 0.5096, "step": 2682 }, { "epoch": 0.18352828510842054, "grad_norm": 2.9919803039685324, "learning_rate": 9.394702087251626e-06, "loss": 0.1604, "step": 2683 }, { "epoch": 0.1835966892400301, "grad_norm": 2.1348177667027874, "learning_rate": 9.394173656133195e-06, "loss": 0.2709, "step": 2684 }, { "epoch": 0.18366509337163964, "grad_norm": 2.1807295724925946, "learning_rate": 9.393645009327502e-06, "loss": 0.4724, "step": 2685 }, { "epoch": 0.1837334975032492, "grad_norm": 2.535306217830459, "learning_rate": 9.393116146860496e-06, "loss": 0.834, "step": 2686 }, { "epoch": 0.18380190163485874, "grad_norm": 2.4943891668789586, "learning_rate": 9.392587068758135e-06, "loss": 0.2532, "step": 2687 }, { "epoch": 0.1838703057664683, "grad_norm": 2.014096086812431, "learning_rate": 9.392057775046389e-06, "loss": 0.4795, "step": 2688 }, { "epoch": 0.18393870989807784, "grad_norm": 2.104311977379725, "learning_rate": 9.391528265751237e-06, "loss": 0.3132, "step": 2689 }, { "epoch": 0.1840071140296874, "grad_norm": 2.1848769128379564, "learning_rate": 9.390998540898672e-06, "loss": 0.5289, "step": 2690 }, { "epoch": 0.18407551816129694, "grad_norm": 1.8739051960452566, "learning_rate": 9.390468600514694e-06, "loss": 0.4255, "step": 2691 }, { "epoch": 0.1841439222929065, "grad_norm": 2.0196830887962496, "learning_rate": 9.389938444625318e-06, "loss": 0.3605, "step": 2692 }, { "epoch": 0.18421232642451604, "grad_norm": 2.259159930137765, "learning_rate": 9.389408073256563e-06, "loss": 0.4955, "step": 2693 }, { "epoch": 0.1842807305561256, "grad_norm": 1.7635570881644909, "learning_rate": 9.388877486434465e-06, "loss": 0.2002, "step": 2694 }, { "epoch": 0.18434913468773514, "grad_norm": 2.3056598939579236, "learning_rate": 9.388346684185063e-06, "loss": 0.6199, "step": 2695 }, { "epoch": 0.1844175388193447, "grad_norm": 1.9688055091177472, "learning_rate": 9.387815666534417e-06, "loss": 0.1968, "step": 2696 }, { "epoch": 0.18448594295095425, "grad_norm": 1.8654534655615094, "learning_rate": 9.38728443350859e-06, "loss": 0.4712, "step": 2697 }, { "epoch": 0.1845543470825638, "grad_norm": 1.803295573029018, "learning_rate": 9.386752985133656e-06, "loss": 0.4261, "step": 2698 }, { "epoch": 0.18462275121417335, "grad_norm": 1.885980126824415, "learning_rate": 9.386221321435702e-06, "loss": 0.1988, "step": 2699 }, { "epoch": 0.1846911553457829, "grad_norm": 1.9998393776588976, "learning_rate": 9.385689442440825e-06, "loss": 0.2518, "step": 2700 }, { "epoch": 0.18475955947739242, "grad_norm": 1.4033360450515782, "learning_rate": 9.385157348175131e-06, "loss": 0.1808, "step": 2701 }, { "epoch": 0.18482796360900197, "grad_norm": 1.9159181931713938, "learning_rate": 9.384625038664742e-06, "loss": 0.1946, "step": 2702 }, { "epoch": 0.18489636774061152, "grad_norm": 1.559179731271297, "learning_rate": 9.384092513935779e-06, "loss": 0.3693, "step": 2703 }, { "epoch": 0.18496477187222107, "grad_norm": 1.8539234392066126, "learning_rate": 9.383559774014386e-06, "loss": 0.3586, "step": 2704 }, { "epoch": 0.18503317600383062, "grad_norm": 1.5825201150480546, "learning_rate": 9.383026818926711e-06, "loss": 0.283, "step": 2705 }, { "epoch": 0.18510158013544017, "grad_norm": 2.5629256842758514, "learning_rate": 9.382493648698915e-06, "loss": 0.2935, "step": 2706 }, { "epoch": 0.18516998426704973, "grad_norm": 2.2619984112918647, "learning_rate": 9.381960263357168e-06, "loss": 0.5045, "step": 2707 }, { "epoch": 0.18523838839865928, "grad_norm": 1.9557756366787848, "learning_rate": 9.38142666292765e-06, "loss": 0.3973, "step": 2708 }, { "epoch": 0.18530679253026883, "grad_norm": 1.3030764975543632, "learning_rate": 9.380892847436555e-06, "loss": 0.1463, "step": 2709 }, { "epoch": 0.18537519666187838, "grad_norm": 1.8532888717951337, "learning_rate": 9.380358816910083e-06, "loss": 0.2939, "step": 2710 }, { "epoch": 0.18544360079348793, "grad_norm": 2.156400730559123, "learning_rate": 9.379824571374449e-06, "loss": 0.655, "step": 2711 }, { "epoch": 0.18551200492509748, "grad_norm": 2.3201358708672175, "learning_rate": 9.379290110855873e-06, "loss": 0.3417, "step": 2712 }, { "epoch": 0.18558040905670703, "grad_norm": 2.505336947850577, "learning_rate": 9.378755435380592e-06, "loss": 0.3376, "step": 2713 }, { "epoch": 0.18564881318831658, "grad_norm": 1.984864215920992, "learning_rate": 9.378220544974851e-06, "loss": 0.3387, "step": 2714 }, { "epoch": 0.18571721731992613, "grad_norm": 1.6413903284866693, "learning_rate": 9.377685439664902e-06, "loss": 0.182, "step": 2715 }, { "epoch": 0.18578562145153568, "grad_norm": 2.2905154970960933, "learning_rate": 9.377150119477011e-06, "loss": 0.5967, "step": 2716 }, { "epoch": 0.18585402558314523, "grad_norm": 2.1207214084862263, "learning_rate": 9.376614584437457e-06, "loss": 0.4705, "step": 2717 }, { "epoch": 0.18592242971475478, "grad_norm": 2.3451170050096035, "learning_rate": 9.376078834572525e-06, "loss": 0.637, "step": 2718 }, { "epoch": 0.18599083384636433, "grad_norm": 1.5301999169990956, "learning_rate": 9.375542869908509e-06, "loss": 0.12, "step": 2719 }, { "epoch": 0.18605923797797386, "grad_norm": 2.431268711067119, "learning_rate": 9.375006690471721e-06, "loss": 0.4966, "step": 2720 }, { "epoch": 0.1861276421095834, "grad_norm": 2.198150954859318, "learning_rate": 9.374470296288479e-06, "loss": 0.5321, "step": 2721 }, { "epoch": 0.18619604624119296, "grad_norm": 2.2357361055706146, "learning_rate": 9.373933687385111e-06, "loss": 0.5953, "step": 2722 }, { "epoch": 0.1862644503728025, "grad_norm": 1.707991330049267, "learning_rate": 9.373396863787955e-06, "loss": 0.3154, "step": 2723 }, { "epoch": 0.18633285450441206, "grad_norm": 2.0511785358530754, "learning_rate": 9.372859825523361e-06, "loss": 0.5515, "step": 2724 }, { "epoch": 0.1864012586360216, "grad_norm": 2.47257941248681, "learning_rate": 9.372322572617693e-06, "loss": 0.6247, "step": 2725 }, { "epoch": 0.18646966276763116, "grad_norm": 1.4405606640250384, "learning_rate": 9.371785105097317e-06, "loss": 0.2584, "step": 2726 }, { "epoch": 0.1865380668992407, "grad_norm": 2.2369100155132204, "learning_rate": 9.37124742298862e-06, "loss": 0.2485, "step": 2727 }, { "epoch": 0.18660647103085026, "grad_norm": 1.7285642627834383, "learning_rate": 9.370709526317987e-06, "loss": 0.3464, "step": 2728 }, { "epoch": 0.1866748751624598, "grad_norm": 1.8029906523650516, "learning_rate": 9.370171415111828e-06, "loss": 0.3813, "step": 2729 }, { "epoch": 0.18674327929406936, "grad_norm": 2.5576942435153014, "learning_rate": 9.36963308939655e-06, "loss": 0.825, "step": 2730 }, { "epoch": 0.18681168342567891, "grad_norm": 1.8068137544724392, "learning_rate": 9.36909454919858e-06, "loss": 0.3298, "step": 2731 }, { "epoch": 0.18688008755728847, "grad_norm": 1.839127127679421, "learning_rate": 9.368555794544352e-06, "loss": 0.2553, "step": 2732 }, { "epoch": 0.18694849168889802, "grad_norm": 1.9470590856861558, "learning_rate": 9.36801682546031e-06, "loss": 0.2352, "step": 2733 }, { "epoch": 0.18701689582050757, "grad_norm": 2.2558949389275926, "learning_rate": 9.367477641972908e-06, "loss": 0.6083, "step": 2734 }, { "epoch": 0.18708529995211712, "grad_norm": 1.4462142914705143, "learning_rate": 9.366938244108616e-06, "loss": 0.2149, "step": 2735 }, { "epoch": 0.18715370408372667, "grad_norm": 1.757293675468064, "learning_rate": 9.366398631893906e-06, "loss": 0.4399, "step": 2736 }, { "epoch": 0.18722210821533622, "grad_norm": 2.3991621767887383, "learning_rate": 9.365858805355263e-06, "loss": 0.61, "step": 2737 }, { "epoch": 0.18729051234694574, "grad_norm": 1.5391917924744065, "learning_rate": 9.36531876451919e-06, "loss": 0.3237, "step": 2738 }, { "epoch": 0.1873589164785553, "grad_norm": 1.8731824052164872, "learning_rate": 9.364778509412191e-06, "loss": 0.2336, "step": 2739 }, { "epoch": 0.18742732061016484, "grad_norm": 1.9979066547849662, "learning_rate": 9.364238040060787e-06, "loss": 0.4286, "step": 2740 }, { "epoch": 0.1874957247417744, "grad_norm": 1.9534410945692662, "learning_rate": 9.363697356491505e-06, "loss": 0.2588, "step": 2741 }, { "epoch": 0.18756412887338395, "grad_norm": 2.257610080977435, "learning_rate": 9.363156458730884e-06, "loss": 0.6484, "step": 2742 }, { "epoch": 0.1876325330049935, "grad_norm": 1.9274719005483454, "learning_rate": 9.362615346805475e-06, "loss": 0.4176, "step": 2743 }, { "epoch": 0.18770093713660305, "grad_norm": 2.3821082465869843, "learning_rate": 9.362074020741836e-06, "loss": 0.5513, "step": 2744 }, { "epoch": 0.1877693412682126, "grad_norm": 1.986921433742885, "learning_rate": 9.361532480566542e-06, "loss": 0.2246, "step": 2745 }, { "epoch": 0.18783774539982215, "grad_norm": 1.911530965383114, "learning_rate": 9.360990726306171e-06, "loss": 0.4572, "step": 2746 }, { "epoch": 0.1879061495314317, "grad_norm": 2.3423768789572, "learning_rate": 9.360448757987318e-06, "loss": 0.4021, "step": 2747 }, { "epoch": 0.18797455366304125, "grad_norm": 1.8092749694818524, "learning_rate": 9.359906575636582e-06, "loss": 0.3081, "step": 2748 }, { "epoch": 0.1880429577946508, "grad_norm": 1.5931455375171455, "learning_rate": 9.359364179280578e-06, "loss": 0.1914, "step": 2749 }, { "epoch": 0.18811136192626035, "grad_norm": 1.992327685192273, "learning_rate": 9.358821568945929e-06, "loss": 0.3721, "step": 2750 }, { "epoch": 0.1881797660578699, "grad_norm": 2.1151531288415324, "learning_rate": 9.358278744659268e-06, "loss": 0.1904, "step": 2751 }, { "epoch": 0.18824817018947945, "grad_norm": 2.3643426828699257, "learning_rate": 9.357735706447243e-06, "loss": 0.5296, "step": 2752 }, { "epoch": 0.188316574321089, "grad_norm": 2.1666766878654355, "learning_rate": 9.357192454336503e-06, "loss": 0.4244, "step": 2753 }, { "epoch": 0.18838497845269855, "grad_norm": 1.759571270048457, "learning_rate": 9.356648988353717e-06, "loss": 0.3069, "step": 2754 }, { "epoch": 0.1884533825843081, "grad_norm": 2.5349344588540608, "learning_rate": 9.356105308525563e-06, "loss": 0.3956, "step": 2755 }, { "epoch": 0.18852178671591766, "grad_norm": 1.8715762868027241, "learning_rate": 9.355561414878722e-06, "loss": 0.2603, "step": 2756 }, { "epoch": 0.18859019084752718, "grad_norm": 2.086976921052409, "learning_rate": 9.355017307439896e-06, "loss": 0.3657, "step": 2757 }, { "epoch": 0.18865859497913673, "grad_norm": 1.885997411207701, "learning_rate": 9.35447298623579e-06, "loss": 0.4178, "step": 2758 }, { "epoch": 0.18872699911074628, "grad_norm": 2.517994292127004, "learning_rate": 9.353928451293122e-06, "loss": 0.7078, "step": 2759 }, { "epoch": 0.18879540324235583, "grad_norm": 2.7836569111628386, "learning_rate": 9.35338370263862e-06, "loss": 0.776, "step": 2760 }, { "epoch": 0.18886380737396538, "grad_norm": 2.3182791781996315, "learning_rate": 9.352838740299025e-06, "loss": 0.6546, "step": 2761 }, { "epoch": 0.18893221150557493, "grad_norm": 1.4792742880322505, "learning_rate": 9.352293564301086e-06, "loss": 0.2127, "step": 2762 }, { "epoch": 0.18900061563718448, "grad_norm": 2.037968444362092, "learning_rate": 9.35174817467156e-06, "loss": 0.3936, "step": 2763 }, { "epoch": 0.18906901976879403, "grad_norm": 2.8300475756652963, "learning_rate": 9.351202571437219e-06, "loss": 0.6752, "step": 2764 }, { "epoch": 0.18913742390040358, "grad_norm": 2.1361142755788682, "learning_rate": 9.350656754624846e-06, "loss": 0.4845, "step": 2765 }, { "epoch": 0.18920582803201313, "grad_norm": 2.102153177458918, "learning_rate": 9.350110724261228e-06, "loss": 0.5035, "step": 2766 }, { "epoch": 0.18927423216362269, "grad_norm": 1.9778266565747757, "learning_rate": 9.34956448037317e-06, "loss": 0.3751, "step": 2767 }, { "epoch": 0.18934263629523224, "grad_norm": 1.9466115528368033, "learning_rate": 9.349018022987484e-06, "loss": 0.4526, "step": 2768 }, { "epoch": 0.1894110404268418, "grad_norm": 2.409056506894815, "learning_rate": 9.348471352130991e-06, "loss": 0.3644, "step": 2769 }, { "epoch": 0.18947944455845134, "grad_norm": 2.32482446423201, "learning_rate": 9.347924467830527e-06, "loss": 0.7108, "step": 2770 }, { "epoch": 0.1895478486900609, "grad_norm": 2.08143650528071, "learning_rate": 9.347377370112934e-06, "loss": 0.2217, "step": 2771 }, { "epoch": 0.18961625282167044, "grad_norm": 2.4070480032848236, "learning_rate": 9.346830059005066e-06, "loss": 0.4235, "step": 2772 }, { "epoch": 0.18968465695328, "grad_norm": 1.7760794701752656, "learning_rate": 9.346282534533787e-06, "loss": 0.428, "step": 2773 }, { "epoch": 0.18975306108488954, "grad_norm": 2.180120575516667, "learning_rate": 9.345734796725974e-06, "loss": 0.669, "step": 2774 }, { "epoch": 0.18982146521649906, "grad_norm": 2.499530581272534, "learning_rate": 9.345186845608512e-06, "loss": 0.6027, "step": 2775 }, { "epoch": 0.18988986934810861, "grad_norm": 2.06694787684539, "learning_rate": 9.344638681208298e-06, "loss": 0.6125, "step": 2776 }, { "epoch": 0.18995827347971816, "grad_norm": 2.027922556872958, "learning_rate": 9.344090303552236e-06, "loss": 0.245, "step": 2777 }, { "epoch": 0.19002667761132772, "grad_norm": 1.8585912511230704, "learning_rate": 9.343541712667245e-06, "loss": 0.2105, "step": 2778 }, { "epoch": 0.19009508174293727, "grad_norm": 1.766274048077285, "learning_rate": 9.342992908580252e-06, "loss": 0.293, "step": 2779 }, { "epoch": 0.19016348587454682, "grad_norm": 2.2748212372060004, "learning_rate": 9.342443891318194e-06, "loss": 0.369, "step": 2780 }, { "epoch": 0.19023189000615637, "grad_norm": 2.630992022301799, "learning_rate": 9.341894660908023e-06, "loss": 0.5107, "step": 2781 }, { "epoch": 0.19030029413776592, "grad_norm": 1.7456300128198277, "learning_rate": 9.341345217376692e-06, "loss": 0.3289, "step": 2782 }, { "epoch": 0.19036869826937547, "grad_norm": 2.023146722586807, "learning_rate": 9.340795560751175e-06, "loss": 0.3337, "step": 2783 }, { "epoch": 0.19043710240098502, "grad_norm": 2.348452948584794, "learning_rate": 9.340245691058451e-06, "loss": 0.6574, "step": 2784 }, { "epoch": 0.19050550653259457, "grad_norm": 1.8322141104660246, "learning_rate": 9.33969560832551e-06, "loss": 0.5132, "step": 2785 }, { "epoch": 0.19057391066420412, "grad_norm": 2.858206942927068, "learning_rate": 9.33914531257935e-06, "loss": 0.4038, "step": 2786 }, { "epoch": 0.19064231479581367, "grad_norm": 1.7616309738283908, "learning_rate": 9.338594803846987e-06, "loss": 0.3202, "step": 2787 }, { "epoch": 0.19071071892742322, "grad_norm": 2.1345051661520693, "learning_rate": 9.33804408215544e-06, "loss": 0.691, "step": 2788 }, { "epoch": 0.19077912305903277, "grad_norm": 1.703793805595464, "learning_rate": 9.337493147531739e-06, "loss": 0.4528, "step": 2789 }, { "epoch": 0.19084752719064232, "grad_norm": 2.231042727959554, "learning_rate": 9.336942000002931e-06, "loss": 0.5402, "step": 2790 }, { "epoch": 0.19091593132225188, "grad_norm": 1.8041662231236266, "learning_rate": 9.336390639596066e-06, "loss": 0.3576, "step": 2791 }, { "epoch": 0.19098433545386143, "grad_norm": 2.1632382858049133, "learning_rate": 9.335839066338209e-06, "loss": 0.501, "step": 2792 }, { "epoch": 0.19105273958547095, "grad_norm": 1.390846674106416, "learning_rate": 9.33528728025643e-06, "loss": 0.3011, "step": 2793 }, { "epoch": 0.1911211437170805, "grad_norm": 1.512651985882751, "learning_rate": 9.334735281377821e-06, "loss": 0.2271, "step": 2794 }, { "epoch": 0.19118954784869005, "grad_norm": 2.1431437939264515, "learning_rate": 9.33418306972947e-06, "loss": 0.5517, "step": 2795 }, { "epoch": 0.1912579519802996, "grad_norm": 2.374684369295619, "learning_rate": 9.333630645338484e-06, "loss": 0.5415, "step": 2796 }, { "epoch": 0.19132635611190915, "grad_norm": 2.415644420580257, "learning_rate": 9.333078008231976e-06, "loss": 0.3941, "step": 2797 }, { "epoch": 0.1913947602435187, "grad_norm": 2.6863360601354844, "learning_rate": 9.33252515843708e-06, "loss": 0.7233, "step": 2798 }, { "epoch": 0.19146316437512825, "grad_norm": 2.1807191383113618, "learning_rate": 9.331972095980927e-06, "loss": 0.7219, "step": 2799 }, { "epoch": 0.1915315685067378, "grad_norm": 1.8052550162523153, "learning_rate": 9.331418820890664e-06, "loss": 0.4292, "step": 2800 }, { "epoch": 0.19159997263834735, "grad_norm": 2.3450454946224517, "learning_rate": 9.330865333193449e-06, "loss": 0.4656, "step": 2801 }, { "epoch": 0.1916683767699569, "grad_norm": 1.8976615293200867, "learning_rate": 9.330311632916448e-06, "loss": 0.4705, "step": 2802 }, { "epoch": 0.19173678090156646, "grad_norm": 1.6743562948927138, "learning_rate": 9.329757720086845e-06, "loss": 0.3054, "step": 2803 }, { "epoch": 0.191805185033176, "grad_norm": 2.8188160653841052, "learning_rate": 9.329203594731821e-06, "loss": 0.6023, "step": 2804 }, { "epoch": 0.19187358916478556, "grad_norm": 2.010901388764432, "learning_rate": 9.32864925687858e-06, "loss": 0.3022, "step": 2805 }, { "epoch": 0.1919419932963951, "grad_norm": 1.930635124996899, "learning_rate": 9.328094706554332e-06, "loss": 0.3382, "step": 2806 }, { "epoch": 0.19201039742800466, "grad_norm": 1.9043946422030908, "learning_rate": 9.327539943786295e-06, "loss": 0.3799, "step": 2807 }, { "epoch": 0.1920788015596142, "grad_norm": 2.3810191112630097, "learning_rate": 9.326984968601701e-06, "loss": 0.659, "step": 2808 }, { "epoch": 0.19214720569122376, "grad_norm": 1.6802298313483306, "learning_rate": 9.32642978102779e-06, "loss": 0.2344, "step": 2809 }, { "epoch": 0.1922156098228333, "grad_norm": 2.2736463925982, "learning_rate": 9.32587438109181e-06, "loss": 0.3113, "step": 2810 }, { "epoch": 0.19228401395444286, "grad_norm": 2.369966392922643, "learning_rate": 9.32531876882103e-06, "loss": 0.2564, "step": 2811 }, { "epoch": 0.19235241808605238, "grad_norm": 2.4197677674073272, "learning_rate": 9.324762944242716e-06, "loss": 0.518, "step": 2812 }, { "epoch": 0.19242082221766194, "grad_norm": 1.853384803199068, "learning_rate": 9.324206907384154e-06, "loss": 0.5308, "step": 2813 }, { "epoch": 0.19248922634927149, "grad_norm": 1.7857418998643764, "learning_rate": 9.323650658272634e-06, "loss": 0.3386, "step": 2814 }, { "epoch": 0.19255763048088104, "grad_norm": 1.4863086002385795, "learning_rate": 9.323094196935461e-06, "loss": 0.1819, "step": 2815 }, { "epoch": 0.1926260346124906, "grad_norm": 1.847998089449791, "learning_rate": 9.322537523399951e-06, "loss": 0.4434, "step": 2816 }, { "epoch": 0.19269443874410014, "grad_norm": 2.0456520897306727, "learning_rate": 9.321980637693423e-06, "loss": 0.3396, "step": 2817 }, { "epoch": 0.1927628428757097, "grad_norm": 1.576673113644762, "learning_rate": 9.321423539843218e-06, "loss": 0.2492, "step": 2818 }, { "epoch": 0.19283124700731924, "grad_norm": 2.8483379265000606, "learning_rate": 9.320866229876674e-06, "loss": 0.4208, "step": 2819 }, { "epoch": 0.1928996511389288, "grad_norm": 1.5722701315489447, "learning_rate": 9.320308707821153e-06, "loss": 0.1599, "step": 2820 }, { "epoch": 0.19296805527053834, "grad_norm": 1.9693273197383898, "learning_rate": 9.319750973704017e-06, "loss": 0.3758, "step": 2821 }, { "epoch": 0.1930364594021479, "grad_norm": 1.864919322304031, "learning_rate": 9.319193027552643e-06, "loss": 0.3557, "step": 2822 }, { "epoch": 0.19310486353375744, "grad_norm": 1.9352750269721977, "learning_rate": 9.318634869394418e-06, "loss": 0.4693, "step": 2823 }, { "epoch": 0.193173267665367, "grad_norm": 1.9348001820131215, "learning_rate": 9.318076499256741e-06, "loss": 0.3704, "step": 2824 }, { "epoch": 0.19324167179697654, "grad_norm": 2.2479600632756167, "learning_rate": 9.317517917167016e-06, "loss": 0.6186, "step": 2825 }, { "epoch": 0.1933100759285861, "grad_norm": 2.285340319661985, "learning_rate": 9.316959123152662e-06, "loss": 0.6945, "step": 2826 }, { "epoch": 0.19337848006019565, "grad_norm": 2.234802952044442, "learning_rate": 9.31640011724111e-06, "loss": 0.3024, "step": 2827 }, { "epoch": 0.1934468841918052, "grad_norm": 1.8712342514134752, "learning_rate": 9.315840899459793e-06, "loss": 0.4842, "step": 2828 }, { "epoch": 0.19351528832341475, "grad_norm": 2.37423693796505, "learning_rate": 9.315281469836169e-06, "loss": 0.5998, "step": 2829 }, { "epoch": 0.19358369245502427, "grad_norm": 1.9722530406143188, "learning_rate": 9.314721828397687e-06, "loss": 0.517, "step": 2830 }, { "epoch": 0.19365209658663382, "grad_norm": 2.1890143738384076, "learning_rate": 9.314161975171825e-06, "loss": 0.511, "step": 2831 }, { "epoch": 0.19372050071824337, "grad_norm": 2.3554234002146037, "learning_rate": 9.313601910186058e-06, "loss": 0.5533, "step": 2832 }, { "epoch": 0.19378890484985292, "grad_norm": 2.302448267873291, "learning_rate": 9.31304163346788e-06, "loss": 0.6312, "step": 2833 }, { "epoch": 0.19385730898146247, "grad_norm": 3.0153985407893624, "learning_rate": 9.312481145044794e-06, "loss": 0.4811, "step": 2834 }, { "epoch": 0.19392571311307202, "grad_norm": 2.133853927080325, "learning_rate": 9.311920444944306e-06, "loss": 0.6209, "step": 2835 }, { "epoch": 0.19399411724468157, "grad_norm": 2.5789449197157523, "learning_rate": 9.311359533193942e-06, "loss": 0.7571, "step": 2836 }, { "epoch": 0.19406252137629112, "grad_norm": 1.8556241592775997, "learning_rate": 9.310798409821232e-06, "loss": 0.2396, "step": 2837 }, { "epoch": 0.19413092550790068, "grad_norm": 2.2205381676059233, "learning_rate": 9.310237074853717e-06, "loss": 0.4054, "step": 2838 }, { "epoch": 0.19419932963951023, "grad_norm": 1.9466138416573802, "learning_rate": 9.309675528318955e-06, "loss": 0.4449, "step": 2839 }, { "epoch": 0.19426773377111978, "grad_norm": 2.3098070664464614, "learning_rate": 9.309113770244507e-06, "loss": 0.6473, "step": 2840 }, { "epoch": 0.19433613790272933, "grad_norm": 2.1066690820391405, "learning_rate": 9.308551800657946e-06, "loss": 0.4716, "step": 2841 }, { "epoch": 0.19440454203433888, "grad_norm": 1.929339877566727, "learning_rate": 9.307989619586857e-06, "loss": 0.3591, "step": 2842 }, { "epoch": 0.19447294616594843, "grad_norm": 1.9996890705529906, "learning_rate": 9.307427227058834e-06, "loss": 0.3439, "step": 2843 }, { "epoch": 0.19454135029755798, "grad_norm": 1.7505193231402663, "learning_rate": 9.306864623101484e-06, "loss": 0.2461, "step": 2844 }, { "epoch": 0.19460975442916753, "grad_norm": 2.593011503910169, "learning_rate": 9.306301807742416e-06, "loss": 0.7841, "step": 2845 }, { "epoch": 0.19467815856077708, "grad_norm": 1.830041504354089, "learning_rate": 9.305738781009265e-06, "loss": 0.3395, "step": 2846 }, { "epoch": 0.19474656269238663, "grad_norm": 2.075937977460157, "learning_rate": 9.305175542929663e-06, "loss": 0.5236, "step": 2847 }, { "epoch": 0.19481496682399618, "grad_norm": 1.2493788233718077, "learning_rate": 9.304612093531254e-06, "loss": 0.1594, "step": 2848 }, { "epoch": 0.1948833709556057, "grad_norm": 1.742885569490264, "learning_rate": 9.304048432841695e-06, "loss": 0.4008, "step": 2849 }, { "epoch": 0.19495177508721526, "grad_norm": 1.7659660364456768, "learning_rate": 9.303484560888658e-06, "loss": 0.4122, "step": 2850 }, { "epoch": 0.1950201792188248, "grad_norm": 2.2103525607861956, "learning_rate": 9.302920477699814e-06, "loss": 0.3202, "step": 2851 }, { "epoch": 0.19508858335043436, "grad_norm": 1.8071455420053302, "learning_rate": 9.302356183302857e-06, "loss": 0.3428, "step": 2852 }, { "epoch": 0.1951569874820439, "grad_norm": 2.383251815696168, "learning_rate": 9.301791677725483e-06, "loss": 0.6093, "step": 2853 }, { "epoch": 0.19522539161365346, "grad_norm": 2.4010393524710425, "learning_rate": 9.301226960995399e-06, "loss": 0.4707, "step": 2854 }, { "epoch": 0.195293795745263, "grad_norm": 1.4196726516015996, "learning_rate": 9.300662033140325e-06, "loss": 0.1887, "step": 2855 }, { "epoch": 0.19536219987687256, "grad_norm": 2.378875243637893, "learning_rate": 9.300096894187991e-06, "loss": 0.6725, "step": 2856 }, { "epoch": 0.1954306040084821, "grad_norm": 2.05850055201312, "learning_rate": 9.299531544166138e-06, "loss": 0.6038, "step": 2857 }, { "epoch": 0.19549900814009166, "grad_norm": 2.1729825475409275, "learning_rate": 9.298965983102513e-06, "loss": 0.5533, "step": 2858 }, { "epoch": 0.1955674122717012, "grad_norm": 1.3717938263367884, "learning_rate": 9.298400211024878e-06, "loss": 0.1563, "step": 2859 }, { "epoch": 0.19563581640331076, "grad_norm": 2.364548108738984, "learning_rate": 9.297834227961005e-06, "loss": 0.6167, "step": 2860 }, { "epoch": 0.19570422053492031, "grad_norm": 2.7709574074082233, "learning_rate": 9.297268033938673e-06, "loss": 0.3873, "step": 2861 }, { "epoch": 0.19577262466652987, "grad_norm": 2.3836053373492128, "learning_rate": 9.296701628985675e-06, "loss": 0.7724, "step": 2862 }, { "epoch": 0.19584102879813942, "grad_norm": 3.012907760771198, "learning_rate": 9.296135013129812e-06, "loss": 0.3326, "step": 2863 }, { "epoch": 0.19590943292974897, "grad_norm": 2.8962943284186675, "learning_rate": 9.295568186398897e-06, "loss": 0.6621, "step": 2864 }, { "epoch": 0.19597783706135852, "grad_norm": 1.7127316269541277, "learning_rate": 9.295001148820754e-06, "loss": 0.3609, "step": 2865 }, { "epoch": 0.19604624119296807, "grad_norm": 2.0119842946613193, "learning_rate": 9.294433900423212e-06, "loss": 0.4952, "step": 2866 }, { "epoch": 0.1961146453245776, "grad_norm": 1.8330333506736358, "learning_rate": 9.293866441234119e-06, "loss": 0.4294, "step": 2867 }, { "epoch": 0.19618304945618714, "grad_norm": 1.812924446360908, "learning_rate": 9.293298771281325e-06, "loss": 0.4543, "step": 2868 }, { "epoch": 0.1962514535877967, "grad_norm": 1.4783438465605343, "learning_rate": 9.292730890592694e-06, "loss": 0.2378, "step": 2869 }, { "epoch": 0.19631985771940624, "grad_norm": 2.397620512396756, "learning_rate": 9.2921627991961e-06, "loss": 0.6802, "step": 2870 }, { "epoch": 0.1963882618510158, "grad_norm": 2.348670721243812, "learning_rate": 9.291594497119433e-06, "loss": 0.4468, "step": 2871 }, { "epoch": 0.19645666598262534, "grad_norm": 2.1130496125537253, "learning_rate": 9.291025984390583e-06, "loss": 0.4886, "step": 2872 }, { "epoch": 0.1965250701142349, "grad_norm": 2.0391953889068284, "learning_rate": 9.290457261037456e-06, "loss": 0.5356, "step": 2873 }, { "epoch": 0.19659347424584445, "grad_norm": 2.6337422503595893, "learning_rate": 9.289888327087967e-06, "loss": 0.6995, "step": 2874 }, { "epoch": 0.196661878377454, "grad_norm": 3.1032393148251036, "learning_rate": 9.289319182570047e-06, "loss": 0.2526, "step": 2875 }, { "epoch": 0.19673028250906355, "grad_norm": 2.467416801692611, "learning_rate": 9.288749827511625e-06, "loss": 0.2816, "step": 2876 }, { "epoch": 0.1967986866406731, "grad_norm": 2.604396959930401, "learning_rate": 9.288180261940654e-06, "loss": 0.5435, "step": 2877 }, { "epoch": 0.19686709077228265, "grad_norm": 1.542419142559085, "learning_rate": 9.287610485885086e-06, "loss": 0.3102, "step": 2878 }, { "epoch": 0.1969354949038922, "grad_norm": 2.1266141328842854, "learning_rate": 9.287040499372893e-06, "loss": 0.5751, "step": 2879 }, { "epoch": 0.19700389903550175, "grad_norm": 1.6497591396252547, "learning_rate": 9.286470302432046e-06, "loss": 0.2352, "step": 2880 }, { "epoch": 0.1970723031671113, "grad_norm": 1.949646140355533, "learning_rate": 9.285899895090542e-06, "loss": 0.4174, "step": 2881 }, { "epoch": 0.19714070729872085, "grad_norm": 1.7117823600702071, "learning_rate": 9.285329277376373e-06, "loss": 0.4072, "step": 2882 }, { "epoch": 0.1972091114303304, "grad_norm": 2.1631006106875703, "learning_rate": 9.284758449317551e-06, "loss": 0.52, "step": 2883 }, { "epoch": 0.19727751556193995, "grad_norm": 1.6641191598263505, "learning_rate": 9.284187410942091e-06, "loss": 0.3987, "step": 2884 }, { "epoch": 0.1973459196935495, "grad_norm": 2.7121243146385714, "learning_rate": 9.283616162278025e-06, "loss": 0.4441, "step": 2885 }, { "epoch": 0.19741432382515903, "grad_norm": 2.574307328756592, "learning_rate": 9.283044703353394e-06, "loss": 0.6058, "step": 2886 }, { "epoch": 0.19748272795676858, "grad_norm": 2.072577047721965, "learning_rate": 9.282473034196244e-06, "loss": 0.5158, "step": 2887 }, { "epoch": 0.19755113208837813, "grad_norm": 2.2826358883508577, "learning_rate": 9.28190115483464e-06, "loss": 0.6501, "step": 2888 }, { "epoch": 0.19761953621998768, "grad_norm": 1.7736382223802272, "learning_rate": 9.28132906529665e-06, "loss": 0.4595, "step": 2889 }, { "epoch": 0.19768794035159723, "grad_norm": 2.7848510360943504, "learning_rate": 9.280756765610354e-06, "loss": 0.1684, "step": 2890 }, { "epoch": 0.19775634448320678, "grad_norm": 2.2471007370329357, "learning_rate": 9.280184255803844e-06, "loss": 0.5044, "step": 2891 }, { "epoch": 0.19782474861481633, "grad_norm": 2.0730190723050645, "learning_rate": 9.279611535905223e-06, "loss": 0.2657, "step": 2892 }, { "epoch": 0.19789315274642588, "grad_norm": 1.885209192448715, "learning_rate": 9.279038605942603e-06, "loss": 0.322, "step": 2893 }, { "epoch": 0.19796155687803543, "grad_norm": 2.1130222824695997, "learning_rate": 9.278465465944102e-06, "loss": 0.3838, "step": 2894 }, { "epoch": 0.19802996100964498, "grad_norm": 2.3933378484799803, "learning_rate": 9.277892115937856e-06, "loss": 0.7031, "step": 2895 }, { "epoch": 0.19809836514125453, "grad_norm": 1.6117038543057214, "learning_rate": 9.277318555952009e-06, "loss": 0.3963, "step": 2896 }, { "epoch": 0.19816676927286409, "grad_norm": 2.4875283831505826, "learning_rate": 9.276744786014712e-06, "loss": 0.5546, "step": 2897 }, { "epoch": 0.19823517340447364, "grad_norm": 2.723145795291404, "learning_rate": 9.276170806154128e-06, "loss": 0.4118, "step": 2898 }, { "epoch": 0.1983035775360832, "grad_norm": 2.2350931980715916, "learning_rate": 9.275596616398431e-06, "loss": 0.5052, "step": 2899 }, { "epoch": 0.19837198166769274, "grad_norm": 1.5475104344155626, "learning_rate": 9.275022216775806e-06, "loss": 0.1807, "step": 2900 }, { "epoch": 0.1984403857993023, "grad_norm": 2.039195111226638, "learning_rate": 9.274447607314447e-06, "loss": 0.4505, "step": 2901 }, { "epoch": 0.19850878993091184, "grad_norm": 2.4777873647461113, "learning_rate": 9.273872788042559e-06, "loss": 0.5598, "step": 2902 }, { "epoch": 0.1985771940625214, "grad_norm": 2.0226077142095513, "learning_rate": 9.273297758988355e-06, "loss": 0.357, "step": 2903 }, { "epoch": 0.1986455981941309, "grad_norm": 1.9986027180286552, "learning_rate": 9.272722520180062e-06, "loss": 0.4063, "step": 2904 }, { "epoch": 0.19871400232574046, "grad_norm": 1.9518726224637462, "learning_rate": 9.272147071645913e-06, "loss": 0.5555, "step": 2905 }, { "epoch": 0.19878240645735, "grad_norm": 2.16926528377365, "learning_rate": 9.271571413414157e-06, "loss": 0.3826, "step": 2906 }, { "epoch": 0.19885081058895956, "grad_norm": 1.9851910347501285, "learning_rate": 9.270995545513051e-06, "loss": 0.3946, "step": 2907 }, { "epoch": 0.19891921472056912, "grad_norm": 1.761385604925556, "learning_rate": 9.270419467970857e-06, "loss": 0.3465, "step": 2908 }, { "epoch": 0.19898761885217867, "grad_norm": 1.6629686342221597, "learning_rate": 9.269843180815854e-06, "loss": 0.3514, "step": 2909 }, { "epoch": 0.19905602298378822, "grad_norm": 1.6648470412695728, "learning_rate": 9.26926668407633e-06, "loss": 0.3173, "step": 2910 }, { "epoch": 0.19912442711539777, "grad_norm": 2.2247241679675644, "learning_rate": 9.26868997778058e-06, "loss": 0.7025, "step": 2911 }, { "epoch": 0.19919283124700732, "grad_norm": 1.8819655693947945, "learning_rate": 9.268113061956914e-06, "loss": 0.4421, "step": 2912 }, { "epoch": 0.19926123537861687, "grad_norm": 2.0694130293135027, "learning_rate": 9.267535936633646e-06, "loss": 0.4862, "step": 2913 }, { "epoch": 0.19932963951022642, "grad_norm": 1.7451186262828189, "learning_rate": 9.266958601839108e-06, "loss": 0.3811, "step": 2914 }, { "epoch": 0.19939804364183597, "grad_norm": 1.8788051538372963, "learning_rate": 9.266381057601637e-06, "loss": 0.4192, "step": 2915 }, { "epoch": 0.19946644777344552, "grad_norm": 2.0124215833420194, "learning_rate": 9.26580330394958e-06, "loss": 0.4069, "step": 2916 }, { "epoch": 0.19953485190505507, "grad_norm": 1.3116669026046008, "learning_rate": 9.265225340911299e-06, "loss": 0.2222, "step": 2917 }, { "epoch": 0.19960325603666462, "grad_norm": 1.8528966965688267, "learning_rate": 9.264647168515161e-06, "loss": 0.3176, "step": 2918 }, { "epoch": 0.19967166016827417, "grad_norm": 2.666681501352254, "learning_rate": 9.264068786789546e-06, "loss": 0.4684, "step": 2919 }, { "epoch": 0.19974006429988372, "grad_norm": 2.260553827533805, "learning_rate": 9.263490195762846e-06, "loss": 0.7131, "step": 2920 }, { "epoch": 0.19980846843149327, "grad_norm": 1.870581152510716, "learning_rate": 9.262911395463456e-06, "loss": 0.3588, "step": 2921 }, { "epoch": 0.1998768725631028, "grad_norm": 1.7651375122912722, "learning_rate": 9.262332385919792e-06, "loss": 0.4396, "step": 2922 }, { "epoch": 0.19994527669471235, "grad_norm": 1.7168871885116304, "learning_rate": 9.261753167160271e-06, "loss": 0.2323, "step": 2923 }, { "epoch": 0.2000136808263219, "grad_norm": 2.2607199879950817, "learning_rate": 9.261173739213325e-06, "loss": 0.6954, "step": 2924 } ], "logging_steps": 1.0, "max_steps": 14619, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1462, "total_flos": 88824382791680.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }