endpointing-jul29 / trainer_state.json
Cyleux's picture
Upload 7 files
68e4dfc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1869,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016051364365971106,
"grad_norm": 0.5444720983505249,
"learning_rate": 4.973247726056715e-05,
"loss": 0.6974,
"step": 10
},
{
"epoch": 0.03210272873194221,
"grad_norm": 0.46184447407722473,
"learning_rate": 4.94649545211343e-05,
"loss": 0.6938,
"step": 20
},
{
"epoch": 0.048154093097913325,
"grad_norm": 0.6587526798248291,
"learning_rate": 4.919743178170145e-05,
"loss": 0.6926,
"step": 30
},
{
"epoch": 0.06420545746388442,
"grad_norm": 2.1942267417907715,
"learning_rate": 4.8929909042268596e-05,
"loss": 0.6628,
"step": 40
},
{
"epoch": 0.08025682182985554,
"grad_norm": 1.835483431816101,
"learning_rate": 4.8662386302835744e-05,
"loss": 0.6777,
"step": 50
},
{
"epoch": 0.09630818619582665,
"grad_norm": 1.6423420906066895,
"learning_rate": 4.839486356340289e-05,
"loss": 0.6748,
"step": 60
},
{
"epoch": 0.11235955056179775,
"grad_norm": 1.618177056312561,
"learning_rate": 4.812734082397004e-05,
"loss": 0.6585,
"step": 70
},
{
"epoch": 0.12841091492776885,
"grad_norm": 2.7368061542510986,
"learning_rate": 4.785981808453719e-05,
"loss": 0.6208,
"step": 80
},
{
"epoch": 0.14446227929373998,
"grad_norm": 2.0427486896514893,
"learning_rate": 4.759229534510434e-05,
"loss": 0.6447,
"step": 90
},
{
"epoch": 0.16051364365971107,
"grad_norm": 2.044725179672241,
"learning_rate": 4.7324772605671486e-05,
"loss": 0.6543,
"step": 100
},
{
"epoch": 0.17656500802568217,
"grad_norm": 1.4154126644134521,
"learning_rate": 4.7057249866238635e-05,
"loss": 0.6295,
"step": 110
},
{
"epoch": 0.1926163723916533,
"grad_norm": 1.4071663618087769,
"learning_rate": 4.678972712680578e-05,
"loss": 0.6088,
"step": 120
},
{
"epoch": 0.2086677367576244,
"grad_norm": 1.5164605379104614,
"learning_rate": 4.652220438737293e-05,
"loss": 0.6086,
"step": 130
},
{
"epoch": 0.2247191011235955,
"grad_norm": 1.1694097518920898,
"learning_rate": 4.625468164794008e-05,
"loss": 0.6131,
"step": 140
},
{
"epoch": 0.24077046548956663,
"grad_norm": 2.416905164718628,
"learning_rate": 4.598715890850723e-05,
"loss": 0.6392,
"step": 150
},
{
"epoch": 0.2568218298555377,
"grad_norm": 1.8865312337875366,
"learning_rate": 4.571963616907438e-05,
"loss": 0.5955,
"step": 160
},
{
"epoch": 0.27287319422150885,
"grad_norm": 1.4470267295837402,
"learning_rate": 4.5452113429641525e-05,
"loss": 0.6245,
"step": 170
},
{
"epoch": 0.28892455858747995,
"grad_norm": 2.124974489212036,
"learning_rate": 4.5184590690208673e-05,
"loss": 0.6251,
"step": 180
},
{
"epoch": 0.30497592295345105,
"grad_norm": 2.1896016597747803,
"learning_rate": 4.491706795077582e-05,
"loss": 0.631,
"step": 190
},
{
"epoch": 0.32102728731942215,
"grad_norm": 1.449318766593933,
"learning_rate": 4.4649545211342963e-05,
"loss": 0.5939,
"step": 200
},
{
"epoch": 0.33707865168539325,
"grad_norm": 2.2219743728637695,
"learning_rate": 4.438202247191011e-05,
"loss": 0.5746,
"step": 210
},
{
"epoch": 0.35313001605136435,
"grad_norm": 2.2409071922302246,
"learning_rate": 4.411449973247726e-05,
"loss": 0.6065,
"step": 220
},
{
"epoch": 0.36918138041733545,
"grad_norm": 1.1223793029785156,
"learning_rate": 4.384697699304441e-05,
"loss": 0.6123,
"step": 230
},
{
"epoch": 0.3852327447833066,
"grad_norm": 1.2747622728347778,
"learning_rate": 4.357945425361156e-05,
"loss": 0.5952,
"step": 240
},
{
"epoch": 0.4012841091492777,
"grad_norm": 1.8558810949325562,
"learning_rate": 4.3311931514178705e-05,
"loss": 0.6027,
"step": 250
},
{
"epoch": 0.4173354735152488,
"grad_norm": 1.1629371643066406,
"learning_rate": 4.3044408774745854e-05,
"loss": 0.6228,
"step": 260
},
{
"epoch": 0.4333868378812199,
"grad_norm": 1.4288586378097534,
"learning_rate": 4.2776886035313e-05,
"loss": 0.5734,
"step": 270
},
{
"epoch": 0.449438202247191,
"grad_norm": 3.359997034072876,
"learning_rate": 4.250936329588015e-05,
"loss": 0.6018,
"step": 280
},
{
"epoch": 0.4654895666131621,
"grad_norm": 1.8125280141830444,
"learning_rate": 4.22418405564473e-05,
"loss": 0.6288,
"step": 290
},
{
"epoch": 0.48154093097913325,
"grad_norm": 1.3432456254959106,
"learning_rate": 4.197431781701445e-05,
"loss": 0.5756,
"step": 300
},
{
"epoch": 0.49759229534510435,
"grad_norm": 3.372265100479126,
"learning_rate": 4.1706795077581596e-05,
"loss": 0.6197,
"step": 310
},
{
"epoch": 0.5136436597110754,
"grad_norm": 1.7587655782699585,
"learning_rate": 4.1439272338148744e-05,
"loss": 0.5808,
"step": 320
},
{
"epoch": 0.5296950240770465,
"grad_norm": 2.1404621601104736,
"learning_rate": 4.117174959871589e-05,
"loss": 0.6075,
"step": 330
},
{
"epoch": 0.5457463884430177,
"grad_norm": 1.6256980895996094,
"learning_rate": 4.090422685928304e-05,
"loss": 0.5765,
"step": 340
},
{
"epoch": 0.5617977528089888,
"grad_norm": 2.2659549713134766,
"learning_rate": 4.063670411985019e-05,
"loss": 0.6305,
"step": 350
},
{
"epoch": 0.5778491171749599,
"grad_norm": 1.9907615184783936,
"learning_rate": 4.036918138041734e-05,
"loss": 0.5743,
"step": 360
},
{
"epoch": 0.593900481540931,
"grad_norm": 2.26408314704895,
"learning_rate": 4.0101658640984486e-05,
"loss": 0.5842,
"step": 370
},
{
"epoch": 0.6099518459069021,
"grad_norm": 1.9207652807235718,
"learning_rate": 3.9834135901551634e-05,
"loss": 0.5719,
"step": 380
},
{
"epoch": 0.6260032102728732,
"grad_norm": 2.5978338718414307,
"learning_rate": 3.956661316211878e-05,
"loss": 0.5811,
"step": 390
},
{
"epoch": 0.6420545746388443,
"grad_norm": 1.6617166996002197,
"learning_rate": 3.929909042268593e-05,
"loss": 0.5819,
"step": 400
},
{
"epoch": 0.6581059390048154,
"grad_norm": 2.7522661685943604,
"learning_rate": 3.903156768325308e-05,
"loss": 0.605,
"step": 410
},
{
"epoch": 0.6741573033707865,
"grad_norm": 1.6527293920516968,
"learning_rate": 3.876404494382023e-05,
"loss": 0.6036,
"step": 420
},
{
"epoch": 0.6902086677367576,
"grad_norm": 1.6553492546081543,
"learning_rate": 3.8496522204387376e-05,
"loss": 0.5723,
"step": 430
},
{
"epoch": 0.7062600321027287,
"grad_norm": 1.9113073348999023,
"learning_rate": 3.8228999464954525e-05,
"loss": 0.5869,
"step": 440
},
{
"epoch": 0.7223113964686998,
"grad_norm": 1.493342399597168,
"learning_rate": 3.796147672552167e-05,
"loss": 0.5912,
"step": 450
},
{
"epoch": 0.7383627608346709,
"grad_norm": 1.3368749618530273,
"learning_rate": 3.769395398608882e-05,
"loss": 0.613,
"step": 460
},
{
"epoch": 0.7544141252006421,
"grad_norm": 2.699831008911133,
"learning_rate": 3.742643124665597e-05,
"loss": 0.5658,
"step": 470
},
{
"epoch": 0.7704654895666132,
"grad_norm": 1.0824522972106934,
"learning_rate": 3.715890850722312e-05,
"loss": 0.6116,
"step": 480
},
{
"epoch": 0.7865168539325843,
"grad_norm": 2.092763900756836,
"learning_rate": 3.689138576779027e-05,
"loss": 0.5711,
"step": 490
},
{
"epoch": 0.8025682182985554,
"grad_norm": 2.2845699787139893,
"learning_rate": 3.6623863028357415e-05,
"loss": 0.5825,
"step": 500
},
{
"epoch": 0.8186195826645265,
"grad_norm": 2.034006118774414,
"learning_rate": 3.6356340288924564e-05,
"loss": 0.5857,
"step": 510
},
{
"epoch": 0.8346709470304976,
"grad_norm": 1.5809577703475952,
"learning_rate": 3.608881754949171e-05,
"loss": 0.5539,
"step": 520
},
{
"epoch": 0.8507223113964687,
"grad_norm": 2.2914111614227295,
"learning_rate": 3.582129481005886e-05,
"loss": 0.5806,
"step": 530
},
{
"epoch": 0.8667736757624398,
"grad_norm": 1.6067487001419067,
"learning_rate": 3.555377207062601e-05,
"loss": 0.5986,
"step": 540
},
{
"epoch": 0.8828250401284109,
"grad_norm": 1.690928339958191,
"learning_rate": 3.528624933119316e-05,
"loss": 0.5713,
"step": 550
},
{
"epoch": 0.898876404494382,
"grad_norm": 2.0696561336517334,
"learning_rate": 3.5018726591760305e-05,
"loss": 0.5601,
"step": 560
},
{
"epoch": 0.9149277688603531,
"grad_norm": 1.693708896636963,
"learning_rate": 3.4751203852327454e-05,
"loss": 0.5812,
"step": 570
},
{
"epoch": 0.9309791332263242,
"grad_norm": 1.5981098413467407,
"learning_rate": 3.44836811128946e-05,
"loss": 0.5699,
"step": 580
},
{
"epoch": 0.9470304975922953,
"grad_norm": 1.4630780220031738,
"learning_rate": 3.421615837346175e-05,
"loss": 0.5917,
"step": 590
},
{
"epoch": 0.9630818619582665,
"grad_norm": 1.8144547939300537,
"learning_rate": 3.394863563402889e-05,
"loss": 0.5969,
"step": 600
},
{
"epoch": 0.9791332263242376,
"grad_norm": 2.242295265197754,
"learning_rate": 3.368111289459604e-05,
"loss": 0.5524,
"step": 610
},
{
"epoch": 0.9951845906902087,
"grad_norm": 3.1214287281036377,
"learning_rate": 3.341359015516319e-05,
"loss": 0.5872,
"step": 620
},
{
"epoch": 1.0112359550561798,
"grad_norm": 1.6238343715667725,
"learning_rate": 3.314606741573034e-05,
"loss": 0.5715,
"step": 630
},
{
"epoch": 1.0272873194221508,
"grad_norm": 1.7471458911895752,
"learning_rate": 3.2878544676297486e-05,
"loss": 0.5377,
"step": 640
},
{
"epoch": 1.043338683788122,
"grad_norm": 2.1529836654663086,
"learning_rate": 3.2611021936864634e-05,
"loss": 0.5542,
"step": 650
},
{
"epoch": 1.0593900481540932,
"grad_norm": 2.278169870376587,
"learning_rate": 3.234349919743178e-05,
"loss": 0.5616,
"step": 660
},
{
"epoch": 1.0754414125200642,
"grad_norm": 2.8219568729400635,
"learning_rate": 3.207597645799893e-05,
"loss": 0.5711,
"step": 670
},
{
"epoch": 1.0914927768860354,
"grad_norm": 1.5639195442199707,
"learning_rate": 3.180845371856608e-05,
"loss": 0.5439,
"step": 680
},
{
"epoch": 1.1075441412520064,
"grad_norm": 2.146303415298462,
"learning_rate": 3.154093097913323e-05,
"loss": 0.525,
"step": 690
},
{
"epoch": 1.1235955056179776,
"grad_norm": 1.7315692901611328,
"learning_rate": 3.1273408239700376e-05,
"loss": 0.5521,
"step": 700
},
{
"epoch": 1.1396468699839486,
"grad_norm": 2.457808494567871,
"learning_rate": 3.1005885500267525e-05,
"loss": 0.5209,
"step": 710
},
{
"epoch": 1.1556982343499198,
"grad_norm": 2.4021546840667725,
"learning_rate": 3.073836276083467e-05,
"loss": 0.5056,
"step": 720
},
{
"epoch": 1.1717495987158908,
"grad_norm": 3.025860548019409,
"learning_rate": 3.047084002140182e-05,
"loss": 0.4884,
"step": 730
},
{
"epoch": 1.187800963081862,
"grad_norm": 3.1893551349639893,
"learning_rate": 3.0203317281968966e-05,
"loss": 0.5203,
"step": 740
},
{
"epoch": 1.203852327447833,
"grad_norm": 4.527679443359375,
"learning_rate": 2.9935794542536115e-05,
"loss": 0.5114,
"step": 750
},
{
"epoch": 1.2199036918138042,
"grad_norm": 3.021358013153076,
"learning_rate": 2.9668271803103263e-05,
"loss": 0.5408,
"step": 760
},
{
"epoch": 1.2359550561797752,
"grad_norm": 3.0941548347473145,
"learning_rate": 2.940074906367041e-05,
"loss": 0.5122,
"step": 770
},
{
"epoch": 1.2520064205457464,
"grad_norm": 2.5667285919189453,
"learning_rate": 2.913322632423756e-05,
"loss": 0.5192,
"step": 780
},
{
"epoch": 1.2680577849117176,
"grad_norm": 3.7231733798980713,
"learning_rate": 2.886570358480471e-05,
"loss": 0.5609,
"step": 790
},
{
"epoch": 1.2841091492776886,
"grad_norm": 1.9197113513946533,
"learning_rate": 2.8598180845371857e-05,
"loss": 0.4747,
"step": 800
},
{
"epoch": 1.3001605136436596,
"grad_norm": 2.612793207168579,
"learning_rate": 2.8330658105939005e-05,
"loss": 0.5286,
"step": 810
},
{
"epoch": 1.3162118780096308,
"grad_norm": 2.353598117828369,
"learning_rate": 2.8063135366506153e-05,
"loss": 0.535,
"step": 820
},
{
"epoch": 1.332263242375602,
"grad_norm": 2.9524009227752686,
"learning_rate": 2.7795612627073302e-05,
"loss": 0.5402,
"step": 830
},
{
"epoch": 1.348314606741573,
"grad_norm": 2.1400082111358643,
"learning_rate": 2.752808988764045e-05,
"loss": 0.5487,
"step": 840
},
{
"epoch": 1.3643659711075442,
"grad_norm": 2.116978168487549,
"learning_rate": 2.72605671482076e-05,
"loss": 0.5586,
"step": 850
},
{
"epoch": 1.3804173354735152,
"grad_norm": 2.3243775367736816,
"learning_rate": 2.6993044408774747e-05,
"loss": 0.5402,
"step": 860
},
{
"epoch": 1.3964686998394864,
"grad_norm": 2.2850890159606934,
"learning_rate": 2.6725521669341895e-05,
"loss": 0.5151,
"step": 870
},
{
"epoch": 1.4125200642054574,
"grad_norm": 2.305981397628784,
"learning_rate": 2.6457998929909044e-05,
"loss": 0.5494,
"step": 880
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.8870021104812622,
"learning_rate": 2.6190476190476192e-05,
"loss": 0.5545,
"step": 890
},
{
"epoch": 1.4446227929373996,
"grad_norm": 2.1851470470428467,
"learning_rate": 2.592295345104334e-05,
"loss": 0.5205,
"step": 900
},
{
"epoch": 1.4606741573033708,
"grad_norm": 2.7136600017547607,
"learning_rate": 2.565543071161049e-05,
"loss": 0.5427,
"step": 910
},
{
"epoch": 1.476725521669342,
"grad_norm": 3.9425771236419678,
"learning_rate": 2.5387907972177637e-05,
"loss": 0.5209,
"step": 920
},
{
"epoch": 1.492776886035313,
"grad_norm": 2.84690260887146,
"learning_rate": 2.5120385232744786e-05,
"loss": 0.5562,
"step": 930
},
{
"epoch": 1.508828250401284,
"grad_norm": 2.375824213027954,
"learning_rate": 2.485286249331193e-05,
"loss": 0.521,
"step": 940
},
{
"epoch": 1.5248796147672552,
"grad_norm": 2.241267681121826,
"learning_rate": 2.458533975387908e-05,
"loss": 0.5576,
"step": 950
},
{
"epoch": 1.5409309791332264,
"grad_norm": 2.209796190261841,
"learning_rate": 2.4317817014446228e-05,
"loss": 0.5034,
"step": 960
},
{
"epoch": 1.5569823434991974,
"grad_norm": 2.9751803874969482,
"learning_rate": 2.4050294275013376e-05,
"loss": 0.5222,
"step": 970
},
{
"epoch": 1.5730337078651684,
"grad_norm": 3.5506584644317627,
"learning_rate": 2.3782771535580524e-05,
"loss": 0.5067,
"step": 980
},
{
"epoch": 1.5890850722311396,
"grad_norm": 2.4530675411224365,
"learning_rate": 2.3515248796147673e-05,
"loss": 0.5262,
"step": 990
},
{
"epoch": 1.6051364365971108,
"grad_norm": 2.639045476913452,
"learning_rate": 2.324772605671482e-05,
"loss": 0.5396,
"step": 1000
},
{
"epoch": 1.621187800963082,
"grad_norm": 3.676542043685913,
"learning_rate": 2.298020331728197e-05,
"loss": 0.5462,
"step": 1010
},
{
"epoch": 1.637239165329053,
"grad_norm": 2.813171148300171,
"learning_rate": 2.2712680577849118e-05,
"loss": 0.5792,
"step": 1020
},
{
"epoch": 1.653290529695024,
"grad_norm": 3.5937094688415527,
"learning_rate": 2.2445157838416266e-05,
"loss": 0.527,
"step": 1030
},
{
"epoch": 1.6693418940609952,
"grad_norm": 2.3738603591918945,
"learning_rate": 2.2177635098983415e-05,
"loss": 0.5431,
"step": 1040
},
{
"epoch": 1.6853932584269664,
"grad_norm": 2.7167625427246094,
"learning_rate": 2.1910112359550563e-05,
"loss": 0.5187,
"step": 1050
},
{
"epoch": 1.7014446227929374,
"grad_norm": 2.8609092235565186,
"learning_rate": 2.164258962011771e-05,
"loss": 0.5061,
"step": 1060
},
{
"epoch": 1.7174959871589084,
"grad_norm": 3.4374756813049316,
"learning_rate": 2.137506688068486e-05,
"loss": 0.5301,
"step": 1070
},
{
"epoch": 1.7335473515248796,
"grad_norm": 2.3880395889282227,
"learning_rate": 2.1107544141252008e-05,
"loss": 0.5572,
"step": 1080
},
{
"epoch": 1.7495987158908508,
"grad_norm": 2.2175981998443604,
"learning_rate": 2.0840021401819157e-05,
"loss": 0.4862,
"step": 1090
},
{
"epoch": 1.7656500802568218,
"grad_norm": 1.8051007986068726,
"learning_rate": 2.05724986623863e-05,
"loss": 0.5108,
"step": 1100
},
{
"epoch": 1.7817014446227928,
"grad_norm": 3.5168681144714355,
"learning_rate": 2.030497592295345e-05,
"loss": 0.538,
"step": 1110
},
{
"epoch": 1.797752808988764,
"grad_norm": 2.5539281368255615,
"learning_rate": 2.00374531835206e-05,
"loss": 0.5489,
"step": 1120
},
{
"epoch": 1.8138041733547352,
"grad_norm": 1.7118955850601196,
"learning_rate": 1.9769930444087747e-05,
"loss": 0.5242,
"step": 1130
},
{
"epoch": 1.8298555377207064,
"grad_norm": 2.2016992568969727,
"learning_rate": 1.9502407704654895e-05,
"loss": 0.5172,
"step": 1140
},
{
"epoch": 1.8459069020866774,
"grad_norm": 2.072165012359619,
"learning_rate": 1.9234884965222044e-05,
"loss": 0.5216,
"step": 1150
},
{
"epoch": 1.8619582664526484,
"grad_norm": 2.446287155151367,
"learning_rate": 1.8967362225789192e-05,
"loss": 0.5391,
"step": 1160
},
{
"epoch": 1.8780096308186196,
"grad_norm": 2.6297097206115723,
"learning_rate": 1.869983948635634e-05,
"loss": 0.5046,
"step": 1170
},
{
"epoch": 1.8940609951845908,
"grad_norm": 2.429002285003662,
"learning_rate": 1.843231674692349e-05,
"loss": 0.5081,
"step": 1180
},
{
"epoch": 1.9101123595505618,
"grad_norm": 2.9561805725097656,
"learning_rate": 1.8164794007490637e-05,
"loss": 0.5335,
"step": 1190
},
{
"epoch": 1.9261637239165328,
"grad_norm": 2.9521896839141846,
"learning_rate": 1.7897271268057786e-05,
"loss": 0.5307,
"step": 1200
},
{
"epoch": 1.942215088282504,
"grad_norm": 2.409397602081299,
"learning_rate": 1.7629748528624934e-05,
"loss": 0.5014,
"step": 1210
},
{
"epoch": 1.9582664526484752,
"grad_norm": 2.046233892440796,
"learning_rate": 1.7362225789192082e-05,
"loss": 0.5637,
"step": 1220
},
{
"epoch": 1.9743178170144462,
"grad_norm": 2.7884459495544434,
"learning_rate": 1.709470304975923e-05,
"loss": 0.5512,
"step": 1230
},
{
"epoch": 1.9903691813804172,
"grad_norm": 2.0197086334228516,
"learning_rate": 1.682718031032638e-05,
"loss": 0.5112,
"step": 1240
},
{
"epoch": 2.0064205457463884,
"grad_norm": 2.801968574523926,
"learning_rate": 1.6559657570893527e-05,
"loss": 0.5084,
"step": 1250
},
{
"epoch": 2.0224719101123596,
"grad_norm": 2.099778652191162,
"learning_rate": 1.6292134831460676e-05,
"loss": 0.4558,
"step": 1260
},
{
"epoch": 2.038523274478331,
"grad_norm": 3.728285789489746,
"learning_rate": 1.6024612092027824e-05,
"loss": 0.453,
"step": 1270
},
{
"epoch": 2.0545746388443016,
"grad_norm": 3.892178773880005,
"learning_rate": 1.5757089352594973e-05,
"loss": 0.4347,
"step": 1280
},
{
"epoch": 2.070626003210273,
"grad_norm": 3.3752293586730957,
"learning_rate": 1.548956661316212e-05,
"loss": 0.4382,
"step": 1290
},
{
"epoch": 2.086677367576244,
"grad_norm": 4.784174919128418,
"learning_rate": 1.5222043873729266e-05,
"loss": 0.4416,
"step": 1300
},
{
"epoch": 2.102728731942215,
"grad_norm": 2.877718448638916,
"learning_rate": 1.4954521134296414e-05,
"loss": 0.4402,
"step": 1310
},
{
"epoch": 2.1187800963081864,
"grad_norm": 3.0553367137908936,
"learning_rate": 1.4686998394863563e-05,
"loss": 0.4508,
"step": 1320
},
{
"epoch": 2.134831460674157,
"grad_norm": 3.8250510692596436,
"learning_rate": 1.4419475655430711e-05,
"loss": 0.4422,
"step": 1330
},
{
"epoch": 2.1508828250401284,
"grad_norm": 3.143554210662842,
"learning_rate": 1.415195291599786e-05,
"loss": 0.4572,
"step": 1340
},
{
"epoch": 2.1669341894060996,
"grad_norm": 1.8287140130996704,
"learning_rate": 1.3884430176565008e-05,
"loss": 0.4287,
"step": 1350
},
{
"epoch": 2.182985553772071,
"grad_norm": 5.451256275177002,
"learning_rate": 1.3616907437132156e-05,
"loss": 0.4481,
"step": 1360
},
{
"epoch": 2.1990369181380416,
"grad_norm": 4.044301986694336,
"learning_rate": 1.3349384697699305e-05,
"loss": 0.4448,
"step": 1370
},
{
"epoch": 2.215088282504013,
"grad_norm": 4.877999782562256,
"learning_rate": 1.3081861958266453e-05,
"loss": 0.4457,
"step": 1380
},
{
"epoch": 2.231139646869984,
"grad_norm": 4.929600238800049,
"learning_rate": 1.2814339218833602e-05,
"loss": 0.432,
"step": 1390
},
{
"epoch": 2.247191011235955,
"grad_norm": 3.4969890117645264,
"learning_rate": 1.254681647940075e-05,
"loss": 0.4387,
"step": 1400
},
{
"epoch": 2.263242375601926,
"grad_norm": 4.1156086921691895,
"learning_rate": 1.2279293739967898e-05,
"loss": 0.4406,
"step": 1410
},
{
"epoch": 2.279293739967897,
"grad_norm": 4.0271148681640625,
"learning_rate": 1.2011771000535047e-05,
"loss": 0.4283,
"step": 1420
},
{
"epoch": 2.2953451043338684,
"grad_norm": 3.6957242488861084,
"learning_rate": 1.1744248261102195e-05,
"loss": 0.4424,
"step": 1430
},
{
"epoch": 2.3113964686998396,
"grad_norm": 3.1115617752075195,
"learning_rate": 1.1476725521669344e-05,
"loss": 0.4475,
"step": 1440
},
{
"epoch": 2.3274478330658104,
"grad_norm": 4.844674587249756,
"learning_rate": 1.120920278223649e-05,
"loss": 0.4459,
"step": 1450
},
{
"epoch": 2.3434991974317816,
"grad_norm": 2.712280750274658,
"learning_rate": 1.0941680042803639e-05,
"loss": 0.4219,
"step": 1460
},
{
"epoch": 2.359550561797753,
"grad_norm": 2.9259140491485596,
"learning_rate": 1.0674157303370787e-05,
"loss": 0.4824,
"step": 1470
},
{
"epoch": 2.375601926163724,
"grad_norm": 4.346639633178711,
"learning_rate": 1.0406634563937935e-05,
"loss": 0.4364,
"step": 1480
},
{
"epoch": 2.391653290529695,
"grad_norm": 2.7212588787078857,
"learning_rate": 1.0139111824505084e-05,
"loss": 0.3956,
"step": 1490
},
{
"epoch": 2.407704654895666,
"grad_norm": 4.124871253967285,
"learning_rate": 9.871589085072232e-06,
"loss": 0.483,
"step": 1500
},
{
"epoch": 2.423756019261637,
"grad_norm": 4.835409641265869,
"learning_rate": 9.60406634563938e-06,
"loss": 0.4623,
"step": 1510
},
{
"epoch": 2.4398073836276084,
"grad_norm": 2.9034523963928223,
"learning_rate": 9.336543606206529e-06,
"loss": 0.4208,
"step": 1520
},
{
"epoch": 2.4558587479935796,
"grad_norm": 2.569786787033081,
"learning_rate": 9.069020866773677e-06,
"loss": 0.4141,
"step": 1530
},
{
"epoch": 2.4719101123595504,
"grad_norm": 4.244718551635742,
"learning_rate": 8.801498127340826e-06,
"loss": 0.4583,
"step": 1540
},
{
"epoch": 2.4879614767255216,
"grad_norm": 4.004569053649902,
"learning_rate": 8.533975387907972e-06,
"loss": 0.4266,
"step": 1550
},
{
"epoch": 2.504012841091493,
"grad_norm": 4.286050796508789,
"learning_rate": 8.26645264847512e-06,
"loss": 0.4117,
"step": 1560
},
{
"epoch": 2.520064205457464,
"grad_norm": 4.992115497589111,
"learning_rate": 7.99892990904227e-06,
"loss": 0.4103,
"step": 1570
},
{
"epoch": 2.5361155698234352,
"grad_norm": 4.154786586761475,
"learning_rate": 7.731407169609418e-06,
"loss": 0.4566,
"step": 1580
},
{
"epoch": 2.552166934189406,
"grad_norm": 4.701552391052246,
"learning_rate": 7.463884430176565e-06,
"loss": 0.4534,
"step": 1590
},
{
"epoch": 2.568218298555377,
"grad_norm": 3.9032320976257324,
"learning_rate": 7.1963616907437135e-06,
"loss": 0.4586,
"step": 1600
},
{
"epoch": 2.5842696629213484,
"grad_norm": 4.401456832885742,
"learning_rate": 6.928838951310862e-06,
"loss": 0.4219,
"step": 1610
},
{
"epoch": 2.600321027287319,
"grad_norm": 3.317080497741699,
"learning_rate": 6.66131621187801e-06,
"loss": 0.4522,
"step": 1620
},
{
"epoch": 2.6163723916532904,
"grad_norm": 3.893983840942383,
"learning_rate": 6.393793472445159e-06,
"loss": 0.4208,
"step": 1630
},
{
"epoch": 2.6324237560192616,
"grad_norm": 2.585857391357422,
"learning_rate": 6.126270733012306e-06,
"loss": 0.4535,
"step": 1640
},
{
"epoch": 2.648475120385233,
"grad_norm": 2.956127405166626,
"learning_rate": 5.858747993579455e-06,
"loss": 0.4587,
"step": 1650
},
{
"epoch": 2.664526484751204,
"grad_norm": 4.7360992431640625,
"learning_rate": 5.591225254146603e-06,
"loss": 0.4317,
"step": 1660
},
{
"epoch": 2.6805778491171752,
"grad_norm": 3.546750068664551,
"learning_rate": 5.323702514713751e-06,
"loss": 0.4707,
"step": 1670
},
{
"epoch": 2.696629213483146,
"grad_norm": 3.985381841659546,
"learning_rate": 5.056179775280899e-06,
"loss": 0.448,
"step": 1680
},
{
"epoch": 2.712680577849117,
"grad_norm": 3.1598966121673584,
"learning_rate": 4.788657035848047e-06,
"loss": 0.4656,
"step": 1690
},
{
"epoch": 2.7287319422150884,
"grad_norm": 2.8233871459960938,
"learning_rate": 4.521134296415196e-06,
"loss": 0.4827,
"step": 1700
},
{
"epoch": 2.744783306581059,
"grad_norm": 3.8089215755462646,
"learning_rate": 4.253611556982344e-06,
"loss": 0.4454,
"step": 1710
},
{
"epoch": 2.7608346709470304,
"grad_norm": 3.793998956680298,
"learning_rate": 3.986088817549492e-06,
"loss": 0.4626,
"step": 1720
},
{
"epoch": 2.7768860353130016,
"grad_norm": 4.2339768409729,
"learning_rate": 3.71856607811664e-06,
"loss": 0.4368,
"step": 1730
},
{
"epoch": 2.792937399678973,
"grad_norm": 3.96157169342041,
"learning_rate": 3.4510433386837885e-06,
"loss": 0.4189,
"step": 1740
},
{
"epoch": 2.808988764044944,
"grad_norm": 3.535388708114624,
"learning_rate": 3.1835205992509364e-06,
"loss": 0.4585,
"step": 1750
},
{
"epoch": 2.825040128410915,
"grad_norm": 3.344831943511963,
"learning_rate": 2.9159978598180844e-06,
"loss": 0.4366,
"step": 1760
},
{
"epoch": 2.841091492776886,
"grad_norm": 2.6424927711486816,
"learning_rate": 2.648475120385233e-06,
"loss": 0.4465,
"step": 1770
},
{
"epoch": 2.857142857142857,
"grad_norm": 3.244215250015259,
"learning_rate": 2.3809523809523808e-06,
"loss": 0.4404,
"step": 1780
},
{
"epoch": 2.8731942215088284,
"grad_norm": 3.6791014671325684,
"learning_rate": 2.113429641519529e-06,
"loss": 0.4533,
"step": 1790
},
{
"epoch": 2.889245585874799,
"grad_norm": 6.856778144836426,
"learning_rate": 1.8459069020866775e-06,
"loss": 0.4305,
"step": 1800
},
{
"epoch": 2.9052969502407704,
"grad_norm": 5.275630474090576,
"learning_rate": 1.5783841626538255e-06,
"loss": 0.5065,
"step": 1810
},
{
"epoch": 2.9213483146067416,
"grad_norm": 4.3441619873046875,
"learning_rate": 1.310861423220974e-06,
"loss": 0.3924,
"step": 1820
},
{
"epoch": 2.937399678972713,
"grad_norm": 4.840381145477295,
"learning_rate": 1.043338683788122e-06,
"loss": 0.455,
"step": 1830
},
{
"epoch": 2.953451043338684,
"grad_norm": 4.567574501037598,
"learning_rate": 7.758159443552703e-07,
"loss": 0.4415,
"step": 1840
},
{
"epoch": 2.969502407704655,
"grad_norm": 3.6461331844329834,
"learning_rate": 5.082932049224184e-07,
"loss": 0.4363,
"step": 1850
},
{
"epoch": 2.985553772070626,
"grad_norm": 5.579707145690918,
"learning_rate": 2.407704654895666e-07,
"loss": 0.4153,
"step": 1860
}
],
"logging_steps": 10,
"max_steps": 1869,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4324368475705500.0,
"train_batch_size": 50,
"trial_name": null,
"trial_params": null
}