baohuynhbk14 commited on
Commit
539ed93
·
verified ·
1 Parent(s): 3335fcc

Model save

Browse files
Files changed (3) hide show
  1. README.md +8 -1
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +402 -70
README.md CHANGED
@@ -14,6 +14,8 @@ should probably proofread and complete it, then remove this comment. -->
14
  # miniCPM_finetune_lora_viet_vqa
15
 
16
  This model is a fine-tuned version of [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) on an unknown dataset.
 
 
17
 
18
  ## Model description
19
 
@@ -44,10 +46,15 @@ The following hyperparameters were used during training:
44
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-08
45
  - lr_scheduler_type: cosine
46
  - lr_scheduler_warmup_ratio: 0.01
47
- - num_epochs: 2.0
48
 
49
  ### Training results
50
 
 
 
 
 
 
51
 
52
 
53
  ### Framework versions
 
14
  # miniCPM_finetune_lora_viet_vqa
15
 
16
  This model is a fine-tuned version of [openbmb/MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - Loss: 1.6850
19
 
20
  ## Model description
21
 
 
46
  - optimizer: Adam with betas=(0.9,0.95) and epsilon=1e-08
47
  - lr_scheduler_type: cosine
48
  - lr_scheduler_warmup_ratio: 0.01
49
+ - num_epochs: 5.0
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Validation Loss |
54
+ |:-------------:|:------:|:----:|:---------------:|
55
+ | 2.1566 | 1.3889 | 100 | 2.0881 |
56
+ | 1.8447 | 2.7778 | 200 | 1.8452 |
57
+ | 1.7103 | 4.1667 | 300 | 1.6850 |
58
 
59
 
60
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a93cdb910600fe2b82d0ab66ee95a2a8021cbdf41aa8c903620c4ff95904923
3
  size 2140797224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80e34b9592f57d69fb2c668c84a46ee56010ad58879663e16e882641d36c912e
3
  size 2140797224
trainer_state.json CHANGED
@@ -1,225 +1,557 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
- "eval_steps": 1000,
6
- "global_step": 144,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06944444444444445,
13
- "grad_norm": 8.71909236907959,
14
  "learning_rate": 1e-06,
15
- "loss": 2.6593,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.1388888888888889,
20
- "grad_norm": 8.853459358215332,
21
  "learning_rate": 1e-06,
22
- "loss": 2.4766,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.20833333333333334,
27
- "grad_norm": 6.577568054199219,
28
  "learning_rate": 1e-06,
29
- "loss": 2.495,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.2777777777777778,
34
- "grad_norm": 8.3203706741333,
35
  "learning_rate": 1e-06,
36
- "loss": 2.6105,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.3472222222222222,
41
- "grad_norm": 7.732483863830566,
42
  "learning_rate": 1e-06,
43
- "loss": 2.4744,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.4166666666666667,
48
- "grad_norm": 6.6942548751831055,
49
  "learning_rate": 1e-06,
50
- "loss": 2.5163,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.4861111111111111,
55
- "grad_norm": 5.781284809112549,
56
  "learning_rate": 1e-06,
57
- "loss": 2.3854,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.5555555555555556,
62
- "grad_norm": 7.311328887939453,
63
  "learning_rate": 1e-06,
64
- "loss": 2.4442,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.625,
69
- "grad_norm": 6.254249572753906,
70
  "learning_rate": 1e-06,
71
- "loss": 2.2468,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.6944444444444444,
76
- "grad_norm": 7.6778669357299805,
77
  "learning_rate": 1e-06,
78
- "loss": 2.2565,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.7638888888888888,
83
- "grad_norm": 7.495645523071289,
84
  "learning_rate": 1e-06,
85
- "loss": 2.4391,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.8333333333333334,
90
- "grad_norm": 5.458991527557373,
91
  "learning_rate": 1e-06,
92
- "loss": 2.2362,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.9027777777777778,
97
- "grad_norm": 5.659170627593994,
98
  "learning_rate": 1e-06,
99
- "loss": 2.2188,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.9722222222222222,
104
- "grad_norm": 4.8217997550964355,
105
  "learning_rate": 1e-06,
106
- "loss": 2.154,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 1.0416666666666667,
111
- "grad_norm": 6.693627834320068,
112
  "learning_rate": 1e-06,
113
- "loss": 2.1182,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 1.1111111111111112,
118
- "grad_norm": 6.67255973815918,
119
  "learning_rate": 1e-06,
120
- "loss": 2.1879,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 1.1805555555555556,
125
- "grad_norm": 4.831326007843018,
126
  "learning_rate": 1e-06,
127
- "loss": 2.1077,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 1.25,
132
- "grad_norm": 4.830414295196533,
133
  "learning_rate": 1e-06,
134
- "loss": 2.026,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 1.3194444444444444,
139
- "grad_norm": 5.039080619812012,
140
  "learning_rate": 1e-06,
141
- "loss": 2.0585,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 1.3888888888888888,
146
- "grad_norm": 5.749475002288818,
147
  "learning_rate": 1e-06,
148
- "loss": 2.0486,
 
 
 
 
 
 
 
 
149
  "step": 100
150
  },
151
  {
152
  "epoch": 1.4583333333333333,
153
- "grad_norm": 5.0571770668029785,
154
  "learning_rate": 1e-06,
155
- "loss": 1.9616,
156
  "step": 105
157
  },
158
  {
159
  "epoch": 1.5277777777777777,
160
- "grad_norm": 4.597809314727783,
161
  "learning_rate": 1e-06,
162
- "loss": 1.9063,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 1.5972222222222223,
167
- "grad_norm": 5.453224182128906,
168
  "learning_rate": 1e-06,
169
- "loss": 2.1802,
170
  "step": 115
171
  },
172
  {
173
  "epoch": 1.6666666666666665,
174
- "grad_norm": 4.519564628601074,
175
  "learning_rate": 1e-06,
176
- "loss": 2.2049,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 1.7361111111111112,
181
- "grad_norm": 4.976806163787842,
182
  "learning_rate": 1e-06,
183
- "loss": 1.8881,
184
  "step": 125
185
  },
186
  {
187
  "epoch": 1.8055555555555556,
188
- "grad_norm": 4.543058395385742,
189
  "learning_rate": 1e-06,
190
- "loss": 2.0673,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 1.875,
195
- "grad_norm": 4.89597225189209,
196
  "learning_rate": 1e-06,
197
- "loss": 2.0158,
198
  "step": 135
199
  },
200
  {
201
  "epoch": 1.9444444444444444,
202
- "grad_norm": 4.95186185836792,
203
  "learning_rate": 1e-06,
204
- "loss": 2.0052,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 2.0,
209
- "step": 144,
210
- "total_flos": 9.601343958286336e+16,
211
- "train_loss": 2.2213990655210285,
212
- "train_runtime": 2501.8045,
213
- "train_samples_per_second": 0.921,
214
- "train_steps_per_second": 0.058
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  }
216
  ],
217
  "logging_steps": 5,
218
- "max_steps": 144,
219
  "num_input_tokens_seen": 0,
220
- "num_train_epochs": 2,
221
- "save_steps": 200,
222
- "total_flos": 9.601343958286336e+16,
223
  "train_batch_size": 4,
224
  "trial_name": null,
225
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 100,
6
+ "global_step": 360,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06944444444444445,
13
+ "grad_norm": 6.3636603355407715,
14
  "learning_rate": 1e-06,
15
+ "loss": 2.6727,
16
  "step": 5
17
  },
18
  {
19
  "epoch": 0.1388888888888889,
20
+ "grad_norm": 7.486879825592041,
21
  "learning_rate": 1e-06,
22
+ "loss": 2.3642,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.20833333333333334,
27
+ "grad_norm": 6.5991997718811035,
28
  "learning_rate": 1e-06,
29
+ "loss": 2.515,
30
  "step": 15
31
  },
32
  {
33
  "epoch": 0.2777777777777778,
34
+ "grad_norm": 7.580630779266357,
35
  "learning_rate": 1e-06,
36
+ "loss": 2.3997,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.3472222222222222,
41
+ "grad_norm": 7.5727410316467285,
42
  "learning_rate": 1e-06,
43
+ "loss": 2.4551,
44
  "step": 25
45
  },
46
  {
47
  "epoch": 0.4166666666666667,
48
+ "grad_norm": 8.835946083068848,
49
  "learning_rate": 1e-06,
50
+ "loss": 2.4476,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.4861111111111111,
55
+ "grad_norm": 7.495606899261475,
56
  "learning_rate": 1e-06,
57
+ "loss": 2.4191,
58
  "step": 35
59
  },
60
  {
61
  "epoch": 0.5555555555555556,
62
+ "grad_norm": 8.057035446166992,
63
  "learning_rate": 1e-06,
64
+ "loss": 2.441,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.625,
69
+ "grad_norm": 6.828744411468506,
70
  "learning_rate": 1e-06,
71
+ "loss": 2.3052,
72
  "step": 45
73
  },
74
  {
75
  "epoch": 0.6944444444444444,
76
+ "grad_norm": 7.163251876831055,
77
  "learning_rate": 1e-06,
78
+ "loss": 2.1357,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.7638888888888888,
83
+ "grad_norm": 5.414941787719727,
84
  "learning_rate": 1e-06,
85
+ "loss": 2.2248,
86
  "step": 55
87
  },
88
  {
89
  "epoch": 0.8333333333333334,
90
+ "grad_norm": 6.0801544189453125,
91
  "learning_rate": 1e-06,
92
+ "loss": 2.2934,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.9027777777777778,
97
+ "grad_norm": 6.054081439971924,
98
  "learning_rate": 1e-06,
99
+ "loss": 2.3014,
100
  "step": 65
101
  },
102
  {
103
  "epoch": 0.9722222222222222,
104
+ "grad_norm": 5.827741622924805,
105
  "learning_rate": 1e-06,
106
+ "loss": 2.2515,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 1.0416666666666667,
111
+ "grad_norm": 3.5676162242889404,
112
  "learning_rate": 1e-06,
113
+ "loss": 2.0915,
114
  "step": 75
115
  },
116
  {
117
  "epoch": 1.1111111111111112,
118
+ "grad_norm": 5.15900993347168,
119
  "learning_rate": 1e-06,
120
+ "loss": 2.0749,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 1.1805555555555556,
125
+ "grad_norm": 5.206437110900879,
126
  "learning_rate": 1e-06,
127
+ "loss": 2.0539,
128
  "step": 85
129
  },
130
  {
131
  "epoch": 1.25,
132
+ "grad_norm": 5.990969657897949,
133
  "learning_rate": 1e-06,
134
+ "loss": 2.1308,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 1.3194444444444444,
139
+ "grad_norm": 6.198008060455322,
140
  "learning_rate": 1e-06,
141
+ "loss": 2.3256,
142
  "step": 95
143
  },
144
  {
145
  "epoch": 1.3888888888888888,
146
+ "grad_norm": 5.184628486633301,
147
  "learning_rate": 1e-06,
148
+ "loss": 2.1566,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 1.3888888888888888,
153
+ "eval_loss": 2.0880796909332275,
154
+ "eval_runtime": 34.0667,
155
+ "eval_samples_per_second": 2.935,
156
+ "eval_steps_per_second": 0.734,
157
  "step": 100
158
  },
159
  {
160
  "epoch": 1.4583333333333333,
161
+ "grad_norm": 5.412724494934082,
162
  "learning_rate": 1e-06,
163
+ "loss": 1.9085,
164
  "step": 105
165
  },
166
  {
167
  "epoch": 1.5277777777777777,
168
+ "grad_norm": 3.459959030151367,
169
  "learning_rate": 1e-06,
170
+ "loss": 1.9494,
171
  "step": 110
172
  },
173
  {
174
  "epoch": 1.5972222222222223,
175
+ "grad_norm": 5.159445762634277,
176
  "learning_rate": 1e-06,
177
+ "loss": 1.9334,
178
  "step": 115
179
  },
180
  {
181
  "epoch": 1.6666666666666665,
182
+ "grad_norm": 5.133082389831543,
183
  "learning_rate": 1e-06,
184
+ "loss": 2.0826,
185
  "step": 120
186
  },
187
  {
188
  "epoch": 1.7361111111111112,
189
+ "grad_norm": 4.473026752471924,
190
  "learning_rate": 1e-06,
191
+ "loss": 2.0585,
192
  "step": 125
193
  },
194
  {
195
  "epoch": 1.8055555555555556,
196
+ "grad_norm": 5.063863754272461,
197
  "learning_rate": 1e-06,
198
+ "loss": 2.1289,
199
  "step": 130
200
  },
201
  {
202
  "epoch": 1.875,
203
+ "grad_norm": 4.927737236022949,
204
  "learning_rate": 1e-06,
205
+ "loss": 1.9872,
206
  "step": 135
207
  },
208
  {
209
  "epoch": 1.9444444444444444,
210
+ "grad_norm": 5.563902854919434,
211
  "learning_rate": 1e-06,
212
+ "loss": 1.9803,
213
  "step": 140
214
  },
215
  {
216
+ "epoch": 2.013888888888889,
217
+ "grad_norm": 3.901442050933838,
218
+ "learning_rate": 1e-06,
219
+ "loss": 1.8309,
220
+ "step": 145
221
+ },
222
+ {
223
+ "epoch": 2.0833333333333335,
224
+ "grad_norm": 3.771136999130249,
225
+ "learning_rate": 1e-06,
226
+ "loss": 1.7758,
227
+ "step": 150
228
+ },
229
+ {
230
+ "epoch": 2.1527777777777777,
231
+ "grad_norm": 4.6159257888793945,
232
+ "learning_rate": 1e-06,
233
+ "loss": 1.9193,
234
+ "step": 155
235
+ },
236
+ {
237
+ "epoch": 2.2222222222222223,
238
+ "grad_norm": 3.758843183517456,
239
+ "learning_rate": 1e-06,
240
+ "loss": 1.9329,
241
+ "step": 160
242
+ },
243
+ {
244
+ "epoch": 2.2916666666666665,
245
+ "grad_norm": 4.267579078674316,
246
+ "learning_rate": 1e-06,
247
+ "loss": 2.0399,
248
+ "step": 165
249
+ },
250
+ {
251
+ "epoch": 2.361111111111111,
252
+ "grad_norm": 3.9819560050964355,
253
+ "learning_rate": 1e-06,
254
+ "loss": 1.9568,
255
+ "step": 170
256
+ },
257
+ {
258
+ "epoch": 2.4305555555555554,
259
+ "grad_norm": 3.8918192386627197,
260
+ "learning_rate": 1e-06,
261
+ "loss": 1.7377,
262
+ "step": 175
263
+ },
264
+ {
265
+ "epoch": 2.5,
266
+ "grad_norm": 3.9746928215026855,
267
+ "learning_rate": 1e-06,
268
+ "loss": 1.8949,
269
+ "step": 180
270
+ },
271
+ {
272
+ "epoch": 2.5694444444444446,
273
+ "grad_norm": 3.328784704208374,
274
+ "learning_rate": 1e-06,
275
+ "loss": 1.6509,
276
+ "step": 185
277
+ },
278
+ {
279
+ "epoch": 2.638888888888889,
280
+ "grad_norm": 3.835324287414551,
281
+ "learning_rate": 1e-06,
282
+ "loss": 1.8321,
283
+ "step": 190
284
+ },
285
+ {
286
+ "epoch": 2.7083333333333335,
287
+ "grad_norm": 3.3603885173797607,
288
+ "learning_rate": 1e-06,
289
+ "loss": 1.8628,
290
+ "step": 195
291
+ },
292
+ {
293
+ "epoch": 2.7777777777777777,
294
+ "grad_norm": 3.7577502727508545,
295
+ "learning_rate": 1e-06,
296
+ "loss": 1.8447,
297
+ "step": 200
298
+ },
299
+ {
300
+ "epoch": 2.7777777777777777,
301
+ "eval_loss": 1.8452154397964478,
302
+ "eval_runtime": 34.0911,
303
+ "eval_samples_per_second": 2.933,
304
+ "eval_steps_per_second": 0.733,
305
+ "step": 200
306
+ },
307
+ {
308
+ "epoch": 2.8472222222222223,
309
+ "grad_norm": 4.379385948181152,
310
+ "learning_rate": 1e-06,
311
+ "loss": 1.8212,
312
+ "step": 205
313
+ },
314
+ {
315
+ "epoch": 2.9166666666666665,
316
+ "grad_norm": 3.7095022201538086,
317
+ "learning_rate": 1e-06,
318
+ "loss": 1.7862,
319
+ "step": 210
320
+ },
321
+ {
322
+ "epoch": 2.986111111111111,
323
+ "grad_norm": 4.164438724517822,
324
+ "learning_rate": 1e-06,
325
+ "loss": 1.8046,
326
+ "step": 215
327
+ },
328
+ {
329
+ "epoch": 3.0555555555555554,
330
+ "grad_norm": 3.6749582290649414,
331
+ "learning_rate": 1e-06,
332
+ "loss": 1.6358,
333
+ "step": 220
334
+ },
335
+ {
336
+ "epoch": 3.125,
337
+ "grad_norm": 3.7247958183288574,
338
+ "learning_rate": 1e-06,
339
+ "loss": 1.791,
340
+ "step": 225
341
+ },
342
+ {
343
+ "epoch": 3.1944444444444446,
344
+ "grad_norm": 2.9533472061157227,
345
+ "learning_rate": 1e-06,
346
+ "loss": 1.6251,
347
+ "step": 230
348
+ },
349
+ {
350
+ "epoch": 3.263888888888889,
351
+ "grad_norm": 4.062502384185791,
352
+ "learning_rate": 1e-06,
353
+ "loss": 1.6976,
354
+ "step": 235
355
+ },
356
+ {
357
+ "epoch": 3.3333333333333335,
358
+ "grad_norm": 4.328882217407227,
359
+ "learning_rate": 1e-06,
360
+ "loss": 1.8438,
361
+ "step": 240
362
+ },
363
+ {
364
+ "epoch": 3.4027777777777777,
365
+ "grad_norm": 4.158596038818359,
366
+ "learning_rate": 1e-06,
367
+ "loss": 1.8998,
368
+ "step": 245
369
+ },
370
+ {
371
+ "epoch": 3.4722222222222223,
372
+ "grad_norm": 5.7752556800842285,
373
+ "learning_rate": 1e-06,
374
+ "loss": 1.7517,
375
+ "step": 250
376
+ },
377
+ {
378
+ "epoch": 3.5416666666666665,
379
+ "grad_norm": 4.568635940551758,
380
+ "learning_rate": 1e-06,
381
+ "loss": 1.6835,
382
+ "step": 255
383
+ },
384
+ {
385
+ "epoch": 3.611111111111111,
386
+ "grad_norm": 3.6611974239349365,
387
+ "learning_rate": 1e-06,
388
+ "loss": 1.7852,
389
+ "step": 260
390
+ },
391
+ {
392
+ "epoch": 3.6805555555555554,
393
+ "grad_norm": 4.026912212371826,
394
+ "learning_rate": 1e-06,
395
+ "loss": 1.7916,
396
+ "step": 265
397
+ },
398
+ {
399
+ "epoch": 3.75,
400
+ "grad_norm": 4.750195026397705,
401
+ "learning_rate": 1e-06,
402
+ "loss": 1.7584,
403
+ "step": 270
404
+ },
405
+ {
406
+ "epoch": 3.8194444444444446,
407
+ "grad_norm": 3.936798572540283,
408
+ "learning_rate": 1e-06,
409
+ "loss": 1.5877,
410
+ "step": 275
411
+ },
412
+ {
413
+ "epoch": 3.888888888888889,
414
+ "grad_norm": 4.1127800941467285,
415
+ "learning_rate": 1e-06,
416
+ "loss": 1.5392,
417
+ "step": 280
418
+ },
419
+ {
420
+ "epoch": 3.9583333333333335,
421
+ "grad_norm": 3.6437580585479736,
422
+ "learning_rate": 1e-06,
423
+ "loss": 1.6125,
424
+ "step": 285
425
+ },
426
+ {
427
+ "epoch": 4.027777777777778,
428
+ "grad_norm": 3.641177177429199,
429
+ "learning_rate": 1e-06,
430
+ "loss": 1.687,
431
+ "step": 290
432
+ },
433
+ {
434
+ "epoch": 4.097222222222222,
435
+ "grad_norm": 3.797327995300293,
436
+ "learning_rate": 1e-06,
437
+ "loss": 1.7779,
438
+ "step": 295
439
+ },
440
+ {
441
+ "epoch": 4.166666666666667,
442
+ "grad_norm": 5.071943283081055,
443
+ "learning_rate": 1e-06,
444
+ "loss": 1.7103,
445
+ "step": 300
446
+ },
447
+ {
448
+ "epoch": 4.166666666666667,
449
+ "eval_loss": 1.6850143671035767,
450
+ "eval_runtime": 34.4694,
451
+ "eval_samples_per_second": 2.901,
452
+ "eval_steps_per_second": 0.725,
453
+ "step": 300
454
+ },
455
+ {
456
+ "epoch": 4.236111111111111,
457
+ "grad_norm": 6.09140682220459,
458
+ "learning_rate": 1e-06,
459
+ "loss": 1.6347,
460
+ "step": 305
461
+ },
462
+ {
463
+ "epoch": 4.305555555555555,
464
+ "grad_norm": 5.452902317047119,
465
+ "learning_rate": 1e-06,
466
+ "loss": 1.7689,
467
+ "step": 310
468
+ },
469
+ {
470
+ "epoch": 4.375,
471
+ "grad_norm": 3.5834009647369385,
472
+ "learning_rate": 1e-06,
473
+ "loss": 1.6514,
474
+ "step": 315
475
+ },
476
+ {
477
+ "epoch": 4.444444444444445,
478
+ "grad_norm": 3.288220167160034,
479
+ "learning_rate": 1e-06,
480
+ "loss": 1.4941,
481
+ "step": 320
482
+ },
483
+ {
484
+ "epoch": 4.513888888888889,
485
+ "grad_norm": 4.202756404876709,
486
+ "learning_rate": 1e-06,
487
+ "loss": 1.5374,
488
+ "step": 325
489
+ },
490
+ {
491
+ "epoch": 4.583333333333333,
492
+ "grad_norm": 3.9757556915283203,
493
+ "learning_rate": 1e-06,
494
+ "loss": 1.6289,
495
+ "step": 330
496
+ },
497
+ {
498
+ "epoch": 4.652777777777778,
499
+ "grad_norm": 3.3575947284698486,
500
+ "learning_rate": 1e-06,
501
+ "loss": 1.5446,
502
+ "step": 335
503
+ },
504
+ {
505
+ "epoch": 4.722222222222222,
506
+ "grad_norm": 4.207667350769043,
507
+ "learning_rate": 1e-06,
508
+ "loss": 1.5668,
509
+ "step": 340
510
+ },
511
+ {
512
+ "epoch": 4.791666666666667,
513
+ "grad_norm": 3.2263221740722656,
514
+ "learning_rate": 1e-06,
515
+ "loss": 1.4529,
516
+ "step": 345
517
+ },
518
+ {
519
+ "epoch": 4.861111111111111,
520
+ "grad_norm": 3.272395610809326,
521
+ "learning_rate": 1e-06,
522
+ "loss": 1.5215,
523
+ "step": 350
524
+ },
525
+ {
526
+ "epoch": 4.930555555555555,
527
+ "grad_norm": 3.4315106868743896,
528
+ "learning_rate": 1e-06,
529
+ "loss": 1.5781,
530
+ "step": 355
531
+ },
532
+ {
533
+ "epoch": 5.0,
534
+ "grad_norm": 3.9581406116485596,
535
+ "learning_rate": 1e-06,
536
+ "loss": 1.5001,
537
+ "step": 360
538
+ },
539
+ {
540
+ "epoch": 5.0,
541
+ "step": 360,
542
+ "total_flos": 2.3797808143060173e+17,
543
+ "train_loss": 1.9143991947174073,
544
+ "train_runtime": 6464.4185,
545
+ "train_samples_per_second": 0.891,
546
+ "train_steps_per_second": 0.056
547
  }
548
  ],
549
  "logging_steps": 5,
550
+ "max_steps": 360,
551
  "num_input_tokens_seen": 0,
552
+ "num_train_epochs": 5,
553
+ "save_steps": 100,
554
+ "total_flos": 2.3797808143060173e+17,
555
  "train_batch_size": 4,
556
  "trial_name": null,
557
  "trial_params": null