agentlans commited on
Commit
5af5012
·
verified ·
1 Parent(s): ce9cc67

Upload 13 files

Browse files
README.md CHANGED
@@ -5,19 +5,19 @@ base_model: agentlans/deberta-v3-base-zyda-2
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
- - name: deberta-v3-base-zyda-2-readability
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # deberta-v3-base-zyda-2-readability
16
 
17
  This model is a fine-tuned version of [agentlans/deberta-v3-base-zyda-2](https://huggingface.co/agentlans/deberta-v3-base-zyda-2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.7407
20
- - Mse: 0.7407
21
 
22
  ## Model description
23
 
@@ -48,9 +48,9 @@ The following hyperparameters were used during training:
48
 
49
  | Training Loss | Epoch | Step | Validation Loss | Mse |
50
  |:-------------:|:-----:|:-----:|:---------------:|:------:|
51
- | 0.6839 | 1.0 | 13589 | 1.0938 | 1.0938 |
52
- | 0.5281 | 2.0 | 27178 | 0.7477 | 0.7477 |
53
- | 0.4484 | 3.0 | 40767 | 0.7407 | 0.7407 |
54
 
55
 
56
  ### Framework versions
 
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
+ - name: deberta-v3-base-zyda-2-transformed-readability
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # deberta-v3-base-zyda-2-transformed-readability
16
 
17
  This model is a fine-tuned version of [agentlans/deberta-v3-base-zyda-2](https://huggingface.co/agentlans/deberta-v3-base-zyda-2) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.0267
20
+ - Mse: 0.0267
21
 
22
  ## Model description
23
 
 
48
 
49
  | Training Loss | Epoch | Step | Validation Loss | Mse |
50
  |:-------------:|:-----:|:-----:|:---------------:|:------:|
51
+ | 0.0288 | 1.0 | 13589 | 0.0286 | 0.0286 |
52
+ | 0.023 | 2.0 | 27178 | 0.0272 | 0.0272 |
53
+ | 0.0189 | 3.0 | 40767 | 0.0267 | 0.0267 |
54
 
55
 
56
  ### Framework versions
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.7406834363937378,
4
- "eval_mse": 0.7406833936714095,
5
- "eval_runtime": 76.9379,
6
  "eval_samples": 50000,
7
- "eval_samples_per_second": 649.875,
8
- "eval_steps_per_second": 81.234,
9
  "total_flos": 1.7161499914378214e+17,
10
- "train_loss": 0.702953228300284,
11
- "train_runtime": 12856.1609,
12
  "train_samples": 869663,
13
- "train_samples_per_second": 202.937,
14
- "train_steps_per_second": 3.171
15
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.0266929492354393,
4
+ "eval_mse": 0.026692949063357767,
5
+ "eval_runtime": 80.8621,
6
  "eval_samples": 50000,
7
+ "eval_samples_per_second": 618.336,
8
+ "eval_steps_per_second": 77.292,
9
  "total_flos": 1.7161499914378214e+17,
10
+ "train_loss": 0.026733717791019525,
11
+ "train_runtime": 12981.5942,
12
  "train_samples": 869663,
13
+ "train_samples_per_second": 200.976,
14
+ "train_steps_per_second": 3.14
15
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_loss": 0.7406834363937378,
4
- "eval_mse": 0.7406833936714095,
5
- "eval_runtime": 76.9379,
6
  "eval_samples": 50000,
7
- "eval_samples_per_second": 649.875,
8
- "eval_steps_per_second": 81.234
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_loss": 0.0266929492354393,
4
+ "eval_mse": 0.026692949063357767,
5
+ "eval_runtime": 80.8621,
6
  "eval_samples": 50000,
7
+ "eval_samples_per_second": 618.336,
8
+ "eval_steps_per_second": 77.292
9
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:53bf4ffe628aa3961a9fdab80547f51c481cda969333051ab370f1d027ee347b
3
  size 737716196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd7ee0b4f3b36635420950278424e552045bef8f1ac02256a55d5a26b486a5ba
3
  size 737716196
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1.7161499914378214e+17,
4
- "train_loss": 0.702953228300284,
5
- "train_runtime": 12856.1609,
6
  "train_samples": 869663,
7
- "train_samples_per_second": 202.937,
8
- "train_steps_per_second": 3.171
9
  }
 
1
  {
2
  "epoch": 3.0,
3
  "total_flos": 1.7161499914378214e+17,
4
+ "train_loss": 0.026733717791019525,
5
+ "train_runtime": 12981.5942,
6
  "train_samples": 869663,
7
+ "train_samples_per_second": 200.976,
8
+ "train_steps_per_second": 3.14
9
  }
trainer_state.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "best_metric": 0.7406834363937378,
3
- "best_model_checkpoint": "deberta-v3-base-zyda-2-readability/checkpoint-40767",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
  "global_step": 40767,
@@ -10,606 +10,606 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03679446611229671,
13
- "grad_norm": 22.72686767578125,
14
  "learning_rate": 4.9386758898128385e-05,
15
- "loss": 6.5572,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.07358893222459342,
20
- "grad_norm": 42.41002655029297,
21
  "learning_rate": 4.877351779625678e-05,
22
- "loss": 1.3129,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.11038339833689013,
27
- "grad_norm": 13.57180404663086,
28
  "learning_rate": 4.8160276694385164e-05,
29
- "loss": 1.099,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.14717786444918685,
34
- "grad_norm": 15.051002502441406,
35
  "learning_rate": 4.754703559251355e-05,
36
- "loss": 0.9744,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.18397233056148354,
41
- "grad_norm": 20.83298683166504,
42
  "learning_rate": 4.693379449064194e-05,
43
- "loss": 0.9412,
44
  "step": 2500
45
  },
46
  {
47
  "epoch": 0.22076679667378027,
48
- "grad_norm": 20.301353454589844,
49
  "learning_rate": 4.632055338877033e-05,
50
- "loss": 0.8845,
51
  "step": 3000
52
  },
53
  {
54
  "epoch": 0.257561262786077,
55
- "grad_norm": 4.867860317230225,
56
  "learning_rate": 4.570731228689872e-05,
57
- "loss": 0.8894,
58
  "step": 3500
59
  },
60
  {
61
  "epoch": 0.2943557288983737,
62
- "grad_norm": 11.473809242248535,
63
  "learning_rate": 4.509407118502711e-05,
64
- "loss": 0.8605,
65
  "step": 4000
66
  },
67
  {
68
  "epoch": 0.3311501950106704,
69
- "grad_norm": 47.68638610839844,
70
  "learning_rate": 4.448083008315549e-05,
71
- "loss": 1.2506,
72
  "step": 4500
73
  },
74
  {
75
  "epoch": 0.3679446611229671,
76
- "grad_norm": 17.02402687072754,
77
  "learning_rate": 4.386758898128389e-05,
78
- "loss": 1.0712,
79
  "step": 5000
80
  },
81
  {
82
  "epoch": 0.40473912723526384,
83
- "grad_norm": 13.133755683898926,
84
  "learning_rate": 4.325434787941227e-05,
85
- "loss": 0.7953,
86
  "step": 5500
87
  },
88
  {
89
  "epoch": 0.44153359334756054,
90
- "grad_norm": 11.83011245727539,
91
  "learning_rate": 4.264110677754066e-05,
92
- "loss": 0.7575,
93
  "step": 6000
94
  },
95
  {
96
  "epoch": 0.47832805945985724,
97
- "grad_norm": 24.706212997436523,
98
  "learning_rate": 4.202786567566905e-05,
99
- "loss": 0.7491,
100
  "step": 6500
101
  },
102
  {
103
  "epoch": 0.515122525572154,
104
- "grad_norm": 16.181684494018555,
105
  "learning_rate": 4.141462457379743e-05,
106
- "loss": 1.0325,
107
  "step": 7000
108
  },
109
  {
110
  "epoch": 0.5519169916844506,
111
- "grad_norm": 14.158419609069824,
112
  "learning_rate": 4.080138347192582e-05,
113
- "loss": 1.082,
114
  "step": 7500
115
  },
116
  {
117
  "epoch": 0.5887114577967474,
118
- "grad_norm": 34.472206115722656,
119
  "learning_rate": 4.018814237005421e-05,
120
- "loss": 1.2578,
121
  "step": 8000
122
  },
123
  {
124
  "epoch": 0.625505923909044,
125
- "grad_norm": 18.01128578186035,
126
  "learning_rate": 3.95749012681826e-05,
127
- "loss": 0.76,
128
  "step": 8500
129
  },
130
  {
131
  "epoch": 0.6623003900213408,
132
- "grad_norm": 9.049446105957031,
133
  "learning_rate": 3.896166016631099e-05,
134
- "loss": 0.7493,
135
  "step": 9000
136
  },
137
  {
138
  "epoch": 0.6990948561336375,
139
- "grad_norm": 112.888427734375,
140
  "learning_rate": 3.834841906443938e-05,
141
- "loss": 0.7287,
142
  "step": 9500
143
  },
144
  {
145
  "epoch": 0.7358893222459342,
146
- "grad_norm": 55.44353485107422,
147
  "learning_rate": 3.773517796256776e-05,
148
- "loss": 0.7615,
149
  "step": 10000
150
  },
151
  {
152
  "epoch": 0.7726837883582309,
153
- "grad_norm": 18.046951293945312,
154
  "learning_rate": 3.712193686069616e-05,
155
- "loss": 0.7182,
156
  "step": 10500
157
  },
158
  {
159
  "epoch": 0.8094782544705277,
160
- "grad_norm": 6.221084117889404,
161
  "learning_rate": 3.650869575882454e-05,
162
- "loss": 0.6961,
163
  "step": 11000
164
  },
165
  {
166
  "epoch": 0.8462727205828243,
167
- "grad_norm": 19.79126739501953,
168
  "learning_rate": 3.589545465695293e-05,
169
- "loss": 0.6857,
170
  "step": 11500
171
  },
172
  {
173
  "epoch": 0.8830671866951211,
174
- "grad_norm": 11.714865684509277,
175
  "learning_rate": 3.528221355508132e-05,
176
- "loss": 0.6733,
177
  "step": 12000
178
  },
179
  {
180
  "epoch": 0.9198616528074177,
181
- "grad_norm": 19.167461395263672,
182
  "learning_rate": 3.466897245320971e-05,
183
- "loss": 0.6798,
184
  "step": 12500
185
  },
186
  {
187
  "epoch": 0.9566561189197145,
188
- "grad_norm": 9.230766296386719,
189
  "learning_rate": 3.405573135133809e-05,
190
- "loss": 0.6825,
191
  "step": 13000
192
  },
193
  {
194
  "epoch": 0.9934505850320112,
195
- "grad_norm": 13.260143280029297,
196
  "learning_rate": 3.344249024946648e-05,
197
- "loss": 0.6839,
198
  "step": 13500
199
  },
200
  {
201
  "epoch": 1.0,
202
- "eval_loss": 1.0938260555267334,
203
- "eval_mse": 1.0938261270842415,
204
- "eval_runtime": 82.8373,
205
- "eval_samples_per_second": 603.593,
206
- "eval_steps_per_second": 75.449,
207
  "step": 13589
208
  },
209
  {
210
  "epoch": 1.030245051144308,
211
- "grad_norm": 7.3761396408081055,
212
  "learning_rate": 3.282924914759487e-05,
213
- "loss": 0.6166,
214
  "step": 14000
215
  },
216
  {
217
  "epoch": 1.0670395172566045,
218
- "grad_norm": 3.998417854309082,
219
  "learning_rate": 3.221600804572326e-05,
220
- "loss": 0.5892,
221
  "step": 14500
222
  },
223
  {
224
  "epoch": 1.1038339833689013,
225
- "grad_norm": 7.822746753692627,
226
  "learning_rate": 3.160276694385165e-05,
227
- "loss": 0.5916,
228
  "step": 15000
229
  },
230
  {
231
  "epoch": 1.140628449481198,
232
- "grad_norm": 5.118077278137207,
233
  "learning_rate": 3.098952584198003e-05,
234
- "loss": 0.593,
235
  "step": 15500
236
  },
237
  {
238
  "epoch": 1.1774229155934948,
239
- "grad_norm": 6.34403133392334,
240
  "learning_rate": 3.0376284740108423e-05,
241
- "loss": 0.6,
242
  "step": 16000
243
  },
244
  {
245
  "epoch": 1.2142173817057915,
246
- "grad_norm": 28.41265296936035,
247
  "learning_rate": 2.976304363823681e-05,
248
- "loss": 0.589,
249
  "step": 16500
250
  },
251
  {
252
  "epoch": 1.2510118478180883,
253
- "grad_norm": 28.579113006591797,
254
  "learning_rate": 2.91498025363652e-05,
255
- "loss": 0.5984,
256
  "step": 17000
257
  },
258
  {
259
  "epoch": 1.2878063139303848,
260
- "grad_norm": 8.48458480834961,
261
  "learning_rate": 2.8536561434493587e-05,
262
- "loss": 0.5795,
263
  "step": 17500
264
  },
265
  {
266
  "epoch": 1.3246007800426816,
267
- "grad_norm": 15.21429443359375,
268
  "learning_rate": 2.7923320332621977e-05,
269
- "loss": 0.587,
270
  "step": 18000
271
  },
272
  {
273
  "epoch": 1.3613952461549783,
274
- "grad_norm": 5.001615047454834,
275
  "learning_rate": 2.7310079230750363e-05,
276
- "loss": 0.592,
277
  "step": 18500
278
  },
279
  {
280
  "epoch": 1.398189712267275,
281
- "grad_norm": 14.064582824707031,
282
  "learning_rate": 2.6696838128878755e-05,
283
- "loss": 0.5892,
284
  "step": 19000
285
  },
286
  {
287
  "epoch": 1.4349841783795716,
288
- "grad_norm": 4.501893997192383,
289
  "learning_rate": 2.6083597027007138e-05,
290
- "loss": 0.5849,
291
  "step": 19500
292
  },
293
  {
294
  "epoch": 1.4717786444918683,
295
- "grad_norm": 15.239348411560059,
296
  "learning_rate": 2.5470355925135524e-05,
297
- "loss": 0.5799,
298
  "step": 20000
299
  },
300
  {
301
  "epoch": 1.508573110604165,
302
- "grad_norm": 19.09524154663086,
303
  "learning_rate": 2.4857114823263916e-05,
304
- "loss": 0.556,
305
  "step": 20500
306
  },
307
  {
308
  "epoch": 1.5453675767164619,
309
- "grad_norm": 13.683536529541016,
310
  "learning_rate": 2.4243873721392306e-05,
311
- "loss": 0.58,
312
  "step": 21000
313
  },
314
  {
315
  "epoch": 1.5821620428287586,
316
- "grad_norm": 10.286214828491211,
317
  "learning_rate": 2.3630632619520692e-05,
318
- "loss": 0.5485,
319
  "step": 21500
320
  },
321
  {
322
  "epoch": 1.6189565089410554,
323
- "grad_norm": 23.011327743530273,
324
  "learning_rate": 2.301739151764908e-05,
325
- "loss": 0.555,
326
  "step": 22000
327
  },
328
  {
329
  "epoch": 1.6557509750533521,
330
- "grad_norm": 4.562350273132324,
331
  "learning_rate": 2.2404150415777467e-05,
332
- "loss": 0.5543,
333
  "step": 22500
334
  },
335
  {
336
  "epoch": 1.6925454411656486,
337
- "grad_norm": 6.733220100402832,
338
  "learning_rate": 2.1790909313905856e-05,
339
- "loss": 0.5579,
340
  "step": 23000
341
  },
342
  {
343
  "epoch": 1.7293399072779454,
344
- "grad_norm": 10.043594360351562,
345
  "learning_rate": 2.1177668212034242e-05,
346
- "loss": 0.5484,
347
  "step": 23500
348
  },
349
  {
350
  "epoch": 1.7661343733902422,
351
- "grad_norm": 26.73402976989746,
352
  "learning_rate": 2.056442711016263e-05,
353
- "loss": 0.5553,
354
  "step": 24000
355
  },
356
  {
357
  "epoch": 1.8029288395025387,
358
- "grad_norm": 37.953338623046875,
359
  "learning_rate": 1.995118600829102e-05,
360
- "loss": 0.5412,
361
  "step": 24500
362
  },
363
  {
364
  "epoch": 1.8397233056148354,
365
- "grad_norm": 24.06332015991211,
366
  "learning_rate": 1.933794490641941e-05,
367
- "loss": 0.5412,
368
  "step": 25000
369
  },
370
  {
371
  "epoch": 1.8765177717271322,
372
- "grad_norm": 14.054511070251465,
373
  "learning_rate": 1.8724703804547796e-05,
374
- "loss": 0.539,
375
  "step": 25500
376
  },
377
  {
378
  "epoch": 1.913312237839429,
379
- "grad_norm": 6.154812335968018,
380
  "learning_rate": 1.8111462702676185e-05,
381
- "loss": 0.5343,
382
  "step": 26000
383
  },
384
  {
385
  "epoch": 1.9501067039517257,
386
- "grad_norm": 9.982664108276367,
387
  "learning_rate": 1.7498221600804575e-05,
388
- "loss": 0.5404,
389
  "step": 26500
390
  },
391
  {
392
  "epoch": 1.9869011700640224,
393
- "grad_norm": 4.059072017669678,
394
  "learning_rate": 1.688498049893296e-05,
395
- "loss": 0.5281,
396
  "step": 27000
397
  },
398
  {
399
  "epoch": 2.0,
400
- "eval_loss": 0.7477120161056519,
401
- "eval_mse": 0.7477119884569656,
402
- "eval_runtime": 81.1709,
403
- "eval_samples_per_second": 615.984,
404
- "eval_steps_per_second": 76.998,
405
  "step": 27178
406
  },
407
  {
408
  "epoch": 2.023695636176319,
409
- "grad_norm": 4.173586368560791,
410
  "learning_rate": 1.627173939706135e-05,
411
- "loss": 0.4911,
412
  "step": 27500
413
  },
414
  {
415
  "epoch": 2.060490102288616,
416
- "grad_norm": 10.17249584197998,
417
  "learning_rate": 1.565849829518974e-05,
418
- "loss": 0.4734,
419
  "step": 28000
420
  },
421
  {
422
  "epoch": 2.0972845684009127,
423
- "grad_norm": 30.416034698486328,
424
  "learning_rate": 1.5045257193318127e-05,
425
- "loss": 0.473,
426
  "step": 28500
427
  },
428
  {
429
  "epoch": 2.134079034513209,
430
- "grad_norm": 8.076013565063477,
431
  "learning_rate": 1.4432016091446513e-05,
432
- "loss": 0.4683,
433
  "step": 29000
434
  },
435
  {
436
  "epoch": 2.1708735006255058,
437
- "grad_norm": 4.578775405883789,
438
  "learning_rate": 1.38187749895749e-05,
439
- "loss": 0.4629,
440
  "step": 29500
441
  },
442
  {
443
  "epoch": 2.2076679667378025,
444
- "grad_norm": 6.670838356018066,
445
  "learning_rate": 1.320553388770329e-05,
446
- "loss": 0.4682,
447
  "step": 30000
448
  },
449
  {
450
  "epoch": 2.2444624328500993,
451
- "grad_norm": 20.175901412963867,
452
  "learning_rate": 1.2592292785831677e-05,
453
- "loss": 0.4716,
454
  "step": 30500
455
  },
456
  {
457
  "epoch": 2.281256898962396,
458
- "grad_norm": 6.051453113555908,
459
  "learning_rate": 1.1979051683960066e-05,
460
- "loss": 0.4606,
461
  "step": 31000
462
  },
463
  {
464
  "epoch": 2.318051365074693,
465
- "grad_norm": 19.612579345703125,
466
  "learning_rate": 1.1365810582088454e-05,
467
- "loss": 0.4661,
468
  "step": 31500
469
  },
470
  {
471
  "epoch": 2.3548458311869895,
472
- "grad_norm": 7.283798694610596,
473
  "learning_rate": 1.0752569480216842e-05,
474
- "loss": 0.4607,
475
  "step": 32000
476
  },
477
  {
478
  "epoch": 2.3916402972992863,
479
- "grad_norm": 6.703380584716797,
480
  "learning_rate": 1.0139328378345231e-05,
481
- "loss": 0.4596,
482
  "step": 32500
483
  },
484
  {
485
  "epoch": 2.428434763411583,
486
- "grad_norm": 5.354931831359863,
487
  "learning_rate": 9.526087276473619e-06,
488
- "loss": 0.4622,
489
  "step": 33000
490
  },
491
  {
492
  "epoch": 2.46522922952388,
493
- "grad_norm": 4.186372756958008,
494
  "learning_rate": 8.912846174602008e-06,
495
- "loss": 0.4528,
496
  "step": 33500
497
  },
498
  {
499
  "epoch": 2.5020236956361765,
500
- "grad_norm": 15.311373710632324,
501
  "learning_rate": 8.299605072730394e-06,
502
- "loss": 0.4632,
503
  "step": 34000
504
  },
505
  {
506
  "epoch": 2.5388181617484733,
507
- "grad_norm": 5.3915791511535645,
508
  "learning_rate": 7.686363970858783e-06,
509
- "loss": 0.4531,
510
  "step": 34500
511
  },
512
  {
513
  "epoch": 2.5756126278607696,
514
- "grad_norm": 11.797430038452148,
515
  "learning_rate": 7.073122868987171e-06,
516
- "loss": 0.4632,
517
  "step": 35000
518
  },
519
  {
520
  "epoch": 2.6124070939730664,
521
- "grad_norm": 3.9882755279541016,
522
  "learning_rate": 6.459881767115559e-06,
523
- "loss": 0.4546,
524
  "step": 35500
525
  },
526
  {
527
  "epoch": 2.649201560085363,
528
- "grad_norm": 3.36647629737854,
529
  "learning_rate": 5.846640665243948e-06,
530
- "loss": 0.4562,
531
  "step": 36000
532
  },
533
  {
534
  "epoch": 2.68599602619766,
535
- "grad_norm": 3.5512359142303467,
536
  "learning_rate": 5.233399563372335e-06,
537
- "loss": 0.4481,
538
  "step": 36500
539
  },
540
  {
541
  "epoch": 2.7227904923099566,
542
- "grad_norm": 4.66156005859375,
543
  "learning_rate": 4.620158461500724e-06,
544
- "loss": 0.4548,
545
  "step": 37000
546
  },
547
  {
548
  "epoch": 2.7595849584222534,
549
- "grad_norm": 4.355463027954102,
550
  "learning_rate": 4.006917359629112e-06,
551
- "loss": 0.4474,
552
  "step": 37500
553
  },
554
  {
555
  "epoch": 2.79637942453455,
556
- "grad_norm": 10.520413398742676,
557
  "learning_rate": 3.3936762577575e-06,
558
- "loss": 0.4483,
559
  "step": 38000
560
  },
561
  {
562
  "epoch": 2.8331738906468464,
563
- "grad_norm": 6.027377605438232,
564
  "learning_rate": 2.7804351558858883e-06,
565
- "loss": 0.4408,
566
  "step": 38500
567
  },
568
  {
569
  "epoch": 2.869968356759143,
570
- "grad_norm": 9.411751747131348,
571
  "learning_rate": 2.1671940540142763e-06,
572
- "loss": 0.448,
573
  "step": 39000
574
  },
575
  {
576
  "epoch": 2.90676282287144,
577
- "grad_norm": 8.813983917236328,
578
  "learning_rate": 1.5539529521426646e-06,
579
- "loss": 0.4374,
580
  "step": 39500
581
  },
582
  {
583
  "epoch": 2.9435572889837367,
584
- "grad_norm": 15.45693588256836,
585
  "learning_rate": 9.407118502710525e-07,
586
- "loss": 0.4431,
587
  "step": 40000
588
  },
589
  {
590
  "epoch": 2.9803517550960335,
591
- "grad_norm": 5.461585521697998,
592
  "learning_rate": 3.2747074839944075e-07,
593
- "loss": 0.4484,
594
  "step": 40500
595
  },
596
  {
597
  "epoch": 3.0,
598
- "eval_loss": 0.7406834363937378,
599
- "eval_mse": 0.7406833936714095,
600
- "eval_runtime": 77.325,
601
- "eval_samples_per_second": 646.622,
602
- "eval_steps_per_second": 80.828,
603
  "step": 40767
604
  },
605
  {
606
  "epoch": 3.0,
607
  "step": 40767,
608
  "total_flos": 1.7161499914378214e+17,
609
- "train_loss": 0.702953228300284,
610
- "train_runtime": 12856.1609,
611
- "train_samples_per_second": 202.937,
612
- "train_steps_per_second": 3.171
613
  }
614
  ],
615
  "logging_steps": 500,
 
1
  {
2
+ "best_metric": 0.0266929492354393,
3
+ "best_model_checkpoint": "deberta-v3-base-zyda-2-transformed-readability/checkpoint-40767",
4
  "epoch": 3.0,
5
  "eval_steps": 500,
6
  "global_step": 40767,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03679446611229671,
13
+ "grad_norm": 2.2327511310577393,
14
  "learning_rate": 4.9386758898128385e-05,
15
+ "loss": 0.0956,
16
  "step": 500
17
  },
18
  {
19
  "epoch": 0.07358893222459342,
20
+ "grad_norm": 0.9045169353485107,
21
  "learning_rate": 4.877351779625678e-05,
22
+ "loss": 0.0524,
23
  "step": 1000
24
  },
25
  {
26
  "epoch": 0.11038339833689013,
27
+ "grad_norm": 0.6377315521240234,
28
  "learning_rate": 4.8160276694385164e-05,
29
+ "loss": 0.0439,
30
  "step": 1500
31
  },
32
  {
33
  "epoch": 0.14717786444918685,
34
+ "grad_norm": 1.2090165615081787,
35
  "learning_rate": 4.754703559251355e-05,
36
+ "loss": 0.0395,
37
  "step": 2000
38
  },
39
  {
40
  "epoch": 0.18397233056148354,
41
+ "grad_norm": 1.0190351009368896,
42
  "learning_rate": 4.693379449064194e-05,
43
+ "loss": 0.0387,
44
  "step": 2500
45
  },
46
  {
47
  "epoch": 0.22076679667378027,
48
+ "grad_norm": 0.6785000562667847,
49
  "learning_rate": 4.632055338877033e-05,
50
+ "loss": 0.0364,
51
  "step": 3000
52
  },
53
  {
54
  "epoch": 0.257561262786077,
55
+ "grad_norm": 0.37076541781425476,
56
  "learning_rate": 4.570731228689872e-05,
57
+ "loss": 0.0427,
58
  "step": 3500
59
  },
60
  {
61
  "epoch": 0.2943557288983737,
62
+ "grad_norm": 0.2937301695346832,
63
  "learning_rate": 4.509407118502711e-05,
64
+ "loss": 0.0359,
65
  "step": 4000
66
  },
67
  {
68
  "epoch": 0.3311501950106704,
69
+ "grad_norm": 0.3934372663497925,
70
  "learning_rate": 4.448083008315549e-05,
71
+ "loss": 0.0354,
72
  "step": 4500
73
  },
74
  {
75
  "epoch": 0.3679446611229671,
76
+ "grad_norm": 0.8720031380653381,
77
  "learning_rate": 4.386758898128389e-05,
78
+ "loss": 0.0343,
79
  "step": 5000
80
  },
81
  {
82
  "epoch": 0.40473912723526384,
83
+ "grad_norm": 0.2753826677799225,
84
  "learning_rate": 4.325434787941227e-05,
85
+ "loss": 0.0333,
86
  "step": 5500
87
  },
88
  {
89
  "epoch": 0.44153359334756054,
90
+ "grad_norm": 0.9069143533706665,
91
  "learning_rate": 4.264110677754066e-05,
92
+ "loss": 0.0319,
93
  "step": 6000
94
  },
95
  {
96
  "epoch": 0.47832805945985724,
97
+ "grad_norm": 1.0423845052719116,
98
  "learning_rate": 4.202786567566905e-05,
99
+ "loss": 0.0319,
100
  "step": 6500
101
  },
102
  {
103
  "epoch": 0.515122525572154,
104
+ "grad_norm": 0.5137051939964294,
105
  "learning_rate": 4.141462457379743e-05,
106
+ "loss": 0.0321,
107
  "step": 7000
108
  },
109
  {
110
  "epoch": 0.5519169916844506,
111
+ "grad_norm": 0.34184473752975464,
112
  "learning_rate": 4.080138347192582e-05,
113
+ "loss": 0.0316,
114
  "step": 7500
115
  },
116
  {
117
  "epoch": 0.5887114577967474,
118
+ "grad_norm": 0.5334771275520325,
119
  "learning_rate": 4.018814237005421e-05,
120
+ "loss": 0.0318,
121
  "step": 8000
122
  },
123
  {
124
  "epoch": 0.625505923909044,
125
+ "grad_norm": 0.27346959710121155,
126
  "learning_rate": 3.95749012681826e-05,
127
+ "loss": 0.0314,
128
  "step": 8500
129
  },
130
  {
131
  "epoch": 0.6623003900213408,
132
+ "grad_norm": 0.46926313638687134,
133
  "learning_rate": 3.896166016631099e-05,
134
+ "loss": 0.0308,
135
  "step": 9000
136
  },
137
  {
138
  "epoch": 0.6990948561336375,
139
+ "grad_norm": 0.661072850227356,
140
  "learning_rate": 3.834841906443938e-05,
141
+ "loss": 0.0309,
142
  "step": 9500
143
  },
144
  {
145
  "epoch": 0.7358893222459342,
146
+ "grad_norm": 0.3445192575454712,
147
  "learning_rate": 3.773517796256776e-05,
148
+ "loss": 0.03,
149
  "step": 10000
150
  },
151
  {
152
  "epoch": 0.7726837883582309,
153
+ "grad_norm": 0.5244751572608948,
154
  "learning_rate": 3.712193686069616e-05,
155
+ "loss": 0.0299,
156
  "step": 10500
157
  },
158
  {
159
  "epoch": 0.8094782544705277,
160
+ "grad_norm": 0.401460200548172,
161
  "learning_rate": 3.650869575882454e-05,
162
+ "loss": 0.0294,
163
  "step": 11000
164
  },
165
  {
166
  "epoch": 0.8462727205828243,
167
+ "grad_norm": 0.23478317260742188,
168
  "learning_rate": 3.589545465695293e-05,
169
+ "loss": 0.029,
170
  "step": 11500
171
  },
172
  {
173
  "epoch": 0.8830671866951211,
174
+ "grad_norm": 0.4309717118740082,
175
  "learning_rate": 3.528221355508132e-05,
176
+ "loss": 0.029,
177
  "step": 12000
178
  },
179
  {
180
  "epoch": 0.9198616528074177,
181
+ "grad_norm": 0.3477807641029358,
182
  "learning_rate": 3.466897245320971e-05,
183
+ "loss": 0.0295,
184
  "step": 12500
185
  },
186
  {
187
  "epoch": 0.9566561189197145,
188
+ "grad_norm": 0.21652667224407196,
189
  "learning_rate": 3.405573135133809e-05,
190
+ "loss": 0.0293,
191
  "step": 13000
192
  },
193
  {
194
  "epoch": 0.9934505850320112,
195
+ "grad_norm": 0.46980977058410645,
196
  "learning_rate": 3.344249024946648e-05,
197
+ "loss": 0.0288,
198
  "step": 13500
199
  },
200
  {
201
  "epoch": 1.0,
202
+ "eval_loss": 0.028563737869262695,
203
+ "eval_mse": 0.028563737035006864,
204
+ "eval_runtime": 98.1597,
205
+ "eval_samples_per_second": 509.374,
206
+ "eval_steps_per_second": 63.672,
207
  "step": 13589
208
  },
209
  {
210
  "epoch": 1.030245051144308,
211
+ "grad_norm": 0.32912561297416687,
212
  "learning_rate": 3.282924914759487e-05,
213
+ "loss": 0.0258,
214
  "step": 14000
215
  },
216
  {
217
  "epoch": 1.0670395172566045,
218
+ "grad_norm": 0.2001865804195404,
219
  "learning_rate": 3.221600804572326e-05,
220
+ "loss": 0.0251,
221
  "step": 14500
222
  },
223
  {
224
  "epoch": 1.1038339833689013,
225
+ "grad_norm": 0.4719059467315674,
226
  "learning_rate": 3.160276694385165e-05,
227
+ "loss": 0.0243,
228
  "step": 15000
229
  },
230
  {
231
  "epoch": 1.140628449481198,
232
+ "grad_norm": 0.401038259267807,
233
  "learning_rate": 3.098952584198003e-05,
234
+ "loss": 0.0246,
235
  "step": 15500
236
  },
237
  {
238
  "epoch": 1.1774229155934948,
239
+ "grad_norm": 0.24117255210876465,
240
  "learning_rate": 3.0376284740108423e-05,
241
+ "loss": 0.0248,
242
  "step": 16000
243
  },
244
  {
245
  "epoch": 1.2142173817057915,
246
+ "grad_norm": 0.24041427671909332,
247
  "learning_rate": 2.976304363823681e-05,
248
+ "loss": 0.025,
249
  "step": 16500
250
  },
251
  {
252
  "epoch": 1.2510118478180883,
253
+ "grad_norm": 0.3739044666290283,
254
  "learning_rate": 2.91498025363652e-05,
255
+ "loss": 0.0257,
256
  "step": 17000
257
  },
258
  {
259
  "epoch": 1.2878063139303848,
260
+ "grad_norm": 0.4344153106212616,
261
  "learning_rate": 2.8536561434493587e-05,
262
+ "loss": 0.0244,
263
  "step": 17500
264
  },
265
  {
266
  "epoch": 1.3246007800426816,
267
+ "grad_norm": 0.6248531341552734,
268
  "learning_rate": 2.7923320332621977e-05,
269
+ "loss": 0.0249,
270
  "step": 18000
271
  },
272
  {
273
  "epoch": 1.3613952461549783,
274
+ "grad_norm": 0.34284424781799316,
275
  "learning_rate": 2.7310079230750363e-05,
276
+ "loss": 0.0247,
277
  "step": 18500
278
  },
279
  {
280
  "epoch": 1.398189712267275,
281
+ "grad_norm": 0.33926498889923096,
282
  "learning_rate": 2.6696838128878755e-05,
283
+ "loss": 0.0248,
284
  "step": 19000
285
  },
286
  {
287
  "epoch": 1.4349841783795716,
288
+ "grad_norm": 0.2008136361837387,
289
  "learning_rate": 2.6083597027007138e-05,
290
+ "loss": 0.0248,
291
  "step": 19500
292
  },
293
  {
294
  "epoch": 1.4717786444918683,
295
+ "grad_norm": 0.5362450480461121,
296
  "learning_rate": 2.5470355925135524e-05,
297
+ "loss": 0.0246,
298
  "step": 20000
299
  },
300
  {
301
  "epoch": 1.508573110604165,
302
+ "grad_norm": 0.4919290840625763,
303
  "learning_rate": 2.4857114823263916e-05,
304
+ "loss": 0.0238,
305
  "step": 20500
306
  },
307
  {
308
  "epoch": 1.5453675767164619,
309
+ "grad_norm": 0.3778747618198395,
310
  "learning_rate": 2.4243873721392306e-05,
311
+ "loss": 0.0248,
312
  "step": 21000
313
  },
314
  {
315
  "epoch": 1.5821620428287586,
316
+ "grad_norm": 0.2485371083021164,
317
  "learning_rate": 2.3630632619520692e-05,
318
+ "loss": 0.0237,
319
  "step": 21500
320
  },
321
  {
322
  "epoch": 1.6189565089410554,
323
+ "grad_norm": 0.2995116412639618,
324
  "learning_rate": 2.301739151764908e-05,
325
+ "loss": 0.0232,
326
  "step": 22000
327
  },
328
  {
329
  "epoch": 1.6557509750533521,
330
+ "grad_norm": 0.3775917887687683,
331
  "learning_rate": 2.2404150415777467e-05,
332
+ "loss": 0.0239,
333
  "step": 22500
334
  },
335
  {
336
  "epoch": 1.6925454411656486,
337
+ "grad_norm": 0.25416481494903564,
338
  "learning_rate": 2.1790909313905856e-05,
339
+ "loss": 0.0242,
340
  "step": 23000
341
  },
342
  {
343
  "epoch": 1.7293399072779454,
344
+ "grad_norm": 0.5196259617805481,
345
  "learning_rate": 2.1177668212034242e-05,
346
+ "loss": 0.0238,
347
  "step": 23500
348
  },
349
  {
350
  "epoch": 1.7661343733902422,
351
+ "grad_norm": 3.1327126026153564,
352
  "learning_rate": 2.056442711016263e-05,
353
+ "loss": 0.0308,
354
  "step": 24000
355
  },
356
  {
357
  "epoch": 1.8029288395025387,
358
+ "grad_norm": 1.1925427913665771,
359
  "learning_rate": 1.995118600829102e-05,
360
+ "loss": 0.0268,
361
  "step": 24500
362
  },
363
  {
364
  "epoch": 1.8397233056148354,
365
+ "grad_norm": 0.5257470011711121,
366
  "learning_rate": 1.933794490641941e-05,
367
+ "loss": 0.0249,
368
  "step": 25000
369
  },
370
  {
371
  "epoch": 1.8765177717271322,
372
+ "grad_norm": 0.4024732708930969,
373
  "learning_rate": 1.8724703804547796e-05,
374
+ "loss": 0.0237,
375
  "step": 25500
376
  },
377
  {
378
  "epoch": 1.913312237839429,
379
+ "grad_norm": 0.5063018798828125,
380
  "learning_rate": 1.8111462702676185e-05,
381
+ "loss": 0.0231,
382
  "step": 26000
383
  },
384
  {
385
  "epoch": 1.9501067039517257,
386
+ "grad_norm": 0.264139860868454,
387
  "learning_rate": 1.7498221600804575e-05,
388
+ "loss": 0.0261,
389
  "step": 26500
390
  },
391
  {
392
  "epoch": 1.9869011700640224,
393
+ "grad_norm": 0.19682620465755463,
394
  "learning_rate": 1.688498049893296e-05,
395
+ "loss": 0.023,
396
  "step": 27000
397
  },
398
  {
399
  "epoch": 2.0,
400
+ "eval_loss": 0.027232788503170013,
401
+ "eval_mse": 0.02723278669797115,
402
+ "eval_runtime": 77.4011,
403
+ "eval_samples_per_second": 645.986,
404
+ "eval_steps_per_second": 80.748,
405
  "step": 27178
406
  },
407
  {
408
  "epoch": 2.023695636176319,
409
+ "grad_norm": 0.3318944275379181,
410
  "learning_rate": 1.627173939706135e-05,
411
+ "loss": 0.0208,
412
  "step": 27500
413
  },
414
  {
415
  "epoch": 2.060490102288616,
416
+ "grad_norm": 0.20372678339481354,
417
  "learning_rate": 1.565849829518974e-05,
418
+ "loss": 0.0199,
419
  "step": 28000
420
  },
421
  {
422
  "epoch": 2.0972845684009127,
423
+ "grad_norm": 0.4932423233985901,
424
  "learning_rate": 1.5045257193318127e-05,
425
+ "loss": 0.0201,
426
  "step": 28500
427
  },
428
  {
429
  "epoch": 2.134079034513209,
430
+ "grad_norm": 0.24097684025764465,
431
  "learning_rate": 1.4432016091446513e-05,
432
+ "loss": 0.0199,
433
  "step": 29000
434
  },
435
  {
436
  "epoch": 2.1708735006255058,
437
+ "grad_norm": 0.46340519189834595,
438
  "learning_rate": 1.38187749895749e-05,
439
+ "loss": 0.0194,
440
  "step": 29500
441
  },
442
  {
443
  "epoch": 2.2076679667378025,
444
+ "grad_norm": 0.17476551234722137,
445
  "learning_rate": 1.320553388770329e-05,
446
+ "loss": 0.0199,
447
  "step": 30000
448
  },
449
  {
450
  "epoch": 2.2444624328500993,
451
+ "grad_norm": 0.7477974891662598,
452
  "learning_rate": 1.2592292785831677e-05,
453
+ "loss": 0.0202,
454
  "step": 30500
455
  },
456
  {
457
  "epoch": 2.281256898962396,
458
+ "grad_norm": 0.21329531073570251,
459
  "learning_rate": 1.1979051683960066e-05,
460
+ "loss": 0.0196,
461
  "step": 31000
462
  },
463
  {
464
  "epoch": 2.318051365074693,
465
+ "grad_norm": 0.39124202728271484,
466
  "learning_rate": 1.1365810582088454e-05,
467
+ "loss": 0.0198,
468
  "step": 31500
469
  },
470
  {
471
  "epoch": 2.3548458311869895,
472
+ "grad_norm": 0.30534929037094116,
473
  "learning_rate": 1.0752569480216842e-05,
474
+ "loss": 0.0196,
475
  "step": 32000
476
  },
477
  {
478
  "epoch": 2.3916402972992863,
479
+ "grad_norm": 0.2803601324558258,
480
  "learning_rate": 1.0139328378345231e-05,
481
+ "loss": 0.0197,
482
  "step": 32500
483
  },
484
  {
485
  "epoch": 2.428434763411583,
486
+ "grad_norm": 0.30882009863853455,
487
  "learning_rate": 9.526087276473619e-06,
488
+ "loss": 0.0194,
489
  "step": 33000
490
  },
491
  {
492
  "epoch": 2.46522922952388,
493
+ "grad_norm": 0.310523122549057,
494
  "learning_rate": 8.912846174602008e-06,
495
+ "loss": 0.0193,
496
  "step": 33500
497
  },
498
  {
499
  "epoch": 2.5020236956361765,
500
+ "grad_norm": 0.3990231156349182,
501
  "learning_rate": 8.299605072730394e-06,
502
+ "loss": 0.0198,
503
  "step": 34000
504
  },
505
  {
506
  "epoch": 2.5388181617484733,
507
+ "grad_norm": 0.41601407527923584,
508
  "learning_rate": 7.686363970858783e-06,
509
+ "loss": 0.0192,
510
  "step": 34500
511
  },
512
  {
513
  "epoch": 2.5756126278607696,
514
+ "grad_norm": 0.2621209919452667,
515
  "learning_rate": 7.073122868987171e-06,
516
+ "loss": 0.0198,
517
  "step": 35000
518
  },
519
  {
520
  "epoch": 2.6124070939730664,
521
+ "grad_norm": 0.231684148311615,
522
  "learning_rate": 6.459881767115559e-06,
523
+ "loss": 0.0192,
524
  "step": 35500
525
  },
526
  {
527
  "epoch": 2.649201560085363,
528
+ "grad_norm": 0.23939248919487,
529
  "learning_rate": 5.846640665243948e-06,
530
+ "loss": 0.0191,
531
  "step": 36000
532
  },
533
  {
534
  "epoch": 2.68599602619766,
535
+ "grad_norm": 0.22479325532913208,
536
  "learning_rate": 5.233399563372335e-06,
537
+ "loss": 0.0192,
538
  "step": 36500
539
  },
540
  {
541
  "epoch": 2.7227904923099566,
542
+ "grad_norm": 0.27915650606155396,
543
  "learning_rate": 4.620158461500724e-06,
544
+ "loss": 0.0193,
545
  "step": 37000
546
  },
547
  {
548
  "epoch": 2.7595849584222534,
549
+ "grad_norm": 0.19762490689754486,
550
  "learning_rate": 4.006917359629112e-06,
551
+ "loss": 0.0191,
552
  "step": 37500
553
  },
554
  {
555
  "epoch": 2.79637942453455,
556
+ "grad_norm": 0.42420724034309387,
557
  "learning_rate": 3.3936762577575e-06,
558
+ "loss": 0.0193,
559
  "step": 38000
560
  },
561
  {
562
  "epoch": 2.8331738906468464,
563
+ "grad_norm": 0.34259703755378723,
564
  "learning_rate": 2.7804351558858883e-06,
565
+ "loss": 0.0188,
566
  "step": 38500
567
  },
568
  {
569
  "epoch": 2.869968356759143,
570
+ "grad_norm": 0.2734413743019104,
571
  "learning_rate": 2.1671940540142763e-06,
572
+ "loss": 0.019,
573
  "step": 39000
574
  },
575
  {
576
  "epoch": 2.90676282287144,
577
+ "grad_norm": 0.16011129319667816,
578
  "learning_rate": 1.5539529521426646e-06,
579
+ "loss": 0.0186,
580
  "step": 39500
581
  },
582
  {
583
  "epoch": 2.9435572889837367,
584
+ "grad_norm": 0.4719178080558777,
585
  "learning_rate": 9.407118502710525e-07,
586
+ "loss": 0.0188,
587
  "step": 40000
588
  },
589
  {
590
  "epoch": 2.9803517550960335,
591
+ "grad_norm": 0.22022365033626556,
592
  "learning_rate": 3.2747074839944075e-07,
593
+ "loss": 0.0189,
594
  "step": 40500
595
  },
596
  {
597
  "epoch": 3.0,
598
+ "eval_loss": 0.0266929492354393,
599
+ "eval_mse": 0.026692949063357767,
600
+ "eval_runtime": 78.6017,
601
+ "eval_samples_per_second": 636.118,
602
+ "eval_steps_per_second": 79.515,
603
  "step": 40767
604
  },
605
  {
606
  "epoch": 3.0,
607
  "step": 40767,
608
  "total_flos": 1.7161499914378214e+17,
609
+ "train_loss": 0.026733717791019525,
610
+ "train_runtime": 12981.5942,
611
+ "train_samples_per_second": 200.976,
612
+ "train_steps_per_second": 3.14
613
  }
614
  ],
615
  "logging_steps": 500,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abdf47d0eea436423adf4d8495595522bdc635700b31d7723e5f0caee85fd1bc
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c535b070a527587ded137592bb39f04574a3871d0d0a867537a635b06da923bd
3
  size 5368