|
{ |
|
"best_metric": 0.5701812505722046, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-250", |
|
"epoch": 2.9304029304029307, |
|
"eval_steps": 50, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007326007326007326, |
|
"grad_norm": 1.0746264457702637, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.7495, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007326007326007326, |
|
"eval_loss": 1.1676084995269775, |
|
"eval_runtime": 73.5757, |
|
"eval_samples_per_second": 7.435, |
|
"eval_steps_per_second": 7.435, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014652014652014652, |
|
"grad_norm": 0.2758323550224304, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.9081, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02197802197802198, |
|
"grad_norm": 0.29802656173706055, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.8211, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.029304029304029304, |
|
"grad_norm": 0.3587598502635956, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.8768, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03663003663003663, |
|
"grad_norm": 0.5663146376609802, |
|
"learning_rate": 0.00015, |
|
"loss": 0.7956, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04395604395604396, |
|
"grad_norm": 0.49756062030792236, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.9633, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"grad_norm": 0.5426256656646729, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.9303, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05860805860805861, |
|
"grad_norm": 0.44440993666648865, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.8358, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06593406593406594, |
|
"grad_norm": 0.3511325418949127, |
|
"learning_rate": 0.00027, |
|
"loss": 0.8668, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07326007326007326, |
|
"grad_norm": 0.31668755412101746, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7562, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08058608058608059, |
|
"grad_norm": 0.35746482014656067, |
|
"learning_rate": 0.0002999953270341234, |
|
"loss": 0.779, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08791208791208792, |
|
"grad_norm": 0.3406325876712799, |
|
"learning_rate": 0.00029998130842764855, |
|
"loss": 0.7256, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 0.37537914514541626, |
|
"learning_rate": 0.00029995794505402164, |
|
"loss": 0.7405, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 0.4480861723423004, |
|
"learning_rate": 0.00029992523836892604, |
|
"loss": 0.7201, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10989010989010989, |
|
"grad_norm": 0.5216184854507446, |
|
"learning_rate": 0.00029988319041019133, |
|
"loss": 0.7125, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11721611721611722, |
|
"grad_norm": 0.43949222564697266, |
|
"learning_rate": 0.00029983180379766647, |
|
"loss": 0.7151, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12454212454212454, |
|
"grad_norm": 0.3697758615016937, |
|
"learning_rate": 0.0002997710817330568, |
|
"loss": 0.6527, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13186813186813187, |
|
"grad_norm": 0.29461607336997986, |
|
"learning_rate": 0.000299701027999724, |
|
"loss": 0.5593, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1391941391941392, |
|
"grad_norm": 0.3185078203678131, |
|
"learning_rate": 0.00029962164696245076, |
|
"loss": 0.6053, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14652014652014653, |
|
"grad_norm": 0.4523352086544037, |
|
"learning_rate": 0.000299532943567169, |
|
"loss": 0.8082, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 0.3355499505996704, |
|
"learning_rate": 0.000299434923340651, |
|
"loss": 0.4784, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16117216117216118, |
|
"grad_norm": 0.38593626022338867, |
|
"learning_rate": 0.0002993275923901659, |
|
"loss": 0.7062, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1684981684981685, |
|
"grad_norm": 0.3636136054992676, |
|
"learning_rate": 0.00029921095740309854, |
|
"loss": 0.689, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.17582417582417584, |
|
"grad_norm": 0.37379103899002075, |
|
"learning_rate": 0.0002990850256465331, |
|
"loss": 0.6864, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.18315018315018314, |
|
"grad_norm": 0.44014841318130493, |
|
"learning_rate": 0.0002989498049668004, |
|
"loss": 0.5692, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 0.36174437403678894, |
|
"learning_rate": 0.0002988053037889886, |
|
"loss": 0.6588, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1978021978021978, |
|
"grad_norm": 0.3935069143772125, |
|
"learning_rate": 0.0002986515311164188, |
|
"loss": 0.6105, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.4123885929584503, |
|
"learning_rate": 0.00029848849653008356, |
|
"loss": 0.5963, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.21245421245421245, |
|
"grad_norm": 0.4219466745853424, |
|
"learning_rate": 0.00029831621018805036, |
|
"loss": 0.598, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 0.40744897723197937, |
|
"learning_rate": 0.00029813468282482856, |
|
"loss": 0.6318, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2271062271062271, |
|
"grad_norm": 0.41496506333351135, |
|
"learning_rate": 0.0002979439257507004, |
|
"loss": 0.5648, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.23443223443223443, |
|
"grad_norm": 0.4135119318962097, |
|
"learning_rate": 0.0002977439508510166, |
|
"loss": 0.5706, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.24175824175824176, |
|
"grad_norm": 0.5413480997085571, |
|
"learning_rate": 0.00029753477058545543, |
|
"loss": 0.6367, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2490842490842491, |
|
"grad_norm": 0.7710244059562683, |
|
"learning_rate": 0.00029731639798724705, |
|
"loss": 0.5927, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 0.29940611124038696, |
|
"learning_rate": 0.0002970888466623606, |
|
"loss": 0.6232, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26373626373626374, |
|
"grad_norm": 0.3256644606590271, |
|
"learning_rate": 0.00029685213078865715, |
|
"loss": 0.6949, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.27106227106227104, |
|
"grad_norm": 0.2492157518863678, |
|
"learning_rate": 0.00029660626511500605, |
|
"loss": 0.6002, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2783882783882784, |
|
"grad_norm": 0.25613823533058167, |
|
"learning_rate": 0.000296351264960366, |
|
"loss": 0.6237, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.24584399163722992, |
|
"learning_rate": 0.00029608714621283063, |
|
"loss": 0.688, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.29304029304029305, |
|
"grad_norm": 0.23079101741313934, |
|
"learning_rate": 0.0002958139253286385, |
|
"loss": 0.7131, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30036630036630035, |
|
"grad_norm": 0.28000855445861816, |
|
"learning_rate": 0.0002955316193311479, |
|
"loss": 0.6817, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.2555966377258301, |
|
"learning_rate": 0.000295240245809776, |
|
"loss": 0.6851, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.315018315018315, |
|
"grad_norm": 0.24511605501174927, |
|
"learning_rate": 0.00029493982291890324, |
|
"loss": 0.6946, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.32234432234432236, |
|
"grad_norm": 0.24352319538593292, |
|
"learning_rate": 0.00029463036937674175, |
|
"loss": 0.6592, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.32967032967032966, |
|
"grad_norm": 0.27086544036865234, |
|
"learning_rate": 0.0002943119044641694, |
|
"loss": 0.6949, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.336996336996337, |
|
"grad_norm": 0.25139376521110535, |
|
"learning_rate": 0.00029398444802352855, |
|
"loss": 0.6856, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3443223443223443, |
|
"grad_norm": 0.23585085570812225, |
|
"learning_rate": 0.0002936480204573894, |
|
"loss": 0.6426, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3516483516483517, |
|
"grad_norm": 0.24448701739311218, |
|
"learning_rate": 0.00029330264272727917, |
|
"loss": 0.5887, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.358974358974359, |
|
"grad_norm": 0.2956358790397644, |
|
"learning_rate": 0.00029294833635237587, |
|
"loss": 0.6925, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"grad_norm": 0.242750346660614, |
|
"learning_rate": 0.0002925851234081674, |
|
"loss": 0.5302, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3663003663003663, |
|
"eval_loss": 0.6130701303482056, |
|
"eval_runtime": 73.3992, |
|
"eval_samples_per_second": 7.452, |
|
"eval_steps_per_second": 7.452, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.37362637362637363, |
|
"grad_norm": 0.2964664399623871, |
|
"learning_rate": 0.0002922130265250764, |
|
"loss": 0.6552, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.3150418996810913, |
|
"learning_rate": 0.00029183206888705004, |
|
"loss": 0.6041, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3882783882783883, |
|
"grad_norm": 0.2626626491546631, |
|
"learning_rate": 0.00029144227423011555, |
|
"loss": 0.6236, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3956043956043956, |
|
"grad_norm": 0.31966981291770935, |
|
"learning_rate": 0.0002910436668409013, |
|
"loss": 0.6936, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.40293040293040294, |
|
"grad_norm": 0.3115909695625305, |
|
"learning_rate": 0.00029063627155512375, |
|
"loss": 0.5992, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.29618480801582336, |
|
"learning_rate": 0.0002902201137560397, |
|
"loss": 0.5781, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.4175824175824176, |
|
"grad_norm": 0.2827432453632355, |
|
"learning_rate": 0.0002897952193728652, |
|
"loss": 0.5961, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4249084249084249, |
|
"grad_norm": 0.30941715836524963, |
|
"learning_rate": 0.00028936161487915955, |
|
"loss": 0.6069, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.43223443223443225, |
|
"grad_norm": 0.34647953510284424, |
|
"learning_rate": 0.00028891932729117615, |
|
"loss": 0.6544, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.34131157398223877, |
|
"learning_rate": 0.0002884683841661791, |
|
"loss": 0.5973, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4468864468864469, |
|
"grad_norm": 0.3378750681877136, |
|
"learning_rate": 0.00028800881360072615, |
|
"loss": 0.5805, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4542124542124542, |
|
"grad_norm": 0.3864741027355194, |
|
"learning_rate": 0.0002875406442289183, |
|
"loss": 0.5829, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.31040650606155396, |
|
"learning_rate": 0.0002870639052206155, |
|
"loss": 0.5408, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.46886446886446886, |
|
"grad_norm": 0.3510590195655823, |
|
"learning_rate": 0.0002865786262796193, |
|
"loss": 0.5623, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.34330350160598755, |
|
"learning_rate": 0.00028608483764182215, |
|
"loss": 0.5304, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4835164835164835, |
|
"grad_norm": 0.3788071274757385, |
|
"learning_rate": 0.0002855825700733235, |
|
"loss": 0.4966, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4908424908424908, |
|
"grad_norm": 0.38324546813964844, |
|
"learning_rate": 0.00028507185486851275, |
|
"loss": 0.44, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4981684981684982, |
|
"grad_norm": 0.4564480185508728, |
|
"learning_rate": 0.0002845527238481195, |
|
"loss": 0.4097, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5054945054945055, |
|
"grad_norm": 0.238909512758255, |
|
"learning_rate": 0.0002840252093572311, |
|
"loss": 0.6646, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.2642434239387512, |
|
"learning_rate": 0.000283489344263277, |
|
"loss": 0.6419, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5201465201465202, |
|
"grad_norm": 0.2626118063926697, |
|
"learning_rate": 0.00028294516195398125, |
|
"loss": 0.6318, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5274725274725275, |
|
"grad_norm": 0.23919053375720978, |
|
"learning_rate": 0.00028239269633528204, |
|
"loss": 0.6266, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5347985347985348, |
|
"grad_norm": 0.25669869780540466, |
|
"learning_rate": 0.000281831981829219, |
|
"loss": 0.6594, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5421245421245421, |
|
"grad_norm": 0.21138758957386017, |
|
"learning_rate": 0.00028126305337178905, |
|
"loss": 0.5986, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5494505494505495, |
|
"grad_norm": 0.24748112261295319, |
|
"learning_rate": 0.0002806859464107689, |
|
"loss": 0.6108, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5567765567765568, |
|
"grad_norm": 0.23385199904441833, |
|
"learning_rate": 0.00028010069690350716, |
|
"loss": 0.6049, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5641025641025641, |
|
"grad_norm": 0.2671392560005188, |
|
"learning_rate": 0.00027950734131468346, |
|
"loss": 0.6324, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.20720772445201874, |
|
"learning_rate": 0.00027890591661403676, |
|
"loss": 0.538, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5787545787545788, |
|
"grad_norm": 0.26165518164634705, |
|
"learning_rate": 0.00027829646027406174, |
|
"loss": 0.6713, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5860805860805861, |
|
"grad_norm": 0.285157173871994, |
|
"learning_rate": 0.00027767901026767416, |
|
"loss": 0.7407, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5934065934065934, |
|
"grad_norm": 0.24118804931640625, |
|
"learning_rate": 0.00027705360506584484, |
|
"loss": 0.6414, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6007326007326007, |
|
"grad_norm": 0.2540411651134491, |
|
"learning_rate": 0.00027642028363520255, |
|
"loss": 0.6584, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.608058608058608, |
|
"grad_norm": 0.2713707685470581, |
|
"learning_rate": 0.0002757790854356066, |
|
"loss": 0.6356, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.25061967968940735, |
|
"learning_rate": 0.0002751300504176876, |
|
"loss": 0.5919, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6227106227106227, |
|
"grad_norm": 0.2813633382320404, |
|
"learning_rate": 0.0002744732190203589, |
|
"loss": 0.6507, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.63003663003663, |
|
"grad_norm": 0.36446070671081543, |
|
"learning_rate": 0.00027380863216829645, |
|
"loss": 0.6146, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6373626373626373, |
|
"grad_norm": 0.2936757802963257, |
|
"learning_rate": 0.00027313633126938936, |
|
"loss": 0.5807, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6446886446886447, |
|
"grad_norm": 0.32543811202049255, |
|
"learning_rate": 0.0002724563582121598, |
|
"loss": 0.633, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.652014652014652, |
|
"grad_norm": 0.26980024576187134, |
|
"learning_rate": 0.0002717687553631531, |
|
"loss": 0.5771, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 0.3081660568714142, |
|
"learning_rate": 0.0002710735655642978, |
|
"loss": 0.5795, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.30959567427635193, |
|
"learning_rate": 0.0002703708321302367, |
|
"loss": 0.5278, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.673992673992674, |
|
"grad_norm": 0.30625343322753906, |
|
"learning_rate": 0.0002696605988456279, |
|
"loss": 0.5816, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6813186813186813, |
|
"grad_norm": 0.31666725873947144, |
|
"learning_rate": 0.00026894290996241677, |
|
"loss": 0.5393, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6886446886446886, |
|
"grad_norm": 0.31827884912490845, |
|
"learning_rate": 0.0002682178101970788, |
|
"loss": 0.5869, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6959706959706959, |
|
"grad_norm": 0.3418588936328888, |
|
"learning_rate": 0.00026748534472783355, |
|
"loss": 0.5939, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7032967032967034, |
|
"grad_norm": 0.35669511556625366, |
|
"learning_rate": 0.00026674555919182943, |
|
"loss": 0.6145, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7106227106227107, |
|
"grad_norm": 0.3676556348800659, |
|
"learning_rate": 0.00026599849968230084, |
|
"loss": 0.5306, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.3910490870475769, |
|
"learning_rate": 0.00026524421274569556, |
|
"loss": 0.6035, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7252747252747253, |
|
"grad_norm": 0.3907155394554138, |
|
"learning_rate": 0.00026448274537877527, |
|
"loss": 0.5149, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"grad_norm": 0.3669157326221466, |
|
"learning_rate": 0.0002637141450256868, |
|
"loss": 0.4153, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7326007326007326, |
|
"eval_loss": 0.5930299162864685, |
|
"eval_runtime": 73.3788, |
|
"eval_samples_per_second": 7.454, |
|
"eval_steps_per_second": 7.454, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.73992673992674, |
|
"grad_norm": 0.5483953952789307, |
|
"learning_rate": 0.0002629384595750065, |
|
"loss": 0.4924, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7472527472527473, |
|
"grad_norm": 0.44320136308670044, |
|
"learning_rate": 0.00026215573735675635, |
|
"loss": 0.4333, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7545787545787546, |
|
"grad_norm": 0.2921634316444397, |
|
"learning_rate": 0.0002613660271393924, |
|
"loss": 0.5333, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 0.3033207654953003, |
|
"learning_rate": 0.0002605693781267668, |
|
"loss": 0.6723, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.21631714701652527, |
|
"learning_rate": 0.00025976583995506147, |
|
"loss": 0.7413, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7765567765567766, |
|
"grad_norm": 0.2756958305835724, |
|
"learning_rate": 0.0002589554626896959, |
|
"loss": 0.6334, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7838827838827839, |
|
"grad_norm": 0.23255576193332672, |
|
"learning_rate": 0.0002581382968222075, |
|
"loss": 0.6241, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7912087912087912, |
|
"grad_norm": 0.2933904230594635, |
|
"learning_rate": 0.00025731439326710564, |
|
"loss": 0.6238, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7985347985347986, |
|
"grad_norm": 0.24506813287734985, |
|
"learning_rate": 0.0002564838033586999, |
|
"loss": 0.7049, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8058608058608059, |
|
"grad_norm": 0.21781164407730103, |
|
"learning_rate": 0.0002556465788479008, |
|
"loss": 0.5831, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8131868131868132, |
|
"grad_norm": 0.23479105532169342, |
|
"learning_rate": 0.00025480277189899594, |
|
"loss": 0.6347, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.22827807068824768, |
|
"learning_rate": 0.00025395243508639974, |
|
"loss": 0.5881, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8278388278388278, |
|
"grad_norm": 0.24758267402648926, |
|
"learning_rate": 0.00025309562139137765, |
|
"loss": 0.6288, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8351648351648352, |
|
"grad_norm": 0.23672336339950562, |
|
"learning_rate": 0.00025223238419874505, |
|
"loss": 0.623, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8424908424908425, |
|
"grad_norm": 0.25866055488586426, |
|
"learning_rate": 0.0002513627772935413, |
|
"loss": 0.6076, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8498168498168498, |
|
"grad_norm": 0.23350456357002258, |
|
"learning_rate": 0.00025048685485767826, |
|
"loss": 0.6124, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.26988059282302856, |
|
"learning_rate": 0.00024960467146656465, |
|
"loss": 0.667, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8644688644688645, |
|
"grad_norm": 0.2606026530265808, |
|
"learning_rate": 0.0002487162820857056, |
|
"loss": 0.514, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8717948717948718, |
|
"grad_norm": 0.26499950885772705, |
|
"learning_rate": 0.00024782174206727797, |
|
"loss": 0.5818, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.2778685390949249, |
|
"learning_rate": 0.00024692110714668144, |
|
"loss": 0.6545, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8864468864468864, |
|
"grad_norm": 0.2645077705383301, |
|
"learning_rate": 0.00024601443343906607, |
|
"loss": 0.5948, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8937728937728938, |
|
"grad_norm": 0.2799476385116577, |
|
"learning_rate": 0.00024510177743583577, |
|
"loss": 0.6282, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9010989010989011, |
|
"grad_norm": 0.3192484676837921, |
|
"learning_rate": 0.00024418319600112877, |
|
"loss": 0.6482, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9084249084249084, |
|
"grad_norm": 0.28705793619155884, |
|
"learning_rate": 0.00024325874636827428, |
|
"loss": 0.6307, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9157509157509157, |
|
"grad_norm": 0.30531764030456543, |
|
"learning_rate": 0.00024232848613622686, |
|
"loss": 0.5717, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.31991544365882874, |
|
"learning_rate": 0.00024139247326597748, |
|
"loss": 0.5068, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9304029304029304, |
|
"grad_norm": 0.40226104855537415, |
|
"learning_rate": 0.00024045076607694216, |
|
"loss": 0.7029, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9377289377289377, |
|
"grad_norm": 0.38639435172080994, |
|
"learning_rate": 0.0002395034232433284, |
|
"loss": 0.5574, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.945054945054945, |
|
"grad_norm": 0.35603398084640503, |
|
"learning_rate": 0.00023855050379047922, |
|
"loss": 0.568, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.3355114459991455, |
|
"learning_rate": 0.00023759206709119595, |
|
"loss": 0.4997, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9597069597069597, |
|
"grad_norm": 0.39614033699035645, |
|
"learning_rate": 0.0002366281728620383, |
|
"loss": 0.5225, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.967032967032967, |
|
"grad_norm": 0.33072763681411743, |
|
"learning_rate": 0.0002356588811596042, |
|
"loss": 0.427, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9743589743589743, |
|
"grad_norm": 0.5783118605613708, |
|
"learning_rate": 0.00023468425237678769, |
|
"loss": 0.5669, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9816849816849816, |
|
"grad_norm": 0.44978731870651245, |
|
"learning_rate": 0.000233704347239016, |
|
"loss": 0.5213, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.989010989010989, |
|
"grad_norm": 0.4447237551212311, |
|
"learning_rate": 0.0002327192268004661, |
|
"loss": 0.4526, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9963369963369964, |
|
"grad_norm": 0.5178239941596985, |
|
"learning_rate": 0.00023172895244026056, |
|
"loss": 0.419, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0036630036630036, |
|
"grad_norm": 0.5518823266029358, |
|
"learning_rate": 0.0002307335858586433, |
|
"loss": 0.9609, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.010989010989011, |
|
"grad_norm": 0.19367220997810364, |
|
"learning_rate": 0.0002297331890731352, |
|
"loss": 0.6718, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0183150183150182, |
|
"grad_norm": 0.2209685742855072, |
|
"learning_rate": 0.0002287278244146702, |
|
"loss": 0.6591, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.2094419300556183, |
|
"learning_rate": 0.00022771755452371162, |
|
"loss": 0.5926, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.032967032967033, |
|
"grad_norm": 0.22118069231510162, |
|
"learning_rate": 0.00022670244234634902, |
|
"loss": 0.6281, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0402930402930404, |
|
"grad_norm": 0.23426313698291779, |
|
"learning_rate": 0.00022568255113037658, |
|
"loss": 0.5911, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": 0.22640864551067352, |
|
"learning_rate": 0.00022465794442135244, |
|
"loss": 0.6435, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.054945054945055, |
|
"grad_norm": 0.2409924864768982, |
|
"learning_rate": 0.000223628686058639, |
|
"loss": 0.594, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0622710622710623, |
|
"grad_norm": 0.2898027002811432, |
|
"learning_rate": 0.00022259484017142582, |
|
"loss": 0.578, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0695970695970696, |
|
"grad_norm": 0.2668663561344147, |
|
"learning_rate": 0.00022155647117473343, |
|
"loss": 0.6175, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.25386887788772583, |
|
"learning_rate": 0.00022051364376540045, |
|
"loss": 0.5452, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0842490842490842, |
|
"grad_norm": 0.2420186847448349, |
|
"learning_rate": 0.00021946642291805214, |
|
"loss": 0.5267, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0915750915750915, |
|
"grad_norm": 0.26154059171676636, |
|
"learning_rate": 0.00021841487388105235, |
|
"loss": 0.5904, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 0.25012269616127014, |
|
"learning_rate": 0.00021735906217243802, |
|
"loss": 0.6004, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"eval_loss": 0.5869444012641907, |
|
"eval_runtime": 73.5969, |
|
"eval_samples_per_second": 7.432, |
|
"eval_steps_per_second": 7.432, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1062271062271063, |
|
"grad_norm": 0.2936391234397888, |
|
"learning_rate": 0.00021629905357583687, |
|
"loss": 0.6069, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1135531135531136, |
|
"grad_norm": 0.3034394383430481, |
|
"learning_rate": 0.00021523491413636894, |
|
"loss": 0.4688, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.120879120879121, |
|
"grad_norm": 0.2803559899330139, |
|
"learning_rate": 0.00021416671015653144, |
|
"loss": 0.5647, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1282051282051282, |
|
"grad_norm": 0.3114618957042694, |
|
"learning_rate": 0.00021309450819206763, |
|
"loss": 0.4767, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1355311355311355, |
|
"grad_norm": 0.3169534206390381, |
|
"learning_rate": 0.0002120183750478201, |
|
"loss": 0.5048, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.3803180754184723, |
|
"learning_rate": 0.00021093837777356835, |
|
"loss": 0.5439, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.15018315018315, |
|
"grad_norm": 0.3590928614139557, |
|
"learning_rate": 0.00020985458365985112, |
|
"loss": 0.6012, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1575091575091574, |
|
"grad_norm": 0.3438403606414795, |
|
"learning_rate": 0.00020876706023377394, |
|
"loss": 0.4902, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.164835164835165, |
|
"grad_norm": 0.35391849279403687, |
|
"learning_rate": 0.0002076758752548016, |
|
"loss": 0.4655, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1721611721611722, |
|
"grad_norm": 0.3842090666294098, |
|
"learning_rate": 0.0002065810967105364, |
|
"loss": 0.4753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.1794871794871795, |
|
"grad_norm": 0.35003578662872314, |
|
"learning_rate": 0.000205482792812482, |
|
"loss": 0.4344, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1868131868131868, |
|
"grad_norm": 0.3732411563396454, |
|
"learning_rate": 0.0002043810319917937, |
|
"loss": 0.4107, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1941391941391941, |
|
"grad_norm": 0.4202706515789032, |
|
"learning_rate": 0.00020327588289501425, |
|
"loss": 0.5001, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2014652014652014, |
|
"grad_norm": 0.41824397444725037, |
|
"learning_rate": 0.00020216741437979735, |
|
"loss": 0.4344, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2087912087912087, |
|
"grad_norm": 0.5380794405937195, |
|
"learning_rate": 0.00020105569551061693, |
|
"loss": 0.4992, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2161172161172162, |
|
"grad_norm": 0.557085394859314, |
|
"learning_rate": 0.00019994079555446417, |
|
"loss": 0.4411, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2234432234432235, |
|
"grad_norm": 0.485733300447464, |
|
"learning_rate": 0.00019882278397653175, |
|
"loss": 0.4401, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.3742977380752563, |
|
"learning_rate": 0.0001977017304358857, |
|
"loss": 0.3275, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.5038601756095886, |
|
"learning_rate": 0.00019657770478112533, |
|
"loss": 0.3036, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2454212454212454, |
|
"grad_norm": 0.615625262260437, |
|
"learning_rate": 0.00019545077704603088, |
|
"loss": 0.3079, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.2527472527472527, |
|
"grad_norm": 0.4520106315612793, |
|
"learning_rate": 0.00019432101744520052, |
|
"loss": 0.4792, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.26007326007326, |
|
"grad_norm": 0.4422815442085266, |
|
"learning_rate": 0.00019318849636967497, |
|
"loss": 0.5819, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.2673992673992673, |
|
"grad_norm": 0.39762523770332336, |
|
"learning_rate": 0.00019205328438255215, |
|
"loss": 0.6133, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2747252747252746, |
|
"grad_norm": 0.308088481426239, |
|
"learning_rate": 0.00019091545221459048, |
|
"loss": 0.5021, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 0.3217431902885437, |
|
"learning_rate": 0.0001897750707598018, |
|
"loss": 0.5656, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2893772893772895, |
|
"grad_norm": 0.34072449803352356, |
|
"learning_rate": 0.00018863221107103453, |
|
"loss": 0.6147, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2967032967032968, |
|
"grad_norm": 0.3127173185348511, |
|
"learning_rate": 0.00018748694435554626, |
|
"loss": 0.5093, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.304029304029304, |
|
"grad_norm": 0.28147441148757935, |
|
"learning_rate": 0.00018633934197056747, |
|
"loss": 0.5148, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3113553113553114, |
|
"grad_norm": 0.32677727937698364, |
|
"learning_rate": 0.00018518947541885532, |
|
"loss": 0.5213, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 0.3397771120071411, |
|
"learning_rate": 0.0001840374163442385, |
|
"loss": 0.5752, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.326007326007326, |
|
"grad_norm": 0.32977497577667236, |
|
"learning_rate": 0.00018288323652715372, |
|
"loss": 0.6049, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.3196227550506592, |
|
"learning_rate": 0.00018172700788017285, |
|
"loss": 0.6103, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3406593406593408, |
|
"grad_norm": 0.3648357689380646, |
|
"learning_rate": 0.0001805688024435228, |
|
"loss": 0.6024, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.347985347985348, |
|
"grad_norm": 0.3467482924461365, |
|
"learning_rate": 0.00017940869238059672, |
|
"loss": 0.6184, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3553113553113554, |
|
"grad_norm": 0.34417080879211426, |
|
"learning_rate": 0.00017824674997345765, |
|
"loss": 0.5201, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3626373626373627, |
|
"grad_norm": 0.32738953828811646, |
|
"learning_rate": 0.0001770830476183353, |
|
"loss": 0.4373, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.36996336996337, |
|
"grad_norm": 0.3495752513408661, |
|
"learning_rate": 0.00017591765782111493, |
|
"loss": 0.5156, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.3772893772893773, |
|
"grad_norm": 0.38357013463974, |
|
"learning_rate": 0.0001747506531928199, |
|
"loss": 0.5689, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.3947698473930359, |
|
"learning_rate": 0.00017358210644508761, |
|
"loss": 0.5288, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3919413919413919, |
|
"grad_norm": 0.39951273798942566, |
|
"learning_rate": 0.00017241209038563915, |
|
"loss": 0.4936, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3992673992673992, |
|
"grad_norm": 0.46843069791793823, |
|
"learning_rate": 0.0001712406779137427, |
|
"loss": 0.5193, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4065934065934065, |
|
"grad_norm": 0.4092109799385071, |
|
"learning_rate": 0.00017006794201567186, |
|
"loss": 0.4587, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4139194139194138, |
|
"grad_norm": 0.5061143636703491, |
|
"learning_rate": 0.00016889395576015763, |
|
"loss": 0.4816, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4212454212454213, |
|
"grad_norm": 0.4533427655696869, |
|
"learning_rate": 0.0001677187922938362, |
|
"loss": 0.5173, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.4554060399532318, |
|
"learning_rate": 0.00016654252483669124, |
|
"loss": 0.4767, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.4680671691894531, |
|
"learning_rate": 0.00016536522667749196, |
|
"loss": 0.4486, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4432234432234432, |
|
"grad_norm": 0.5479559898376465, |
|
"learning_rate": 0.00016418697116922672, |
|
"loss": 0.4627, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4505494505494505, |
|
"grad_norm": 0.5161874890327454, |
|
"learning_rate": 0.00016300783172453265, |
|
"loss": 0.4527, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.4578754578754578, |
|
"grad_norm": 0.4987958073616028, |
|
"learning_rate": 0.00016182788181112146, |
|
"loss": 0.43, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4652014652014653, |
|
"grad_norm": 0.5920132994651794, |
|
"learning_rate": 0.00016064719494720238, |
|
"loss": 0.4271, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4652014652014653, |
|
"eval_loss": 0.5827460885047913, |
|
"eval_runtime": 73.578, |
|
"eval_samples_per_second": 7.434, |
|
"eval_steps_per_second": 7.434, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4725274725274726, |
|
"grad_norm": 0.5107991695404053, |
|
"learning_rate": 0.000159465844696901, |
|
"loss": 0.3252, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.47985347985348, |
|
"grad_norm": 0.5588508248329163, |
|
"learning_rate": 0.0001582839046656762, |
|
"loss": 0.3086, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.4871794871794872, |
|
"grad_norm": 0.5706549882888794, |
|
"learning_rate": 0.0001571014484957337, |
|
"loss": 0.2613, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.4945054945054945, |
|
"grad_norm": 0.8393925428390503, |
|
"learning_rate": 0.0001559185498614379, |
|
"loss": 0.3406, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5018315018315018, |
|
"grad_norm": 0.6459693312644958, |
|
"learning_rate": 0.0001547352824647216, |
|
"loss": 0.3553, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5091575091575091, |
|
"grad_norm": 0.313265323638916, |
|
"learning_rate": 0.00015355172003049357, |
|
"loss": 0.4846, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.5164835164835164, |
|
"grad_norm": 0.3253680169582367, |
|
"learning_rate": 0.00015236793630204554, |
|
"loss": 0.5412, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.35269081592559814, |
|
"learning_rate": 0.000151184005036457, |
|
"loss": 0.6062, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.531135531135531, |
|
"grad_norm": 0.3606890141963959, |
|
"learning_rate": 0.00015, |
|
"loss": 0.5544, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.36298248171806335, |
|
"learning_rate": 0.000148815994963543, |
|
"loss": 0.542, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5457875457875456, |
|
"grad_norm": 0.36222681403160095, |
|
"learning_rate": 0.00014763206369795446, |
|
"loss": 0.6695, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.5531135531135531, |
|
"grad_norm": 0.4112010598182678, |
|
"learning_rate": 0.00014644827996950643, |
|
"loss": 0.6308, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.5604395604395604, |
|
"grad_norm": 0.33088386058807373, |
|
"learning_rate": 0.00014526471753527838, |
|
"loss": 0.5061, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5677655677655677, |
|
"grad_norm": 0.3446553349494934, |
|
"learning_rate": 0.0001440814501385621, |
|
"loss": 0.5424, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.575091575091575, |
|
"grad_norm": 0.3574959933757782, |
|
"learning_rate": 0.0001428985515042663, |
|
"loss": 0.448, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.5824175824175826, |
|
"grad_norm": 0.35467544198036194, |
|
"learning_rate": 0.00014171609533432378, |
|
"loss": 0.5307, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5897435897435899, |
|
"grad_norm": 0.34874802827835083, |
|
"learning_rate": 0.00014053415530309896, |
|
"loss": 0.5321, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.5970695970695972, |
|
"grad_norm": 0.37879034876823425, |
|
"learning_rate": 0.0001393528050527976, |
|
"loss": 0.4935, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6043956043956045, |
|
"grad_norm": 0.3754430115222931, |
|
"learning_rate": 0.00013817211818887852, |
|
"loss": 0.5807, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6117216117216118, |
|
"grad_norm": 0.40496763586997986, |
|
"learning_rate": 0.0001369921682754674, |
|
"loss": 0.6015, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.38105273246765137, |
|
"learning_rate": 0.00013581302883077325, |
|
"loss": 0.512, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6263736263736264, |
|
"grad_norm": 0.45323583483695984, |
|
"learning_rate": 0.000134634773322508, |
|
"loss": 0.5284, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.6336996336996337, |
|
"grad_norm": 0.4019657075405121, |
|
"learning_rate": 0.00013345747516330873, |
|
"loss": 0.5855, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.40215152502059937, |
|
"learning_rate": 0.0001322812077061638, |
|
"loss": 0.472, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6483516483516483, |
|
"grad_norm": 0.4318372309207916, |
|
"learning_rate": 0.0001311060442398424, |
|
"loss": 0.4597, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6556776556776556, |
|
"grad_norm": 0.39995384216308594, |
|
"learning_rate": 0.00012993205798432814, |
|
"loss": 0.4269, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6630036630036629, |
|
"grad_norm": 0.44160038232803345, |
|
"learning_rate": 0.0001287593220862573, |
|
"loss": 0.495, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.6703296703296702, |
|
"grad_norm": 0.4725303649902344, |
|
"learning_rate": 0.00012758790961436083, |
|
"loss": 0.4705, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.6776556776556777, |
|
"grad_norm": 0.7378038763999939, |
|
"learning_rate": 0.0001264178935549124, |
|
"loss": 0.4881, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.684981684981685, |
|
"grad_norm": 0.5113095045089722, |
|
"learning_rate": 0.0001252493468071801, |
|
"loss": 0.4921, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.5403740406036377, |
|
"learning_rate": 0.00012408234217888508, |
|
"loss": 0.4288, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6996336996336996, |
|
"grad_norm": 0.5715883374214172, |
|
"learning_rate": 0.0001229169523816647, |
|
"loss": 0.3966, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.7069597069597071, |
|
"grad_norm": 0.6109308004379272, |
|
"learning_rate": 0.00012175325002654229, |
|
"loss": 0.4353, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.5063544511795044, |
|
"learning_rate": 0.00012059130761940328, |
|
"loss": 0.3582, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7216117216117217, |
|
"grad_norm": 0.5469954609870911, |
|
"learning_rate": 0.00011943119755647719, |
|
"loss": 0.3257, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.728937728937729, |
|
"grad_norm": 0.6388989686965942, |
|
"learning_rate": 0.00011827299211982714, |
|
"loss": 0.3494, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7362637362637363, |
|
"grad_norm": 0.6123701333999634, |
|
"learning_rate": 0.00011711676347284631, |
|
"loss": 0.2883, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7435897435897436, |
|
"grad_norm": 0.8964057564735413, |
|
"learning_rate": 0.00011596258365576144, |
|
"loss": 0.3418, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.750915750915751, |
|
"grad_norm": 0.5625249147415161, |
|
"learning_rate": 0.0001148105245811447, |
|
"loss": 0.3956, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 0.3839769661426544, |
|
"learning_rate": 0.00011366065802943254, |
|
"loss": 0.5822, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7655677655677655, |
|
"grad_norm": 0.38849931955337524, |
|
"learning_rate": 0.00011251305564445375, |
|
"loss": 0.7068, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.7728937728937728, |
|
"grad_norm": 0.3844453990459442, |
|
"learning_rate": 0.00011136778892896552, |
|
"loss": 0.5259, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.7802197802197801, |
|
"grad_norm": 0.39755213260650635, |
|
"learning_rate": 0.00011022492924019817, |
|
"loss": 0.5587, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7875457875457874, |
|
"grad_norm": 0.4066022038459778, |
|
"learning_rate": 0.00010908454778540952, |
|
"loss": 0.5985, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 0.453279048204422, |
|
"learning_rate": 0.00010794671561744785, |
|
"loss": 0.6355, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8021978021978022, |
|
"grad_norm": 0.44758322834968567, |
|
"learning_rate": 0.00010681150363032503, |
|
"loss": 0.5131, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.39223358035087585, |
|
"learning_rate": 0.00010567898255479951, |
|
"loss": 0.5434, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8168498168498168, |
|
"grad_norm": 0.38072723150253296, |
|
"learning_rate": 0.00010454922295396907, |
|
"loss": 0.5052, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8241758241758241, |
|
"grad_norm": 0.3893994092941284, |
|
"learning_rate": 0.00010342229521887467, |
|
"loss": 0.5188, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.8315018315018317, |
|
"grad_norm": 0.3957141041755676, |
|
"learning_rate": 0.00010229826956411426, |
|
"loss": 0.4949, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8315018315018317, |
|
"eval_loss": 0.5701812505722046, |
|
"eval_runtime": 73.3521, |
|
"eval_samples_per_second": 7.457, |
|
"eval_steps_per_second": 7.457, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.838827838827839, |
|
"grad_norm": 0.3748125433921814, |
|
"learning_rate": 0.00010117721602346823, |
|
"loss": 0.482, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.35637104511260986, |
|
"learning_rate": 0.00010005920444553586, |
|
"loss": 0.4196, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.8534798534798536, |
|
"grad_norm": 0.45035073161125183, |
|
"learning_rate": 9.894430448938305e-05, |
|
"loss": 0.5593, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8608058608058609, |
|
"grad_norm": 0.42238089442253113, |
|
"learning_rate": 9.783258562020263e-05, |
|
"loss": 0.4647, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.8681318681318682, |
|
"grad_norm": 0.3975355625152588, |
|
"learning_rate": 9.672411710498575e-05, |
|
"loss": 0.5369, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.8754578754578755, |
|
"grad_norm": 0.4017747640609741, |
|
"learning_rate": 9.561896800820633e-05, |
|
"loss": 0.4813, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.8827838827838828, |
|
"grad_norm": 0.3966550827026367, |
|
"learning_rate": 9.4517207187518e-05, |
|
"loss": 0.425, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.89010989010989, |
|
"grad_norm": 0.4850409924983978, |
|
"learning_rate": 9.341890328946358e-05, |
|
"loss": 0.6023, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.8974358974358974, |
|
"grad_norm": 0.4538116753101349, |
|
"learning_rate": 9.23241247451984e-05, |
|
"loss": 0.4504, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.42693740129470825, |
|
"learning_rate": 9.123293976622602e-05, |
|
"loss": 0.4383, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.912087912087912, |
|
"grad_norm": 0.529269814491272, |
|
"learning_rate": 9.014541634014885e-05, |
|
"loss": 0.5292, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.9194139194139193, |
|
"grad_norm": 0.520923912525177, |
|
"learning_rate": 8.906162222643167e-05, |
|
"loss": 0.5155, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.9267399267399268, |
|
"grad_norm": 0.47154876589775085, |
|
"learning_rate": 8.798162495217989e-05, |
|
"loss": 0.3989, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.934065934065934, |
|
"grad_norm": 0.5603944659233093, |
|
"learning_rate": 8.690549180793239e-05, |
|
"loss": 0.4806, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9413919413919414, |
|
"grad_norm": 0.5125073194503784, |
|
"learning_rate": 8.583328984346854e-05, |
|
"loss": 0.4296, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9487179487179487, |
|
"grad_norm": 0.519232988357544, |
|
"learning_rate": 8.476508586363106e-05, |
|
"loss": 0.4025, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.9560439560439562, |
|
"grad_norm": 0.5697182416915894, |
|
"learning_rate": 8.370094642416314e-05, |
|
"loss": 0.3478, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.9633699633699635, |
|
"grad_norm": 0.516194760799408, |
|
"learning_rate": 8.264093782756195e-05, |
|
"loss": 0.3657, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.9706959706959708, |
|
"grad_norm": 0.6397049427032471, |
|
"learning_rate": 8.158512611894759e-05, |
|
"loss": 0.4872, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"grad_norm": 0.6545494794845581, |
|
"learning_rate": 8.053357708194783e-05, |
|
"loss": 0.3736, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9853479853479854, |
|
"grad_norm": 0.6731343269348145, |
|
"learning_rate": 7.948635623459958e-05, |
|
"loss": 0.3911, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.9926739926739927, |
|
"grad_norm": 0.8151178359985352, |
|
"learning_rate": 7.844352882526661e-05, |
|
"loss": 0.2981, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.9851661920547485, |
|
"learning_rate": 7.740515982857419e-05, |
|
"loss": 0.5828, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.0073260073260073, |
|
"grad_norm": 0.26004815101623535, |
|
"learning_rate": 7.637131394136096e-05, |
|
"loss": 0.45, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.0146520146520146, |
|
"grad_norm": 0.27779507637023926, |
|
"learning_rate": 7.534205557864752e-05, |
|
"loss": 0.5448, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.021978021978022, |
|
"grad_norm": 0.31077465415000916, |
|
"learning_rate": 7.431744886962338e-05, |
|
"loss": 0.5772, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.029304029304029, |
|
"grad_norm": 0.3218604326248169, |
|
"learning_rate": 7.329755765365101e-05, |
|
"loss": 0.4763, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.0366300366300365, |
|
"grad_norm": 0.35660311579704285, |
|
"learning_rate": 7.228244547628837e-05, |
|
"loss": 0.5446, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.043956043956044, |
|
"grad_norm": 0.369620680809021, |
|
"learning_rate": 7.127217558532974e-05, |
|
"loss": 0.5142, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.374860942363739, |
|
"learning_rate": 7.02668109268648e-05, |
|
"loss": 0.4916, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.0586080586080584, |
|
"grad_norm": 0.38749629259109497, |
|
"learning_rate": 6.926641414135674e-05, |
|
"loss": 0.4357, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.065934065934066, |
|
"grad_norm": 0.405636191368103, |
|
"learning_rate": 6.827104755973947e-05, |
|
"loss": 0.4206, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.0732600732600734, |
|
"grad_norm": 0.40589308738708496, |
|
"learning_rate": 6.728077319953388e-05, |
|
"loss": 0.4861, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.0805860805860807, |
|
"grad_norm": 0.4325176775455475, |
|
"learning_rate": 6.629565276098398e-05, |
|
"loss": 0.4732, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.087912087912088, |
|
"grad_norm": 0.4640852212905884, |
|
"learning_rate": 6.531574762321226e-05, |
|
"loss": 0.4558, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.43299245834350586, |
|
"learning_rate": 6.434111884039579e-05, |
|
"loss": 0.4278, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.1025641025641026, |
|
"grad_norm": 0.4622572660446167, |
|
"learning_rate": 6.337182713796172e-05, |
|
"loss": 0.4749, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.10989010989011, |
|
"grad_norm": 0.47975558042526245, |
|
"learning_rate": 6.240793290880404e-05, |
|
"loss": 0.3462, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.1172161172161172, |
|
"grad_norm": 0.48463815450668335, |
|
"learning_rate": 6.144949620952074e-05, |
|
"loss": 0.4606, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.1245421245421245, |
|
"grad_norm": 0.5988311767578125, |
|
"learning_rate": 6.049657675667161e-05, |
|
"loss": 0.3281, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.131868131868132, |
|
"grad_norm": 0.43833500146865845, |
|
"learning_rate": 5.954923392305783e-05, |
|
"loss": 0.3536, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.139194139194139, |
|
"grad_norm": 0.5085727572441101, |
|
"learning_rate": 5.860752673402253e-05, |
|
"loss": 0.3745, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.1465201465201464, |
|
"grad_norm": 0.5323147177696228, |
|
"learning_rate": 5.767151386377313e-05, |
|
"loss": 0.4184, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.5333714485168457, |
|
"learning_rate": 5.6741253631725734e-05, |
|
"loss": 0.2953, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.161172161172161, |
|
"grad_norm": 0.5652948021888733, |
|
"learning_rate": 5.581680399887123e-05, |
|
"loss": 0.3118, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.1684981684981683, |
|
"grad_norm": 0.6583665609359741, |
|
"learning_rate": 5.4898222564164196e-05, |
|
"loss": 0.2828, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.1758241758241756, |
|
"grad_norm": 0.603283703327179, |
|
"learning_rate": 5.398556656093393e-05, |
|
"loss": 0.31, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.183150183150183, |
|
"grad_norm": 0.770128607749939, |
|
"learning_rate": 5.307889285331851e-05, |
|
"loss": 0.3279, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.6317210793495178, |
|
"learning_rate": 5.2178257932721996e-05, |
|
"loss": 0.3703, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 0.6448614597320557, |
|
"learning_rate": 5.128371791429436e-05, |
|
"loss": 0.3123, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"eval_loss": 0.6170002222061157, |
|
"eval_runtime": 73.4287, |
|
"eval_samples_per_second": 7.449, |
|
"eval_steps_per_second": 7.449, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.2051282051282053, |
|
"grad_norm": 0.599308431148529, |
|
"learning_rate": 5.039532853343533e-05, |
|
"loss": 0.249, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.2124542124542126, |
|
"grad_norm": 0.7922989130020142, |
|
"learning_rate": 4.951314514232175e-05, |
|
"loss": 0.2308, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.21978021978022, |
|
"grad_norm": 0.8403403162956238, |
|
"learning_rate": 4.863722270645869e-05, |
|
"loss": 0.2293, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.227106227106227, |
|
"grad_norm": 0.8893972039222717, |
|
"learning_rate": 4.776761580125495e-05, |
|
"loss": 0.2041, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.2344322344322345, |
|
"grad_norm": 0.6155855059623718, |
|
"learning_rate": 4.690437860862234e-05, |
|
"loss": 0.1374, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.241758241758242, |
|
"grad_norm": 0.7354236245155334, |
|
"learning_rate": 4.6047564913600234e-05, |
|
"loss": 0.1482, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.249084249084249, |
|
"grad_norm": 2.1080105304718018, |
|
"learning_rate": 4.519722810100403e-05, |
|
"loss": 0.1127, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.2564102564102564, |
|
"grad_norm": 0.4302568733692169, |
|
"learning_rate": 4.435342115209916e-05, |
|
"loss": 0.5135, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.2637362637362637, |
|
"grad_norm": 0.8538389205932617, |
|
"learning_rate": 4.35161966413001e-05, |
|
"loss": 0.6252, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.271062271062271, |
|
"grad_norm": 0.7556006908416748, |
|
"learning_rate": 4.2685606732894316e-05, |
|
"loss": 0.4975, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.2783882783882783, |
|
"grad_norm": 0.58868408203125, |
|
"learning_rate": 4.186170317779257e-05, |
|
"loss": 0.4612, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.7426422238349915, |
|
"learning_rate": 4.1044537310304135e-05, |
|
"loss": 0.458, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.293040293040293, |
|
"grad_norm": 0.6092411279678345, |
|
"learning_rate": 4.023416004493849e-05, |
|
"loss": 0.5098, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.3003663003663, |
|
"grad_norm": 0.6772862076759338, |
|
"learning_rate": 3.943062187323317e-05, |
|
"loss": 0.5537, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.5766953229904175, |
|
"learning_rate": 3.863397286060752e-05, |
|
"loss": 0.4449, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.315018315018315, |
|
"grad_norm": 0.5632515549659729, |
|
"learning_rate": 3.784426264324364e-05, |
|
"loss": 0.4455, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.3223443223443225, |
|
"grad_norm": 0.4638958275318146, |
|
"learning_rate": 3.7061540424993455e-05, |
|
"loss": 0.3575, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.32967032967033, |
|
"grad_norm": 0.5185256004333496, |
|
"learning_rate": 3.628585497431319e-05, |
|
"loss": 0.4092, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.336996336996337, |
|
"grad_norm": 0.4830188751220703, |
|
"learning_rate": 3.551725462122475e-05, |
|
"loss": 0.4283, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.3443223443223444, |
|
"grad_norm": 0.5283904075622559, |
|
"learning_rate": 3.47557872543044e-05, |
|
"loss": 0.4279, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.3516483516483517, |
|
"grad_norm": 0.48210349678993225, |
|
"learning_rate": 3.400150031769916e-05, |
|
"loss": 0.3868, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.358974358974359, |
|
"grad_norm": 0.49750009179115295, |
|
"learning_rate": 3.325444080817054e-05, |
|
"loss": 0.4265, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.3663003663003663, |
|
"grad_norm": 0.5228102803230286, |
|
"learning_rate": 3.251465527216644e-05, |
|
"loss": 0.3988, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.3736263736263736, |
|
"grad_norm": 0.4787524938583374, |
|
"learning_rate": 3.178218980292116e-05, |
|
"loss": 0.4338, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.5247374773025513, |
|
"learning_rate": 3.1057090037583195e-05, |
|
"loss": 0.457, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.3882783882783882, |
|
"grad_norm": 0.49824175238609314, |
|
"learning_rate": 3.03394011543721e-05, |
|
"loss": 0.3382, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.3956043956043955, |
|
"grad_norm": 0.5834226012229919, |
|
"learning_rate": 2.9629167869763314e-05, |
|
"loss": 0.4105, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.402930402930403, |
|
"grad_norm": 0.5177899599075317, |
|
"learning_rate": 2.8926434435702213e-05, |
|
"loss": 0.3676, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.41025641025641, |
|
"grad_norm": 0.5421662926673889, |
|
"learning_rate": 2.823124463684692e-05, |
|
"loss": 0.3891, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 0.6656437516212463, |
|
"learning_rate": 2.7543641787840137e-05, |
|
"loss": 0.4003, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.4249084249084247, |
|
"grad_norm": 0.5354308485984802, |
|
"learning_rate": 2.6863668730610628e-05, |
|
"loss": 0.334, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.4322344322344325, |
|
"grad_norm": 0.5242961049079895, |
|
"learning_rate": 2.6191367831703597e-05, |
|
"loss": 0.3575, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.4395604395604398, |
|
"grad_norm": 0.5668814182281494, |
|
"learning_rate": 2.5526780979641132e-05, |
|
"loss": 0.251, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.446886446886447, |
|
"grad_norm": 0.6647098064422607, |
|
"learning_rate": 2.486994958231238e-05, |
|
"loss": 0.3012, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.4542124542124544, |
|
"grad_norm": 0.590238630771637, |
|
"learning_rate": 2.422091456439338e-05, |
|
"loss": 0.2125, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.6544525027275085, |
|
"learning_rate": 2.3579716364797406e-05, |
|
"loss": 0.2345, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.468864468864469, |
|
"grad_norm": 0.6543833017349243, |
|
"learning_rate": 2.294639493415517e-05, |
|
"loss": 0.2455, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.8714612126350403, |
|
"learning_rate": 2.2320989732325816e-05, |
|
"loss": 0.2462, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.4835164835164836, |
|
"grad_norm": 0.8587841391563416, |
|
"learning_rate": 2.170353972593825e-05, |
|
"loss": 0.1519, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.490842490842491, |
|
"grad_norm": 0.6549997329711914, |
|
"learning_rate": 2.1094083385963202e-05, |
|
"loss": 0.1206, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.498168498168498, |
|
"grad_norm": 0.5828280448913574, |
|
"learning_rate": 2.049265868531651e-05, |
|
"loss": 0.0964, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.5054945054945055, |
|
"grad_norm": 0.29954391717910767, |
|
"learning_rate": 1.989930309649282e-05, |
|
"loss": 0.4719, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.5128205128205128, |
|
"grad_norm": 0.41014474630355835, |
|
"learning_rate": 1.9314053589231067e-05, |
|
"loss": 0.5926, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.52014652014652, |
|
"grad_norm": 0.42200765013694763, |
|
"learning_rate": 1.873694662821096e-05, |
|
"loss": 0.5305, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.5274725274725274, |
|
"grad_norm": 0.4672837257385254, |
|
"learning_rate": 1.816801817078093e-05, |
|
"loss": 0.5177, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.5347985347985347, |
|
"grad_norm": 0.45549672842025757, |
|
"learning_rate": 1.760730366471796e-05, |
|
"loss": 0.4906, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.542124542124542, |
|
"grad_norm": 0.42634275555610657, |
|
"learning_rate": 1.705483804601871e-05, |
|
"loss": 0.4466, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.5494505494505493, |
|
"grad_norm": 0.5157731175422668, |
|
"learning_rate": 1.6510655736722967e-05, |
|
"loss": 0.5153, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.5567765567765566, |
|
"grad_norm": 0.6015162467956543, |
|
"learning_rate": 1.5974790642768903e-05, |
|
"loss": 0.4401, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 0.5981739163398743, |
|
"learning_rate": 1.5447276151880473e-05, |
|
"loss": 0.5157, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_loss": 0.6206538081169128, |
|
"eval_runtime": 73.5137, |
|
"eval_samples_per_second": 7.441, |
|
"eval_steps_per_second": 7.441, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.5820057988166809, |
|
"learning_rate": 1.4928145131487267e-05, |
|
"loss": 0.4702, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.578754578754579, |
|
"grad_norm": 0.5424510836601257, |
|
"learning_rate": 1.4417429926676482e-05, |
|
"loss": 0.4174, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.586080586080586, |
|
"grad_norm": 0.6577489376068115, |
|
"learning_rate": 1.39151623581778e-05, |
|
"loss": 0.4169, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.5934065934065935, |
|
"grad_norm": 0.5944687128067017, |
|
"learning_rate": 1.3421373720380669e-05, |
|
"loss": 0.4272, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.600732600732601, |
|
"grad_norm": 0.7125936150550842, |
|
"learning_rate": 1.2936094779384486e-05, |
|
"loss": 0.4438, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.608058608058608, |
|
"grad_norm": 0.679352879524231, |
|
"learning_rate": 1.245935577108168e-05, |
|
"loss": 0.4949, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 1.0750776529312134, |
|
"learning_rate": 1.199118639927385e-05, |
|
"loss": 0.3986, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.6227106227106227, |
|
"grad_norm": 0.8349664211273193, |
|
"learning_rate": 1.1531615833820906e-05, |
|
"loss": 0.4152, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.63003663003663, |
|
"grad_norm": 0.770355761051178, |
|
"learning_rate": 1.108067270882384e-05, |
|
"loss": 0.4878, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 0.7165803909301758, |
|
"learning_rate": 1.0638385120840414e-05, |
|
"loss": 0.3852, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6446886446886446, |
|
"grad_norm": 0.7379334568977356, |
|
"learning_rate": 1.0204780627134784e-05, |
|
"loss": 0.4758, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.652014652014652, |
|
"grad_norm": 0.6477860808372498, |
|
"learning_rate": 9.77988624396025e-06, |
|
"loss": 0.3718, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.659340659340659, |
|
"grad_norm": 0.6215366721153259, |
|
"learning_rate": 9.363728444876239e-06, |
|
"loss": 0.3146, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.7191038727760315, |
|
"learning_rate": 8.956333159098677e-06, |
|
"loss": 0.2567, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.6739926739926743, |
|
"grad_norm": 0.6438663005828857, |
|
"learning_rate": 8.557725769884444e-06, |
|
"loss": 0.3366, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.6813186813186816, |
|
"grad_norm": 0.7834159135818481, |
|
"learning_rate": 8.167931112949955e-06, |
|
"loss": 0.2802, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.688644688644689, |
|
"grad_norm": 0.687300443649292, |
|
"learning_rate": 7.786973474923569e-06, |
|
"loss": 0.3275, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.695970695970696, |
|
"grad_norm": 0.633350133895874, |
|
"learning_rate": 7.41487659183258e-06, |
|
"loss": 0.2462, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.7032967032967035, |
|
"grad_norm": 0.8299592137336731, |
|
"learning_rate": 7.051663647624117e-06, |
|
"loss": 0.2847, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.7106227106227108, |
|
"grad_norm": 0.6400296688079834, |
|
"learning_rate": 6.697357272720782e-06, |
|
"loss": 0.239, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.717948717948718, |
|
"grad_norm": 0.620663583278656, |
|
"learning_rate": 6.35197954261058e-06, |
|
"loss": 0.183, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.7252747252747254, |
|
"grad_norm": 0.7446674108505249, |
|
"learning_rate": 6.015551976471433e-06, |
|
"loss": 0.2334, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.7326007326007327, |
|
"grad_norm": 0.8476528525352478, |
|
"learning_rate": 5.688095535830573e-06, |
|
"loss": 0.1747, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.73992673992674, |
|
"grad_norm": 0.8313474655151367, |
|
"learning_rate": 5.369630623258248e-06, |
|
"loss": 0.189, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.7472527472527473, |
|
"grad_norm": 0.9323949217796326, |
|
"learning_rate": 5.060177081096728e-06, |
|
"loss": 0.0911, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.7545787545787546, |
|
"grad_norm": 0.33731380105018616, |
|
"learning_rate": 4.759754190223925e-06, |
|
"loss": 0.4471, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.37849536538124084, |
|
"learning_rate": 4.468380668852068e-06, |
|
"loss": 0.6115, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.4009314775466919, |
|
"learning_rate": 4.186074671361456e-06, |
|
"loss": 0.575, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.7765567765567765, |
|
"grad_norm": 0.3839149475097656, |
|
"learning_rate": 3.912853787169345e-06, |
|
"loss": 0.5056, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.7838827838827838, |
|
"grad_norm": 0.40725114941596985, |
|
"learning_rate": 3.6487350396339597e-06, |
|
"loss": 0.4408, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.791208791208791, |
|
"grad_norm": 0.4696556329727173, |
|
"learning_rate": 3.3937348849939204e-06, |
|
"loss": 0.5321, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.7985347985347984, |
|
"grad_norm": 0.43771475553512573, |
|
"learning_rate": 3.147869211342818e-06, |
|
"loss": 0.4356, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.8058608058608057, |
|
"grad_norm": 0.5101984739303589, |
|
"learning_rate": 2.911153337639388e-06, |
|
"loss": 0.4499, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.813186813186813, |
|
"grad_norm": 0.5081220865249634, |
|
"learning_rate": 2.683602012752939e-06, |
|
"loss": 0.4756, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.8205128205128203, |
|
"grad_norm": 0.5413169264793396, |
|
"learning_rate": 2.4652294145445226e-06, |
|
"loss": 0.4866, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.8278388278388276, |
|
"grad_norm": 0.5008475184440613, |
|
"learning_rate": 2.256049148983441e-06, |
|
"loss": 0.4641, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.8351648351648353, |
|
"grad_norm": 0.5848665833473206, |
|
"learning_rate": 2.0560742492995885e-06, |
|
"loss": 0.4572, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.8424908424908426, |
|
"grad_norm": 0.6183776259422302, |
|
"learning_rate": 1.8653171751714379e-06, |
|
"loss": 0.5198, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.84981684981685, |
|
"grad_norm": 0.5888795852661133, |
|
"learning_rate": 1.6837898119496263e-06, |
|
"loss": 0.451, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5763834118843079, |
|
"learning_rate": 1.5115034699164308e-06, |
|
"loss": 0.4926, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.8644688644688645, |
|
"grad_norm": 0.6053770184516907, |
|
"learning_rate": 1.348468883581183e-06, |
|
"loss": 0.443, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.871794871794872, |
|
"grad_norm": 0.5707501769065857, |
|
"learning_rate": 1.19469621101132e-06, |
|
"loss": 0.4184, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.879120879120879, |
|
"grad_norm": 0.5886608958244324, |
|
"learning_rate": 1.0501950331995578e-06, |
|
"loss": 0.4002, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.8864468864468864, |
|
"grad_norm": 0.5797408223152161, |
|
"learning_rate": 9.149743534668353e-07, |
|
"loss": 0.4117, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.8937728937728937, |
|
"grad_norm": 0.7185364961624146, |
|
"learning_rate": 7.890425969014625e-07, |
|
"loss": 0.4059, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.901098901098901, |
|
"grad_norm": 0.7297194004058838, |
|
"learning_rate": 6.724076098341247e-07, |
|
"loss": 0.3407, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.9084249084249083, |
|
"grad_norm": 0.6948420405387878, |
|
"learning_rate": 5.650766593489897e-07, |
|
"loss": 0.3564, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.9157509157509156, |
|
"grad_norm": 0.6854040026664734, |
|
"learning_rate": 4.6705643283102003e-07, |
|
"loss": 0.4018, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.7185117602348328, |
|
"learning_rate": 3.7835303754918943e-07, |
|
"loss": 0.3607, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.9304029304029307, |
|
"grad_norm": 0.6749255061149597, |
|
"learning_rate": 2.9897200027598767e-07, |
|
"loss": 0.276, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.9304029304029307, |
|
"eval_loss": 0.60358726978302, |
|
"eval_runtime": 73.527, |
|
"eval_samples_per_second": 7.439, |
|
"eval_steps_per_second": 7.439, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 408, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.659682780951347e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|