{ "best_metric": 0.8232, "best_model_checkpoint": "swin-tiny-patch4-window7-224-swinnn/checkpoint-17550", "epoch": 49.89339019189765, "eval_steps": 500, "global_step": 17550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028429282160625444, "grad_norm": 1.8907402753829956, "learning_rate": 2.8490028490028494e-07, "loss": 0.2926, "step": 10 }, { "epoch": 0.05685856432125089, "grad_norm": 2.258711576461792, "learning_rate": 5.698005698005699e-07, "loss": 0.293, "step": 20 }, { "epoch": 0.08528784648187633, "grad_norm": 2.0438408851623535, "learning_rate": 8.547008547008548e-07, "loss": 0.2904, "step": 30 }, { "epoch": 0.11371712864250177, "grad_norm": 2.178166627883911, "learning_rate": 1.1396011396011398e-06, "loss": 0.2915, "step": 40 }, { "epoch": 0.14214641080312723, "grad_norm": 2.442103147506714, "learning_rate": 1.4245014245014246e-06, "loss": 0.2901, "step": 50 }, { "epoch": 0.17057569296375266, "grad_norm": 2.6239702701568604, "learning_rate": 1.7094017094017097e-06, "loss": 0.2906, "step": 60 }, { "epoch": 0.19900497512437812, "grad_norm": 2.1954970359802246, "learning_rate": 1.9943019943019943e-06, "loss": 0.2895, "step": 70 }, { "epoch": 0.22743425728500355, "grad_norm": 2.1789145469665527, "learning_rate": 2.2792022792022796e-06, "loss": 0.289, "step": 80 }, { "epoch": 0.255863539445629, "grad_norm": 2.859412908554077, "learning_rate": 2.564102564102564e-06, "loss": 0.2905, "step": 90 }, { "epoch": 0.28429282160625446, "grad_norm": 2.286804676055908, "learning_rate": 2.8490028490028492e-06, "loss": 0.2884, "step": 100 }, { "epoch": 0.31272210376687987, "grad_norm": 2.054844856262207, "learning_rate": 3.133903133903134e-06, "loss": 0.2918, "step": 110 }, { "epoch": 0.3411513859275053, "grad_norm": 2.4455275535583496, "learning_rate": 3.4188034188034193e-06, "loss": 0.2877, "step": 120 }, { "epoch": 0.3695806680881308, "grad_norm": 2.3894708156585693, "learning_rate": 3.7037037037037037e-06, "loss": 0.283, "step": 130 }, { "epoch": 0.39800995024875624, "grad_norm": 2.3629746437072754, "learning_rate": 3.988603988603989e-06, "loss": 0.2861, "step": 140 }, { "epoch": 0.42643923240938164, "grad_norm": 2.7381527423858643, "learning_rate": 4.273504273504274e-06, "loss": 0.2892, "step": 150 }, { "epoch": 0.4548685145700071, "grad_norm": 2.8951902389526367, "learning_rate": 4.558404558404559e-06, "loss": 0.2884, "step": 160 }, { "epoch": 0.48329779673063256, "grad_norm": 2.5735270977020264, "learning_rate": 4.8433048433048435e-06, "loss": 0.289, "step": 170 }, { "epoch": 0.511727078891258, "grad_norm": 4.083412170410156, "learning_rate": 5.128205128205128e-06, "loss": 0.2885, "step": 180 }, { "epoch": 0.5401563610518835, "grad_norm": 1.951905369758606, "learning_rate": 5.413105413105413e-06, "loss": 0.2901, "step": 190 }, { "epoch": 0.5685856432125089, "grad_norm": 3.175370216369629, "learning_rate": 5.6980056980056985e-06, "loss": 0.2863, "step": 200 }, { "epoch": 0.5970149253731343, "grad_norm": 2.883253812789917, "learning_rate": 5.982905982905984e-06, "loss": 0.2853, "step": 210 }, { "epoch": 0.6254442075337597, "grad_norm": 3.3479135036468506, "learning_rate": 6.267806267806268e-06, "loss": 0.2828, "step": 220 }, { "epoch": 0.6538734896943852, "grad_norm": 3.148728847503662, "learning_rate": 6.5527065527065525e-06, "loss": 0.285, "step": 230 }, { "epoch": 0.6823027718550106, "grad_norm": 2.233570098876953, "learning_rate": 6.837606837606839e-06, "loss": 0.2861, "step": 240 }, { "epoch": 0.7107320540156361, "grad_norm": 3.506321668624878, "learning_rate": 7.122507122507123e-06, "loss": 0.2857, "step": 250 }, { "epoch": 0.7391613361762616, "grad_norm": 2.235628128051758, "learning_rate": 7.4074074074074075e-06, "loss": 0.2836, "step": 260 }, { "epoch": 0.767590618336887, "grad_norm": 2.8699324131011963, "learning_rate": 7.692307692307694e-06, "loss": 0.2862, "step": 270 }, { "epoch": 0.7960199004975125, "grad_norm": 3.3909108638763428, "learning_rate": 7.977207977207977e-06, "loss": 0.2842, "step": 280 }, { "epoch": 0.8244491826581379, "grad_norm": 5.050898551940918, "learning_rate": 8.262108262108262e-06, "loss": 0.2873, "step": 290 }, { "epoch": 0.8528784648187633, "grad_norm": 2.575756311416626, "learning_rate": 8.547008547008548e-06, "loss": 0.2836, "step": 300 }, { "epoch": 0.8813077469793887, "grad_norm": 2.999044895172119, "learning_rate": 8.831908831908831e-06, "loss": 0.2819, "step": 310 }, { "epoch": 0.9097370291400142, "grad_norm": 4.6781511306762695, "learning_rate": 9.116809116809118e-06, "loss": 0.2835, "step": 320 }, { "epoch": 0.9381663113006397, "grad_norm": 4.386454105377197, "learning_rate": 9.401709401709402e-06, "loss": 0.2807, "step": 330 }, { "epoch": 0.9665955934612651, "grad_norm": 1.9225760698318481, "learning_rate": 9.686609686609687e-06, "loss": 0.285, "step": 340 }, { "epoch": 0.9950248756218906, "grad_norm": 3.636368989944458, "learning_rate": 9.971509971509972e-06, "loss": 0.2802, "step": 350 }, { "epoch": 0.997867803837953, "eval_accuracy": 0.3222, "eval_loss": 0.27834293246269226, "eval_runtime": 13.4401, "eval_samples_per_second": 372.021, "eval_steps_per_second": 11.681, "step": 351 }, { "epoch": 1.023454157782516, "grad_norm": 3.545668125152588, "learning_rate": 1.0256410256410256e-05, "loss": 0.2866, "step": 360 }, { "epoch": 1.0518834399431414, "grad_norm": 3.710625171661377, "learning_rate": 1.0541310541310543e-05, "loss": 0.2828, "step": 370 }, { "epoch": 1.080312722103767, "grad_norm": 3.078389883041382, "learning_rate": 1.0826210826210826e-05, "loss": 0.2816, "step": 380 }, { "epoch": 1.1087420042643923, "grad_norm": 4.719541549682617, "learning_rate": 1.1111111111111112e-05, "loss": 0.2872, "step": 390 }, { "epoch": 1.1371712864250179, "grad_norm": 3.3766567707061768, "learning_rate": 1.1396011396011397e-05, "loss": 0.2846, "step": 400 }, { "epoch": 1.1656005685856432, "grad_norm": 3.108663320541382, "learning_rate": 1.168091168091168e-05, "loss": 0.2811, "step": 410 }, { "epoch": 1.1940298507462686, "grad_norm": 4.949700832366943, "learning_rate": 1.1965811965811967e-05, "loss": 0.2825, "step": 420 }, { "epoch": 1.2224591329068941, "grad_norm": 5.564762115478516, "learning_rate": 1.2250712250712251e-05, "loss": 0.2827, "step": 430 }, { "epoch": 1.2508884150675195, "grad_norm": 3.055199146270752, "learning_rate": 1.2535612535612536e-05, "loss": 0.2782, "step": 440 }, { "epoch": 1.279317697228145, "grad_norm": 3.3172607421875, "learning_rate": 1.282051282051282e-05, "loss": 0.2832, "step": 450 }, { "epoch": 1.3077469793887704, "grad_norm": 3.498727321624756, "learning_rate": 1.3105413105413105e-05, "loss": 0.2731, "step": 460 }, { "epoch": 1.336176261549396, "grad_norm": 6.585111141204834, "learning_rate": 1.3390313390313392e-05, "loss": 0.2822, "step": 470 }, { "epoch": 1.3646055437100213, "grad_norm": 3.937904119491577, "learning_rate": 1.3675213675213677e-05, "loss": 0.2799, "step": 480 }, { "epoch": 1.3930348258706466, "grad_norm": 3.094285726547241, "learning_rate": 1.3960113960113961e-05, "loss": 0.2756, "step": 490 }, { "epoch": 1.4214641080312722, "grad_norm": 4.038396835327148, "learning_rate": 1.4245014245014246e-05, "loss": 0.2802, "step": 500 }, { "epoch": 1.4498933901918978, "grad_norm": 3.947652816772461, "learning_rate": 1.4529914529914531e-05, "loss": 0.277, "step": 510 }, { "epoch": 1.4783226723525231, "grad_norm": 3.3318724632263184, "learning_rate": 1.4814814814814815e-05, "loss": 0.2755, "step": 520 }, { "epoch": 1.5067519545131485, "grad_norm": 5.542562961578369, "learning_rate": 1.50997150997151e-05, "loss": 0.2793, "step": 530 }, { "epoch": 1.535181236673774, "grad_norm": 3.7330291271209717, "learning_rate": 1.5384615384615387e-05, "loss": 0.2809, "step": 540 }, { "epoch": 1.5636105188343994, "grad_norm": 3.7486541271209717, "learning_rate": 1.566951566951567e-05, "loss": 0.2799, "step": 550 }, { "epoch": 1.5920398009950247, "grad_norm": 3.3346805572509766, "learning_rate": 1.5954415954415954e-05, "loss": 0.2796, "step": 560 }, { "epoch": 1.6204690831556503, "grad_norm": 3.4785587787628174, "learning_rate": 1.623931623931624e-05, "loss": 0.2766, "step": 570 }, { "epoch": 1.6488983653162759, "grad_norm": 5.285679817199707, "learning_rate": 1.6524216524216525e-05, "loss": 0.2743, "step": 580 }, { "epoch": 1.6773276474769012, "grad_norm": 4.10180139541626, "learning_rate": 1.680911680911681e-05, "loss": 0.2779, "step": 590 }, { "epoch": 1.7057569296375266, "grad_norm": 5.630993843078613, "learning_rate": 1.7094017094017095e-05, "loss": 0.272, "step": 600 }, { "epoch": 1.7341862117981521, "grad_norm": 3.9752230644226074, "learning_rate": 1.737891737891738e-05, "loss": 0.2736, "step": 610 }, { "epoch": 1.7626154939587777, "grad_norm": 5.526058197021484, "learning_rate": 1.7663817663817662e-05, "loss": 0.2743, "step": 620 }, { "epoch": 1.7910447761194028, "grad_norm": 3.795504093170166, "learning_rate": 1.794871794871795e-05, "loss": 0.2748, "step": 630 }, { "epoch": 1.8194740582800284, "grad_norm": 6.265020847320557, "learning_rate": 1.8233618233618236e-05, "loss": 0.2755, "step": 640 }, { "epoch": 1.847903340440654, "grad_norm": 3.0405287742614746, "learning_rate": 1.8518518518518518e-05, "loss": 0.2796, "step": 650 }, { "epoch": 1.8763326226012793, "grad_norm": 6.968567848205566, "learning_rate": 1.8803418803418804e-05, "loss": 0.2788, "step": 660 }, { "epoch": 1.9047619047619047, "grad_norm": 5.8073649406433105, "learning_rate": 1.908831908831909e-05, "loss": 0.2709, "step": 670 }, { "epoch": 1.9331911869225302, "grad_norm": 3.541447877883911, "learning_rate": 1.9373219373219374e-05, "loss": 0.2755, "step": 680 }, { "epoch": 1.9616204690831558, "grad_norm": 6.208255290985107, "learning_rate": 1.965811965811966e-05, "loss": 0.2741, "step": 690 }, { "epoch": 1.9900497512437811, "grad_norm": 5.17614221572876, "learning_rate": 1.9943019943019945e-05, "loss": 0.2702, "step": 700 }, { "epoch": 1.9985785358919688, "eval_accuracy": 0.376, "eval_loss": 0.2651675343513489, "eval_runtime": 13.4872, "eval_samples_per_second": 370.721, "eval_steps_per_second": 11.641, "step": 703 }, { "epoch": 2.0184790334044065, "grad_norm": 5.432563781738281, "learning_rate": 2.022792022792023e-05, "loss": 0.2788, "step": 710 }, { "epoch": 2.046908315565032, "grad_norm": 4.879900932312012, "learning_rate": 2.0512820512820512e-05, "loss": 0.2691, "step": 720 }, { "epoch": 2.0753375977256576, "grad_norm": 2.3853085041046143, "learning_rate": 2.07977207977208e-05, "loss": 0.2691, "step": 730 }, { "epoch": 2.1037668798862827, "grad_norm": 4.834916591644287, "learning_rate": 2.1082621082621086e-05, "loss": 0.2726, "step": 740 }, { "epoch": 2.1321961620469083, "grad_norm": 5.892274379730225, "learning_rate": 2.1367521367521368e-05, "loss": 0.2703, "step": 750 }, { "epoch": 2.160625444207534, "grad_norm": 5.287324905395508, "learning_rate": 2.1652421652421653e-05, "loss": 0.2655, "step": 760 }, { "epoch": 2.189054726368159, "grad_norm": 4.9049201011657715, "learning_rate": 2.1937321937321938e-05, "loss": 0.2728, "step": 770 }, { "epoch": 2.2174840085287846, "grad_norm": 2.830197334289551, "learning_rate": 2.2222222222222223e-05, "loss": 0.2709, "step": 780 }, { "epoch": 2.24591329068941, "grad_norm": 3.5926570892333984, "learning_rate": 2.250712250712251e-05, "loss": 0.2689, "step": 790 }, { "epoch": 2.2743425728500357, "grad_norm": 4.939948081970215, "learning_rate": 2.2792022792022794e-05, "loss": 0.2695, "step": 800 }, { "epoch": 2.302771855010661, "grad_norm": 6.520457744598389, "learning_rate": 2.307692307692308e-05, "loss": 0.2721, "step": 810 }, { "epoch": 2.3312011371712864, "grad_norm": 2.3680639266967773, "learning_rate": 2.336182336182336e-05, "loss": 0.2693, "step": 820 }, { "epoch": 2.359630419331912, "grad_norm": 2.8569042682647705, "learning_rate": 2.364672364672365e-05, "loss": 0.2726, "step": 830 }, { "epoch": 2.388059701492537, "grad_norm": 3.889040231704712, "learning_rate": 2.3931623931623935e-05, "loss": 0.2637, "step": 840 }, { "epoch": 2.4164889836531627, "grad_norm": 3.36212158203125, "learning_rate": 2.4216524216524217e-05, "loss": 0.2648, "step": 850 }, { "epoch": 2.4449182658137882, "grad_norm": 3.47865629196167, "learning_rate": 2.4501424501424502e-05, "loss": 0.2649, "step": 860 }, { "epoch": 2.473347547974414, "grad_norm": 3.2204861640930176, "learning_rate": 2.4786324786324787e-05, "loss": 0.2672, "step": 870 }, { "epoch": 2.501776830135039, "grad_norm": 6.172186851501465, "learning_rate": 2.5071225071225073e-05, "loss": 0.2667, "step": 880 }, { "epoch": 2.5302061122956645, "grad_norm": 3.048217296600342, "learning_rate": 2.535612535612536e-05, "loss": 0.2664, "step": 890 }, { "epoch": 2.55863539445629, "grad_norm": 3.8980777263641357, "learning_rate": 2.564102564102564e-05, "loss": 0.2639, "step": 900 }, { "epoch": 2.587064676616915, "grad_norm": 5.806455135345459, "learning_rate": 2.5925925925925925e-05, "loss": 0.267, "step": 910 }, { "epoch": 2.6154939587775408, "grad_norm": 6.2620930671691895, "learning_rate": 2.621082621082621e-05, "loss": 0.2675, "step": 920 }, { "epoch": 2.6439232409381663, "grad_norm": 5.409758567810059, "learning_rate": 2.64957264957265e-05, "loss": 0.2606, "step": 930 }, { "epoch": 2.672352523098792, "grad_norm": 3.452788829803467, "learning_rate": 2.6780626780626784e-05, "loss": 0.26, "step": 940 }, { "epoch": 2.7007818052594175, "grad_norm": 5.087782859802246, "learning_rate": 2.706552706552707e-05, "loss": 0.2601, "step": 950 }, { "epoch": 2.7292110874200426, "grad_norm": 3.9471144676208496, "learning_rate": 2.7350427350427355e-05, "loss": 0.2635, "step": 960 }, { "epoch": 2.757640369580668, "grad_norm": 7.521236896514893, "learning_rate": 2.7635327635327633e-05, "loss": 0.2616, "step": 970 }, { "epoch": 2.7860696517412933, "grad_norm": 6.5749311447143555, "learning_rate": 2.7920227920227922e-05, "loss": 0.2624, "step": 980 }, { "epoch": 2.814498933901919, "grad_norm": 5.974452018737793, "learning_rate": 2.8205128205128207e-05, "loss": 0.2585, "step": 990 }, { "epoch": 2.8429282160625444, "grad_norm": 4.7054314613342285, "learning_rate": 2.8490028490028492e-05, "loss": 0.2616, "step": 1000 }, { "epoch": 2.87135749822317, "grad_norm": 3.2259464263916016, "learning_rate": 2.8774928774928778e-05, "loss": 0.2573, "step": 1010 }, { "epoch": 2.8997867803837956, "grad_norm": 3.2185564041137695, "learning_rate": 2.9059829059829063e-05, "loss": 0.2578, "step": 1020 }, { "epoch": 2.9282160625444207, "grad_norm": 3.6937203407287598, "learning_rate": 2.9344729344729345e-05, "loss": 0.2571, "step": 1030 }, { "epoch": 2.9566453447050463, "grad_norm": 4.159379959106445, "learning_rate": 2.962962962962963e-05, "loss": 0.2563, "step": 1040 }, { "epoch": 2.9850746268656714, "grad_norm": 6.00156307220459, "learning_rate": 2.9914529914529915e-05, "loss": 0.2565, "step": 1050 }, { "epoch": 2.9992892679459846, "eval_accuracy": 0.431, "eval_loss": 0.24740619957447052, "eval_runtime": 13.4887, "eval_samples_per_second": 370.68, "eval_steps_per_second": 11.639, "step": 1055 }, { "epoch": 3.013503909026297, "grad_norm": 4.579977512359619, "learning_rate": 3.01994301994302e-05, "loss": 0.2592, "step": 1060 }, { "epoch": 3.0419331911869225, "grad_norm": 4.566761493682861, "learning_rate": 3.0484330484330486e-05, "loss": 0.2563, "step": 1070 }, { "epoch": 3.070362473347548, "grad_norm": 5.73409366607666, "learning_rate": 3.0769230769230774e-05, "loss": 0.2545, "step": 1080 }, { "epoch": 3.098791755508173, "grad_norm": 4.306080341339111, "learning_rate": 3.105413105413106e-05, "loss": 0.2534, "step": 1090 }, { "epoch": 3.1272210376687988, "grad_norm": 6.322807312011719, "learning_rate": 3.133903133903134e-05, "loss": 0.2604, "step": 1100 }, { "epoch": 3.1556503198294243, "grad_norm": 4.54417085647583, "learning_rate": 3.162393162393162e-05, "loss": 0.2574, "step": 1110 }, { "epoch": 3.18407960199005, "grad_norm": 3.0040600299835205, "learning_rate": 3.190883190883191e-05, "loss": 0.2584, "step": 1120 }, { "epoch": 3.212508884150675, "grad_norm": 3.5668346881866455, "learning_rate": 3.2193732193732194e-05, "loss": 0.2513, "step": 1130 }, { "epoch": 3.2409381663113006, "grad_norm": 4.641059398651123, "learning_rate": 3.247863247863248e-05, "loss": 0.2586, "step": 1140 }, { "epoch": 3.269367448471926, "grad_norm": 5.613424777984619, "learning_rate": 3.2763532763532764e-05, "loss": 0.26, "step": 1150 }, { "epoch": 3.2977967306325517, "grad_norm": 5.7787957191467285, "learning_rate": 3.304843304843305e-05, "loss": 0.252, "step": 1160 }, { "epoch": 3.326226012793177, "grad_norm": 5.8402204513549805, "learning_rate": 3.3333333333333335e-05, "loss": 0.253, "step": 1170 }, { "epoch": 3.3546552949538024, "grad_norm": 6.315359115600586, "learning_rate": 3.361823361823362e-05, "loss": 0.254, "step": 1180 }, { "epoch": 3.383084577114428, "grad_norm": 3.9695117473602295, "learning_rate": 3.3903133903133905e-05, "loss": 0.2546, "step": 1190 }, { "epoch": 3.411513859275053, "grad_norm": 5.717206001281738, "learning_rate": 3.418803418803419e-05, "loss": 0.2541, "step": 1200 }, { "epoch": 3.4399431414356787, "grad_norm": 9.703412055969238, "learning_rate": 3.4472934472934476e-05, "loss": 0.2596, "step": 1210 }, { "epoch": 3.4683724235963043, "grad_norm": 4.471320629119873, "learning_rate": 3.475783475783476e-05, "loss": 0.253, "step": 1220 }, { "epoch": 3.49680170575693, "grad_norm": 4.092626094818115, "learning_rate": 3.504273504273504e-05, "loss": 0.2558, "step": 1230 }, { "epoch": 3.525230987917555, "grad_norm": 7.080715656280518, "learning_rate": 3.5327635327635325e-05, "loss": 0.2487, "step": 1240 }, { "epoch": 3.5536602700781805, "grad_norm": 2.4141533374786377, "learning_rate": 3.561253561253561e-05, "loss": 0.2507, "step": 1250 }, { "epoch": 3.582089552238806, "grad_norm": 5.367834568023682, "learning_rate": 3.58974358974359e-05, "loss": 0.2497, "step": 1260 }, { "epoch": 3.610518834399431, "grad_norm": 3.128957509994507, "learning_rate": 3.618233618233619e-05, "loss": 0.2499, "step": 1270 }, { "epoch": 3.638948116560057, "grad_norm": 3.8073232173919678, "learning_rate": 3.646723646723647e-05, "loss": 0.253, "step": 1280 }, { "epoch": 3.6673773987206824, "grad_norm": 3.4332194328308105, "learning_rate": 3.675213675213676e-05, "loss": 0.243, "step": 1290 }, { "epoch": 3.695806680881308, "grad_norm": 4.036149501800537, "learning_rate": 3.7037037037037037e-05, "loss": 0.2475, "step": 1300 }, { "epoch": 3.724235963041933, "grad_norm": 3.786414384841919, "learning_rate": 3.732193732193732e-05, "loss": 0.2543, "step": 1310 }, { "epoch": 3.7526652452025586, "grad_norm": 3.2915213108062744, "learning_rate": 3.760683760683761e-05, "loss": 0.2456, "step": 1320 }, { "epoch": 3.781094527363184, "grad_norm": 4.569310665130615, "learning_rate": 3.789173789173789e-05, "loss": 0.2498, "step": 1330 }, { "epoch": 3.8095238095238093, "grad_norm": 5.0229597091674805, "learning_rate": 3.817663817663818e-05, "loss": 0.2487, "step": 1340 }, { "epoch": 3.837953091684435, "grad_norm": 5.367189884185791, "learning_rate": 3.846153846153846e-05, "loss": 0.2463, "step": 1350 }, { "epoch": 3.8663823738450604, "grad_norm": 5.307685852050781, "learning_rate": 3.874643874643875e-05, "loss": 0.2456, "step": 1360 }, { "epoch": 3.894811656005686, "grad_norm": 4.55289888381958, "learning_rate": 3.903133903133903e-05, "loss": 0.2425, "step": 1370 }, { "epoch": 3.923240938166311, "grad_norm": 3.7516965866088867, "learning_rate": 3.931623931623932e-05, "loss": 0.2476, "step": 1380 }, { "epoch": 3.9516702203269367, "grad_norm": 6.223262786865234, "learning_rate": 3.9601139601139604e-05, "loss": 0.2449, "step": 1390 }, { "epoch": 3.9800995024875623, "grad_norm": 3.2988734245300293, "learning_rate": 3.988603988603989e-05, "loss": 0.2448, "step": 1400 }, { "epoch": 4.0, "eval_accuracy": 0.4558, "eval_loss": 0.2358027845621109, "eval_runtime": 13.4713, "eval_samples_per_second": 371.161, "eval_steps_per_second": 11.654, "step": 1407 }, { "epoch": 4.008528784648187, "grad_norm": 4.976803302764893, "learning_rate": 4.0170940170940174e-05, "loss": 0.2474, "step": 1410 }, { "epoch": 4.036958066808813, "grad_norm": 5.573189735412598, "learning_rate": 4.045584045584046e-05, "loss": 0.2421, "step": 1420 }, { "epoch": 4.0653873489694385, "grad_norm": 4.221163749694824, "learning_rate": 4.074074074074074e-05, "loss": 0.2468, "step": 1430 }, { "epoch": 4.093816631130064, "grad_norm": 5.250073432922363, "learning_rate": 4.1025641025641023e-05, "loss": 0.2433, "step": 1440 }, { "epoch": 4.12224591329069, "grad_norm": 4.930447101593018, "learning_rate": 4.131054131054131e-05, "loss": 0.2449, "step": 1450 }, { "epoch": 4.150675195451315, "grad_norm": 3.9071946144104004, "learning_rate": 4.15954415954416e-05, "loss": 0.2402, "step": 1460 }, { "epoch": 4.17910447761194, "grad_norm": 4.902606010437012, "learning_rate": 4.1880341880341886e-05, "loss": 0.2402, "step": 1470 }, { "epoch": 4.2075337597725655, "grad_norm": 4.128215789794922, "learning_rate": 4.216524216524217e-05, "loss": 0.2459, "step": 1480 }, { "epoch": 4.235963041933191, "grad_norm": 3.6796748638153076, "learning_rate": 4.2450142450142457e-05, "loss": 0.2403, "step": 1490 }, { "epoch": 4.264392324093817, "grad_norm": 3.4276349544525146, "learning_rate": 4.2735042735042735e-05, "loss": 0.2381, "step": 1500 }, { "epoch": 4.292821606254442, "grad_norm": 7.670202255249023, "learning_rate": 4.301994301994302e-05, "loss": 0.2426, "step": 1510 }, { "epoch": 4.321250888415068, "grad_norm": 6.68562126159668, "learning_rate": 4.3304843304843306e-05, "loss": 0.246, "step": 1520 }, { "epoch": 4.349680170575693, "grad_norm": 10.20826244354248, "learning_rate": 4.358974358974359e-05, "loss": 0.2382, "step": 1530 }, { "epoch": 4.378109452736318, "grad_norm": 3.3424441814422607, "learning_rate": 4.3874643874643876e-05, "loss": 0.2451, "step": 1540 }, { "epoch": 4.406538734896944, "grad_norm": 3.3360562324523926, "learning_rate": 4.415954415954416e-05, "loss": 0.2423, "step": 1550 }, { "epoch": 4.434968017057569, "grad_norm": 4.240778923034668, "learning_rate": 4.4444444444444447e-05, "loss": 0.2404, "step": 1560 }, { "epoch": 4.463397299218195, "grad_norm": 4.9358954429626465, "learning_rate": 4.472934472934473e-05, "loss": 0.2428, "step": 1570 }, { "epoch": 4.49182658137882, "grad_norm": 3.456130266189575, "learning_rate": 4.501424501424502e-05, "loss": 0.2372, "step": 1580 }, { "epoch": 4.520255863539446, "grad_norm": 4.79456901550293, "learning_rate": 4.52991452991453e-05, "loss": 0.2361, "step": 1590 }, { "epoch": 4.548685145700071, "grad_norm": 3.8571829795837402, "learning_rate": 4.558404558404559e-05, "loss": 0.2331, "step": 1600 }, { "epoch": 4.577114427860696, "grad_norm": 3.5238330364227295, "learning_rate": 4.586894586894587e-05, "loss": 0.2439, "step": 1610 }, { "epoch": 4.605543710021322, "grad_norm": 5.498716354370117, "learning_rate": 4.615384615384616e-05, "loss": 0.2353, "step": 1620 }, { "epoch": 4.633972992181947, "grad_norm": 6.618678092956543, "learning_rate": 4.643874643874644e-05, "loss": 0.2438, "step": 1630 }, { "epoch": 4.662402274342573, "grad_norm": 7.758936405181885, "learning_rate": 4.672364672364672e-05, "loss": 0.2349, "step": 1640 }, { "epoch": 4.690831556503198, "grad_norm": 6.209959506988525, "learning_rate": 4.700854700854701e-05, "loss": 0.237, "step": 1650 }, { "epoch": 4.719260838663824, "grad_norm": 4.108484745025635, "learning_rate": 4.72934472934473e-05, "loss": 0.2322, "step": 1660 }, { "epoch": 4.7476901208244495, "grad_norm": 4.291415214538574, "learning_rate": 4.7578347578347584e-05, "loss": 0.2364, "step": 1670 }, { "epoch": 4.776119402985074, "grad_norm": 4.442831516265869, "learning_rate": 4.786324786324787e-05, "loss": 0.231, "step": 1680 }, { "epoch": 4.8045486851457, "grad_norm": 3.837928533554077, "learning_rate": 4.814814814814815e-05, "loss": 0.2317, "step": 1690 }, { "epoch": 4.832977967306325, "grad_norm": 7.176934719085693, "learning_rate": 4.8433048433048433e-05, "loss": 0.2347, "step": 1700 }, { "epoch": 4.861407249466951, "grad_norm": 5.241628646850586, "learning_rate": 4.871794871794872e-05, "loss": 0.2315, "step": 1710 }, { "epoch": 4.8898365316275765, "grad_norm": 6.158897876739502, "learning_rate": 4.9002849002849004e-05, "loss": 0.2375, "step": 1720 }, { "epoch": 4.918265813788202, "grad_norm": 3.2543437480926514, "learning_rate": 4.928774928774929e-05, "loss": 0.2314, "step": 1730 }, { "epoch": 4.946695095948828, "grad_norm": 3.572920799255371, "learning_rate": 4.9572649572649575e-05, "loss": 0.2389, "step": 1740 }, { "epoch": 4.975124378109452, "grad_norm": 5.096522331237793, "learning_rate": 4.985754985754986e-05, "loss": 0.2433, "step": 1750 }, { "epoch": 4.997867803837953, "eval_accuracy": 0.4994, "eval_loss": 0.22225263714790344, "eval_runtime": 13.438, "eval_samples_per_second": 372.079, "eval_steps_per_second": 11.683, "step": 1758 }, { "epoch": 5.003553660270078, "grad_norm": 6.091975212097168, "learning_rate": 4.998417220639443e-05, "loss": 0.2377, "step": 1760 }, { "epoch": 5.031982942430703, "grad_norm": 5.004733085632324, "learning_rate": 4.995251661918329e-05, "loss": 0.2349, "step": 1770 }, { "epoch": 5.060412224591329, "grad_norm": 3.959138870239258, "learning_rate": 4.9920861031972145e-05, "loss": 0.2286, "step": 1780 }, { "epoch": 5.088841506751955, "grad_norm": 5.394062519073486, "learning_rate": 4.9889205444761e-05, "loss": 0.2386, "step": 1790 }, { "epoch": 5.11727078891258, "grad_norm": 8.798538208007812, "learning_rate": 4.985754985754986e-05, "loss": 0.2324, "step": 1800 }, { "epoch": 5.145700071073206, "grad_norm": 5.837261199951172, "learning_rate": 4.982589427033872e-05, "loss": 0.2281, "step": 1810 }, { "epoch": 5.174129353233831, "grad_norm": 9.308436393737793, "learning_rate": 4.9794238683127575e-05, "loss": 0.2329, "step": 1820 }, { "epoch": 5.202558635394456, "grad_norm": 5.427538871765137, "learning_rate": 4.976258309591643e-05, "loss": 0.2301, "step": 1830 }, { "epoch": 5.2309879175550815, "grad_norm": 4.305994510650635, "learning_rate": 4.973092750870529e-05, "loss": 0.2341, "step": 1840 }, { "epoch": 5.259417199715707, "grad_norm": 4.874300003051758, "learning_rate": 4.9699271921494144e-05, "loss": 0.2238, "step": 1850 }, { "epoch": 5.287846481876333, "grad_norm": 5.12693977355957, "learning_rate": 4.9667616334283e-05, "loss": 0.2299, "step": 1860 }, { "epoch": 5.316275764036958, "grad_norm": 6.230199813842773, "learning_rate": 4.963596074707186e-05, "loss": 0.2299, "step": 1870 }, { "epoch": 5.344705046197584, "grad_norm": 4.379709243774414, "learning_rate": 4.960430515986072e-05, "loss": 0.232, "step": 1880 }, { "epoch": 5.373134328358209, "grad_norm": 8.380620956420898, "learning_rate": 4.9572649572649575e-05, "loss": 0.2348, "step": 1890 }, { "epoch": 5.401563610518834, "grad_norm": 6.815150737762451, "learning_rate": 4.9540993985438435e-05, "loss": 0.2323, "step": 1900 }, { "epoch": 5.42999289267946, "grad_norm": 10.341625213623047, "learning_rate": 4.950933839822729e-05, "loss": 0.2322, "step": 1910 }, { "epoch": 5.458422174840085, "grad_norm": 6.199831008911133, "learning_rate": 4.9477682811016144e-05, "loss": 0.2279, "step": 1920 }, { "epoch": 5.486851457000711, "grad_norm": 3.859348773956299, "learning_rate": 4.9446027223805005e-05, "loss": 0.2234, "step": 1930 }, { "epoch": 5.515280739161336, "grad_norm": 6.276079177856445, "learning_rate": 4.941437163659386e-05, "loss": 0.2302, "step": 1940 }, { "epoch": 5.543710021321962, "grad_norm": 4.312353610992432, "learning_rate": 4.938271604938271e-05, "loss": 0.226, "step": 1950 }, { "epoch": 5.572139303482587, "grad_norm": 6.3399882316589355, "learning_rate": 4.935106046217158e-05, "loss": 0.2284, "step": 1960 }, { "epoch": 5.600568585643212, "grad_norm": 6.7359490394592285, "learning_rate": 4.9319404874960435e-05, "loss": 0.2245, "step": 1970 }, { "epoch": 5.628997867803838, "grad_norm": 6.008596420288086, "learning_rate": 4.928774928774929e-05, "loss": 0.2266, "step": 1980 }, { "epoch": 5.657427149964463, "grad_norm": 7.1867451667785645, "learning_rate": 4.925609370053815e-05, "loss": 0.221, "step": 1990 }, { "epoch": 5.685856432125089, "grad_norm": 5.440988063812256, "learning_rate": 4.9224438113327004e-05, "loss": 0.2342, "step": 2000 }, { "epoch": 5.714285714285714, "grad_norm": 5.188518047332764, "learning_rate": 4.919278252611586e-05, "loss": 0.2197, "step": 2010 }, { "epoch": 5.74271499644634, "grad_norm": 6.648231029510498, "learning_rate": 4.916112693890472e-05, "loss": 0.2195, "step": 2020 }, { "epoch": 5.7711442786069655, "grad_norm": 4.315269470214844, "learning_rate": 4.912947135169358e-05, "loss": 0.2296, "step": 2030 }, { "epoch": 5.79957356076759, "grad_norm": 7.448794841766357, "learning_rate": 4.9097815764482435e-05, "loss": 0.2317, "step": 2040 }, { "epoch": 5.828002842928216, "grad_norm": 4.203551769256592, "learning_rate": 4.906616017727129e-05, "loss": 0.2196, "step": 2050 }, { "epoch": 5.856432125088841, "grad_norm": 11.099379539489746, "learning_rate": 4.903450459006015e-05, "loss": 0.2257, "step": 2060 }, { "epoch": 5.884861407249467, "grad_norm": 7.174654483795166, "learning_rate": 4.9002849002849004e-05, "loss": 0.2222, "step": 2070 }, { "epoch": 5.9132906894100925, "grad_norm": 4.160901069641113, "learning_rate": 4.8971193415637865e-05, "loss": 0.2229, "step": 2080 }, { "epoch": 5.941719971570718, "grad_norm": 5.662876605987549, "learning_rate": 4.893953782842672e-05, "loss": 0.2307, "step": 2090 }, { "epoch": 5.970149253731344, "grad_norm": 4.971590042114258, "learning_rate": 4.890788224121557e-05, "loss": 0.2122, "step": 2100 }, { "epoch": 5.998578535891969, "grad_norm": 6.247657299041748, "learning_rate": 4.8876226654004434e-05, "loss": 0.2095, "step": 2110 }, { "epoch": 5.998578535891969, "eval_accuracy": 0.5434, "eval_loss": 0.20578816533088684, "eval_runtime": 13.48, "eval_samples_per_second": 370.92, "eval_steps_per_second": 11.647, "step": 2110 }, { "epoch": 6.027007818052594, "grad_norm": 10.330707550048828, "learning_rate": 4.8844571066793295e-05, "loss": 0.2158, "step": 2120 }, { "epoch": 6.0554371002132195, "grad_norm": 4.683455467224121, "learning_rate": 4.881291547958215e-05, "loss": 0.2156, "step": 2130 }, { "epoch": 6.083866382373845, "grad_norm": 4.233691692352295, "learning_rate": 4.8781259892371004e-05, "loss": 0.2225, "step": 2140 }, { "epoch": 6.112295664534471, "grad_norm": 6.733424663543701, "learning_rate": 4.8749604305159865e-05, "loss": 0.2274, "step": 2150 }, { "epoch": 6.140724946695096, "grad_norm": 8.281678199768066, "learning_rate": 4.871794871794872e-05, "loss": 0.2198, "step": 2160 }, { "epoch": 6.169154228855722, "grad_norm": 6.473422527313232, "learning_rate": 4.868629313073757e-05, "loss": 0.2191, "step": 2170 }, { "epoch": 6.197583511016346, "grad_norm": 8.645085334777832, "learning_rate": 4.8654637543526434e-05, "loss": 0.2197, "step": 2180 }, { "epoch": 6.226012793176972, "grad_norm": 5.5490312576293945, "learning_rate": 4.8622981956315295e-05, "loss": 0.2096, "step": 2190 }, { "epoch": 6.2544420753375976, "grad_norm": 3.9064080715179443, "learning_rate": 4.859132636910415e-05, "loss": 0.2215, "step": 2200 }, { "epoch": 6.282871357498223, "grad_norm": 7.356064319610596, "learning_rate": 4.855967078189301e-05, "loss": 0.2113, "step": 2210 }, { "epoch": 6.311300639658849, "grad_norm": 5.092055320739746, "learning_rate": 4.8528015194681864e-05, "loss": 0.222, "step": 2220 }, { "epoch": 6.339729921819474, "grad_norm": 3.6763720512390137, "learning_rate": 4.849635960747072e-05, "loss": 0.2189, "step": 2230 }, { "epoch": 6.3681592039801, "grad_norm": 8.912233352661133, "learning_rate": 4.846470402025958e-05, "loss": 0.2242, "step": 2240 }, { "epoch": 6.396588486140725, "grad_norm": 6.6447296142578125, "learning_rate": 4.8433048433048433e-05, "loss": 0.2226, "step": 2250 }, { "epoch": 6.42501776830135, "grad_norm": 9.668810844421387, "learning_rate": 4.840139284583729e-05, "loss": 0.2232, "step": 2260 }, { "epoch": 6.453447050461976, "grad_norm": 7.601034164428711, "learning_rate": 4.8369737258626155e-05, "loss": 0.2169, "step": 2270 }, { "epoch": 6.481876332622601, "grad_norm": 5.693022727966309, "learning_rate": 4.833808167141501e-05, "loss": 0.2162, "step": 2280 }, { "epoch": 6.510305614783227, "grad_norm": 3.8609538078308105, "learning_rate": 4.8306426084203864e-05, "loss": 0.2206, "step": 2290 }, { "epoch": 6.538734896943852, "grad_norm": 3.6102852821350098, "learning_rate": 4.8274770496992725e-05, "loss": 0.2168, "step": 2300 }, { "epoch": 6.567164179104478, "grad_norm": 6.371620178222656, "learning_rate": 4.824311490978158e-05, "loss": 0.2212, "step": 2310 }, { "epoch": 6.5955934612651035, "grad_norm": 4.227074146270752, "learning_rate": 4.821145932257043e-05, "loss": 0.2188, "step": 2320 }, { "epoch": 6.624022743425728, "grad_norm": 5.018686294555664, "learning_rate": 4.8179803735359294e-05, "loss": 0.2128, "step": 2330 }, { "epoch": 6.652452025586354, "grad_norm": 4.595715045928955, "learning_rate": 4.814814814814815e-05, "loss": 0.2143, "step": 2340 }, { "epoch": 6.680881307746979, "grad_norm": 5.826360702514648, "learning_rate": 4.811649256093701e-05, "loss": 0.2082, "step": 2350 }, { "epoch": 6.709310589907605, "grad_norm": 7.087152004241943, "learning_rate": 4.808483697372586e-05, "loss": 0.2155, "step": 2360 }, { "epoch": 6.73773987206823, "grad_norm": 8.325740814208984, "learning_rate": 4.8053181386514724e-05, "loss": 0.2117, "step": 2370 }, { "epoch": 6.766169154228856, "grad_norm": 7.611214637756348, "learning_rate": 4.802152579930358e-05, "loss": 0.2085, "step": 2380 }, { "epoch": 6.794598436389482, "grad_norm": 9.194658279418945, "learning_rate": 4.798987021209243e-05, "loss": 0.2179, "step": 2390 }, { "epoch": 6.823027718550106, "grad_norm": 4.227519512176514, "learning_rate": 4.7958214624881294e-05, "loss": 0.2146, "step": 2400 }, { "epoch": 6.851457000710732, "grad_norm": 6.55830192565918, "learning_rate": 4.792655903767015e-05, "loss": 0.2151, "step": 2410 }, { "epoch": 6.879886282871357, "grad_norm": 6.308530330657959, "learning_rate": 4.789490345045901e-05, "loss": 0.2154, "step": 2420 }, { "epoch": 6.908315565031983, "grad_norm": 6.225198745727539, "learning_rate": 4.786324786324787e-05, "loss": 0.2108, "step": 2430 }, { "epoch": 6.9367448471926085, "grad_norm": 7.941949844360352, "learning_rate": 4.7831592276036724e-05, "loss": 0.2125, "step": 2440 }, { "epoch": 6.965174129353234, "grad_norm": 9.239226341247559, "learning_rate": 4.779993668882558e-05, "loss": 0.2123, "step": 2450 }, { "epoch": 6.99360341151386, "grad_norm": 8.558223724365234, "learning_rate": 4.776828110161444e-05, "loss": 0.2197, "step": 2460 }, { "epoch": 6.999289267945985, "eval_accuracy": 0.568, "eval_loss": 0.19627775251865387, "eval_runtime": 13.4805, "eval_samples_per_second": 370.906, "eval_steps_per_second": 11.646, "step": 2462 }, { "epoch": 7.022032693674484, "grad_norm": 10.356009483337402, "learning_rate": 4.773662551440329e-05, "loss": 0.2048, "step": 2470 }, { "epoch": 7.05046197583511, "grad_norm": 4.4935622215271, "learning_rate": 4.770496992719215e-05, "loss": 0.2044, "step": 2480 }, { "epoch": 7.0788912579957355, "grad_norm": 5.347179412841797, "learning_rate": 4.767331433998101e-05, "loss": 0.2159, "step": 2490 }, { "epoch": 7.107320540156361, "grad_norm": 6.232418537139893, "learning_rate": 4.764165875276987e-05, "loss": 0.2095, "step": 2500 }, { "epoch": 7.135749822316987, "grad_norm": 4.670558929443359, "learning_rate": 4.7610003165558723e-05, "loss": 0.2085, "step": 2510 }, { "epoch": 7.164179104477612, "grad_norm": 5.811947345733643, "learning_rate": 4.7578347578347584e-05, "loss": 0.2078, "step": 2520 }, { "epoch": 7.192608386638238, "grad_norm": 5.025790691375732, "learning_rate": 4.754669199113644e-05, "loss": 0.1992, "step": 2530 }, { "epoch": 7.221037668798862, "grad_norm": 9.463619232177734, "learning_rate": 4.751503640392529e-05, "loss": 0.2109, "step": 2540 }, { "epoch": 7.249466950959488, "grad_norm": 5.140215873718262, "learning_rate": 4.7483380816714154e-05, "loss": 0.2078, "step": 2550 }, { "epoch": 7.277896233120114, "grad_norm": 5.988222122192383, "learning_rate": 4.745172522950301e-05, "loss": 0.2093, "step": 2560 }, { "epoch": 7.306325515280739, "grad_norm": 4.7362284660339355, "learning_rate": 4.742006964229186e-05, "loss": 0.2088, "step": 2570 }, { "epoch": 7.334754797441365, "grad_norm": 5.376959800720215, "learning_rate": 4.738841405508073e-05, "loss": 0.2073, "step": 2580 }, { "epoch": 7.36318407960199, "grad_norm": 8.916358947753906, "learning_rate": 4.7356758467869584e-05, "loss": 0.2174, "step": 2590 }, { "epoch": 7.391613361762616, "grad_norm": 6.023611068725586, "learning_rate": 4.732510288065844e-05, "loss": 0.2134, "step": 2600 }, { "epoch": 7.4200426439232405, "grad_norm": 9.97637939453125, "learning_rate": 4.72934472934473e-05, "loss": 0.2105, "step": 2610 }, { "epoch": 7.448471926083866, "grad_norm": 4.836955547332764, "learning_rate": 4.726179170623615e-05, "loss": 0.2155, "step": 2620 }, { "epoch": 7.476901208244492, "grad_norm": 4.887229919433594, "learning_rate": 4.723013611902501e-05, "loss": 0.2097, "step": 2630 }, { "epoch": 7.505330490405117, "grad_norm": 5.565708160400391, "learning_rate": 4.719848053181387e-05, "loss": 0.2096, "step": 2640 }, { "epoch": 7.533759772565743, "grad_norm": 6.370345592498779, "learning_rate": 4.716682494460272e-05, "loss": 0.2057, "step": 2650 }, { "epoch": 7.562189054726368, "grad_norm": 4.098349571228027, "learning_rate": 4.7135169357391584e-05, "loss": 0.212, "step": 2660 }, { "epoch": 7.590618336886994, "grad_norm": 8.181506156921387, "learning_rate": 4.710351377018044e-05, "loss": 0.2087, "step": 2670 }, { "epoch": 7.619047619047619, "grad_norm": 6.234516143798828, "learning_rate": 4.70718581829693e-05, "loss": 0.2094, "step": 2680 }, { "epoch": 7.647476901208244, "grad_norm": 6.469677925109863, "learning_rate": 4.704020259575815e-05, "loss": 0.2057, "step": 2690 }, { "epoch": 7.67590618336887, "grad_norm": 7.101709365844727, "learning_rate": 4.700854700854701e-05, "loss": 0.2081, "step": 2700 }, { "epoch": 7.704335465529495, "grad_norm": 6.342074394226074, "learning_rate": 4.697689142133587e-05, "loss": 0.213, "step": 2710 }, { "epoch": 7.732764747690121, "grad_norm": 7.016164302825928, "learning_rate": 4.694523583412472e-05, "loss": 0.2073, "step": 2720 }, { "epoch": 7.7611940298507465, "grad_norm": 3.3445522785186768, "learning_rate": 4.691358024691358e-05, "loss": 0.2029, "step": 2730 }, { "epoch": 7.789623312011372, "grad_norm": 9.680062294006348, "learning_rate": 4.6881924659702444e-05, "loss": 0.2135, "step": 2740 }, { "epoch": 7.818052594171997, "grad_norm": 10.709121704101562, "learning_rate": 4.68502690724913e-05, "loss": 0.2094, "step": 2750 }, { "epoch": 7.846481876332622, "grad_norm": 6.3846917152404785, "learning_rate": 4.681861348528015e-05, "loss": 0.2055, "step": 2760 }, { "epoch": 7.874911158493248, "grad_norm": 8.69446849822998, "learning_rate": 4.6786957898069014e-05, "loss": 0.2108, "step": 2770 }, { "epoch": 7.903340440653873, "grad_norm": 10.49174690246582, "learning_rate": 4.675530231085787e-05, "loss": 0.2141, "step": 2780 }, { "epoch": 7.931769722814499, "grad_norm": 11.51611614227295, "learning_rate": 4.672364672364672e-05, "loss": 0.2032, "step": 2790 }, { "epoch": 7.960199004975125, "grad_norm": 2.9337503910064697, "learning_rate": 4.669199113643558e-05, "loss": 0.2034, "step": 2800 }, { "epoch": 7.98862828713575, "grad_norm": 4.53941535949707, "learning_rate": 4.666033554922444e-05, "loss": 0.2093, "step": 2810 }, { "epoch": 8.0, "eval_accuracy": 0.5764, "eval_loss": 0.19058294594287872, "eval_runtime": 13.5193, "eval_samples_per_second": 369.842, "eval_steps_per_second": 11.613, "step": 2814 }, { "epoch": 8.017057569296375, "grad_norm": 8.945481300354004, "learning_rate": 4.66286799620133e-05, "loss": 0.2048, "step": 2820 }, { "epoch": 8.045486851457001, "grad_norm": 6.670810699462891, "learning_rate": 4.659702437480216e-05, "loss": 0.2041, "step": 2830 }, { "epoch": 8.073916133617626, "grad_norm": 4.744898319244385, "learning_rate": 4.656536878759101e-05, "loss": 0.2001, "step": 2840 }, { "epoch": 8.102345415778252, "grad_norm": 7.565896511077881, "learning_rate": 4.653371320037987e-05, "loss": 0.2004, "step": 2850 }, { "epoch": 8.130774697938877, "grad_norm": 8.042109489440918, "learning_rate": 4.650205761316873e-05, "loss": 0.2044, "step": 2860 }, { "epoch": 8.159203980099502, "grad_norm": 6.9921770095825195, "learning_rate": 4.647040202595758e-05, "loss": 0.2062, "step": 2870 }, { "epoch": 8.187633262260128, "grad_norm": 10.717667579650879, "learning_rate": 4.643874643874644e-05, "loss": 0.2058, "step": 2880 }, { "epoch": 8.216062544420753, "grad_norm": 3.7526729106903076, "learning_rate": 4.64070908515353e-05, "loss": 0.206, "step": 2890 }, { "epoch": 8.24449182658138, "grad_norm": 6.880955696105957, "learning_rate": 4.637543526432416e-05, "loss": 0.2077, "step": 2900 }, { "epoch": 8.272921108742004, "grad_norm": 6.426712512969971, "learning_rate": 4.634377967711301e-05, "loss": 0.1961, "step": 2910 }, { "epoch": 8.30135039090263, "grad_norm": 4.4087324142456055, "learning_rate": 4.6312124089901874e-05, "loss": 0.195, "step": 2920 }, { "epoch": 8.329779673063255, "grad_norm": 6.602668762207031, "learning_rate": 4.628046850269073e-05, "loss": 0.1963, "step": 2930 }, { "epoch": 8.35820895522388, "grad_norm": 8.466628074645996, "learning_rate": 4.624881291547958e-05, "loss": 0.1995, "step": 2940 }, { "epoch": 8.386638237384506, "grad_norm": 4.413565635681152, "learning_rate": 4.621715732826844e-05, "loss": 0.2, "step": 2950 }, { "epoch": 8.415067519545131, "grad_norm": 8.149608612060547, "learning_rate": 4.61855017410573e-05, "loss": 0.2002, "step": 2960 }, { "epoch": 8.443496801705757, "grad_norm": 7.0931806564331055, "learning_rate": 4.615384615384616e-05, "loss": 0.201, "step": 2970 }, { "epoch": 8.471926083866382, "grad_norm": 4.603878498077393, "learning_rate": 4.612219056663501e-05, "loss": 0.2006, "step": 2980 }, { "epoch": 8.500355366027009, "grad_norm": 5.25785493850708, "learning_rate": 4.609053497942387e-05, "loss": 0.1983, "step": 2990 }, { "epoch": 8.528784648187633, "grad_norm": 5.518628120422363, "learning_rate": 4.605887939221273e-05, "loss": 0.1987, "step": 3000 }, { "epoch": 8.557213930348258, "grad_norm": 5.253413677215576, "learning_rate": 4.602722380500158e-05, "loss": 0.201, "step": 3010 }, { "epoch": 8.585643212508884, "grad_norm": 8.48166561126709, "learning_rate": 4.599556821779044e-05, "loss": 0.2032, "step": 3020 }, { "epoch": 8.614072494669509, "grad_norm": 9.760549545288086, "learning_rate": 4.59639126305793e-05, "loss": 0.2002, "step": 3030 }, { "epoch": 8.642501776830136, "grad_norm": 3.8356821537017822, "learning_rate": 4.593225704336815e-05, "loss": 0.192, "step": 3040 }, { "epoch": 8.67093105899076, "grad_norm": 6.869128704071045, "learning_rate": 4.590060145615702e-05, "loss": 0.1973, "step": 3050 }, { "epoch": 8.699360341151387, "grad_norm": 11.256450653076172, "learning_rate": 4.586894586894587e-05, "loss": 0.2072, "step": 3060 }, { "epoch": 8.727789623312011, "grad_norm": 4.46504020690918, "learning_rate": 4.583729028173473e-05, "loss": 0.2031, "step": 3070 }, { "epoch": 8.756218905472636, "grad_norm": 8.85410213470459, "learning_rate": 4.580563469452359e-05, "loss": 0.1986, "step": 3080 }, { "epoch": 8.784648187633262, "grad_norm": 4.037339210510254, "learning_rate": 4.577397910731244e-05, "loss": 0.2046, "step": 3090 }, { "epoch": 8.813077469793887, "grad_norm": 6.662086009979248, "learning_rate": 4.5742323520101296e-05, "loss": 0.2074, "step": 3100 }, { "epoch": 8.841506751954514, "grad_norm": 9.254980087280273, "learning_rate": 4.571066793289016e-05, "loss": 0.2031, "step": 3110 }, { "epoch": 8.869936034115138, "grad_norm": 8.44653606414795, "learning_rate": 4.567901234567901e-05, "loss": 0.1995, "step": 3120 }, { "epoch": 8.898365316275765, "grad_norm": 6.454211711883545, "learning_rate": 4.564735675846787e-05, "loss": 0.211, "step": 3130 }, { "epoch": 8.92679459843639, "grad_norm": 12.527981758117676, "learning_rate": 4.5615701171256733e-05, "loss": 0.2043, "step": 3140 }, { "epoch": 8.955223880597014, "grad_norm": 4.566003799438477, "learning_rate": 4.558404558404559e-05, "loss": 0.1963, "step": 3150 }, { "epoch": 8.98365316275764, "grad_norm": 6.329031944274902, "learning_rate": 4.555238999683444e-05, "loss": 0.2047, "step": 3160 }, { "epoch": 8.997867803837954, "eval_accuracy": 0.5874, "eval_loss": 0.18877100944519043, "eval_runtime": 13.4728, "eval_samples_per_second": 371.118, "eval_steps_per_second": 11.653, "step": 3165 }, { "epoch": 9.012082444918265, "grad_norm": 7.717807292938232, "learning_rate": 4.55207344096233e-05, "loss": 0.2108, "step": 3170 }, { "epoch": 9.040511727078892, "grad_norm": 5.965142250061035, "learning_rate": 4.548907882241216e-05, "loss": 0.2004, "step": 3180 }, { "epoch": 9.068941009239516, "grad_norm": 4.294879913330078, "learning_rate": 4.545742323520101e-05, "loss": 0.1996, "step": 3190 }, { "epoch": 9.097370291400143, "grad_norm": 7.199387073516846, "learning_rate": 4.542576764798987e-05, "loss": 0.198, "step": 3200 }, { "epoch": 9.125799573560768, "grad_norm": 11.50894546508789, "learning_rate": 4.539411206077873e-05, "loss": 0.2025, "step": 3210 }, { "epoch": 9.154228855721392, "grad_norm": 4.251418590545654, "learning_rate": 4.536245647356759e-05, "loss": 0.1952, "step": 3220 }, { "epoch": 9.182658137882019, "grad_norm": 7.604278564453125, "learning_rate": 4.533080088635645e-05, "loss": 0.1988, "step": 3230 }, { "epoch": 9.211087420042643, "grad_norm": 6.027789115905762, "learning_rate": 4.52991452991453e-05, "loss": 0.1973, "step": 3240 }, { "epoch": 9.23951670220327, "grad_norm": 5.859224319458008, "learning_rate": 4.5267489711934157e-05, "loss": 0.1968, "step": 3250 }, { "epoch": 9.267945984363894, "grad_norm": 5.541887283325195, "learning_rate": 4.523583412472302e-05, "loss": 0.2046, "step": 3260 }, { "epoch": 9.296375266524521, "grad_norm": 7.097010612487793, "learning_rate": 4.520417853751187e-05, "loss": 0.2054, "step": 3270 }, { "epoch": 9.324804548685146, "grad_norm": 6.034319877624512, "learning_rate": 4.517252295030073e-05, "loss": 0.2038, "step": 3280 }, { "epoch": 9.35323383084577, "grad_norm": 4.420682907104492, "learning_rate": 4.514086736308959e-05, "loss": 0.196, "step": 3290 }, { "epoch": 9.381663113006397, "grad_norm": 10.714386940002441, "learning_rate": 4.510921177587845e-05, "loss": 0.1949, "step": 3300 }, { "epoch": 9.410092395167021, "grad_norm": 5.9246907234191895, "learning_rate": 4.50775561886673e-05, "loss": 0.1914, "step": 3310 }, { "epoch": 9.438521677327648, "grad_norm": 10.435689926147461, "learning_rate": 4.5045900601456156e-05, "loss": 0.1989, "step": 3320 }, { "epoch": 9.466950959488273, "grad_norm": 11.238248825073242, "learning_rate": 4.501424501424502e-05, "loss": 0.1915, "step": 3330 }, { "epoch": 9.495380241648899, "grad_norm": 12.31617546081543, "learning_rate": 4.498258942703387e-05, "loss": 0.1927, "step": 3340 }, { "epoch": 9.523809523809524, "grad_norm": 9.57353687286377, "learning_rate": 4.4950933839822725e-05, "loss": 0.1965, "step": 3350 }, { "epoch": 9.552238805970148, "grad_norm": 4.310195446014404, "learning_rate": 4.491927825261159e-05, "loss": 0.199, "step": 3360 }, { "epoch": 9.580668088130775, "grad_norm": 5.755348205566406, "learning_rate": 4.488762266540045e-05, "loss": 0.1898, "step": 3370 }, { "epoch": 9.6090973702914, "grad_norm": 5.95635986328125, "learning_rate": 4.48559670781893e-05, "loss": 0.1893, "step": 3380 }, { "epoch": 9.637526652452026, "grad_norm": 8.524093627929688, "learning_rate": 4.482431149097816e-05, "loss": 0.1981, "step": 3390 }, { "epoch": 9.66595593461265, "grad_norm": 4.436422348022461, "learning_rate": 4.479265590376702e-05, "loss": 0.1956, "step": 3400 }, { "epoch": 9.694385216773277, "grad_norm": 3.2499125003814697, "learning_rate": 4.476100031655587e-05, "loss": 0.1892, "step": 3410 }, { "epoch": 9.722814498933902, "grad_norm": 5.634009838104248, "learning_rate": 4.472934472934473e-05, "loss": 0.1955, "step": 3420 }, { "epoch": 9.751243781094526, "grad_norm": 7.401211261749268, "learning_rate": 4.4697689142133586e-05, "loss": 0.1949, "step": 3430 }, { "epoch": 9.779673063255153, "grad_norm": 5.247729301452637, "learning_rate": 4.466603355492245e-05, "loss": 0.1991, "step": 3440 }, { "epoch": 9.808102345415778, "grad_norm": 4.464122295379639, "learning_rate": 4.463437796771131e-05, "loss": 0.194, "step": 3450 }, { "epoch": 9.836531627576404, "grad_norm": 10.124085426330566, "learning_rate": 4.460272238050016e-05, "loss": 0.1968, "step": 3460 }, { "epoch": 9.864960909737029, "grad_norm": 14.410158157348633, "learning_rate": 4.4571066793289016e-05, "loss": 0.1978, "step": 3470 }, { "epoch": 9.893390191897655, "grad_norm": 8.860930442810059, "learning_rate": 4.453941120607788e-05, "loss": 0.19, "step": 3480 }, { "epoch": 9.92181947405828, "grad_norm": 9.4293794631958, "learning_rate": 4.450775561886673e-05, "loss": 0.1933, "step": 3490 }, { "epoch": 9.950248756218905, "grad_norm": 4.803533554077148, "learning_rate": 4.4476100031655586e-05, "loss": 0.1899, "step": 3500 }, { "epoch": 9.978678038379531, "grad_norm": 12.175983428955078, "learning_rate": 4.4444444444444447e-05, "loss": 0.1952, "step": 3510 }, { "epoch": 9.99857853589197, "eval_accuracy": 0.6192, "eval_loss": 0.1743256151676178, "eval_runtime": 13.4826, "eval_samples_per_second": 370.849, "eval_steps_per_second": 11.645, "step": 3517 }, { "epoch": 10.007107320540156, "grad_norm": 7.631377220153809, "learning_rate": 4.441278885723331e-05, "loss": 0.1872, "step": 3520 }, { "epoch": 10.035536602700782, "grad_norm": 7.324371337890625, "learning_rate": 4.438113327002216e-05, "loss": 0.1896, "step": 3530 }, { "epoch": 10.063965884861407, "grad_norm": 6.50758171081543, "learning_rate": 4.4349477682811016e-05, "loss": 0.1812, "step": 3540 }, { "epoch": 10.092395167022033, "grad_norm": 5.858243942260742, "learning_rate": 4.431782209559988e-05, "loss": 0.1927, "step": 3550 }, { "epoch": 10.120824449182658, "grad_norm": 7.763025283813477, "learning_rate": 4.428616650838873e-05, "loss": 0.1935, "step": 3560 }, { "epoch": 10.149253731343283, "grad_norm": 7.317619800567627, "learning_rate": 4.425451092117759e-05, "loss": 0.1868, "step": 3570 }, { "epoch": 10.17768301350391, "grad_norm": 5.185365200042725, "learning_rate": 4.4222855333966446e-05, "loss": 0.1961, "step": 3580 }, { "epoch": 10.206112295664534, "grad_norm": 7.347925662994385, "learning_rate": 4.41911997467553e-05, "loss": 0.1849, "step": 3590 }, { "epoch": 10.23454157782516, "grad_norm": 6.95358419418335, "learning_rate": 4.415954415954416e-05, "loss": 0.1992, "step": 3600 }, { "epoch": 10.262970859985785, "grad_norm": 12.613680839538574, "learning_rate": 4.412788857233302e-05, "loss": 0.19, "step": 3610 }, { "epoch": 10.291400142146411, "grad_norm": 6.729465484619141, "learning_rate": 4.4096232985121876e-05, "loss": 0.1932, "step": 3620 }, { "epoch": 10.319829424307036, "grad_norm": 5.531039237976074, "learning_rate": 4.406457739791073e-05, "loss": 0.1898, "step": 3630 }, { "epoch": 10.348258706467663, "grad_norm": 11.057815551757812, "learning_rate": 4.403292181069959e-05, "loss": 0.1882, "step": 3640 }, { "epoch": 10.376687988628287, "grad_norm": 6.585393905639648, "learning_rate": 4.4001266223488446e-05, "loss": 0.1908, "step": 3650 }, { "epoch": 10.405117270788912, "grad_norm": 9.01979923248291, "learning_rate": 4.39696106362773e-05, "loss": 0.1929, "step": 3660 }, { "epoch": 10.433546552949538, "grad_norm": 8.77499008178711, "learning_rate": 4.393795504906616e-05, "loss": 0.1889, "step": 3670 }, { "epoch": 10.461975835110163, "grad_norm": 5.790901184082031, "learning_rate": 4.390629946185502e-05, "loss": 0.1987, "step": 3680 }, { "epoch": 10.49040511727079, "grad_norm": 6.106592178344727, "learning_rate": 4.3874643874643876e-05, "loss": 0.1919, "step": 3690 }, { "epoch": 10.518834399431414, "grad_norm": 5.474663257598877, "learning_rate": 4.384298828743274e-05, "loss": 0.1967, "step": 3700 }, { "epoch": 10.547263681592039, "grad_norm": 10.3093900680542, "learning_rate": 4.381133270022159e-05, "loss": 0.1904, "step": 3710 }, { "epoch": 10.575692963752665, "grad_norm": 6.134634971618652, "learning_rate": 4.3779677113010445e-05, "loss": 0.1857, "step": 3720 }, { "epoch": 10.60412224591329, "grad_norm": 10.577787399291992, "learning_rate": 4.3748021525799306e-05, "loss": 0.1906, "step": 3730 }, { "epoch": 10.632551528073916, "grad_norm": 11.940903663635254, "learning_rate": 4.371636593858816e-05, "loss": 0.188, "step": 3740 }, { "epoch": 10.660980810234541, "grad_norm": 10.615262985229492, "learning_rate": 4.368471035137702e-05, "loss": 0.1849, "step": 3750 }, { "epoch": 10.689410092395168, "grad_norm": 4.822948932647705, "learning_rate": 4.365305476416588e-05, "loss": 0.1888, "step": 3760 }, { "epoch": 10.717839374555792, "grad_norm": 10.014656066894531, "learning_rate": 4.3621399176954737e-05, "loss": 0.1885, "step": 3770 }, { "epoch": 10.746268656716419, "grad_norm": 5.427389144897461, "learning_rate": 4.358974358974359e-05, "loss": 0.1875, "step": 3780 }, { "epoch": 10.774697938877043, "grad_norm": 8.24125862121582, "learning_rate": 4.355808800253245e-05, "loss": 0.1904, "step": 3790 }, { "epoch": 10.803127221037668, "grad_norm": 6.464120388031006, "learning_rate": 4.3526432415321306e-05, "loss": 0.186, "step": 3800 }, { "epoch": 10.831556503198295, "grad_norm": 8.16998291015625, "learning_rate": 4.349477682811016e-05, "loss": 0.183, "step": 3810 }, { "epoch": 10.85998578535892, "grad_norm": 5.470979690551758, "learning_rate": 4.346312124089902e-05, "loss": 0.1898, "step": 3820 }, { "epoch": 10.888415067519546, "grad_norm": 4.495871543884277, "learning_rate": 4.343146565368788e-05, "loss": 0.1877, "step": 3830 }, { "epoch": 10.91684434968017, "grad_norm": 4.573151588439941, "learning_rate": 4.3399810066476736e-05, "loss": 0.1825, "step": 3840 }, { "epoch": 10.945273631840797, "grad_norm": 4.955619812011719, "learning_rate": 4.336815447926559e-05, "loss": 0.1868, "step": 3850 }, { "epoch": 10.973702914001422, "grad_norm": 6.148294925689697, "learning_rate": 4.333649889205445e-05, "loss": 0.1926, "step": 3860 }, { "epoch": 10.999289267945985, "eval_accuracy": 0.6234, "eval_loss": 0.17397905886173248, "eval_runtime": 13.4663, "eval_samples_per_second": 371.298, "eval_steps_per_second": 11.659, "step": 3869 }, { "epoch": 11.002132196162046, "grad_norm": 7.838772773742676, "learning_rate": 4.3304843304843306e-05, "loss": 0.193, "step": 3870 }, { "epoch": 11.030561478322673, "grad_norm": 5.667586326599121, "learning_rate": 4.3273187717632166e-05, "loss": 0.1909, "step": 3880 }, { "epoch": 11.058990760483297, "grad_norm": 7.794498920440674, "learning_rate": 4.324153213042102e-05, "loss": 0.1895, "step": 3890 }, { "epoch": 11.087420042643924, "grad_norm": 5.333418846130371, "learning_rate": 4.3209876543209875e-05, "loss": 0.1886, "step": 3900 }, { "epoch": 11.115849324804548, "grad_norm": 9.634596824645996, "learning_rate": 4.3178220955998736e-05, "loss": 0.1853, "step": 3910 }, { "epoch": 11.144278606965175, "grad_norm": 5.844647407531738, "learning_rate": 4.31465653687876e-05, "loss": 0.1859, "step": 3920 }, { "epoch": 11.1727078891258, "grad_norm": 4.132171154022217, "learning_rate": 4.311490978157645e-05, "loss": 0.1879, "step": 3930 }, { "epoch": 11.201137171286424, "grad_norm": 10.006366729736328, "learning_rate": 4.3083254194365305e-05, "loss": 0.1914, "step": 3940 }, { "epoch": 11.22956645344705, "grad_norm": 6.6482014656066895, "learning_rate": 4.3051598607154166e-05, "loss": 0.1885, "step": 3950 }, { "epoch": 11.257995735607675, "grad_norm": 5.791187286376953, "learning_rate": 4.301994301994302e-05, "loss": 0.1899, "step": 3960 }, { "epoch": 11.286425017768302, "grad_norm": 5.362449645996094, "learning_rate": 4.2988287432731874e-05, "loss": 0.1771, "step": 3970 }, { "epoch": 11.314854299928927, "grad_norm": 6.490601539611816, "learning_rate": 4.2956631845520735e-05, "loss": 0.1851, "step": 3980 }, { "epoch": 11.343283582089553, "grad_norm": 6.9313483238220215, "learning_rate": 4.2924976258309596e-05, "loss": 0.1893, "step": 3990 }, { "epoch": 11.371712864250178, "grad_norm": 4.41709041595459, "learning_rate": 4.289332067109845e-05, "loss": 0.1836, "step": 4000 }, { "epoch": 11.400142146410802, "grad_norm": 5.569442272186279, "learning_rate": 4.286166508388731e-05, "loss": 0.1942, "step": 4010 }, { "epoch": 11.428571428571429, "grad_norm": 5.641345500946045, "learning_rate": 4.2830009496676166e-05, "loss": 0.1817, "step": 4020 }, { "epoch": 11.457000710732054, "grad_norm": 5.313252925872803, "learning_rate": 4.279835390946502e-05, "loss": 0.1749, "step": 4030 }, { "epoch": 11.48542999289268, "grad_norm": 9.450050354003906, "learning_rate": 4.276669832225388e-05, "loss": 0.1774, "step": 4040 }, { "epoch": 11.513859275053305, "grad_norm": 5.868836879730225, "learning_rate": 4.2735042735042735e-05, "loss": 0.1757, "step": 4050 }, { "epoch": 11.542288557213931, "grad_norm": 6.607734680175781, "learning_rate": 4.270338714783159e-05, "loss": 0.1778, "step": 4060 }, { "epoch": 11.570717839374556, "grad_norm": 7.389378070831299, "learning_rate": 4.267173156062046e-05, "loss": 0.1873, "step": 4070 }, { "epoch": 11.59914712153518, "grad_norm": 6.332109451293945, "learning_rate": 4.264007597340931e-05, "loss": 0.1859, "step": 4080 }, { "epoch": 11.627576403695807, "grad_norm": 6.20521354675293, "learning_rate": 4.2608420386198165e-05, "loss": 0.1858, "step": 4090 }, { "epoch": 11.656005685856432, "grad_norm": 7.912403106689453, "learning_rate": 4.2576764798987026e-05, "loss": 0.1903, "step": 4100 }, { "epoch": 11.684434968017058, "grad_norm": 6.959258556365967, "learning_rate": 4.254510921177588e-05, "loss": 0.1786, "step": 4110 }, { "epoch": 11.712864250177683, "grad_norm": 9.900556564331055, "learning_rate": 4.2513453624564735e-05, "loss": 0.1857, "step": 4120 }, { "epoch": 11.74129353233831, "grad_norm": 6.947319507598877, "learning_rate": 4.2481798037353596e-05, "loss": 0.18, "step": 4130 }, { "epoch": 11.769722814498934, "grad_norm": 5.834961891174316, "learning_rate": 4.2450142450142457e-05, "loss": 0.1797, "step": 4140 }, { "epoch": 11.798152096659559, "grad_norm": 18.270727157592773, "learning_rate": 4.241848686293131e-05, "loss": 0.1876, "step": 4150 }, { "epoch": 11.826581378820185, "grad_norm": 9.199230194091797, "learning_rate": 4.2386831275720165e-05, "loss": 0.1895, "step": 4160 }, { "epoch": 11.85501066098081, "grad_norm": 4.743688583374023, "learning_rate": 4.2355175688509026e-05, "loss": 0.1924, "step": 4170 }, { "epoch": 11.883439943141436, "grad_norm": 4.048192977905273, "learning_rate": 4.232352010129788e-05, "loss": 0.1872, "step": 4180 }, { "epoch": 11.91186922530206, "grad_norm": 6.297144889831543, "learning_rate": 4.2291864514086734e-05, "loss": 0.1845, "step": 4190 }, { "epoch": 11.940298507462687, "grad_norm": 5.947645664215088, "learning_rate": 4.2260208926875595e-05, "loss": 0.1788, "step": 4200 }, { "epoch": 11.968727789623312, "grad_norm": 4.096541404724121, "learning_rate": 4.222855333966445e-05, "loss": 0.1824, "step": 4210 }, { "epoch": 11.997157071783937, "grad_norm": 5.694889068603516, "learning_rate": 4.219689775245331e-05, "loss": 0.1838, "step": 4220 }, { "epoch": 12.0, "eval_accuracy": 0.6448, "eval_loss": 0.166715607047081, "eval_runtime": 13.4874, "eval_samples_per_second": 370.717, "eval_steps_per_second": 11.641, "step": 4221 }, { "epoch": 12.025586353944563, "grad_norm": 13.107288360595703, "learning_rate": 4.216524216524217e-05, "loss": 0.1811, "step": 4230 }, { "epoch": 12.054015636105188, "grad_norm": 5.283908843994141, "learning_rate": 4.2133586578031025e-05, "loss": 0.178, "step": 4240 }, { "epoch": 12.082444918265814, "grad_norm": 10.282354354858398, "learning_rate": 4.210193099081988e-05, "loss": 0.181, "step": 4250 }, { "epoch": 12.110874200426439, "grad_norm": 7.403770446777344, "learning_rate": 4.207027540360874e-05, "loss": 0.1765, "step": 4260 }, { "epoch": 12.139303482587065, "grad_norm": 5.7965407371521, "learning_rate": 4.2038619816397595e-05, "loss": 0.1828, "step": 4270 }, { "epoch": 12.16773276474769, "grad_norm": 4.449316024780273, "learning_rate": 4.200696422918645e-05, "loss": 0.186, "step": 4280 }, { "epoch": 12.196162046908315, "grad_norm": 13.941222190856934, "learning_rate": 4.197530864197531e-05, "loss": 0.1763, "step": 4290 }, { "epoch": 12.224591329068941, "grad_norm": 9.081911087036133, "learning_rate": 4.194365305476417e-05, "loss": 0.187, "step": 4300 }, { "epoch": 12.253020611229566, "grad_norm": 5.47357702255249, "learning_rate": 4.1911997467553025e-05, "loss": 0.1824, "step": 4310 }, { "epoch": 12.281449893390192, "grad_norm": 10.789342880249023, "learning_rate": 4.1880341880341886e-05, "loss": 0.1826, "step": 4320 }, { "epoch": 12.309879175550817, "grad_norm": 5.950871467590332, "learning_rate": 4.184868629313074e-05, "loss": 0.1872, "step": 4330 }, { "epoch": 12.338308457711443, "grad_norm": 6.042428970336914, "learning_rate": 4.1817030705919594e-05, "loss": 0.1789, "step": 4340 }, { "epoch": 12.366737739872068, "grad_norm": 6.279260158538818, "learning_rate": 4.1785375118708455e-05, "loss": 0.1907, "step": 4350 }, { "epoch": 12.395167022032693, "grad_norm": 8.843768119812012, "learning_rate": 4.175371953149731e-05, "loss": 0.1751, "step": 4360 }, { "epoch": 12.42359630419332, "grad_norm": 7.667747974395752, "learning_rate": 4.1722063944286164e-05, "loss": 0.1753, "step": 4370 }, { "epoch": 12.452025586353944, "grad_norm": 8.441402435302734, "learning_rate": 4.169040835707503e-05, "loss": 0.1753, "step": 4380 }, { "epoch": 12.48045486851457, "grad_norm": 7.004631519317627, "learning_rate": 4.1658752769863886e-05, "loss": 0.1774, "step": 4390 }, { "epoch": 12.508884150675195, "grad_norm": 11.224427223205566, "learning_rate": 4.162709718265274e-05, "loss": 0.1787, "step": 4400 }, { "epoch": 12.537313432835822, "grad_norm": 9.65335750579834, "learning_rate": 4.15954415954416e-05, "loss": 0.1813, "step": 4410 }, { "epoch": 12.565742714996446, "grad_norm": 15.914355278015137, "learning_rate": 4.1563786008230455e-05, "loss": 0.1796, "step": 4420 }, { "epoch": 12.594171997157073, "grad_norm": 10.89742374420166, "learning_rate": 4.153213042101931e-05, "loss": 0.1832, "step": 4430 }, { "epoch": 12.622601279317697, "grad_norm": 7.901971817016602, "learning_rate": 4.150047483380817e-05, "loss": 0.1881, "step": 4440 }, { "epoch": 12.651030561478322, "grad_norm": 16.155794143676758, "learning_rate": 4.1468819246597024e-05, "loss": 0.1835, "step": 4450 }, { "epoch": 12.679459843638949, "grad_norm": 6.3575615882873535, "learning_rate": 4.1437163659385885e-05, "loss": 0.1835, "step": 4460 }, { "epoch": 12.707889125799573, "grad_norm": 5.320089340209961, "learning_rate": 4.140550807217474e-05, "loss": 0.185, "step": 4470 }, { "epoch": 12.7363184079602, "grad_norm": 7.589712142944336, "learning_rate": 4.13738524849636e-05, "loss": 0.179, "step": 4480 }, { "epoch": 12.764747690120824, "grad_norm": 7.458854675292969, "learning_rate": 4.1342196897752455e-05, "loss": 0.1834, "step": 4490 }, { "epoch": 12.79317697228145, "grad_norm": 8.258151054382324, "learning_rate": 4.131054131054131e-05, "loss": 0.1758, "step": 4500 }, { "epoch": 12.821606254442075, "grad_norm": 9.791566848754883, "learning_rate": 4.127888572333017e-05, "loss": 0.1761, "step": 4510 }, { "epoch": 12.8500355366027, "grad_norm": 7.58099365234375, "learning_rate": 4.1247230136119024e-05, "loss": 0.1775, "step": 4520 }, { "epoch": 12.878464818763327, "grad_norm": 7.711862564086914, "learning_rate": 4.1215574548907885e-05, "loss": 0.1795, "step": 4530 }, { "epoch": 12.906894100923951, "grad_norm": 16.93216896057129, "learning_rate": 4.1183918961696746e-05, "loss": 0.1813, "step": 4540 }, { "epoch": 12.935323383084578, "grad_norm": 7.012818813323975, "learning_rate": 4.11522633744856e-05, "loss": 0.1851, "step": 4550 }, { "epoch": 12.963752665245202, "grad_norm": 6.061773300170898, "learning_rate": 4.1120607787274454e-05, "loss": 0.175, "step": 4560 }, { "epoch": 12.992181947405829, "grad_norm": 10.474386215209961, "learning_rate": 4.1088952200063315e-05, "loss": 0.1822, "step": 4570 }, { "epoch": 12.997867803837954, "eval_accuracy": 0.6468, "eval_loss": 0.16294465959072113, "eval_runtime": 13.4513, "eval_samples_per_second": 371.713, "eval_steps_per_second": 11.672, "step": 4572 }, { "epoch": 13.020611229566454, "grad_norm": 5.63366174697876, "learning_rate": 4.105729661285217e-05, "loss": 0.1851, "step": 4580 }, { "epoch": 13.049040511727078, "grad_norm": 9.006646156311035, "learning_rate": 4.1025641025641023e-05, "loss": 0.1764, "step": 4590 }, { "epoch": 13.077469793887705, "grad_norm": 9.808253288269043, "learning_rate": 4.0993985438429884e-05, "loss": 0.1798, "step": 4600 }, { "epoch": 13.10589907604833, "grad_norm": 6.913196086883545, "learning_rate": 4.0962329851218745e-05, "loss": 0.186, "step": 4610 }, { "epoch": 13.134328358208956, "grad_norm": 5.747495174407959, "learning_rate": 4.09306742640076e-05, "loss": 0.1812, "step": 4620 }, { "epoch": 13.16275764036958, "grad_norm": 5.476018905639648, "learning_rate": 4.089901867679646e-05, "loss": 0.1783, "step": 4630 }, { "epoch": 13.191186922530207, "grad_norm": 5.094200611114502, "learning_rate": 4.0867363089585315e-05, "loss": 0.1837, "step": 4640 }, { "epoch": 13.219616204690832, "grad_norm": 8.331713676452637, "learning_rate": 4.083570750237417e-05, "loss": 0.1786, "step": 4650 }, { "epoch": 13.248045486851456, "grad_norm": 7.266057968139648, "learning_rate": 4.080405191516303e-05, "loss": 0.1725, "step": 4660 }, { "epoch": 13.276474769012083, "grad_norm": 5.70189094543457, "learning_rate": 4.0772396327951884e-05, "loss": 0.1732, "step": 4670 }, { "epoch": 13.304904051172707, "grad_norm": 6.469992160797119, "learning_rate": 4.074074074074074e-05, "loss": 0.1727, "step": 4680 }, { "epoch": 13.333333333333334, "grad_norm": 8.781341552734375, "learning_rate": 4.07090851535296e-05, "loss": 0.1725, "step": 4690 }, { "epoch": 13.361762615493959, "grad_norm": 4.825845718383789, "learning_rate": 4.067742956631846e-05, "loss": 0.1772, "step": 4700 }, { "epoch": 13.390191897654585, "grad_norm": 7.113617420196533, "learning_rate": 4.0645773979107314e-05, "loss": 0.1799, "step": 4710 }, { "epoch": 13.41862117981521, "grad_norm": 5.477686405181885, "learning_rate": 4.0614118391896175e-05, "loss": 0.1765, "step": 4720 }, { "epoch": 13.447050461975834, "grad_norm": 7.033078670501709, "learning_rate": 4.058246280468503e-05, "loss": 0.1741, "step": 4730 }, { "epoch": 13.47547974413646, "grad_norm": 5.321260929107666, "learning_rate": 4.0550807217473884e-05, "loss": 0.1819, "step": 4740 }, { "epoch": 13.503909026297086, "grad_norm": 7.639000415802002, "learning_rate": 4.0519151630262745e-05, "loss": 0.1795, "step": 4750 }, { "epoch": 13.532338308457712, "grad_norm": 10.588789939880371, "learning_rate": 4.04874960430516e-05, "loss": 0.1846, "step": 4760 }, { "epoch": 13.560767590618337, "grad_norm": 7.095537185668945, "learning_rate": 4.045584045584046e-05, "loss": 0.1758, "step": 4770 }, { "epoch": 13.589196872778963, "grad_norm": 7.436639785766602, "learning_rate": 4.0424184868629314e-05, "loss": 0.1816, "step": 4780 }, { "epoch": 13.617626154939588, "grad_norm": 5.618598461151123, "learning_rate": 4.0392529281418175e-05, "loss": 0.1718, "step": 4790 }, { "epoch": 13.646055437100213, "grad_norm": 9.28583812713623, "learning_rate": 4.036087369420703e-05, "loss": 0.1772, "step": 4800 }, { "epoch": 13.674484719260839, "grad_norm": 7.762270927429199, "learning_rate": 4.032921810699588e-05, "loss": 0.178, "step": 4810 }, { "epoch": 13.702914001421464, "grad_norm": 5.994172096252441, "learning_rate": 4.0297562519784744e-05, "loss": 0.1737, "step": 4820 }, { "epoch": 13.73134328358209, "grad_norm": 9.733640670776367, "learning_rate": 4.02659069325736e-05, "loss": 0.1773, "step": 4830 }, { "epoch": 13.759772565742715, "grad_norm": 5.444761753082275, "learning_rate": 4.023425134536245e-05, "loss": 0.1762, "step": 4840 }, { "epoch": 13.788201847903341, "grad_norm": 6.2805681228637695, "learning_rate": 4.020259575815132e-05, "loss": 0.1808, "step": 4850 }, { "epoch": 13.816631130063966, "grad_norm": 5.9816083908081055, "learning_rate": 4.0170940170940174e-05, "loss": 0.1786, "step": 4860 }, { "epoch": 13.84506041222459, "grad_norm": 6.074634075164795, "learning_rate": 4.013928458372903e-05, "loss": 0.1747, "step": 4870 }, { "epoch": 13.873489694385217, "grad_norm": 8.790934562683105, "learning_rate": 4.010762899651789e-05, "loss": 0.1688, "step": 4880 }, { "epoch": 13.901918976545842, "grad_norm": 9.49049186706543, "learning_rate": 4.0075973409306744e-05, "loss": 0.1731, "step": 4890 }, { "epoch": 13.930348258706468, "grad_norm": 8.490804672241211, "learning_rate": 4.00443178220956e-05, "loss": 0.1703, "step": 4900 }, { "epoch": 13.958777540867093, "grad_norm": 6.443454265594482, "learning_rate": 4.001266223488446e-05, "loss": 0.1764, "step": 4910 }, { "epoch": 13.98720682302772, "grad_norm": 6.596497058868408, "learning_rate": 3.998100664767331e-05, "loss": 0.1838, "step": 4920 }, { "epoch": 13.99857853589197, "eval_accuracy": 0.6638, "eval_loss": 0.15867580473423004, "eval_runtime": 13.4666, "eval_samples_per_second": 371.288, "eval_steps_per_second": 11.658, "step": 4924 }, { "epoch": 14.015636105188344, "grad_norm": 8.316414833068848, "learning_rate": 3.9949351060462174e-05, "loss": 0.1772, "step": 4930 }, { "epoch": 14.044065387348969, "grad_norm": 5.798473358154297, "learning_rate": 3.9917695473251035e-05, "loss": 0.1798, "step": 4940 }, { "epoch": 14.072494669509595, "grad_norm": 6.894885540008545, "learning_rate": 3.988603988603989e-05, "loss": 0.1719, "step": 4950 }, { "epoch": 14.10092395167022, "grad_norm": 4.8885273933410645, "learning_rate": 3.985438429882874e-05, "loss": 0.1719, "step": 4960 }, { "epoch": 14.129353233830846, "grad_norm": 7.582751750946045, "learning_rate": 3.9822728711617604e-05, "loss": 0.1732, "step": 4970 }, { "epoch": 14.157782515991471, "grad_norm": 7.836116790771484, "learning_rate": 3.979107312440646e-05, "loss": 0.1787, "step": 4980 }, { "epoch": 14.186211798152097, "grad_norm": 7.8729472160339355, "learning_rate": 3.975941753719531e-05, "loss": 0.1719, "step": 4990 }, { "epoch": 14.214641080312722, "grad_norm": 5.8584370613098145, "learning_rate": 3.9727761949984174e-05, "loss": 0.1668, "step": 5000 }, { "epoch": 14.243070362473347, "grad_norm": 7.428163051605225, "learning_rate": 3.9696106362773035e-05, "loss": 0.1724, "step": 5010 }, { "epoch": 14.271499644633973, "grad_norm": 10.26877498626709, "learning_rate": 3.966445077556189e-05, "loss": 0.1739, "step": 5020 }, { "epoch": 14.299928926794598, "grad_norm": 6.295298099517822, "learning_rate": 3.963279518835075e-05, "loss": 0.1754, "step": 5030 }, { "epoch": 14.328358208955224, "grad_norm": 4.642026901245117, "learning_rate": 3.9601139601139604e-05, "loss": 0.1719, "step": 5040 }, { "epoch": 14.356787491115849, "grad_norm": 4.547788143157959, "learning_rate": 3.956948401392846e-05, "loss": 0.1683, "step": 5050 }, { "epoch": 14.385216773276476, "grad_norm": 4.895547866821289, "learning_rate": 3.953782842671732e-05, "loss": 0.1728, "step": 5060 }, { "epoch": 14.4136460554371, "grad_norm": 4.180243968963623, "learning_rate": 3.950617283950617e-05, "loss": 0.1623, "step": 5070 }, { "epoch": 14.442075337597725, "grad_norm": 9.849530220031738, "learning_rate": 3.9474517252295034e-05, "loss": 0.1635, "step": 5080 }, { "epoch": 14.470504619758351, "grad_norm": 5.804750442504883, "learning_rate": 3.944286166508389e-05, "loss": 0.1766, "step": 5090 }, { "epoch": 14.498933901918976, "grad_norm": 9.91480541229248, "learning_rate": 3.941120607787275e-05, "loss": 0.1632, "step": 5100 }, { "epoch": 14.527363184079602, "grad_norm": 5.946216106414795, "learning_rate": 3.9379550490661604e-05, "loss": 0.1753, "step": 5110 }, { "epoch": 14.555792466240227, "grad_norm": 10.948956489562988, "learning_rate": 3.934789490345046e-05, "loss": 0.1701, "step": 5120 }, { "epoch": 14.584221748400854, "grad_norm": 10.421112060546875, "learning_rate": 3.931623931623932e-05, "loss": 0.1761, "step": 5130 }, { "epoch": 14.612651030561478, "grad_norm": 7.275792598724365, "learning_rate": 3.928458372902817e-05, "loss": 0.1714, "step": 5140 }, { "epoch": 14.641080312722103, "grad_norm": 9.117766380310059, "learning_rate": 3.925292814181703e-05, "loss": 0.1674, "step": 5150 }, { "epoch": 14.66950959488273, "grad_norm": 8.332450866699219, "learning_rate": 3.9221272554605895e-05, "loss": 0.1811, "step": 5160 }, { "epoch": 14.697938877043354, "grad_norm": 5.5898661613464355, "learning_rate": 3.918961696739475e-05, "loss": 0.1732, "step": 5170 }, { "epoch": 14.72636815920398, "grad_norm": 6.179837703704834, "learning_rate": 3.91579613801836e-05, "loss": 0.173, "step": 5180 }, { "epoch": 14.754797441364605, "grad_norm": 16.57663345336914, "learning_rate": 3.9126305792972464e-05, "loss": 0.1745, "step": 5190 }, { "epoch": 14.783226723525232, "grad_norm": 16.966814041137695, "learning_rate": 3.909465020576132e-05, "loss": 0.1712, "step": 5200 }, { "epoch": 14.811656005685856, "grad_norm": 9.56699275970459, "learning_rate": 3.906299461855017e-05, "loss": 0.1768, "step": 5210 }, { "epoch": 14.840085287846481, "grad_norm": 5.286874771118164, "learning_rate": 3.903133903133903e-05, "loss": 0.1849, "step": 5220 }, { "epoch": 14.868514570007108, "grad_norm": 6.400488376617432, "learning_rate": 3.899968344412789e-05, "loss": 0.1734, "step": 5230 }, { "epoch": 14.896943852167732, "grad_norm": 5.678165912628174, "learning_rate": 3.896802785691675e-05, "loss": 0.163, "step": 5240 }, { "epoch": 14.925373134328359, "grad_norm": 6.225283622741699, "learning_rate": 3.893637226970561e-05, "loss": 0.1749, "step": 5250 }, { "epoch": 14.953802416488983, "grad_norm": 12.645467758178711, "learning_rate": 3.8904716682494464e-05, "loss": 0.1699, "step": 5260 }, { "epoch": 14.98223169864961, "grad_norm": 12.196904182434082, "learning_rate": 3.887306109528332e-05, "loss": 0.1689, "step": 5270 }, { "epoch": 14.999289267945985, "eval_accuracy": 0.675, "eval_loss": 0.1562993824481964, "eval_runtime": 13.4957, "eval_samples_per_second": 370.487, "eval_steps_per_second": 11.633, "step": 5276 }, { "epoch": 15.010660980810234, "grad_norm": 5.375777721405029, "learning_rate": 3.884140550807218e-05, "loss": 0.163, "step": 5280 }, { "epoch": 15.03909026297086, "grad_norm": 6.3642120361328125, "learning_rate": 3.880974992086103e-05, "loss": 0.1665, "step": 5290 }, { "epoch": 15.067519545131486, "grad_norm": 8.05555534362793, "learning_rate": 3.877809433364989e-05, "loss": 0.1785, "step": 5300 }, { "epoch": 15.09594882729211, "grad_norm": 6.703502178192139, "learning_rate": 3.874643874643875e-05, "loss": 0.1778, "step": 5310 }, { "epoch": 15.124378109452737, "grad_norm": 14.248394012451172, "learning_rate": 3.871478315922761e-05, "loss": 0.1752, "step": 5320 }, { "epoch": 15.152807391613361, "grad_norm": 6.40488862991333, "learning_rate": 3.868312757201646e-05, "loss": 0.1761, "step": 5330 }, { "epoch": 15.181236673773988, "grad_norm": 5.260807991027832, "learning_rate": 3.8651471984805324e-05, "loss": 0.1649, "step": 5340 }, { "epoch": 15.209665955934613, "grad_norm": 5.043075084686279, "learning_rate": 3.861981639759418e-05, "loss": 0.1711, "step": 5350 }, { "epoch": 15.238095238095237, "grad_norm": 6.416396141052246, "learning_rate": 3.858816081038303e-05, "loss": 0.1694, "step": 5360 }, { "epoch": 15.266524520255864, "grad_norm": 7.821042537689209, "learning_rate": 3.8556505223171894e-05, "loss": 0.1697, "step": 5370 }, { "epoch": 15.294953802416488, "grad_norm": 4.34155797958374, "learning_rate": 3.852484963596075e-05, "loss": 0.1668, "step": 5380 }, { "epoch": 15.323383084577115, "grad_norm": 7.777837753295898, "learning_rate": 3.84931940487496e-05, "loss": 0.1738, "step": 5390 }, { "epoch": 15.35181236673774, "grad_norm": 8.838866233825684, "learning_rate": 3.846153846153846e-05, "loss": 0.1707, "step": 5400 }, { "epoch": 15.380241648898366, "grad_norm": 4.986713409423828, "learning_rate": 3.8429882874327324e-05, "loss": 0.1682, "step": 5410 }, { "epoch": 15.40867093105899, "grad_norm": 4.820565223693848, "learning_rate": 3.839822728711618e-05, "loss": 0.1747, "step": 5420 }, { "epoch": 15.437100213219615, "grad_norm": 5.614505767822266, "learning_rate": 3.836657169990503e-05, "loss": 0.1722, "step": 5430 }, { "epoch": 15.465529495380242, "grad_norm": 12.464309692382812, "learning_rate": 3.833491611269389e-05, "loss": 0.1714, "step": 5440 }, { "epoch": 15.493958777540866, "grad_norm": 9.441070556640625, "learning_rate": 3.830326052548275e-05, "loss": 0.1756, "step": 5450 }, { "epoch": 15.522388059701493, "grad_norm": 7.475626468658447, "learning_rate": 3.82716049382716e-05, "loss": 0.1835, "step": 5460 }, { "epoch": 15.550817341862118, "grad_norm": 4.62790584564209, "learning_rate": 3.823994935106047e-05, "loss": 0.1679, "step": 5470 }, { "epoch": 15.579246624022744, "grad_norm": 7.387009143829346, "learning_rate": 3.8208293763849323e-05, "loss": 0.167, "step": 5480 }, { "epoch": 15.607675906183369, "grad_norm": 17.020044326782227, "learning_rate": 3.817663817663818e-05, "loss": 0.1661, "step": 5490 }, { "epoch": 15.636105188343993, "grad_norm": 5.43551778793335, "learning_rate": 3.814498258942704e-05, "loss": 0.164, "step": 5500 }, { "epoch": 15.66453447050462, "grad_norm": 11.664446830749512, "learning_rate": 3.811332700221589e-05, "loss": 0.165, "step": 5510 }, { "epoch": 15.692963752665245, "grad_norm": 6.0515055656433105, "learning_rate": 3.808167141500475e-05, "loss": 0.1734, "step": 5520 }, { "epoch": 15.721393034825871, "grad_norm": 5.583444118499756, "learning_rate": 3.805001582779361e-05, "loss": 0.1684, "step": 5530 }, { "epoch": 15.749822316986496, "grad_norm": 9.789053916931152, "learning_rate": 3.801836024058246e-05, "loss": 0.1737, "step": 5540 }, { "epoch": 15.778251599147122, "grad_norm": 9.55753231048584, "learning_rate": 3.798670465337132e-05, "loss": 0.1723, "step": 5550 }, { "epoch": 15.806680881307747, "grad_norm": 6.521287441253662, "learning_rate": 3.7955049066160184e-05, "loss": 0.1668, "step": 5560 }, { "epoch": 15.835110163468372, "grad_norm": 7.379385471343994, "learning_rate": 3.792339347894904e-05, "loss": 0.1821, "step": 5570 }, { "epoch": 15.863539445628998, "grad_norm": 9.951786994934082, "learning_rate": 3.789173789173789e-05, "loss": 0.1647, "step": 5580 }, { "epoch": 15.891968727789623, "grad_norm": 7.365923881530762, "learning_rate": 3.786008230452675e-05, "loss": 0.163, "step": 5590 }, { "epoch": 15.92039800995025, "grad_norm": 12.983285903930664, "learning_rate": 3.782842671731561e-05, "loss": 0.1625, "step": 5600 }, { "epoch": 15.948827292110874, "grad_norm": 7.357599258422852, "learning_rate": 3.779677113010446e-05, "loss": 0.1668, "step": 5610 }, { "epoch": 15.9772565742715, "grad_norm": 9.090298652648926, "learning_rate": 3.776511554289332e-05, "loss": 0.1697, "step": 5620 }, { "epoch": 16.0, "eval_accuracy": 0.6916, "eval_loss": 0.14721454679965973, "eval_runtime": 13.5096, "eval_samples_per_second": 370.108, "eval_steps_per_second": 11.621, "step": 5628 }, { "epoch": 16.005685856432127, "grad_norm": 8.401142120361328, "learning_rate": 3.7733459955682184e-05, "loss": 0.1749, "step": 5630 }, { "epoch": 16.03411513859275, "grad_norm": 9.800179481506348, "learning_rate": 3.770180436847104e-05, "loss": 0.1628, "step": 5640 }, { "epoch": 16.062544420753376, "grad_norm": 9.622997283935547, "learning_rate": 3.767014878125989e-05, "loss": 0.1611, "step": 5650 }, { "epoch": 16.090973702914003, "grad_norm": 9.222646713256836, "learning_rate": 3.763849319404875e-05, "loss": 0.1606, "step": 5660 }, { "epoch": 16.119402985074625, "grad_norm": 5.980533599853516, "learning_rate": 3.760683760683761e-05, "loss": 0.165, "step": 5670 }, { "epoch": 16.147832267235252, "grad_norm": 11.353034019470215, "learning_rate": 3.757518201962647e-05, "loss": 0.1696, "step": 5680 }, { "epoch": 16.17626154939588, "grad_norm": 4.855453968048096, "learning_rate": 3.754352643241532e-05, "loss": 0.1746, "step": 5690 }, { "epoch": 16.204690831556505, "grad_norm": 7.198344707489014, "learning_rate": 3.7511870845204176e-05, "loss": 0.1639, "step": 5700 }, { "epoch": 16.233120113717128, "grad_norm": 6.764168739318848, "learning_rate": 3.748021525799304e-05, "loss": 0.1717, "step": 5710 }, { "epoch": 16.261549395877754, "grad_norm": 8.16052532196045, "learning_rate": 3.74485596707819e-05, "loss": 0.1633, "step": 5720 }, { "epoch": 16.28997867803838, "grad_norm": 6.895861625671387, "learning_rate": 3.741690408357075e-05, "loss": 0.1672, "step": 5730 }, { "epoch": 16.318407960199004, "grad_norm": 3.5315351486206055, "learning_rate": 3.738524849635961e-05, "loss": 0.1722, "step": 5740 }, { "epoch": 16.34683724235963, "grad_norm": 5.195951461791992, "learning_rate": 3.735359290914847e-05, "loss": 0.1713, "step": 5750 }, { "epoch": 16.375266524520256, "grad_norm": 5.453103065490723, "learning_rate": 3.732193732193732e-05, "loss": 0.1655, "step": 5760 }, { "epoch": 16.403695806680883, "grad_norm": 4.626656532287598, "learning_rate": 3.7290281734726176e-05, "loss": 0.1636, "step": 5770 }, { "epoch": 16.432125088841506, "grad_norm": 10.986313819885254, "learning_rate": 3.725862614751504e-05, "loss": 0.1647, "step": 5780 }, { "epoch": 16.460554371002132, "grad_norm": 6.051445484161377, "learning_rate": 3.72269705603039e-05, "loss": 0.1708, "step": 5790 }, { "epoch": 16.48898365316276, "grad_norm": 6.361173629760742, "learning_rate": 3.719531497309275e-05, "loss": 0.1649, "step": 5800 }, { "epoch": 16.51741293532338, "grad_norm": 12.536359786987305, "learning_rate": 3.716365938588161e-05, "loss": 0.16, "step": 5810 }, { "epoch": 16.545842217484008, "grad_norm": 5.546921253204346, "learning_rate": 3.713200379867047e-05, "loss": 0.1647, "step": 5820 }, { "epoch": 16.574271499644635, "grad_norm": 10.2316312789917, "learning_rate": 3.710034821145932e-05, "loss": 0.1684, "step": 5830 }, { "epoch": 16.60270078180526, "grad_norm": 8.55539321899414, "learning_rate": 3.706869262424818e-05, "loss": 0.1716, "step": 5840 }, { "epoch": 16.631130063965884, "grad_norm": 6.74585485458374, "learning_rate": 3.7037037037037037e-05, "loss": 0.1763, "step": 5850 }, { "epoch": 16.65955934612651, "grad_norm": 5.900564670562744, "learning_rate": 3.700538144982589e-05, "loss": 0.1677, "step": 5860 }, { "epoch": 16.687988628287137, "grad_norm": 12.721473693847656, "learning_rate": 3.697372586261476e-05, "loss": 0.164, "step": 5870 }, { "epoch": 16.71641791044776, "grad_norm": 9.620595932006836, "learning_rate": 3.694207027540361e-05, "loss": 0.1615, "step": 5880 }, { "epoch": 16.744847192608386, "grad_norm": 5.164673328399658, "learning_rate": 3.691041468819247e-05, "loss": 0.1573, "step": 5890 }, { "epoch": 16.773276474769013, "grad_norm": 15.78459358215332, "learning_rate": 3.687875910098133e-05, "loss": 0.1621, "step": 5900 }, { "epoch": 16.80170575692964, "grad_norm": 5.539624214172363, "learning_rate": 3.684710351377018e-05, "loss": 0.1604, "step": 5910 }, { "epoch": 16.830135039090262, "grad_norm": 5.900873184204102, "learning_rate": 3.6815447926559036e-05, "loss": 0.1636, "step": 5920 }, { "epoch": 16.85856432125089, "grad_norm": 7.360677719116211, "learning_rate": 3.67837923393479e-05, "loss": 0.1653, "step": 5930 }, { "epoch": 16.886993603411515, "grad_norm": 7.124042987823486, "learning_rate": 3.675213675213676e-05, "loss": 0.1642, "step": 5940 }, { "epoch": 16.915422885572138, "grad_norm": 16.17691993713379, "learning_rate": 3.672048116492561e-05, "loss": 0.1696, "step": 5950 }, { "epoch": 16.943852167732764, "grad_norm": 9.038963317871094, "learning_rate": 3.6688825577714466e-05, "loss": 0.1657, "step": 5960 }, { "epoch": 16.97228144989339, "grad_norm": 9.11174488067627, "learning_rate": 3.665716999050333e-05, "loss": 0.1643, "step": 5970 }, { "epoch": 16.997867803837952, "eval_accuracy": 0.6912, "eval_loss": 0.14354543387889862, "eval_runtime": 13.4933, "eval_samples_per_second": 370.555, "eval_steps_per_second": 11.635, "step": 5979 }, { "epoch": 17.000710732054017, "grad_norm": 7.836512088775635, "learning_rate": 3.662551440329218e-05, "loss": 0.1659, "step": 5980 }, { "epoch": 17.02914001421464, "grad_norm": 5.481104373931885, "learning_rate": 3.659385881608104e-05, "loss": 0.1601, "step": 5990 }, { "epoch": 17.057569296375267, "grad_norm": 22.114076614379883, "learning_rate": 3.65622032288699e-05, "loss": 0.1562, "step": 6000 }, { "epoch": 17.085998578535893, "grad_norm": 12.712228775024414, "learning_rate": 3.653054764165875e-05, "loss": 0.1707, "step": 6010 }, { "epoch": 17.114427860696516, "grad_norm": 5.33532190322876, "learning_rate": 3.649889205444761e-05, "loss": 0.1637, "step": 6020 }, { "epoch": 17.142857142857142, "grad_norm": 8.143196105957031, "learning_rate": 3.646723646723647e-05, "loss": 0.1607, "step": 6030 }, { "epoch": 17.17128642501777, "grad_norm": 9.09637451171875, "learning_rate": 3.643558088002533e-05, "loss": 0.1723, "step": 6040 }, { "epoch": 17.199715707178395, "grad_norm": 7.9089531898498535, "learning_rate": 3.640392529281418e-05, "loss": 0.1656, "step": 6050 }, { "epoch": 17.228144989339018, "grad_norm": 7.642179489135742, "learning_rate": 3.637226970560304e-05, "loss": 0.1723, "step": 6060 }, { "epoch": 17.256574271499645, "grad_norm": 9.775899887084961, "learning_rate": 3.6340614118391896e-05, "loss": 0.1609, "step": 6070 }, { "epoch": 17.28500355366027, "grad_norm": 7.18733549118042, "learning_rate": 3.630895853118075e-05, "loss": 0.1705, "step": 6080 }, { "epoch": 17.313432835820894, "grad_norm": 6.199199199676514, "learning_rate": 3.627730294396961e-05, "loss": 0.1652, "step": 6090 }, { "epoch": 17.34186211798152, "grad_norm": 4.586163520812988, "learning_rate": 3.624564735675847e-05, "loss": 0.1672, "step": 6100 }, { "epoch": 17.370291400142147, "grad_norm": 8.44304370880127, "learning_rate": 3.6213991769547327e-05, "loss": 0.1623, "step": 6110 }, { "epoch": 17.398720682302773, "grad_norm": 10.64468002319336, "learning_rate": 3.618233618233619e-05, "loss": 0.1607, "step": 6120 }, { "epoch": 17.427149964463396, "grad_norm": 7.362174987792969, "learning_rate": 3.615068059512504e-05, "loss": 0.1571, "step": 6130 }, { "epoch": 17.455579246624023, "grad_norm": 5.801829814910889, "learning_rate": 3.6119025007913896e-05, "loss": 0.1696, "step": 6140 }, { "epoch": 17.48400852878465, "grad_norm": 6.003584384918213, "learning_rate": 3.608736942070276e-05, "loss": 0.1561, "step": 6150 }, { "epoch": 17.512437810945272, "grad_norm": 4.654746055603027, "learning_rate": 3.605571383349161e-05, "loss": 0.1568, "step": 6160 }, { "epoch": 17.5408670931059, "grad_norm": 5.518158435821533, "learning_rate": 3.6024058246280465e-05, "loss": 0.1619, "step": 6170 }, { "epoch": 17.569296375266525, "grad_norm": 4.9060516357421875, "learning_rate": 3.599240265906933e-05, "loss": 0.1698, "step": 6180 }, { "epoch": 17.59772565742715, "grad_norm": 5.4964680671691895, "learning_rate": 3.596074707185819e-05, "loss": 0.1685, "step": 6190 }, { "epoch": 17.626154939587774, "grad_norm": 9.260560989379883, "learning_rate": 3.592909148464704e-05, "loss": 0.1608, "step": 6200 }, { "epoch": 17.6545842217484, "grad_norm": 7.691409111022949, "learning_rate": 3.58974358974359e-05, "loss": 0.1618, "step": 6210 }, { "epoch": 17.683013503909027, "grad_norm": 10.206984519958496, "learning_rate": 3.5865780310224756e-05, "loss": 0.1645, "step": 6220 }, { "epoch": 17.71144278606965, "grad_norm": 10.701088905334473, "learning_rate": 3.583412472301361e-05, "loss": 0.1546, "step": 6230 }, { "epoch": 17.739872068230277, "grad_norm": 9.652596473693848, "learning_rate": 3.580246913580247e-05, "loss": 0.1552, "step": 6240 }, { "epoch": 17.768301350390903, "grad_norm": 6.70912504196167, "learning_rate": 3.5770813548591326e-05, "loss": 0.1671, "step": 6250 }, { "epoch": 17.79673063255153, "grad_norm": 6.655432224273682, "learning_rate": 3.573915796138019e-05, "loss": 0.1639, "step": 6260 }, { "epoch": 17.825159914712152, "grad_norm": 5.79883337020874, "learning_rate": 3.570750237416904e-05, "loss": 0.1551, "step": 6270 }, { "epoch": 17.85358919687278, "grad_norm": 5.370034694671631, "learning_rate": 3.56758467869579e-05, "loss": 0.1713, "step": 6280 }, { "epoch": 17.882018479033405, "grad_norm": 5.844311237335205, "learning_rate": 3.5644191199746756e-05, "loss": 0.1573, "step": 6290 }, { "epoch": 17.91044776119403, "grad_norm": 7.6851582527160645, "learning_rate": 3.561253561253561e-05, "loss": 0.1694, "step": 6300 }, { "epoch": 17.938877043354655, "grad_norm": 7.74069881439209, "learning_rate": 3.558088002532447e-05, "loss": 0.1622, "step": 6310 }, { "epoch": 17.96730632551528, "grad_norm": 5.8436360359191895, "learning_rate": 3.5549224438113325e-05, "loss": 0.1605, "step": 6320 }, { "epoch": 17.995735607675908, "grad_norm": 5.317842960357666, "learning_rate": 3.5517568850902186e-05, "loss": 0.1655, "step": 6330 }, { "epoch": 17.99857853589197, "eval_accuracy": 0.706, "eval_loss": 0.13945943117141724, "eval_runtime": 13.4453, "eval_samples_per_second": 371.878, "eval_steps_per_second": 11.677, "step": 6331 }, { "epoch": 18.02416488983653, "grad_norm": 8.166695594787598, "learning_rate": 3.548591326369105e-05, "loss": 0.1559, "step": 6340 }, { "epoch": 18.052594171997157, "grad_norm": 5.226716995239258, "learning_rate": 3.54542576764799e-05, "loss": 0.1578, "step": 6350 }, { "epoch": 18.081023454157783, "grad_norm": 6.8258819580078125, "learning_rate": 3.5422602089268756e-05, "loss": 0.1664, "step": 6360 }, { "epoch": 18.109452736318406, "grad_norm": 5.907895088195801, "learning_rate": 3.539094650205762e-05, "loss": 0.1607, "step": 6370 }, { "epoch": 18.137882018479033, "grad_norm": 8.665287971496582, "learning_rate": 3.535929091484647e-05, "loss": 0.1607, "step": 6380 }, { "epoch": 18.16631130063966, "grad_norm": 6.130677223205566, "learning_rate": 3.5327635327635325e-05, "loss": 0.1572, "step": 6390 }, { "epoch": 18.194740582800286, "grad_norm": 5.1094818115234375, "learning_rate": 3.5295979740424186e-05, "loss": 0.1601, "step": 6400 }, { "epoch": 18.22316986496091, "grad_norm": 6.80999755859375, "learning_rate": 3.526432415321305e-05, "loss": 0.161, "step": 6410 }, { "epoch": 18.251599147121535, "grad_norm": 8.978972434997559, "learning_rate": 3.52326685660019e-05, "loss": 0.1625, "step": 6420 }, { "epoch": 18.28002842928216, "grad_norm": 13.114323616027832, "learning_rate": 3.520101297879076e-05, "loss": 0.1631, "step": 6430 }, { "epoch": 18.308457711442784, "grad_norm": 7.016110897064209, "learning_rate": 3.5169357391579616e-05, "loss": 0.158, "step": 6440 }, { "epoch": 18.33688699360341, "grad_norm": 8.39936351776123, "learning_rate": 3.513770180436847e-05, "loss": 0.1561, "step": 6450 }, { "epoch": 18.365316275764037, "grad_norm": 9.849126815795898, "learning_rate": 3.510604621715733e-05, "loss": 0.1704, "step": 6460 }, { "epoch": 18.393745557924664, "grad_norm": 7.212557315826416, "learning_rate": 3.5074390629946186e-05, "loss": 0.1629, "step": 6470 }, { "epoch": 18.422174840085287, "grad_norm": 8.727147102355957, "learning_rate": 3.504273504273504e-05, "loss": 0.1563, "step": 6480 }, { "epoch": 18.450604122245913, "grad_norm": 6.686132907867432, "learning_rate": 3.501107945552391e-05, "loss": 0.1625, "step": 6490 }, { "epoch": 18.47903340440654, "grad_norm": 6.6875901222229, "learning_rate": 3.497942386831276e-05, "loss": 0.1599, "step": 6500 }, { "epoch": 18.507462686567163, "grad_norm": 5.4074578285217285, "learning_rate": 3.4947768281101616e-05, "loss": 0.16, "step": 6510 }, { "epoch": 18.53589196872779, "grad_norm": 17.295433044433594, "learning_rate": 3.491611269389048e-05, "loss": 0.1664, "step": 6520 }, { "epoch": 18.564321250888415, "grad_norm": 5.922271251678467, "learning_rate": 3.488445710667933e-05, "loss": 0.1613, "step": 6530 }, { "epoch": 18.592750533049042, "grad_norm": 8.197579383850098, "learning_rate": 3.4852801519468185e-05, "loss": 0.1641, "step": 6540 }, { "epoch": 18.621179815209665, "grad_norm": 6.425829887390137, "learning_rate": 3.4821145932257046e-05, "loss": 0.1614, "step": 6550 }, { "epoch": 18.64960909737029, "grad_norm": 7.810327529907227, "learning_rate": 3.47894903450459e-05, "loss": 0.1538, "step": 6560 }, { "epoch": 18.678038379530918, "grad_norm": 17.985605239868164, "learning_rate": 3.475783475783476e-05, "loss": 0.1545, "step": 6570 }, { "epoch": 18.70646766169154, "grad_norm": 12.622027397155762, "learning_rate": 3.4726179170623615e-05, "loss": 0.156, "step": 6580 }, { "epoch": 18.734896943852167, "grad_norm": 7.6832733154296875, "learning_rate": 3.4694523583412476e-05, "loss": 0.164, "step": 6590 }, { "epoch": 18.763326226012794, "grad_norm": 7.1333909034729, "learning_rate": 3.466286799620133e-05, "loss": 0.1567, "step": 6600 }, { "epoch": 18.79175550817342, "grad_norm": 12.511330604553223, "learning_rate": 3.4631212408990185e-05, "loss": 0.1561, "step": 6610 }, { "epoch": 18.820184790334043, "grad_norm": 7.218026161193848, "learning_rate": 3.4599556821779046e-05, "loss": 0.1609, "step": 6620 }, { "epoch": 18.84861407249467, "grad_norm": 5.773265838623047, "learning_rate": 3.45679012345679e-05, "loss": 0.1518, "step": 6630 }, { "epoch": 18.877043354655296, "grad_norm": 5.179189205169678, "learning_rate": 3.453624564735676e-05, "loss": 0.1487, "step": 6640 }, { "epoch": 18.90547263681592, "grad_norm": 9.711522102355957, "learning_rate": 3.450459006014562e-05, "loss": 0.1562, "step": 6650 }, { "epoch": 18.933901918976545, "grad_norm": 7.569458484649658, "learning_rate": 3.4472934472934476e-05, "loss": 0.1647, "step": 6660 }, { "epoch": 18.96233120113717, "grad_norm": 8.368632316589355, "learning_rate": 3.444127888572333e-05, "loss": 0.1638, "step": 6670 }, { "epoch": 18.990760483297798, "grad_norm": 4.85873556137085, "learning_rate": 3.440962329851219e-05, "loss": 0.1555, "step": 6680 }, { "epoch": 18.999289267945983, "eval_accuracy": 0.714, "eval_loss": 0.13714179396629333, "eval_runtime": 13.5378, "eval_samples_per_second": 369.336, "eval_steps_per_second": 11.597, "step": 6683 }, { "epoch": 19.01918976545842, "grad_norm": 7.283456802368164, "learning_rate": 3.4377967711301045e-05, "loss": 0.1604, "step": 6690 }, { "epoch": 19.047619047619047, "grad_norm": 4.669328212738037, "learning_rate": 3.43463121240899e-05, "loss": 0.1554, "step": 6700 }, { "epoch": 19.076048329779674, "grad_norm": 6.214028835296631, "learning_rate": 3.431465653687876e-05, "loss": 0.1632, "step": 6710 }, { "epoch": 19.104477611940297, "grad_norm": 7.8813157081604, "learning_rate": 3.4283000949667615e-05, "loss": 0.1602, "step": 6720 }, { "epoch": 19.132906894100923, "grad_norm": 6.617958068847656, "learning_rate": 3.4251345362456476e-05, "loss": 0.1595, "step": 6730 }, { "epoch": 19.16133617626155, "grad_norm": 7.466203212738037, "learning_rate": 3.4219689775245337e-05, "loss": 0.1589, "step": 6740 }, { "epoch": 19.189765458422176, "grad_norm": 5.4316253662109375, "learning_rate": 3.418803418803419e-05, "loss": 0.1613, "step": 6750 }, { "epoch": 19.2181947405828, "grad_norm": 7.687604904174805, "learning_rate": 3.4156378600823045e-05, "loss": 0.1564, "step": 6760 }, { "epoch": 19.246624022743426, "grad_norm": 6.290378570556641, "learning_rate": 3.4124723013611906e-05, "loss": 0.1621, "step": 6770 }, { "epoch": 19.275053304904052, "grad_norm": 9.970466613769531, "learning_rate": 3.409306742640076e-05, "loss": 0.1462, "step": 6780 }, { "epoch": 19.303482587064675, "grad_norm": 6.2664690017700195, "learning_rate": 3.4061411839189614e-05, "loss": 0.1589, "step": 6790 }, { "epoch": 19.3319118692253, "grad_norm": 6.588331699371338, "learning_rate": 3.4029756251978475e-05, "loss": 0.1554, "step": 6800 }, { "epoch": 19.360341151385928, "grad_norm": 6.2913432121276855, "learning_rate": 3.3998100664767336e-05, "loss": 0.1632, "step": 6810 }, { "epoch": 19.388770433546554, "grad_norm": 9.006940841674805, "learning_rate": 3.396644507755619e-05, "loss": 0.146, "step": 6820 }, { "epoch": 19.417199715707177, "grad_norm": 5.104375839233398, "learning_rate": 3.393478949034505e-05, "loss": 0.149, "step": 6830 }, { "epoch": 19.445628997867804, "grad_norm": 6.304584980010986, "learning_rate": 3.3903133903133905e-05, "loss": 0.1514, "step": 6840 }, { "epoch": 19.47405828002843, "grad_norm": 7.113027095794678, "learning_rate": 3.387147831592276e-05, "loss": 0.1515, "step": 6850 }, { "epoch": 19.502487562189053, "grad_norm": 5.539541721343994, "learning_rate": 3.383982272871162e-05, "loss": 0.1543, "step": 6860 }, { "epoch": 19.53091684434968, "grad_norm": 10.70662784576416, "learning_rate": 3.3808167141500475e-05, "loss": 0.1547, "step": 6870 }, { "epoch": 19.559346126510306, "grad_norm": 8.089286804199219, "learning_rate": 3.3776511554289336e-05, "loss": 0.1665, "step": 6880 }, { "epoch": 19.587775408670932, "grad_norm": 8.201956748962402, "learning_rate": 3.374485596707819e-05, "loss": 0.159, "step": 6890 }, { "epoch": 19.616204690831555, "grad_norm": 11.057098388671875, "learning_rate": 3.371320037986705e-05, "loss": 0.1574, "step": 6900 }, { "epoch": 19.64463397299218, "grad_norm": 7.710545063018799, "learning_rate": 3.3681544792655905e-05, "loss": 0.1606, "step": 6910 }, { "epoch": 19.673063255152808, "grad_norm": 10.064957618713379, "learning_rate": 3.364988920544476e-05, "loss": 0.1498, "step": 6920 }, { "epoch": 19.701492537313435, "grad_norm": 6.91300630569458, "learning_rate": 3.361823361823362e-05, "loss": 0.1525, "step": 6930 }, { "epoch": 19.729921819474058, "grad_norm": 7.470318794250488, "learning_rate": 3.3586578031022474e-05, "loss": 0.157, "step": 6940 }, { "epoch": 19.758351101634684, "grad_norm": 5.2139787673950195, "learning_rate": 3.355492244381133e-05, "loss": 0.1553, "step": 6950 }, { "epoch": 19.78678038379531, "grad_norm": 6.875024795532227, "learning_rate": 3.3523266856600196e-05, "loss": 0.1636, "step": 6960 }, { "epoch": 19.815209665955933, "grad_norm": 5.392747402191162, "learning_rate": 3.349161126938905e-05, "loss": 0.1543, "step": 6970 }, { "epoch": 19.84363894811656, "grad_norm": 9.983506202697754, "learning_rate": 3.3459955682177905e-05, "loss": 0.1595, "step": 6980 }, { "epoch": 19.872068230277186, "grad_norm": 6.036851406097412, "learning_rate": 3.3428300094966766e-05, "loss": 0.1593, "step": 6990 }, { "epoch": 19.90049751243781, "grad_norm": 6.506160736083984, "learning_rate": 3.339664450775562e-05, "loss": 0.1599, "step": 7000 }, { "epoch": 19.928926794598436, "grad_norm": 5.6278767585754395, "learning_rate": 3.3364988920544474e-05, "loss": 0.1548, "step": 7010 }, { "epoch": 19.957356076759062, "grad_norm": 9.023452758789062, "learning_rate": 3.3333333333333335e-05, "loss": 0.1538, "step": 7020 }, { "epoch": 19.98578535891969, "grad_norm": 9.797062873840332, "learning_rate": 3.330167774612219e-05, "loss": 0.1577, "step": 7030 }, { "epoch": 20.0, "eval_accuracy": 0.7258, "eval_loss": 0.13210512697696686, "eval_runtime": 13.4813, "eval_samples_per_second": 370.885, "eval_steps_per_second": 11.646, "step": 7035 }, { "epoch": 20.01421464108031, "grad_norm": 5.939871311187744, "learning_rate": 3.327002215891105e-05, "loss": 0.1517, "step": 7040 }, { "epoch": 20.042643923240938, "grad_norm": 5.3057942390441895, "learning_rate": 3.323836657169991e-05, "loss": 0.1464, "step": 7050 }, { "epoch": 20.071073205401564, "grad_norm": 11.72205924987793, "learning_rate": 3.3206710984488765e-05, "loss": 0.1592, "step": 7060 }, { "epoch": 20.09950248756219, "grad_norm": 9.260968208312988, "learning_rate": 3.317505539727762e-05, "loss": 0.1542, "step": 7070 }, { "epoch": 20.127931769722814, "grad_norm": 10.511054992675781, "learning_rate": 3.314339981006648e-05, "loss": 0.1604, "step": 7080 }, { "epoch": 20.15636105188344, "grad_norm": 4.936570644378662, "learning_rate": 3.3111744222855335e-05, "loss": 0.1558, "step": 7090 }, { "epoch": 20.184790334044067, "grad_norm": 6.491855144500732, "learning_rate": 3.308008863564419e-05, "loss": 0.1502, "step": 7100 }, { "epoch": 20.21321961620469, "grad_norm": 6.715132713317871, "learning_rate": 3.304843304843305e-05, "loss": 0.1528, "step": 7110 }, { "epoch": 20.241648898365316, "grad_norm": 4.719024181365967, "learning_rate": 3.301677746122191e-05, "loss": 0.1546, "step": 7120 }, { "epoch": 20.270078180525942, "grad_norm": 4.0241169929504395, "learning_rate": 3.2985121874010765e-05, "loss": 0.1463, "step": 7130 }, { "epoch": 20.298507462686565, "grad_norm": 9.190053939819336, "learning_rate": 3.2953466286799626e-05, "loss": 0.1644, "step": 7140 }, { "epoch": 20.326936744847192, "grad_norm": 6.305832386016846, "learning_rate": 3.292181069958848e-05, "loss": 0.1578, "step": 7150 }, { "epoch": 20.35536602700782, "grad_norm": 8.951306343078613, "learning_rate": 3.2890155112377334e-05, "loss": 0.1478, "step": 7160 }, { "epoch": 20.383795309168445, "grad_norm": 8.108132362365723, "learning_rate": 3.2858499525166195e-05, "loss": 0.1497, "step": 7170 }, { "epoch": 20.412224591329068, "grad_norm": 5.111138820648193, "learning_rate": 3.282684393795505e-05, "loss": 0.1564, "step": 7180 }, { "epoch": 20.440653873489694, "grad_norm": 7.1464691162109375, "learning_rate": 3.279518835074391e-05, "loss": 0.1616, "step": 7190 }, { "epoch": 20.46908315565032, "grad_norm": 5.254659652709961, "learning_rate": 3.2763532763532764e-05, "loss": 0.1584, "step": 7200 }, { "epoch": 20.497512437810947, "grad_norm": 5.141119956970215, "learning_rate": 3.2731877176321625e-05, "loss": 0.1532, "step": 7210 }, { "epoch": 20.52594171997157, "grad_norm": 7.0433573722839355, "learning_rate": 3.270022158911048e-05, "loss": 0.1483, "step": 7220 }, { "epoch": 20.554371002132196, "grad_norm": 5.835916042327881, "learning_rate": 3.2668566001899334e-05, "loss": 0.1457, "step": 7230 }, { "epoch": 20.582800284292823, "grad_norm": 4.723277568817139, "learning_rate": 3.2636910414688195e-05, "loss": 0.1467, "step": 7240 }, { "epoch": 20.611229566453446, "grad_norm": 9.972947120666504, "learning_rate": 3.260525482747705e-05, "loss": 0.1575, "step": 7250 }, { "epoch": 20.639658848614072, "grad_norm": 9.842533111572266, "learning_rate": 3.25735992402659e-05, "loss": 0.1511, "step": 7260 }, { "epoch": 20.6680881307747, "grad_norm": 5.607415199279785, "learning_rate": 3.254194365305477e-05, "loss": 0.1584, "step": 7270 }, { "epoch": 20.696517412935325, "grad_norm": 6.633474826812744, "learning_rate": 3.2510288065843625e-05, "loss": 0.1529, "step": 7280 }, { "epoch": 20.724946695095948, "grad_norm": 5.701229095458984, "learning_rate": 3.247863247863248e-05, "loss": 0.1608, "step": 7290 }, { "epoch": 20.753375977256574, "grad_norm": 8.30053997039795, "learning_rate": 3.244697689142134e-05, "loss": 0.1563, "step": 7300 }, { "epoch": 20.7818052594172, "grad_norm": 7.414968013763428, "learning_rate": 3.2415321304210194e-05, "loss": 0.1549, "step": 7310 }, { "epoch": 20.810234541577824, "grad_norm": 9.00041675567627, "learning_rate": 3.238366571699905e-05, "loss": 0.1498, "step": 7320 }, { "epoch": 20.83866382373845, "grad_norm": 6.740293502807617, "learning_rate": 3.235201012978791e-05, "loss": 0.1503, "step": 7330 }, { "epoch": 20.867093105899077, "grad_norm": 7.314992427825928, "learning_rate": 3.2320354542576764e-05, "loss": 0.1503, "step": 7340 }, { "epoch": 20.895522388059703, "grad_norm": 9.979508399963379, "learning_rate": 3.2288698955365625e-05, "loss": 0.16, "step": 7350 }, { "epoch": 20.923951670220326, "grad_norm": 6.199654579162598, "learning_rate": 3.2257043368154486e-05, "loss": 0.1615, "step": 7360 }, { "epoch": 20.952380952380953, "grad_norm": 6.053821563720703, "learning_rate": 3.222538778094334e-05, "loss": 0.1582, "step": 7370 }, { "epoch": 20.98081023454158, "grad_norm": 7.191252708435059, "learning_rate": 3.2193732193732194e-05, "loss": 0.1575, "step": 7380 }, { "epoch": 20.997867803837952, "eval_accuracy": 0.7284, "eval_loss": 0.13182254135608673, "eval_runtime": 13.5144, "eval_samples_per_second": 369.977, "eval_steps_per_second": 11.617, "step": 7386 }, { "epoch": 21.009239516702202, "grad_norm": 5.592476844787598, "learning_rate": 3.2162076606521055e-05, "loss": 0.1531, "step": 7390 }, { "epoch": 21.03766879886283, "grad_norm": 9.563066482543945, "learning_rate": 3.213042101930991e-05, "loss": 0.1573, "step": 7400 }, { "epoch": 21.066098081023455, "grad_norm": 5.0909929275512695, "learning_rate": 3.209876543209876e-05, "loss": 0.147, "step": 7410 }, { "epoch": 21.09452736318408, "grad_norm": 6.586167335510254, "learning_rate": 3.2067109844887624e-05, "loss": 0.1595, "step": 7420 }, { "epoch": 21.122956645344704, "grad_norm": 4.968739986419678, "learning_rate": 3.2035454257676485e-05, "loss": 0.1528, "step": 7430 }, { "epoch": 21.15138592750533, "grad_norm": 5.2223005294799805, "learning_rate": 3.200379867046534e-05, "loss": 0.1475, "step": 7440 }, { "epoch": 21.179815209665957, "grad_norm": 8.273797988891602, "learning_rate": 3.1972143083254193e-05, "loss": 0.1531, "step": 7450 }, { "epoch": 21.20824449182658, "grad_norm": 11.816410064697266, "learning_rate": 3.1940487496043054e-05, "loss": 0.1508, "step": 7460 }, { "epoch": 21.236673773987206, "grad_norm": 6.004356861114502, "learning_rate": 3.190883190883191e-05, "loss": 0.1486, "step": 7470 }, { "epoch": 21.265103056147833, "grad_norm": 9.545648574829102, "learning_rate": 3.187717632162077e-05, "loss": 0.1509, "step": 7480 }, { "epoch": 21.29353233830846, "grad_norm": 7.594755172729492, "learning_rate": 3.1845520734409624e-05, "loss": 0.1485, "step": 7490 }, { "epoch": 21.321961620469082, "grad_norm": 6.20038366317749, "learning_rate": 3.181386514719848e-05, "loss": 0.151, "step": 7500 }, { "epoch": 21.35039090262971, "grad_norm": 6.651302337646484, "learning_rate": 3.178220955998734e-05, "loss": 0.1522, "step": 7510 }, { "epoch": 21.378820184790335, "grad_norm": 5.549664497375488, "learning_rate": 3.17505539727762e-05, "loss": 0.1495, "step": 7520 }, { "epoch": 21.407249466950958, "grad_norm": 13.31169605255127, "learning_rate": 3.1718898385565054e-05, "loss": 0.1567, "step": 7530 }, { "epoch": 21.435678749111585, "grad_norm": 7.653259754180908, "learning_rate": 3.168724279835391e-05, "loss": 0.1502, "step": 7540 }, { "epoch": 21.46410803127221, "grad_norm": 4.297057151794434, "learning_rate": 3.165558721114277e-05, "loss": 0.1518, "step": 7550 }, { "epoch": 21.492537313432837, "grad_norm": 8.407471656799316, "learning_rate": 3.162393162393162e-05, "loss": 0.1559, "step": 7560 }, { "epoch": 21.52096659559346, "grad_norm": 13.282374382019043, "learning_rate": 3.159227603672048e-05, "loss": 0.1495, "step": 7570 }, { "epoch": 21.549395877754087, "grad_norm": 6.2126688957214355, "learning_rate": 3.156062044950934e-05, "loss": 0.1558, "step": 7580 }, { "epoch": 21.577825159914713, "grad_norm": 6.630423069000244, "learning_rate": 3.15289648622982e-05, "loss": 0.1485, "step": 7590 }, { "epoch": 21.606254442075336, "grad_norm": 5.692834854125977, "learning_rate": 3.1497309275087054e-05, "loss": 0.1533, "step": 7600 }, { "epoch": 21.634683724235963, "grad_norm": 6.376125335693359, "learning_rate": 3.1465653687875915e-05, "loss": 0.1455, "step": 7610 }, { "epoch": 21.66311300639659, "grad_norm": 12.494769096374512, "learning_rate": 3.143399810066477e-05, "loss": 0.1397, "step": 7620 }, { "epoch": 21.691542288557216, "grad_norm": 12.940725326538086, "learning_rate": 3.140234251345362e-05, "loss": 0.1484, "step": 7630 }, { "epoch": 21.71997157071784, "grad_norm": 8.579645156860352, "learning_rate": 3.1370686926242484e-05, "loss": 0.1509, "step": 7640 }, { "epoch": 21.748400852878465, "grad_norm": 8.484445571899414, "learning_rate": 3.133903133903134e-05, "loss": 0.1599, "step": 7650 }, { "epoch": 21.77683013503909, "grad_norm": 6.595262050628662, "learning_rate": 3.13073757518202e-05, "loss": 0.1499, "step": 7660 }, { "epoch": 21.805259417199714, "grad_norm": 8.021895408630371, "learning_rate": 3.127572016460906e-05, "loss": 0.1614, "step": 7670 }, { "epoch": 21.83368869936034, "grad_norm": 14.3469877243042, "learning_rate": 3.1244064577397914e-05, "loss": 0.1463, "step": 7680 }, { "epoch": 21.862117981520967, "grad_norm": 10.049955368041992, "learning_rate": 3.121240899018677e-05, "loss": 0.1482, "step": 7690 }, { "epoch": 21.890547263681594, "grad_norm": 9.917855262756348, "learning_rate": 3.118075340297563e-05, "loss": 0.1511, "step": 7700 }, { "epoch": 21.918976545842217, "grad_norm": 10.449019432067871, "learning_rate": 3.1149097815764484e-05, "loss": 0.1467, "step": 7710 }, { "epoch": 21.947405828002843, "grad_norm": 7.205163478851318, "learning_rate": 3.111744222855334e-05, "loss": 0.147, "step": 7720 }, { "epoch": 21.97583511016347, "grad_norm": 13.378124237060547, "learning_rate": 3.10857866413422e-05, "loss": 0.141, "step": 7730 }, { "epoch": 21.99857853589197, "eval_accuracy": 0.7438, "eval_loss": 0.1228351816534996, "eval_runtime": 13.5207, "eval_samples_per_second": 369.802, "eval_steps_per_second": 11.612, "step": 7738 }, { "epoch": 22.004264392324092, "grad_norm": 8.777661323547363, "learning_rate": 3.105413105413106e-05, "loss": 0.1506, "step": 7740 }, { "epoch": 22.03269367448472, "grad_norm": 4.329195022583008, "learning_rate": 3.1022475466919914e-05, "loss": 0.1527, "step": 7750 }, { "epoch": 22.061122956645345, "grad_norm": 6.228816509246826, "learning_rate": 3.099081987970877e-05, "loss": 0.1516, "step": 7760 }, { "epoch": 22.08955223880597, "grad_norm": 5.697615623474121, "learning_rate": 3.095916429249763e-05, "loss": 0.154, "step": 7770 }, { "epoch": 22.117981520966595, "grad_norm": 6.576175212860107, "learning_rate": 3.092750870528648e-05, "loss": 0.1476, "step": 7780 }, { "epoch": 22.14641080312722, "grad_norm": 8.503117561340332, "learning_rate": 3.0895853118075344e-05, "loss": 0.1599, "step": 7790 }, { "epoch": 22.174840085287848, "grad_norm": 8.577230453491211, "learning_rate": 3.08641975308642e-05, "loss": 0.1506, "step": 7800 }, { "epoch": 22.20326936744847, "grad_norm": 11.13038444519043, "learning_rate": 3.083254194365305e-05, "loss": 0.147, "step": 7810 }, { "epoch": 22.231698649609097, "grad_norm": 7.147623538970947, "learning_rate": 3.0800886356441913e-05, "loss": 0.1485, "step": 7820 }, { "epoch": 22.260127931769723, "grad_norm": 6.999971866607666, "learning_rate": 3.0769230769230774e-05, "loss": 0.1488, "step": 7830 }, { "epoch": 22.28855721393035, "grad_norm": 6.81623649597168, "learning_rate": 3.073757518201963e-05, "loss": 0.1456, "step": 7840 }, { "epoch": 22.316986496090973, "grad_norm": 5.738211631774902, "learning_rate": 3.070591959480848e-05, "loss": 0.1495, "step": 7850 }, { "epoch": 22.3454157782516, "grad_norm": 7.0419840812683105, "learning_rate": 3.0674264007597344e-05, "loss": 0.1473, "step": 7860 }, { "epoch": 22.373845060412226, "grad_norm": 5.929579257965088, "learning_rate": 3.06426084203862e-05, "loss": 0.1508, "step": 7870 }, { "epoch": 22.40227434257285, "grad_norm": 7.021225452423096, "learning_rate": 3.061095283317505e-05, "loss": 0.148, "step": 7880 }, { "epoch": 22.430703624733475, "grad_norm": 4.966420650482178, "learning_rate": 3.057929724596391e-05, "loss": 0.1483, "step": 7890 }, { "epoch": 22.4591329068941, "grad_norm": 10.524633407592773, "learning_rate": 3.0547641658752774e-05, "loss": 0.1498, "step": 7900 }, { "epoch": 22.487562189054728, "grad_norm": 6.478786945343018, "learning_rate": 3.051598607154163e-05, "loss": 0.1512, "step": 7910 }, { "epoch": 22.51599147121535, "grad_norm": 7.513288974761963, "learning_rate": 3.0484330484330486e-05, "loss": 0.1375, "step": 7920 }, { "epoch": 22.544420753375977, "grad_norm": 7.3536200523376465, "learning_rate": 3.0452674897119343e-05, "loss": 0.1421, "step": 7930 }, { "epoch": 22.572850035536604, "grad_norm": 8.709885597229004, "learning_rate": 3.04210193099082e-05, "loss": 0.1473, "step": 7940 }, { "epoch": 22.601279317697227, "grad_norm": 8.036056518554688, "learning_rate": 3.0389363722697055e-05, "loss": 0.1535, "step": 7950 }, { "epoch": 22.629708599857853, "grad_norm": 5.516635894775391, "learning_rate": 3.0357708135485913e-05, "loss": 0.1451, "step": 7960 }, { "epoch": 22.65813788201848, "grad_norm": 9.147539138793945, "learning_rate": 3.032605254827477e-05, "loss": 0.1523, "step": 7970 }, { "epoch": 22.686567164179106, "grad_norm": 6.451848030090332, "learning_rate": 3.029439696106363e-05, "loss": 0.1582, "step": 7980 }, { "epoch": 22.71499644633973, "grad_norm": 7.640406608581543, "learning_rate": 3.026274137385249e-05, "loss": 0.1498, "step": 7990 }, { "epoch": 22.743425728500355, "grad_norm": 14.785600662231445, "learning_rate": 3.0231085786641343e-05, "loss": 0.16, "step": 8000 }, { "epoch": 22.771855010660982, "grad_norm": 5.789612770080566, "learning_rate": 3.01994301994302e-05, "loss": 0.1472, "step": 8010 }, { "epoch": 22.800284292821605, "grad_norm": 6.370020866394043, "learning_rate": 3.0167774612219058e-05, "loss": 0.1464, "step": 8020 }, { "epoch": 22.82871357498223, "grad_norm": 7.668898582458496, "learning_rate": 3.0136119025007916e-05, "loss": 0.1478, "step": 8030 }, { "epoch": 22.857142857142858, "grad_norm": 5.980592727661133, "learning_rate": 3.010446343779677e-05, "loss": 0.1447, "step": 8040 }, { "epoch": 22.885572139303484, "grad_norm": 12.745854377746582, "learning_rate": 3.0072807850585634e-05, "loss": 0.1546, "step": 8050 }, { "epoch": 22.914001421464107, "grad_norm": 7.870355606079102, "learning_rate": 3.0041152263374488e-05, "loss": 0.1483, "step": 8060 }, { "epoch": 22.942430703624733, "grad_norm": 5.292947292327881, "learning_rate": 3.0009496676163346e-05, "loss": 0.1377, "step": 8070 }, { "epoch": 22.97085998578536, "grad_norm": 6.468026161193848, "learning_rate": 2.9977841088952203e-05, "loss": 0.145, "step": 8080 }, { "epoch": 22.999289267945983, "grad_norm": 8.062458038330078, "learning_rate": 2.9946185501741058e-05, "loss": 0.151, "step": 8090 }, { "epoch": 22.999289267945983, "eval_accuracy": 0.7392, "eval_loss": 0.1260121613740921, "eval_runtime": 13.569, "eval_samples_per_second": 368.486, "eval_steps_per_second": 11.57, "step": 8090 }, { "epoch": 23.02771855010661, "grad_norm": 8.114884376525879, "learning_rate": 2.9914529914529915e-05, "loss": 0.1442, "step": 8100 }, { "epoch": 23.056147832267236, "grad_norm": 7.7791008949279785, "learning_rate": 2.9882874327318773e-05, "loss": 0.1461, "step": 8110 }, { "epoch": 23.084577114427862, "grad_norm": 6.323302268981934, "learning_rate": 2.9851218740107627e-05, "loss": 0.1485, "step": 8120 }, { "epoch": 23.113006396588485, "grad_norm": 5.80276346206665, "learning_rate": 2.981956315289649e-05, "loss": 0.1546, "step": 8130 }, { "epoch": 23.14143567874911, "grad_norm": 7.424940586090088, "learning_rate": 2.9787907565685345e-05, "loss": 0.144, "step": 8140 }, { "epoch": 23.169864960909738, "grad_norm": 7.394670486450195, "learning_rate": 2.9756251978474203e-05, "loss": 0.1464, "step": 8150 }, { "epoch": 23.19829424307036, "grad_norm": 8.30813980102539, "learning_rate": 2.972459639126306e-05, "loss": 0.1448, "step": 8160 }, { "epoch": 23.226723525230987, "grad_norm": 5.570594787597656, "learning_rate": 2.9692940804051915e-05, "loss": 0.1497, "step": 8170 }, { "epoch": 23.255152807391614, "grad_norm": 6.296761512756348, "learning_rate": 2.9661285216840772e-05, "loss": 0.1482, "step": 8180 }, { "epoch": 23.28358208955224, "grad_norm": 11.862334251403809, "learning_rate": 2.962962962962963e-05, "loss": 0.1513, "step": 8190 }, { "epoch": 23.312011371712863, "grad_norm": 12.025609016418457, "learning_rate": 2.9597974042418487e-05, "loss": 0.1524, "step": 8200 }, { "epoch": 23.34044065387349, "grad_norm": 5.601258754730225, "learning_rate": 2.956631845520735e-05, "loss": 0.1509, "step": 8210 }, { "epoch": 23.368869936034116, "grad_norm": 7.173379421234131, "learning_rate": 2.9534662867996206e-05, "loss": 0.1347, "step": 8220 }, { "epoch": 23.39729921819474, "grad_norm": 8.370221138000488, "learning_rate": 2.950300728078506e-05, "loss": 0.1526, "step": 8230 }, { "epoch": 23.425728500355365, "grad_norm": 5.201686382293701, "learning_rate": 2.9471351693573918e-05, "loss": 0.1428, "step": 8240 }, { "epoch": 23.454157782515992, "grad_norm": 8.784417152404785, "learning_rate": 2.9439696106362775e-05, "loss": 0.148, "step": 8250 }, { "epoch": 23.48258706467662, "grad_norm": 5.006414890289307, "learning_rate": 2.940804051915163e-05, "loss": 0.1462, "step": 8260 }, { "epoch": 23.51101634683724, "grad_norm": 5.660472869873047, "learning_rate": 2.9376384931940487e-05, "loss": 0.1447, "step": 8270 }, { "epoch": 23.539445628997868, "grad_norm": 5.7646331787109375, "learning_rate": 2.9344729344729345e-05, "loss": 0.1471, "step": 8280 }, { "epoch": 23.567874911158494, "grad_norm": 8.831572532653809, "learning_rate": 2.9313073757518206e-05, "loss": 0.1434, "step": 8290 }, { "epoch": 23.596304193319117, "grad_norm": 5.75462532043457, "learning_rate": 2.9281418170307063e-05, "loss": 0.154, "step": 8300 }, { "epoch": 23.624733475479744, "grad_norm": 5.988626480102539, "learning_rate": 2.9249762583095917e-05, "loss": 0.1405, "step": 8310 }, { "epoch": 23.65316275764037, "grad_norm": 7.364073276519775, "learning_rate": 2.9218106995884775e-05, "loss": 0.1498, "step": 8320 }, { "epoch": 23.681592039800996, "grad_norm": 5.987818717956543, "learning_rate": 2.9186451408673633e-05, "loss": 0.1496, "step": 8330 }, { "epoch": 23.71002132196162, "grad_norm": 6.780797004699707, "learning_rate": 2.9154795821462487e-05, "loss": 0.1362, "step": 8340 }, { "epoch": 23.738450604122246, "grad_norm": 6.569079875946045, "learning_rate": 2.9123140234251344e-05, "loss": 0.1437, "step": 8350 }, { "epoch": 23.766879886282872, "grad_norm": 5.085376262664795, "learning_rate": 2.9091484647040202e-05, "loss": 0.1535, "step": 8360 }, { "epoch": 23.795309168443495, "grad_norm": 8.613494873046875, "learning_rate": 2.9059829059829063e-05, "loss": 0.1474, "step": 8370 }, { "epoch": 23.82373845060412, "grad_norm": 12.342422485351562, "learning_rate": 2.902817347261792e-05, "loss": 0.1513, "step": 8380 }, { "epoch": 23.852167732764748, "grad_norm": 10.415434837341309, "learning_rate": 2.8996517885406778e-05, "loss": 0.1445, "step": 8390 }, { "epoch": 23.880597014925375, "grad_norm": 7.935213565826416, "learning_rate": 2.8964862298195632e-05, "loss": 0.147, "step": 8400 }, { "epoch": 23.909026297085997, "grad_norm": 6.455832004547119, "learning_rate": 2.893320671098449e-05, "loss": 0.1473, "step": 8410 }, { "epoch": 23.937455579246624, "grad_norm": 8.252398490905762, "learning_rate": 2.8901551123773347e-05, "loss": 0.1428, "step": 8420 }, { "epoch": 23.96588486140725, "grad_norm": 9.187219619750977, "learning_rate": 2.88698955365622e-05, "loss": 0.1501, "step": 8430 }, { "epoch": 23.994314143567873, "grad_norm": 6.297224998474121, "learning_rate": 2.883823994935106e-05, "loss": 0.1403, "step": 8440 }, { "epoch": 24.0, "eval_accuracy": 0.7558, "eval_loss": 0.11781904101371765, "eval_runtime": 13.5809, "eval_samples_per_second": 368.163, "eval_steps_per_second": 11.56, "step": 8442 }, { "epoch": 24.0227434257285, "grad_norm": 5.55855131149292, "learning_rate": 2.880658436213992e-05, "loss": 0.139, "step": 8450 }, { "epoch": 24.051172707889126, "grad_norm": 9.439704895019531, "learning_rate": 2.8774928774928778e-05, "loss": 0.1475, "step": 8460 }, { "epoch": 24.079601990049753, "grad_norm": 6.154767036437988, "learning_rate": 2.8743273187717635e-05, "loss": 0.1443, "step": 8470 }, { "epoch": 24.108031272210376, "grad_norm": 6.717215061187744, "learning_rate": 2.871161760050649e-05, "loss": 0.1471, "step": 8480 }, { "epoch": 24.136460554371002, "grad_norm": 6.775696754455566, "learning_rate": 2.8679962013295347e-05, "loss": 0.1315, "step": 8490 }, { "epoch": 24.16488983653163, "grad_norm": 5.769715785980225, "learning_rate": 2.8648306426084204e-05, "loss": 0.1442, "step": 8500 }, { "epoch": 24.19331911869225, "grad_norm": 9.103348731994629, "learning_rate": 2.8616650838873062e-05, "loss": 0.1405, "step": 8510 }, { "epoch": 24.221748400852878, "grad_norm": 8.164674758911133, "learning_rate": 2.8584995251661923e-05, "loss": 0.1391, "step": 8520 }, { "epoch": 24.250177683013504, "grad_norm": 7.2729716300964355, "learning_rate": 2.855333966445078e-05, "loss": 0.1441, "step": 8530 }, { "epoch": 24.27860696517413, "grad_norm": 9.869029998779297, "learning_rate": 2.8521684077239635e-05, "loss": 0.1393, "step": 8540 }, { "epoch": 24.307036247334754, "grad_norm": 11.370794296264648, "learning_rate": 2.8490028490028492e-05, "loss": 0.1469, "step": 8550 }, { "epoch": 24.33546552949538, "grad_norm": 5.766239643096924, "learning_rate": 2.845837290281735e-05, "loss": 0.1485, "step": 8560 }, { "epoch": 24.363894811656007, "grad_norm": 7.63856315612793, "learning_rate": 2.8426717315606204e-05, "loss": 0.149, "step": 8570 }, { "epoch": 24.39232409381663, "grad_norm": 6.744407653808594, "learning_rate": 2.839506172839506e-05, "loss": 0.1391, "step": 8580 }, { "epoch": 24.420753375977256, "grad_norm": 8.74738597869873, "learning_rate": 2.836340614118392e-05, "loss": 0.1452, "step": 8590 }, { "epoch": 24.449182658137882, "grad_norm": 10.62308120727539, "learning_rate": 2.833175055397278e-05, "loss": 0.1375, "step": 8600 }, { "epoch": 24.47761194029851, "grad_norm": 8.327263832092285, "learning_rate": 2.8300094966761638e-05, "loss": 0.1421, "step": 8610 }, { "epoch": 24.50604122245913, "grad_norm": 7.7247090339660645, "learning_rate": 2.8268439379550492e-05, "loss": 0.1445, "step": 8620 }, { "epoch": 24.534470504619758, "grad_norm": 8.24417781829834, "learning_rate": 2.823678379233935e-05, "loss": 0.1441, "step": 8630 }, { "epoch": 24.562899786780385, "grad_norm": 5.919760704040527, "learning_rate": 2.8205128205128207e-05, "loss": 0.1472, "step": 8640 }, { "epoch": 24.591329068941008, "grad_norm": 9.408928871154785, "learning_rate": 2.817347261791706e-05, "loss": 0.1431, "step": 8650 }, { "epoch": 24.619758351101634, "grad_norm": 6.479671478271484, "learning_rate": 2.814181703070592e-05, "loss": 0.1429, "step": 8660 }, { "epoch": 24.64818763326226, "grad_norm": 6.375315189361572, "learning_rate": 2.8110161443494776e-05, "loss": 0.1369, "step": 8670 }, { "epoch": 24.676616915422887, "grad_norm": 6.061427593231201, "learning_rate": 2.8078505856283637e-05, "loss": 0.1481, "step": 8680 }, { "epoch": 24.70504619758351, "grad_norm": 5.765936374664307, "learning_rate": 2.8046850269072495e-05, "loss": 0.1406, "step": 8690 }, { "epoch": 24.733475479744136, "grad_norm": 6.458628177642822, "learning_rate": 2.8015194681861352e-05, "loss": 0.1455, "step": 8700 }, { "epoch": 24.761904761904763, "grad_norm": 15.039815902709961, "learning_rate": 2.7983539094650207e-05, "loss": 0.1405, "step": 8710 }, { "epoch": 24.790334044065386, "grad_norm": 10.922232627868652, "learning_rate": 2.7951883507439064e-05, "loss": 0.1421, "step": 8720 }, { "epoch": 24.818763326226012, "grad_norm": 9.89081859588623, "learning_rate": 2.7920227920227922e-05, "loss": 0.1427, "step": 8730 }, { "epoch": 24.84719260838664, "grad_norm": 8.254655838012695, "learning_rate": 2.7888572333016776e-05, "loss": 0.1411, "step": 8740 }, { "epoch": 24.875621890547265, "grad_norm": 8.658320426940918, "learning_rate": 2.7856916745805633e-05, "loss": 0.1487, "step": 8750 }, { "epoch": 24.904051172707888, "grad_norm": 7.422158718109131, "learning_rate": 2.7825261158594494e-05, "loss": 0.1414, "step": 8760 }, { "epoch": 24.932480454868514, "grad_norm": 8.454310417175293, "learning_rate": 2.7793605571383352e-05, "loss": 0.145, "step": 8770 }, { "epoch": 24.96090973702914, "grad_norm": 5.144379138946533, "learning_rate": 2.776194998417221e-05, "loss": 0.1452, "step": 8780 }, { "epoch": 24.989339019189764, "grad_norm": 8.716599464416504, "learning_rate": 2.7730294396961064e-05, "loss": 0.1434, "step": 8790 }, { "epoch": 24.997867803837952, "eval_accuracy": 0.7534, "eval_loss": 0.11848505586385727, "eval_runtime": 13.588, "eval_samples_per_second": 367.971, "eval_steps_per_second": 11.554, "step": 8793 }, { "epoch": 25.01776830135039, "grad_norm": 7.571016788482666, "learning_rate": 2.769863880974992e-05, "loss": 0.1431, "step": 8800 }, { "epoch": 25.046197583511017, "grad_norm": 5.296953201293945, "learning_rate": 2.766698322253878e-05, "loss": 0.1369, "step": 8810 }, { "epoch": 25.074626865671643, "grad_norm": 10.426069259643555, "learning_rate": 2.7635327635327633e-05, "loss": 0.1448, "step": 8820 }, { "epoch": 25.103056147832266, "grad_norm": 8.120406150817871, "learning_rate": 2.760367204811649e-05, "loss": 0.1408, "step": 8830 }, { "epoch": 25.131485429992892, "grad_norm": 11.165103912353516, "learning_rate": 2.757201646090535e-05, "loss": 0.1407, "step": 8840 }, { "epoch": 25.15991471215352, "grad_norm": 6.967249870300293, "learning_rate": 2.754036087369421e-05, "loss": 0.1374, "step": 8850 }, { "epoch": 25.188343994314142, "grad_norm": 7.554841995239258, "learning_rate": 2.7508705286483067e-05, "loss": 0.141, "step": 8860 }, { "epoch": 25.21677327647477, "grad_norm": 11.249825477600098, "learning_rate": 2.7477049699271924e-05, "loss": 0.1389, "step": 8870 }, { "epoch": 25.245202558635395, "grad_norm": 16.006229400634766, "learning_rate": 2.744539411206078e-05, "loss": 0.1448, "step": 8880 }, { "epoch": 25.27363184079602, "grad_norm": 6.915517330169678, "learning_rate": 2.7413738524849636e-05, "loss": 0.1483, "step": 8890 }, { "epoch": 25.302061122956644, "grad_norm": 8.875819206237793, "learning_rate": 2.7382082937638494e-05, "loss": 0.1471, "step": 8900 }, { "epoch": 25.33049040511727, "grad_norm": 9.75496768951416, "learning_rate": 2.7350427350427355e-05, "loss": 0.1404, "step": 8910 }, { "epoch": 25.358919687277897, "grad_norm": 9.497008323669434, "learning_rate": 2.7318771763216212e-05, "loss": 0.1512, "step": 8920 }, { "epoch": 25.38734896943852, "grad_norm": 9.545600891113281, "learning_rate": 2.7287116176005066e-05, "loss": 0.1444, "step": 8930 }, { "epoch": 25.415778251599146, "grad_norm": 6.799803256988525, "learning_rate": 2.7255460588793924e-05, "loss": 0.1357, "step": 8940 }, { "epoch": 25.444207533759773, "grad_norm": 5.753367900848389, "learning_rate": 2.722380500158278e-05, "loss": 0.1421, "step": 8950 }, { "epoch": 25.4726368159204, "grad_norm": 8.674606323242188, "learning_rate": 2.7192149414371636e-05, "loss": 0.1407, "step": 8960 }, { "epoch": 25.501066098081022, "grad_norm": 7.093050003051758, "learning_rate": 2.7160493827160493e-05, "loss": 0.1331, "step": 8970 }, { "epoch": 25.52949538024165, "grad_norm": 9.291196823120117, "learning_rate": 2.712883823994935e-05, "loss": 0.1347, "step": 8980 }, { "epoch": 25.557924662402275, "grad_norm": 6.6785149574279785, "learning_rate": 2.7097182652738212e-05, "loss": 0.1362, "step": 8990 }, { "epoch": 25.5863539445629, "grad_norm": 6.92014217376709, "learning_rate": 2.706552706552707e-05, "loss": 0.1464, "step": 9000 }, { "epoch": 25.614783226723524, "grad_norm": 5.3187785148620605, "learning_rate": 2.7033871478315927e-05, "loss": 0.1419, "step": 9010 }, { "epoch": 25.64321250888415, "grad_norm": 5.818498134613037, "learning_rate": 2.700221589110478e-05, "loss": 0.14, "step": 9020 }, { "epoch": 25.671641791044777, "grad_norm": 5.697383880615234, "learning_rate": 2.697056030389364e-05, "loss": 0.1447, "step": 9030 }, { "epoch": 25.7000710732054, "grad_norm": 6.586178779602051, "learning_rate": 2.6938904716682496e-05, "loss": 0.1403, "step": 9040 }, { "epoch": 25.728500355366027, "grad_norm": 9.177045822143555, "learning_rate": 2.690724912947135e-05, "loss": 0.1423, "step": 9050 }, { "epoch": 25.756929637526653, "grad_norm": 8.418880462646484, "learning_rate": 2.6875593542260208e-05, "loss": 0.1427, "step": 9060 }, { "epoch": 25.785358919687276, "grad_norm": 8.560445785522461, "learning_rate": 2.684393795504907e-05, "loss": 0.1396, "step": 9070 }, { "epoch": 25.813788201847903, "grad_norm": 7.396243095397949, "learning_rate": 2.6812282367837927e-05, "loss": 0.141, "step": 9080 }, { "epoch": 25.84221748400853, "grad_norm": 7.770750045776367, "learning_rate": 2.6780626780626784e-05, "loss": 0.1392, "step": 9090 }, { "epoch": 25.870646766169155, "grad_norm": 4.9813008308410645, "learning_rate": 2.6748971193415638e-05, "loss": 0.1341, "step": 9100 }, { "epoch": 25.89907604832978, "grad_norm": 11.929550170898438, "learning_rate": 2.6717315606204496e-05, "loss": 0.1368, "step": 9110 }, { "epoch": 25.927505330490405, "grad_norm": 7.101933002471924, "learning_rate": 2.6685660018993353e-05, "loss": 0.1392, "step": 9120 }, { "epoch": 25.95593461265103, "grad_norm": 7.931550025939941, "learning_rate": 2.6654004431782208e-05, "loss": 0.1398, "step": 9130 }, { "epoch": 25.984363894811658, "grad_norm": 7.694642543792725, "learning_rate": 2.6622348844571065e-05, "loss": 0.1465, "step": 9140 }, { "epoch": 25.99857853589197, "eval_accuracy": 0.759, "eval_loss": 0.11621713638305664, "eval_runtime": 13.5439, "eval_samples_per_second": 369.17, "eval_steps_per_second": 11.592, "step": 9145 }, { "epoch": 26.01279317697228, "grad_norm": 5.021768569946289, "learning_rate": 2.6590693257359926e-05, "loss": 0.1343, "step": 9150 }, { "epoch": 26.041222459132907, "grad_norm": 12.402433395385742, "learning_rate": 2.6559037670148784e-05, "loss": 0.1378, "step": 9160 }, { "epoch": 26.069651741293534, "grad_norm": 7.353051662445068, "learning_rate": 2.652738208293764e-05, "loss": 0.148, "step": 9170 }, { "epoch": 26.098081023454156, "grad_norm": 7.127249240875244, "learning_rate": 2.64957264957265e-05, "loss": 0.1376, "step": 9180 }, { "epoch": 26.126510305614783, "grad_norm": 12.120996475219727, "learning_rate": 2.6464070908515353e-05, "loss": 0.1416, "step": 9190 }, { "epoch": 26.15493958777541, "grad_norm": 5.678403377532959, "learning_rate": 2.643241532130421e-05, "loss": 0.1346, "step": 9200 }, { "epoch": 26.183368869936036, "grad_norm": 5.865853786468506, "learning_rate": 2.6400759734093068e-05, "loss": 0.1424, "step": 9210 }, { "epoch": 26.21179815209666, "grad_norm": 7.526356220245361, "learning_rate": 2.6369104146881922e-05, "loss": 0.1384, "step": 9220 }, { "epoch": 26.240227434257285, "grad_norm": 5.5171799659729, "learning_rate": 2.6337448559670787e-05, "loss": 0.1416, "step": 9230 }, { "epoch": 26.26865671641791, "grad_norm": 8.74276065826416, "learning_rate": 2.630579297245964e-05, "loss": 0.1431, "step": 9240 }, { "epoch": 26.297085998578535, "grad_norm": 6.9587578773498535, "learning_rate": 2.62741373852485e-05, "loss": 0.1421, "step": 9250 }, { "epoch": 26.32551528073916, "grad_norm": 4.84282922744751, "learning_rate": 2.6242481798037356e-05, "loss": 0.1346, "step": 9260 }, { "epoch": 26.353944562899787, "grad_norm": 12.135048866271973, "learning_rate": 2.621082621082621e-05, "loss": 0.143, "step": 9270 }, { "epoch": 26.382373845060414, "grad_norm": 7.2798895835876465, "learning_rate": 2.6179170623615068e-05, "loss": 0.1449, "step": 9280 }, { "epoch": 26.410803127221037, "grad_norm": 6.308412075042725, "learning_rate": 2.6147515036403925e-05, "loss": 0.1387, "step": 9290 }, { "epoch": 26.439232409381663, "grad_norm": 8.798842430114746, "learning_rate": 2.611585944919278e-05, "loss": 0.1421, "step": 9300 }, { "epoch": 26.46766169154229, "grad_norm": 10.645768165588379, "learning_rate": 2.6084203861981644e-05, "loss": 0.1345, "step": 9310 }, { "epoch": 26.496090973702913, "grad_norm": 12.621675491333008, "learning_rate": 2.6052548274770498e-05, "loss": 0.1411, "step": 9320 }, { "epoch": 26.52452025586354, "grad_norm": 7.827025890350342, "learning_rate": 2.6020892687559356e-05, "loss": 0.1367, "step": 9330 }, { "epoch": 26.552949538024166, "grad_norm": 5.328700065612793, "learning_rate": 2.5989237100348213e-05, "loss": 0.1383, "step": 9340 }, { "epoch": 26.581378820184792, "grad_norm": 6.2277984619140625, "learning_rate": 2.595758151313707e-05, "loss": 0.1452, "step": 9350 }, { "epoch": 26.609808102345415, "grad_norm": 9.740056037902832, "learning_rate": 2.5925925925925925e-05, "loss": 0.1324, "step": 9360 }, { "epoch": 26.63823738450604, "grad_norm": 6.533791542053223, "learning_rate": 2.5894270338714782e-05, "loss": 0.1429, "step": 9370 }, { "epoch": 26.666666666666668, "grad_norm": 6.585256576538086, "learning_rate": 2.5862614751503643e-05, "loss": 0.13, "step": 9380 }, { "epoch": 26.69509594882729, "grad_norm": 7.734272003173828, "learning_rate": 2.58309591642925e-05, "loss": 0.1467, "step": 9390 }, { "epoch": 26.723525230987917, "grad_norm": 6.145429611206055, "learning_rate": 2.579930357708136e-05, "loss": 0.1321, "step": 9400 }, { "epoch": 26.751954513148544, "grad_norm": 8.636664390563965, "learning_rate": 2.5767647989870213e-05, "loss": 0.1381, "step": 9410 }, { "epoch": 26.78038379530917, "grad_norm": 3.7743451595306396, "learning_rate": 2.573599240265907e-05, "loss": 0.1384, "step": 9420 }, { "epoch": 26.808813077469793, "grad_norm": 7.461757183074951, "learning_rate": 2.5704336815447928e-05, "loss": 0.1345, "step": 9430 }, { "epoch": 26.83724235963042, "grad_norm": 7.429134368896484, "learning_rate": 2.5672681228236782e-05, "loss": 0.1323, "step": 9440 }, { "epoch": 26.865671641791046, "grad_norm": 5.44699764251709, "learning_rate": 2.564102564102564e-05, "loss": 0.1374, "step": 9450 }, { "epoch": 26.89410092395167, "grad_norm": 6.562127590179443, "learning_rate": 2.56093700538145e-05, "loss": 0.1384, "step": 9460 }, { "epoch": 26.922530206112295, "grad_norm": 7.994168281555176, "learning_rate": 2.5577714466603358e-05, "loss": 0.1497, "step": 9470 }, { "epoch": 26.95095948827292, "grad_norm": 7.851631164550781, "learning_rate": 2.5546058879392216e-05, "loss": 0.1449, "step": 9480 }, { "epoch": 26.979388770433548, "grad_norm": 12.57490348815918, "learning_rate": 2.551440329218107e-05, "loss": 0.1362, "step": 9490 }, { "epoch": 26.999289267945983, "eval_accuracy": 0.769, "eval_loss": 0.11206282675266266, "eval_runtime": 13.553, "eval_samples_per_second": 368.922, "eval_steps_per_second": 11.584, "step": 9497 }, { "epoch": 27.00781805259417, "grad_norm": 5.598655700683594, "learning_rate": 2.5482747704969927e-05, "loss": 0.1417, "step": 9500 }, { "epoch": 27.036247334754798, "grad_norm": 6.855911731719971, "learning_rate": 2.5451092117758785e-05, "loss": 0.1404, "step": 9510 }, { "epoch": 27.064676616915424, "grad_norm": 8.818585395812988, "learning_rate": 2.5419436530547643e-05, "loss": 0.1409, "step": 9520 }, { "epoch": 27.093105899076047, "grad_norm": 7.7069292068481445, "learning_rate": 2.5387780943336497e-05, "loss": 0.1361, "step": 9530 }, { "epoch": 27.121535181236673, "grad_norm": 9.583283424377441, "learning_rate": 2.535612535612536e-05, "loss": 0.1376, "step": 9540 }, { "epoch": 27.1499644633973, "grad_norm": 7.265142917633057, "learning_rate": 2.5324469768914215e-05, "loss": 0.1362, "step": 9550 }, { "epoch": 27.178393745557926, "grad_norm": 5.608309745788574, "learning_rate": 2.5292814181703073e-05, "loss": 0.1321, "step": 9560 }, { "epoch": 27.20682302771855, "grad_norm": 10.654949188232422, "learning_rate": 2.526115859449193e-05, "loss": 0.1439, "step": 9570 }, { "epoch": 27.235252309879176, "grad_norm": 5.948031902313232, "learning_rate": 2.5229503007280785e-05, "loss": 0.1378, "step": 9580 }, { "epoch": 27.263681592039802, "grad_norm": 13.630681037902832, "learning_rate": 2.5197847420069642e-05, "loss": 0.1396, "step": 9590 }, { "epoch": 27.292110874200425, "grad_norm": 12.318713188171387, "learning_rate": 2.51661918328585e-05, "loss": 0.1316, "step": 9600 }, { "epoch": 27.32054015636105, "grad_norm": 9.117362022399902, "learning_rate": 2.5134536245647354e-05, "loss": 0.1333, "step": 9610 }, { "epoch": 27.348969438521678, "grad_norm": 9.406400680541992, "learning_rate": 2.510288065843622e-05, "loss": 0.1399, "step": 9620 }, { "epoch": 27.377398720682304, "grad_norm": 6.60117244720459, "learning_rate": 2.5071225071225073e-05, "loss": 0.141, "step": 9630 }, { "epoch": 27.405828002842927, "grad_norm": 9.663880348205566, "learning_rate": 2.503956948401393e-05, "loss": 0.1462, "step": 9640 }, { "epoch": 27.434257285003554, "grad_norm": 5.250779628753662, "learning_rate": 2.5007913896802788e-05, "loss": 0.1356, "step": 9650 }, { "epoch": 27.46268656716418, "grad_norm": 8.462498664855957, "learning_rate": 2.4976258309591645e-05, "loss": 0.1287, "step": 9660 }, { "epoch": 27.491115849324803, "grad_norm": 7.328686237335205, "learning_rate": 2.49446027223805e-05, "loss": 0.1543, "step": 9670 }, { "epoch": 27.51954513148543, "grad_norm": 5.981494903564453, "learning_rate": 2.491294713516936e-05, "loss": 0.1388, "step": 9680 }, { "epoch": 27.547974413646056, "grad_norm": 6.373142242431641, "learning_rate": 2.4881291547958215e-05, "loss": 0.1384, "step": 9690 }, { "epoch": 27.576403695806682, "grad_norm": 7.539931774139404, "learning_rate": 2.4849635960747072e-05, "loss": 0.1407, "step": 9700 }, { "epoch": 27.604832977967305, "grad_norm": 7.0806732177734375, "learning_rate": 2.481798037353593e-05, "loss": 0.1383, "step": 9710 }, { "epoch": 27.633262260127932, "grad_norm": 7.427414894104004, "learning_rate": 2.4786324786324787e-05, "loss": 0.1345, "step": 9720 }, { "epoch": 27.66169154228856, "grad_norm": 9.98422622680664, "learning_rate": 2.4754669199113645e-05, "loss": 0.1328, "step": 9730 }, { "epoch": 27.69012082444918, "grad_norm": 9.237375259399414, "learning_rate": 2.4723013611902502e-05, "loss": 0.1313, "step": 9740 }, { "epoch": 27.718550106609808, "grad_norm": 7.937037944793701, "learning_rate": 2.4691358024691357e-05, "loss": 0.1433, "step": 9750 }, { "epoch": 27.746979388770434, "grad_norm": 8.625982284545898, "learning_rate": 2.4659702437480218e-05, "loss": 0.1333, "step": 9760 }, { "epoch": 27.77540867093106, "grad_norm": 6.477577209472656, "learning_rate": 2.4628046850269075e-05, "loss": 0.1357, "step": 9770 }, { "epoch": 27.803837953091683, "grad_norm": 6.752776145935059, "learning_rate": 2.459639126305793e-05, "loss": 0.1273, "step": 9780 }, { "epoch": 27.83226723525231, "grad_norm": 5.306801795959473, "learning_rate": 2.456473567584679e-05, "loss": 0.1378, "step": 9790 }, { "epoch": 27.860696517412936, "grad_norm": 6.6520538330078125, "learning_rate": 2.4533080088635644e-05, "loss": 0.142, "step": 9800 }, { "epoch": 27.88912579957356, "grad_norm": 10.990520477294922, "learning_rate": 2.4501424501424502e-05, "loss": 0.1372, "step": 9810 }, { "epoch": 27.917555081734186, "grad_norm": 11.567150115966797, "learning_rate": 2.446976891421336e-05, "loss": 0.1377, "step": 9820 }, { "epoch": 27.945984363894812, "grad_norm": 5.136601448059082, "learning_rate": 2.4438113327002217e-05, "loss": 0.1377, "step": 9830 }, { "epoch": 27.97441364605544, "grad_norm": 7.663478851318359, "learning_rate": 2.4406457739791075e-05, "loss": 0.138, "step": 9840 }, { "epoch": 28.0, "eval_accuracy": 0.769, "eval_loss": 0.10992265492677689, "eval_runtime": 13.5172, "eval_samples_per_second": 369.9, "eval_steps_per_second": 11.615, "step": 9849 }, { "epoch": 28.00284292821606, "grad_norm": 7.304515361785889, "learning_rate": 2.4374802152579932e-05, "loss": 0.1392, "step": 9850 }, { "epoch": 28.031272210376688, "grad_norm": 6.832584381103516, "learning_rate": 2.4343146565368786e-05, "loss": 0.1409, "step": 9860 }, { "epoch": 28.059701492537314, "grad_norm": 5.521937370300293, "learning_rate": 2.4311490978157647e-05, "loss": 0.1365, "step": 9870 }, { "epoch": 28.088130774697937, "grad_norm": 6.8485612869262695, "learning_rate": 2.4279835390946505e-05, "loss": 0.13, "step": 9880 }, { "epoch": 28.116560056858564, "grad_norm": 9.598737716674805, "learning_rate": 2.424817980373536e-05, "loss": 0.1412, "step": 9890 }, { "epoch": 28.14498933901919, "grad_norm": 7.1167168617248535, "learning_rate": 2.4216524216524217e-05, "loss": 0.1298, "step": 9900 }, { "epoch": 28.173418621179817, "grad_norm": 4.762835502624512, "learning_rate": 2.4184868629313078e-05, "loss": 0.1348, "step": 9910 }, { "epoch": 28.20184790334044, "grad_norm": 5.0789923667907715, "learning_rate": 2.4153213042101932e-05, "loss": 0.1328, "step": 9920 }, { "epoch": 28.230277185501066, "grad_norm": 10.540358543395996, "learning_rate": 2.412155745489079e-05, "loss": 0.1305, "step": 9930 }, { "epoch": 28.258706467661693, "grad_norm": 7.114138126373291, "learning_rate": 2.4089901867679647e-05, "loss": 0.1339, "step": 9940 }, { "epoch": 28.287135749822315, "grad_norm": 5.747593879699707, "learning_rate": 2.4058246280468505e-05, "loss": 0.1262, "step": 9950 }, { "epoch": 28.315565031982942, "grad_norm": 7.8007493019104, "learning_rate": 2.4026590693257362e-05, "loss": 0.1295, "step": 9960 }, { "epoch": 28.34399431414357, "grad_norm": 6.13392972946167, "learning_rate": 2.3994935106046216e-05, "loss": 0.1278, "step": 9970 }, { "epoch": 28.372423596304195, "grad_norm": 5.374325752258301, "learning_rate": 2.3963279518835074e-05, "loss": 0.1301, "step": 9980 }, { "epoch": 28.400852878464818, "grad_norm": 6.481910705566406, "learning_rate": 2.3931623931623935e-05, "loss": 0.1389, "step": 9990 }, { "epoch": 28.429282160625444, "grad_norm": 7.0581488609313965, "learning_rate": 2.389996834441279e-05, "loss": 0.144, "step": 10000 }, { "epoch": 28.45771144278607, "grad_norm": 9.137778282165527, "learning_rate": 2.3868312757201647e-05, "loss": 0.1387, "step": 10010 }, { "epoch": 28.486140724946694, "grad_norm": 6.143486022949219, "learning_rate": 2.3836657169990504e-05, "loss": 0.1363, "step": 10020 }, { "epoch": 28.51457000710732, "grad_norm": 7.295355319976807, "learning_rate": 2.3805001582779362e-05, "loss": 0.1382, "step": 10030 }, { "epoch": 28.542999289267946, "grad_norm": 7.998733997344971, "learning_rate": 2.377334599556822e-05, "loss": 0.1359, "step": 10040 }, { "epoch": 28.571428571428573, "grad_norm": 11.568644523620605, "learning_rate": 2.3741690408357077e-05, "loss": 0.1373, "step": 10050 }, { "epoch": 28.599857853589196, "grad_norm": 7.219127655029297, "learning_rate": 2.371003482114593e-05, "loss": 0.1317, "step": 10060 }, { "epoch": 28.628287135749822, "grad_norm": 8.487744331359863, "learning_rate": 2.3678379233934792e-05, "loss": 0.1341, "step": 10070 }, { "epoch": 28.65671641791045, "grad_norm": 10.82504940032959, "learning_rate": 2.364672364672365e-05, "loss": 0.136, "step": 10080 }, { "epoch": 28.68514570007107, "grad_norm": 5.486518859863281, "learning_rate": 2.3615068059512504e-05, "loss": 0.1386, "step": 10090 }, { "epoch": 28.713574982231698, "grad_norm": 5.786195755004883, "learning_rate": 2.358341247230136e-05, "loss": 0.1433, "step": 10100 }, { "epoch": 28.742004264392325, "grad_norm": 8.049909591674805, "learning_rate": 2.355175688509022e-05, "loss": 0.131, "step": 10110 }, { "epoch": 28.77043354655295, "grad_norm": 5.329484939575195, "learning_rate": 2.3520101297879076e-05, "loss": 0.1359, "step": 10120 }, { "epoch": 28.798862828713574, "grad_norm": 7.575248718261719, "learning_rate": 2.3488445710667934e-05, "loss": 0.1412, "step": 10130 }, { "epoch": 28.8272921108742, "grad_norm": 8.416072845458984, "learning_rate": 2.345679012345679e-05, "loss": 0.1345, "step": 10140 }, { "epoch": 28.855721393034827, "grad_norm": 8.664349555969238, "learning_rate": 2.342513453624565e-05, "loss": 0.1399, "step": 10150 }, { "epoch": 28.88415067519545, "grad_norm": 7.181797981262207, "learning_rate": 2.3393478949034507e-05, "loss": 0.1317, "step": 10160 }, { "epoch": 28.912579957356076, "grad_norm": 8.701619148254395, "learning_rate": 2.336182336182336e-05, "loss": 0.1434, "step": 10170 }, { "epoch": 28.941009239516703, "grad_norm": 7.428786277770996, "learning_rate": 2.333016777461222e-05, "loss": 0.1411, "step": 10180 }, { "epoch": 28.96943852167733, "grad_norm": 5.5267109870910645, "learning_rate": 2.329851218740108e-05, "loss": 0.1335, "step": 10190 }, { "epoch": 28.997867803837952, "grad_norm": 10.01519775390625, "learning_rate": 2.3266856600189934e-05, "loss": 0.1293, "step": 10200 }, { "epoch": 28.997867803837952, "eval_accuracy": 0.7754, "eval_loss": 0.10941459983587265, "eval_runtime": 13.5246, "eval_samples_per_second": 369.698, "eval_steps_per_second": 11.609, "step": 10200 }, { "epoch": 29.02629708599858, "grad_norm": 7.988402366638184, "learning_rate": 2.323520101297879e-05, "loss": 0.1312, "step": 10210 }, { "epoch": 29.054726368159205, "grad_norm": 6.700438976287842, "learning_rate": 2.320354542576765e-05, "loss": 0.1289, "step": 10220 }, { "epoch": 29.083155650319828, "grad_norm": 7.944076061248779, "learning_rate": 2.3171889838556506e-05, "loss": 0.1327, "step": 10230 }, { "epoch": 29.111584932480454, "grad_norm": 6.171491622924805, "learning_rate": 2.3140234251345364e-05, "loss": 0.1267, "step": 10240 }, { "epoch": 29.14001421464108, "grad_norm": 5.884680271148682, "learning_rate": 2.310857866413422e-05, "loss": 0.132, "step": 10250 }, { "epoch": 29.168443496801707, "grad_norm": 10.052933692932129, "learning_rate": 2.307692307692308e-05, "loss": 0.126, "step": 10260 }, { "epoch": 29.19687277896233, "grad_norm": 8.323927879333496, "learning_rate": 2.3045267489711937e-05, "loss": 0.1365, "step": 10270 }, { "epoch": 29.225302061122957, "grad_norm": 6.383059978485107, "learning_rate": 2.301361190250079e-05, "loss": 0.14, "step": 10280 }, { "epoch": 29.253731343283583, "grad_norm": 13.930680274963379, "learning_rate": 2.298195631528965e-05, "loss": 0.1321, "step": 10290 }, { "epoch": 29.282160625444206, "grad_norm": 5.550623893737793, "learning_rate": 2.295030072807851e-05, "loss": 0.141, "step": 10300 }, { "epoch": 29.310589907604832, "grad_norm": 4.2210917472839355, "learning_rate": 2.2918645140867364e-05, "loss": 0.1327, "step": 10310 }, { "epoch": 29.33901918976546, "grad_norm": 7.759565830230713, "learning_rate": 2.288698955365622e-05, "loss": 0.1321, "step": 10320 }, { "epoch": 29.367448471926085, "grad_norm": 7.359158992767334, "learning_rate": 2.285533396644508e-05, "loss": 0.1389, "step": 10330 }, { "epoch": 29.395877754086708, "grad_norm": 6.822604656219482, "learning_rate": 2.2823678379233936e-05, "loss": 0.1312, "step": 10340 }, { "epoch": 29.424307036247335, "grad_norm": 8.015970230102539, "learning_rate": 2.2792022792022794e-05, "loss": 0.1338, "step": 10350 }, { "epoch": 29.45273631840796, "grad_norm": 5.947789192199707, "learning_rate": 2.276036720481165e-05, "loss": 0.1394, "step": 10360 }, { "epoch": 29.481165600568584, "grad_norm": 7.061962127685547, "learning_rate": 2.2728711617600506e-05, "loss": 0.1446, "step": 10370 }, { "epoch": 29.50959488272921, "grad_norm": 10.131390571594238, "learning_rate": 2.2697056030389367e-05, "loss": 0.1334, "step": 10380 }, { "epoch": 29.538024164889837, "grad_norm": 9.207195281982422, "learning_rate": 2.2665400443178224e-05, "loss": 0.1352, "step": 10390 }, { "epoch": 29.566453447050463, "grad_norm": 5.108695983886719, "learning_rate": 2.2633744855967078e-05, "loss": 0.1318, "step": 10400 }, { "epoch": 29.594882729211086, "grad_norm": 5.167972087860107, "learning_rate": 2.2602089268755936e-05, "loss": 0.1319, "step": 10410 }, { "epoch": 29.623312011371713, "grad_norm": 6.849377632141113, "learning_rate": 2.2570433681544793e-05, "loss": 0.1358, "step": 10420 }, { "epoch": 29.65174129353234, "grad_norm": 9.979886054992676, "learning_rate": 2.253877809433365e-05, "loss": 0.1295, "step": 10430 }, { "epoch": 29.680170575692962, "grad_norm": 4.664868354797363, "learning_rate": 2.250712250712251e-05, "loss": 0.1284, "step": 10440 }, { "epoch": 29.70859985785359, "grad_norm": 6.683469772338867, "learning_rate": 2.2475466919911363e-05, "loss": 0.1336, "step": 10450 }, { "epoch": 29.737029140014215, "grad_norm": 5.911435127258301, "learning_rate": 2.2443811332700224e-05, "loss": 0.134, "step": 10460 }, { "epoch": 29.76545842217484, "grad_norm": 6.302966117858887, "learning_rate": 2.241215574548908e-05, "loss": 0.1326, "step": 10470 }, { "epoch": 29.793887704335464, "grad_norm": 6.449643611907959, "learning_rate": 2.2380500158277935e-05, "loss": 0.1445, "step": 10480 }, { "epoch": 29.82231698649609, "grad_norm": 9.889830589294434, "learning_rate": 2.2348844571066793e-05, "loss": 0.1339, "step": 10490 }, { "epoch": 29.850746268656717, "grad_norm": 5.0603108406066895, "learning_rate": 2.2317188983855654e-05, "loss": 0.1264, "step": 10500 }, { "epoch": 29.87917555081734, "grad_norm": 7.853873252868652, "learning_rate": 2.2285533396644508e-05, "loss": 0.1338, "step": 10510 }, { "epoch": 29.907604832977967, "grad_norm": 7.320250511169434, "learning_rate": 2.2253877809433366e-05, "loss": 0.135, "step": 10520 }, { "epoch": 29.936034115138593, "grad_norm": 7.760400295257568, "learning_rate": 2.2222222222222223e-05, "loss": 0.1357, "step": 10530 }, { "epoch": 29.96446339729922, "grad_norm": 10.85993766784668, "learning_rate": 2.219056663501108e-05, "loss": 0.1323, "step": 10540 }, { "epoch": 29.992892679459842, "grad_norm": 8.534313201904297, "learning_rate": 2.215891104779994e-05, "loss": 0.1273, "step": 10550 }, { "epoch": 29.99857853589197, "eval_accuracy": 0.7768, "eval_loss": 0.10909145325422287, "eval_runtime": 13.5235, "eval_samples_per_second": 369.726, "eval_steps_per_second": 11.609, "step": 10552 }, { "epoch": 30.02132196162047, "grad_norm": 7.812359809875488, "learning_rate": 2.2127255460588796e-05, "loss": 0.1283, "step": 10560 }, { "epoch": 30.049751243781095, "grad_norm": 7.074906826019287, "learning_rate": 2.209559987337765e-05, "loss": 0.1305, "step": 10570 }, { "epoch": 30.07818052594172, "grad_norm": 12.2709321975708, "learning_rate": 2.206394428616651e-05, "loss": 0.1328, "step": 10580 }, { "epoch": 30.106609808102345, "grad_norm": 12.848553657531738, "learning_rate": 2.2032288698955365e-05, "loss": 0.1294, "step": 10590 }, { "epoch": 30.13503909026297, "grad_norm": 9.683428764343262, "learning_rate": 2.2000633111744223e-05, "loss": 0.1341, "step": 10600 }, { "epoch": 30.163468372423598, "grad_norm": 7.8487324714660645, "learning_rate": 2.196897752453308e-05, "loss": 0.1347, "step": 10610 }, { "epoch": 30.19189765458422, "grad_norm": 5.906916618347168, "learning_rate": 2.1937321937321938e-05, "loss": 0.1351, "step": 10620 }, { "epoch": 30.220326936744847, "grad_norm": 8.996933937072754, "learning_rate": 2.1905666350110796e-05, "loss": 0.1321, "step": 10630 }, { "epoch": 30.248756218905474, "grad_norm": 5.676651954650879, "learning_rate": 2.1874010762899653e-05, "loss": 0.1246, "step": 10640 }, { "epoch": 30.277185501066096, "grad_norm": 4.894083023071289, "learning_rate": 2.184235517568851e-05, "loss": 0.1296, "step": 10650 }, { "epoch": 30.305614783226723, "grad_norm": 9.179611206054688, "learning_rate": 2.1810699588477368e-05, "loss": 0.1314, "step": 10660 }, { "epoch": 30.33404406538735, "grad_norm": 5.719768047332764, "learning_rate": 2.1779044001266226e-05, "loss": 0.1272, "step": 10670 }, { "epoch": 30.362473347547976, "grad_norm": 8.423073768615723, "learning_rate": 2.174738841405508e-05, "loss": 0.1351, "step": 10680 }, { "epoch": 30.3909026297086, "grad_norm": 6.499375343322754, "learning_rate": 2.171573282684394e-05, "loss": 0.1355, "step": 10690 }, { "epoch": 30.419331911869225, "grad_norm": 9.108795166015625, "learning_rate": 2.1684077239632795e-05, "loss": 0.1338, "step": 10700 }, { "epoch": 30.44776119402985, "grad_norm": 6.589269638061523, "learning_rate": 2.1652421652421653e-05, "loss": 0.1288, "step": 10710 }, { "epoch": 30.476190476190474, "grad_norm": 8.960533142089844, "learning_rate": 2.162076606521051e-05, "loss": 0.1319, "step": 10720 }, { "epoch": 30.5046197583511, "grad_norm": 4.5037455558776855, "learning_rate": 2.1589110477999368e-05, "loss": 0.1351, "step": 10730 }, { "epoch": 30.533049040511727, "grad_norm": 6.25697135925293, "learning_rate": 2.1557454890788225e-05, "loss": 0.1286, "step": 10740 }, { "epoch": 30.561478322672354, "grad_norm": 5.714256763458252, "learning_rate": 2.1525799303577083e-05, "loss": 0.1375, "step": 10750 }, { "epoch": 30.589907604832977, "grad_norm": 6.971078872680664, "learning_rate": 2.1494143716365937e-05, "loss": 0.1312, "step": 10760 }, { "epoch": 30.618336886993603, "grad_norm": 8.790769577026367, "learning_rate": 2.1462488129154798e-05, "loss": 0.1349, "step": 10770 }, { "epoch": 30.64676616915423, "grad_norm": 6.210764408111572, "learning_rate": 2.1430832541943656e-05, "loss": 0.1293, "step": 10780 }, { "epoch": 30.675195451314853, "grad_norm": 9.885531425476074, "learning_rate": 2.139917695473251e-05, "loss": 0.1378, "step": 10790 }, { "epoch": 30.70362473347548, "grad_norm": 9.312872886657715, "learning_rate": 2.1367521367521368e-05, "loss": 0.1341, "step": 10800 }, { "epoch": 30.732054015636106, "grad_norm": 8.703923225402832, "learning_rate": 2.133586578031023e-05, "loss": 0.1429, "step": 10810 }, { "epoch": 30.760483297796732, "grad_norm": 5.015939712524414, "learning_rate": 2.1304210193099083e-05, "loss": 0.1349, "step": 10820 }, { "epoch": 30.788912579957355, "grad_norm": 7.164327621459961, "learning_rate": 2.127255460588794e-05, "loss": 0.1296, "step": 10830 }, { "epoch": 30.81734186211798, "grad_norm": 9.111225128173828, "learning_rate": 2.1240899018676798e-05, "loss": 0.1256, "step": 10840 }, { "epoch": 30.845771144278608, "grad_norm": 8.632919311523438, "learning_rate": 2.1209243431465655e-05, "loss": 0.1278, "step": 10850 }, { "epoch": 30.87420042643923, "grad_norm": 10.941034317016602, "learning_rate": 2.1177587844254513e-05, "loss": 0.1334, "step": 10860 }, { "epoch": 30.902629708599857, "grad_norm": 12.321640968322754, "learning_rate": 2.1145932257043367e-05, "loss": 0.1301, "step": 10870 }, { "epoch": 30.931058990760484, "grad_norm": 6.711130619049072, "learning_rate": 2.1114276669832225e-05, "loss": 0.1346, "step": 10880 }, { "epoch": 30.95948827292111, "grad_norm": 11.876862525939941, "learning_rate": 2.1082621082621086e-05, "loss": 0.1342, "step": 10890 }, { "epoch": 30.987917555081733, "grad_norm": 5.851500034332275, "learning_rate": 2.105096549540994e-05, "loss": 0.1363, "step": 10900 }, { "epoch": 30.999289267945983, "eval_accuracy": 0.7766, "eval_loss": 0.10781557857990265, "eval_runtime": 13.5081, "eval_samples_per_second": 370.148, "eval_steps_per_second": 11.623, "step": 10904 }, { "epoch": 31.01634683724236, "grad_norm": 7.7810468673706055, "learning_rate": 2.1019309908198797e-05, "loss": 0.1408, "step": 10910 }, { "epoch": 31.044776119402986, "grad_norm": 7.81134033203125, "learning_rate": 2.0987654320987655e-05, "loss": 0.129, "step": 10920 }, { "epoch": 31.07320540156361, "grad_norm": 5.992602348327637, "learning_rate": 2.0955998733776513e-05, "loss": 0.1354, "step": 10930 }, { "epoch": 31.101634683724235, "grad_norm": 8.459920883178711, "learning_rate": 2.092434314656537e-05, "loss": 0.1343, "step": 10940 }, { "epoch": 31.13006396588486, "grad_norm": 5.661654949188232, "learning_rate": 2.0892687559354228e-05, "loss": 0.1336, "step": 10950 }, { "epoch": 31.158493248045488, "grad_norm": 7.22084379196167, "learning_rate": 2.0861031972143082e-05, "loss": 0.1297, "step": 10960 }, { "epoch": 31.18692253020611, "grad_norm": 12.312235832214355, "learning_rate": 2.0829376384931943e-05, "loss": 0.1369, "step": 10970 }, { "epoch": 31.215351812366738, "grad_norm": 5.357903957366943, "learning_rate": 2.07977207977208e-05, "loss": 0.1359, "step": 10980 }, { "epoch": 31.243781094527364, "grad_norm": 8.557554244995117, "learning_rate": 2.0766065210509655e-05, "loss": 0.1281, "step": 10990 }, { "epoch": 31.272210376687987, "grad_norm": 8.890033721923828, "learning_rate": 2.0734409623298512e-05, "loss": 0.132, "step": 11000 }, { "epoch": 31.300639658848613, "grad_norm": 11.211170196533203, "learning_rate": 2.070275403608737e-05, "loss": 0.128, "step": 11010 }, { "epoch": 31.32906894100924, "grad_norm": 5.767834186553955, "learning_rate": 2.0671098448876227e-05, "loss": 0.13, "step": 11020 }, { "epoch": 31.357498223169866, "grad_norm": 5.333038330078125, "learning_rate": 2.0639442861665085e-05, "loss": 0.1249, "step": 11030 }, { "epoch": 31.38592750533049, "grad_norm": 7.375089168548584, "learning_rate": 2.0607787274453942e-05, "loss": 0.1277, "step": 11040 }, { "epoch": 31.414356787491116, "grad_norm": 5.5436110496521, "learning_rate": 2.05761316872428e-05, "loss": 0.1292, "step": 11050 }, { "epoch": 31.442786069651742, "grad_norm": 5.241732597351074, "learning_rate": 2.0544476100031658e-05, "loss": 0.1316, "step": 11060 }, { "epoch": 31.47121535181237, "grad_norm": 8.928046226501465, "learning_rate": 2.0512820512820512e-05, "loss": 0.1227, "step": 11070 }, { "epoch": 31.49964463397299, "grad_norm": 8.119956970214844, "learning_rate": 2.0481164925609373e-05, "loss": 0.127, "step": 11080 }, { "epoch": 31.528073916133618, "grad_norm": 8.014517784118652, "learning_rate": 2.044950933839823e-05, "loss": 0.1256, "step": 11090 }, { "epoch": 31.556503198294244, "grad_norm": 14.385274887084961, "learning_rate": 2.0417853751187084e-05, "loss": 0.1348, "step": 11100 }, { "epoch": 31.584932480454867, "grad_norm": 6.57793664932251, "learning_rate": 2.0386198163975942e-05, "loss": 0.1352, "step": 11110 }, { "epoch": 31.613361762615494, "grad_norm": 8.808320999145508, "learning_rate": 2.03545425767648e-05, "loss": 0.1284, "step": 11120 }, { "epoch": 31.64179104477612, "grad_norm": 10.185745239257812, "learning_rate": 2.0322886989553657e-05, "loss": 0.1245, "step": 11130 }, { "epoch": 31.670220326936743, "grad_norm": 8.060871124267578, "learning_rate": 2.0291231402342515e-05, "loss": 0.1326, "step": 11140 }, { "epoch": 31.69864960909737, "grad_norm": 8.0587797164917, "learning_rate": 2.0259575815131372e-05, "loss": 0.1249, "step": 11150 }, { "epoch": 31.727078891257996, "grad_norm": 10.892049789428711, "learning_rate": 2.022792022792023e-05, "loss": 0.1308, "step": 11160 }, { "epoch": 31.755508173418622, "grad_norm": 11.089327812194824, "learning_rate": 2.0196264640709087e-05, "loss": 0.1302, "step": 11170 }, { "epoch": 31.783937455579245, "grad_norm": 7.030252933502197, "learning_rate": 2.016460905349794e-05, "loss": 0.1343, "step": 11180 }, { "epoch": 31.812366737739872, "grad_norm": 11.058432579040527, "learning_rate": 2.01329534662868e-05, "loss": 0.1341, "step": 11190 }, { "epoch": 31.8407960199005, "grad_norm": 9.995917320251465, "learning_rate": 2.010129787907566e-05, "loss": 0.1283, "step": 11200 }, { "epoch": 31.869225302061125, "grad_norm": 5.8510284423828125, "learning_rate": 2.0069642291864514e-05, "loss": 0.1294, "step": 11210 }, { "epoch": 31.897654584221748, "grad_norm": 6.676878929138184, "learning_rate": 2.0037986704653372e-05, "loss": 0.1281, "step": 11220 }, { "epoch": 31.926083866382374, "grad_norm": 6.233856201171875, "learning_rate": 2.000633111744223e-05, "loss": 0.1255, "step": 11230 }, { "epoch": 31.954513148543, "grad_norm": 8.033185005187988, "learning_rate": 1.9974675530231087e-05, "loss": 0.1305, "step": 11240 }, { "epoch": 31.982942430703623, "grad_norm": 5.770398139953613, "learning_rate": 1.9943019943019945e-05, "loss": 0.1293, "step": 11250 }, { "epoch": 32.0, "eval_accuracy": 0.7736, "eval_loss": 0.10908429324626923, "eval_runtime": 13.5331, "eval_samples_per_second": 369.465, "eval_steps_per_second": 11.601, "step": 11256 }, { "epoch": 32.01137171286425, "grad_norm": 7.649781227111816, "learning_rate": 1.9911364355808802e-05, "loss": 0.1295, "step": 11260 }, { "epoch": 32.039800995024876, "grad_norm": 8.251233100891113, "learning_rate": 1.9879708768597656e-05, "loss": 0.1287, "step": 11270 }, { "epoch": 32.0682302771855, "grad_norm": 8.773487091064453, "learning_rate": 1.9848053181386517e-05, "loss": 0.1212, "step": 11280 }, { "epoch": 32.09665955934613, "grad_norm": 7.8118791580200195, "learning_rate": 1.9816397594175375e-05, "loss": 0.1279, "step": 11290 }, { "epoch": 32.12508884150675, "grad_norm": 5.217939853668213, "learning_rate": 1.978474200696423e-05, "loss": 0.1292, "step": 11300 }, { "epoch": 32.153518123667375, "grad_norm": 9.422746658325195, "learning_rate": 1.9753086419753087e-05, "loss": 0.1239, "step": 11310 }, { "epoch": 32.181947405828005, "grad_norm": 14.056968688964844, "learning_rate": 1.9721430832541944e-05, "loss": 0.1322, "step": 11320 }, { "epoch": 32.21037668798863, "grad_norm": 8.5320405960083, "learning_rate": 1.9689775245330802e-05, "loss": 0.1248, "step": 11330 }, { "epoch": 32.23880597014925, "grad_norm": 7.294402599334717, "learning_rate": 1.965811965811966e-05, "loss": 0.1196, "step": 11340 }, { "epoch": 32.26723525230988, "grad_norm": 10.066523551940918, "learning_rate": 1.9626464070908514e-05, "loss": 0.1301, "step": 11350 }, { "epoch": 32.295664534470504, "grad_norm": 6.38063907623291, "learning_rate": 1.9594808483697374e-05, "loss": 0.1332, "step": 11360 }, { "epoch": 32.32409381663113, "grad_norm": 9.330846786499023, "learning_rate": 1.9563152896486232e-05, "loss": 0.1216, "step": 11370 }, { "epoch": 32.35252309879176, "grad_norm": 12.725480079650879, "learning_rate": 1.9531497309275086e-05, "loss": 0.1289, "step": 11380 }, { "epoch": 32.38095238095238, "grad_norm": 7.192099571228027, "learning_rate": 1.9499841722063944e-05, "loss": 0.1288, "step": 11390 }, { "epoch": 32.40938166311301, "grad_norm": 8.737794876098633, "learning_rate": 1.9468186134852805e-05, "loss": 0.1339, "step": 11400 }, { "epoch": 32.43781094527363, "grad_norm": 7.142094612121582, "learning_rate": 1.943653054764166e-05, "loss": 0.1279, "step": 11410 }, { "epoch": 32.466240227434255, "grad_norm": 5.805847644805908, "learning_rate": 1.9404874960430516e-05, "loss": 0.126, "step": 11420 }, { "epoch": 32.494669509594885, "grad_norm": 9.589973449707031, "learning_rate": 1.9373219373219374e-05, "loss": 0.1278, "step": 11430 }, { "epoch": 32.52309879175551, "grad_norm": 9.326807022094727, "learning_rate": 1.934156378600823e-05, "loss": 0.1262, "step": 11440 }, { "epoch": 32.55152807391613, "grad_norm": 8.244095802307129, "learning_rate": 1.930990819879709e-05, "loss": 0.127, "step": 11450 }, { "epoch": 32.57995735607676, "grad_norm": 7.786057472229004, "learning_rate": 1.9278252611585947e-05, "loss": 0.1276, "step": 11460 }, { "epoch": 32.608386638237384, "grad_norm": 6.3490166664123535, "learning_rate": 1.92465970243748e-05, "loss": 0.1334, "step": 11470 }, { "epoch": 32.63681592039801, "grad_norm": 6.347194194793701, "learning_rate": 1.9214941437163662e-05, "loss": 0.1302, "step": 11480 }, { "epoch": 32.66524520255864, "grad_norm": 6.4109697341918945, "learning_rate": 1.9183285849952516e-05, "loss": 0.1283, "step": 11490 }, { "epoch": 32.69367448471926, "grad_norm": 8.155673027038574, "learning_rate": 1.9151630262741374e-05, "loss": 0.1267, "step": 11500 }, { "epoch": 32.72210376687988, "grad_norm": 6.420925617218018, "learning_rate": 1.9119974675530235e-05, "loss": 0.1205, "step": 11510 }, { "epoch": 32.75053304904051, "grad_norm": 4.928952693939209, "learning_rate": 1.908831908831909e-05, "loss": 0.1349, "step": 11520 }, { "epoch": 32.778962331201136, "grad_norm": 11.742147445678711, "learning_rate": 1.9056663501107946e-05, "loss": 0.1267, "step": 11530 }, { "epoch": 32.807391613361766, "grad_norm": 11.09310531616211, "learning_rate": 1.9025007913896804e-05, "loss": 0.1342, "step": 11540 }, { "epoch": 32.83582089552239, "grad_norm": 4.707826614379883, "learning_rate": 1.899335232668566e-05, "loss": 0.1254, "step": 11550 }, { "epoch": 32.86425017768301, "grad_norm": 6.62393856048584, "learning_rate": 1.896169673947452e-05, "loss": 0.1235, "step": 11560 }, { "epoch": 32.89267945984364, "grad_norm": 7.414745807647705, "learning_rate": 1.8930041152263377e-05, "loss": 0.1227, "step": 11570 }, { "epoch": 32.921108742004265, "grad_norm": 6.783624172210693, "learning_rate": 1.889838556505223e-05, "loss": 0.1262, "step": 11580 }, { "epoch": 32.94953802416489, "grad_norm": 9.787161827087402, "learning_rate": 1.8866729977841092e-05, "loss": 0.1357, "step": 11590 }, { "epoch": 32.97796730632552, "grad_norm": 6.208036422729492, "learning_rate": 1.8835074390629946e-05, "loss": 0.1275, "step": 11600 }, { "epoch": 32.997867803837956, "eval_accuracy": 0.7806, "eval_loss": 0.1068153902888298, "eval_runtime": 13.4846, "eval_samples_per_second": 370.792, "eval_steps_per_second": 11.643, "step": 11607 }, { "epoch": 33.00639658848614, "grad_norm": 10.223814964294434, "learning_rate": 1.8803418803418804e-05, "loss": 0.1294, "step": 11610 }, { "epoch": 33.03482587064676, "grad_norm": 5.409425735473633, "learning_rate": 1.877176321620766e-05, "loss": 0.1232, "step": 11620 }, { "epoch": 33.06325515280739, "grad_norm": 8.091533660888672, "learning_rate": 1.874010762899652e-05, "loss": 0.1243, "step": 11630 }, { "epoch": 33.091684434968016, "grad_norm": 7.93132209777832, "learning_rate": 1.8708452041785376e-05, "loss": 0.1238, "step": 11640 }, { "epoch": 33.12011371712864, "grad_norm": 8.090452194213867, "learning_rate": 1.8676796454574234e-05, "loss": 0.1282, "step": 11650 }, { "epoch": 33.14854299928927, "grad_norm": 10.613383293151855, "learning_rate": 1.8645140867363088e-05, "loss": 0.1272, "step": 11660 }, { "epoch": 33.17697228144989, "grad_norm": 8.287062644958496, "learning_rate": 1.861348528015195e-05, "loss": 0.1299, "step": 11670 }, { "epoch": 33.20540156361052, "grad_norm": 16.724559783935547, "learning_rate": 1.8581829692940807e-05, "loss": 0.1297, "step": 11680 }, { "epoch": 33.233830845771145, "grad_norm": 9.144177436828613, "learning_rate": 1.855017410572966e-05, "loss": 0.1236, "step": 11690 }, { "epoch": 33.26226012793177, "grad_norm": 14.098179817199707, "learning_rate": 1.8518518518518518e-05, "loss": 0.131, "step": 11700 }, { "epoch": 33.2906894100924, "grad_norm": 6.204522132873535, "learning_rate": 1.848686293130738e-05, "loss": 0.1273, "step": 11710 }, { "epoch": 33.31911869225302, "grad_norm": 8.287238121032715, "learning_rate": 1.8455207344096233e-05, "loss": 0.1164, "step": 11720 }, { "epoch": 33.347547974413644, "grad_norm": 10.760403633117676, "learning_rate": 1.842355175688509e-05, "loss": 0.1309, "step": 11730 }, { "epoch": 33.375977256574274, "grad_norm": 12.639159202575684, "learning_rate": 1.839189616967395e-05, "loss": 0.123, "step": 11740 }, { "epoch": 33.4044065387349, "grad_norm": 5.38312292098999, "learning_rate": 1.8360240582462806e-05, "loss": 0.1254, "step": 11750 }, { "epoch": 33.43283582089552, "grad_norm": 7.068725109100342, "learning_rate": 1.8328584995251664e-05, "loss": 0.1281, "step": 11760 }, { "epoch": 33.46126510305615, "grad_norm": 7.018673419952393, "learning_rate": 1.829692940804052e-05, "loss": 0.1216, "step": 11770 }, { "epoch": 33.48969438521677, "grad_norm": 12.721528053283691, "learning_rate": 1.8265273820829375e-05, "loss": 0.1272, "step": 11780 }, { "epoch": 33.518123667377395, "grad_norm": 5.890460014343262, "learning_rate": 1.8233618233618236e-05, "loss": 0.1217, "step": 11790 }, { "epoch": 33.546552949538025, "grad_norm": 8.201329231262207, "learning_rate": 1.820196264640709e-05, "loss": 0.1313, "step": 11800 }, { "epoch": 33.57498223169865, "grad_norm": 7.3668060302734375, "learning_rate": 1.8170307059195948e-05, "loss": 0.1282, "step": 11810 }, { "epoch": 33.60341151385928, "grad_norm": 7.798852920532227, "learning_rate": 1.8138651471984806e-05, "loss": 0.1308, "step": 11820 }, { "epoch": 33.6318407960199, "grad_norm": 8.019193649291992, "learning_rate": 1.8106995884773663e-05, "loss": 0.1268, "step": 11830 }, { "epoch": 33.660270078180524, "grad_norm": 15.059558868408203, "learning_rate": 1.807534029756252e-05, "loss": 0.128, "step": 11840 }, { "epoch": 33.688699360341154, "grad_norm": 8.425444602966309, "learning_rate": 1.804368471035138e-05, "loss": 0.1263, "step": 11850 }, { "epoch": 33.71712864250178, "grad_norm": 7.464546203613281, "learning_rate": 1.8012029123140233e-05, "loss": 0.1252, "step": 11860 }, { "epoch": 33.7455579246624, "grad_norm": 5.460416316986084, "learning_rate": 1.7980373535929094e-05, "loss": 0.1231, "step": 11870 }, { "epoch": 33.77398720682303, "grad_norm": 6.134063720703125, "learning_rate": 1.794871794871795e-05, "loss": 0.1243, "step": 11880 }, { "epoch": 33.80241648898365, "grad_norm": 6.876859664916992, "learning_rate": 1.7917062361506805e-05, "loss": 0.1265, "step": 11890 }, { "epoch": 33.830845771144276, "grad_norm": 7.3587260246276855, "learning_rate": 1.7885406774295663e-05, "loss": 0.1276, "step": 11900 }, { "epoch": 33.859275053304906, "grad_norm": 5.446532249450684, "learning_rate": 1.785375118708452e-05, "loss": 0.1292, "step": 11910 }, { "epoch": 33.88770433546553, "grad_norm": 8.481527328491211, "learning_rate": 1.7822095599873378e-05, "loss": 0.1274, "step": 11920 }, { "epoch": 33.91613361762616, "grad_norm": 7.9660325050354, "learning_rate": 1.7790440012662236e-05, "loss": 0.1337, "step": 11930 }, { "epoch": 33.94456289978678, "grad_norm": 12.019059181213379, "learning_rate": 1.7758784425451093e-05, "loss": 0.127, "step": 11940 }, { "epoch": 33.972992181947404, "grad_norm": 8.178388595581055, "learning_rate": 1.772712883823995e-05, "loss": 0.1263, "step": 11950 }, { "epoch": 33.998578535891966, "eval_accuracy": 0.7888, "eval_loss": 0.10395967960357666, "eval_runtime": 13.4856, "eval_samples_per_second": 370.766, "eval_steps_per_second": 11.642, "step": 11959 }, { "epoch": 34.001421464108034, "grad_norm": 10.218581199645996, "learning_rate": 1.769547325102881e-05, "loss": 0.1288, "step": 11960 }, { "epoch": 34.02985074626866, "grad_norm": 8.260220527648926, "learning_rate": 1.7663817663817662e-05, "loss": 0.1242, "step": 11970 }, { "epoch": 34.05828002842928, "grad_norm": 7.601161003112793, "learning_rate": 1.7632162076606523e-05, "loss": 0.1266, "step": 11980 }, { "epoch": 34.08670931058991, "grad_norm": 7.116036415100098, "learning_rate": 1.760050648939538e-05, "loss": 0.1235, "step": 11990 }, { "epoch": 34.11513859275053, "grad_norm": 8.048095703125, "learning_rate": 1.7568850902184235e-05, "loss": 0.1308, "step": 12000 }, { "epoch": 34.143567874911156, "grad_norm": 10.806943893432617, "learning_rate": 1.7537195314973093e-05, "loss": 0.1244, "step": 12010 }, { "epoch": 34.171997157071786, "grad_norm": 5.461693286895752, "learning_rate": 1.7505539727761954e-05, "loss": 0.1274, "step": 12020 }, { "epoch": 34.20042643923241, "grad_norm": 9.05993366241455, "learning_rate": 1.7473884140550808e-05, "loss": 0.1229, "step": 12030 }, { "epoch": 34.22885572139303, "grad_norm": 10.913057327270508, "learning_rate": 1.7442228553339665e-05, "loss": 0.1199, "step": 12040 }, { "epoch": 34.25728500355366, "grad_norm": 12.048541069030762, "learning_rate": 1.7410572966128523e-05, "loss": 0.1214, "step": 12050 }, { "epoch": 34.285714285714285, "grad_norm": 9.739500999450684, "learning_rate": 1.737891737891738e-05, "loss": 0.1226, "step": 12060 }, { "epoch": 34.31414356787491, "grad_norm": 12.430303573608398, "learning_rate": 1.7347261791706238e-05, "loss": 0.1227, "step": 12070 }, { "epoch": 34.34257285003554, "grad_norm": 8.849843978881836, "learning_rate": 1.7315606204495092e-05, "loss": 0.1225, "step": 12080 }, { "epoch": 34.37100213219616, "grad_norm": 12.40982437133789, "learning_rate": 1.728395061728395e-05, "loss": 0.1193, "step": 12090 }, { "epoch": 34.39943141435679, "grad_norm": 7.470407009124756, "learning_rate": 1.725229503007281e-05, "loss": 0.1288, "step": 12100 }, { "epoch": 34.42786069651741, "grad_norm": 6.326234340667725, "learning_rate": 1.7220639442861665e-05, "loss": 0.1265, "step": 12110 }, { "epoch": 34.456289978678036, "grad_norm": 10.158736228942871, "learning_rate": 1.7188983855650523e-05, "loss": 0.1315, "step": 12120 }, { "epoch": 34.484719260838666, "grad_norm": 6.992920875549316, "learning_rate": 1.715732826843938e-05, "loss": 0.1228, "step": 12130 }, { "epoch": 34.51314854299929, "grad_norm": 10.826105117797852, "learning_rate": 1.7125672681228238e-05, "loss": 0.1206, "step": 12140 }, { "epoch": 34.54157782515991, "grad_norm": 7.328774929046631, "learning_rate": 1.7094017094017095e-05, "loss": 0.1213, "step": 12150 }, { "epoch": 34.57000710732054, "grad_norm": 9.24868106842041, "learning_rate": 1.7062361506805953e-05, "loss": 0.1261, "step": 12160 }, { "epoch": 34.598436389481165, "grad_norm": 11.653092384338379, "learning_rate": 1.7030705919594807e-05, "loss": 0.1153, "step": 12170 }, { "epoch": 34.62686567164179, "grad_norm": 5.948169708251953, "learning_rate": 1.6999050332383668e-05, "loss": 0.1227, "step": 12180 }, { "epoch": 34.65529495380242, "grad_norm": 9.58452320098877, "learning_rate": 1.6967394745172526e-05, "loss": 0.1328, "step": 12190 }, { "epoch": 34.68372423596304, "grad_norm": 7.249233722686768, "learning_rate": 1.693573915796138e-05, "loss": 0.1272, "step": 12200 }, { "epoch": 34.71215351812367, "grad_norm": 8.45887565612793, "learning_rate": 1.6904083570750237e-05, "loss": 0.1294, "step": 12210 }, { "epoch": 34.740582800284294, "grad_norm": 9.408273696899414, "learning_rate": 1.6872427983539095e-05, "loss": 0.1192, "step": 12220 }, { "epoch": 34.76901208244492, "grad_norm": 5.716788291931152, "learning_rate": 1.6840772396327953e-05, "loss": 0.1285, "step": 12230 }, { "epoch": 34.79744136460555, "grad_norm": 11.101398468017578, "learning_rate": 1.680911680911681e-05, "loss": 0.1267, "step": 12240 }, { "epoch": 34.82587064676617, "grad_norm": 6.720208644866943, "learning_rate": 1.6777461221905664e-05, "loss": 0.124, "step": 12250 }, { "epoch": 34.85429992892679, "grad_norm": 13.999639511108398, "learning_rate": 1.6745805634694525e-05, "loss": 0.1262, "step": 12260 }, { "epoch": 34.88272921108742, "grad_norm": 5.7058939933776855, "learning_rate": 1.6714150047483383e-05, "loss": 0.1158, "step": 12270 }, { "epoch": 34.911158493248045, "grad_norm": 15.318939208984375, "learning_rate": 1.6682494460272237e-05, "loss": 0.1249, "step": 12280 }, { "epoch": 34.93958777540867, "grad_norm": 5.875026702880859, "learning_rate": 1.6650838873061095e-05, "loss": 0.1225, "step": 12290 }, { "epoch": 34.9680170575693, "grad_norm": 11.43384075164795, "learning_rate": 1.6619183285849956e-05, "loss": 0.1237, "step": 12300 }, { "epoch": 34.99644633972992, "grad_norm": 7.066872596740723, "learning_rate": 1.658752769863881e-05, "loss": 0.1243, "step": 12310 }, { "epoch": 34.99928926794598, "eval_accuracy": 0.7954, "eval_loss": 0.10188879817724228, "eval_runtime": 13.4945, "eval_samples_per_second": 370.521, "eval_steps_per_second": 11.634, "step": 12311 }, { "epoch": 35.024875621890544, "grad_norm": 6.7789411544799805, "learning_rate": 1.6555872111427667e-05, "loss": 0.1229, "step": 12320 }, { "epoch": 35.053304904051174, "grad_norm": 6.959674835205078, "learning_rate": 1.6524216524216525e-05, "loss": 0.1258, "step": 12330 }, { "epoch": 35.0817341862118, "grad_norm": 5.314538955688477, "learning_rate": 1.6492560937005382e-05, "loss": 0.1213, "step": 12340 }, { "epoch": 35.11016346837243, "grad_norm": 6.146927356719971, "learning_rate": 1.646090534979424e-05, "loss": 0.1235, "step": 12350 }, { "epoch": 35.13859275053305, "grad_norm": 6.855558395385742, "learning_rate": 1.6429249762583098e-05, "loss": 0.1277, "step": 12360 }, { "epoch": 35.16702203269367, "grad_norm": 8.33514404296875, "learning_rate": 1.6397594175371955e-05, "loss": 0.1254, "step": 12370 }, { "epoch": 35.1954513148543, "grad_norm": 7.066875457763672, "learning_rate": 1.6365938588160813e-05, "loss": 0.1233, "step": 12380 }, { "epoch": 35.223880597014926, "grad_norm": 10.739639282226562, "learning_rate": 1.6334283000949667e-05, "loss": 0.1206, "step": 12390 }, { "epoch": 35.25230987917555, "grad_norm": 8.161611557006836, "learning_rate": 1.6302627413738524e-05, "loss": 0.12, "step": 12400 }, { "epoch": 35.28073916133618, "grad_norm": 6.713425636291504, "learning_rate": 1.6270971826527385e-05, "loss": 0.1278, "step": 12410 }, { "epoch": 35.3091684434968, "grad_norm": 11.15284538269043, "learning_rate": 1.623931623931624e-05, "loss": 0.1309, "step": 12420 }, { "epoch": 35.337597725657425, "grad_norm": 9.264711380004883, "learning_rate": 1.6207660652105097e-05, "loss": 0.1278, "step": 12430 }, { "epoch": 35.366027007818055, "grad_norm": 6.691350936889648, "learning_rate": 1.6176005064893955e-05, "loss": 0.1217, "step": 12440 }, { "epoch": 35.39445628997868, "grad_norm": 6.611015796661377, "learning_rate": 1.6144349477682812e-05, "loss": 0.1234, "step": 12450 }, { "epoch": 35.4228855721393, "grad_norm": 11.795673370361328, "learning_rate": 1.611269389047167e-05, "loss": 0.1292, "step": 12460 }, { "epoch": 35.45131485429993, "grad_norm": 8.052464485168457, "learning_rate": 1.6081038303260527e-05, "loss": 0.1187, "step": 12470 }, { "epoch": 35.47974413646055, "grad_norm": 8.770303726196289, "learning_rate": 1.604938271604938e-05, "loss": 0.1287, "step": 12480 }, { "epoch": 35.50817341862118, "grad_norm": 6.5024733543396, "learning_rate": 1.6017727128838243e-05, "loss": 0.1202, "step": 12490 }, { "epoch": 35.536602700781806, "grad_norm": 9.498977661132812, "learning_rate": 1.5986071541627097e-05, "loss": 0.1232, "step": 12500 }, { "epoch": 35.56503198294243, "grad_norm": 8.048347473144531, "learning_rate": 1.5954415954415954e-05, "loss": 0.1194, "step": 12510 }, { "epoch": 35.59346126510306, "grad_norm": 7.19356632232666, "learning_rate": 1.5922760367204812e-05, "loss": 0.1236, "step": 12520 }, { "epoch": 35.62189054726368, "grad_norm": 6.878899574279785, "learning_rate": 1.589110477999367e-05, "loss": 0.1215, "step": 12530 }, { "epoch": 35.650319829424305, "grad_norm": 8.4365873336792, "learning_rate": 1.5859449192782527e-05, "loss": 0.1172, "step": 12540 }, { "epoch": 35.678749111584935, "grad_norm": 5.392386436462402, "learning_rate": 1.5827793605571385e-05, "loss": 0.1256, "step": 12550 }, { "epoch": 35.70717839374556, "grad_norm": 8.587969779968262, "learning_rate": 1.579613801836024e-05, "loss": 0.1225, "step": 12560 }, { "epoch": 35.73560767590618, "grad_norm": 4.8818793296813965, "learning_rate": 1.57644824311491e-05, "loss": 0.1229, "step": 12570 }, { "epoch": 35.76403695806681, "grad_norm": 7.0683207511901855, "learning_rate": 1.5732826843937957e-05, "loss": 0.1185, "step": 12580 }, { "epoch": 35.792466240227434, "grad_norm": 7.873589038848877, "learning_rate": 1.570117125672681e-05, "loss": 0.1235, "step": 12590 }, { "epoch": 35.82089552238806, "grad_norm": 6.5173726081848145, "learning_rate": 1.566951566951567e-05, "loss": 0.1222, "step": 12600 }, { "epoch": 35.84932480454869, "grad_norm": 21.202932357788086, "learning_rate": 1.563786008230453e-05, "loss": 0.1228, "step": 12610 }, { "epoch": 35.87775408670931, "grad_norm": 9.879849433898926, "learning_rate": 1.5606204495093384e-05, "loss": 0.1218, "step": 12620 }, { "epoch": 35.90618336886994, "grad_norm": 11.343986511230469, "learning_rate": 1.5574548907882242e-05, "loss": 0.1164, "step": 12630 }, { "epoch": 35.93461265103056, "grad_norm": 8.563065528869629, "learning_rate": 1.55428933206711e-05, "loss": 0.1185, "step": 12640 }, { "epoch": 35.963041933191185, "grad_norm": 9.70135498046875, "learning_rate": 1.5511237733459957e-05, "loss": 0.1239, "step": 12650 }, { "epoch": 35.991471215351815, "grad_norm": 7.081270217895508, "learning_rate": 1.5479582146248814e-05, "loss": 0.1237, "step": 12660 }, { "epoch": 36.0, "eval_accuracy": 0.7958, "eval_loss": 0.10163594782352448, "eval_runtime": 13.4921, "eval_samples_per_second": 370.587, "eval_steps_per_second": 11.636, "step": 12663 }, { "epoch": 36.01990049751244, "grad_norm": 9.70610237121582, "learning_rate": 1.5447926559037672e-05, "loss": 0.1193, "step": 12670 }, { "epoch": 36.04832977967306, "grad_norm": 9.30388355255127, "learning_rate": 1.5416270971826526e-05, "loss": 0.1238, "step": 12680 }, { "epoch": 36.07675906183369, "grad_norm": 6.350025653839111, "learning_rate": 1.5384615384615387e-05, "loss": 0.116, "step": 12690 }, { "epoch": 36.105188343994314, "grad_norm": 9.970781326293945, "learning_rate": 1.535295979740424e-05, "loss": 0.1281, "step": 12700 }, { "epoch": 36.13361762615494, "grad_norm": 7.705214500427246, "learning_rate": 1.53213042101931e-05, "loss": 0.1226, "step": 12710 }, { "epoch": 36.16204690831557, "grad_norm": 6.123471260070801, "learning_rate": 1.5289648622981956e-05, "loss": 0.122, "step": 12720 }, { "epoch": 36.19047619047619, "grad_norm": 7.846043109893799, "learning_rate": 1.5257993035770816e-05, "loss": 0.1214, "step": 12730 }, { "epoch": 36.21890547263681, "grad_norm": 6.141413688659668, "learning_rate": 1.5226337448559672e-05, "loss": 0.1234, "step": 12740 }, { "epoch": 36.24733475479744, "grad_norm": 10.361335754394531, "learning_rate": 1.5194681861348528e-05, "loss": 0.1162, "step": 12750 }, { "epoch": 36.275764036958066, "grad_norm": 6.058375835418701, "learning_rate": 1.5163026274137385e-05, "loss": 0.1296, "step": 12760 }, { "epoch": 36.304193319118696, "grad_norm": 8.66115951538086, "learning_rate": 1.5131370686926244e-05, "loss": 0.123, "step": 12770 }, { "epoch": 36.33262260127932, "grad_norm": 11.521245002746582, "learning_rate": 1.50997150997151e-05, "loss": 0.1213, "step": 12780 }, { "epoch": 36.36105188343994, "grad_norm": 6.6705780029296875, "learning_rate": 1.5068059512503958e-05, "loss": 0.127, "step": 12790 }, { "epoch": 36.38948116560057, "grad_norm": 4.863412380218506, "learning_rate": 1.5036403925292817e-05, "loss": 0.1221, "step": 12800 }, { "epoch": 36.417910447761194, "grad_norm": 8.2689790725708, "learning_rate": 1.5004748338081673e-05, "loss": 0.1121, "step": 12810 }, { "epoch": 36.44633972992182, "grad_norm": 5.589539527893066, "learning_rate": 1.4973092750870529e-05, "loss": 0.1226, "step": 12820 }, { "epoch": 36.47476901208245, "grad_norm": 5.630880355834961, "learning_rate": 1.4941437163659386e-05, "loss": 0.1301, "step": 12830 }, { "epoch": 36.50319829424307, "grad_norm": 7.311243534088135, "learning_rate": 1.4909781576448246e-05, "loss": 0.124, "step": 12840 }, { "epoch": 36.53162757640369, "grad_norm": 8.062106132507324, "learning_rate": 1.4878125989237102e-05, "loss": 0.1261, "step": 12850 }, { "epoch": 36.56005685856432, "grad_norm": 7.385509967803955, "learning_rate": 1.4846470402025957e-05, "loss": 0.123, "step": 12860 }, { "epoch": 36.588486140724946, "grad_norm": 8.434797286987305, "learning_rate": 1.4814814814814815e-05, "loss": 0.1199, "step": 12870 }, { "epoch": 36.61691542288557, "grad_norm": 6.657288074493408, "learning_rate": 1.4783159227603674e-05, "loss": 0.1191, "step": 12880 }, { "epoch": 36.6453447050462, "grad_norm": 8.588462829589844, "learning_rate": 1.475150364039253e-05, "loss": 0.1291, "step": 12890 }, { "epoch": 36.67377398720682, "grad_norm": 7.092151165008545, "learning_rate": 1.4719848053181388e-05, "loss": 0.1238, "step": 12900 }, { "epoch": 36.70220326936745, "grad_norm": 6.532871246337891, "learning_rate": 1.4688192465970244e-05, "loss": 0.1125, "step": 12910 }, { "epoch": 36.730632551528075, "grad_norm": 8.643569946289062, "learning_rate": 1.4656536878759103e-05, "loss": 0.1279, "step": 12920 }, { "epoch": 36.7590618336887, "grad_norm": 5.067727088928223, "learning_rate": 1.4624881291547959e-05, "loss": 0.1173, "step": 12930 }, { "epoch": 36.78749111584933, "grad_norm": 7.445854663848877, "learning_rate": 1.4593225704336816e-05, "loss": 0.1279, "step": 12940 }, { "epoch": 36.81592039800995, "grad_norm": 6.321052074432373, "learning_rate": 1.4561570117125672e-05, "loss": 0.1167, "step": 12950 }, { "epoch": 36.84434968017057, "grad_norm": 5.235776424407959, "learning_rate": 1.4529914529914531e-05, "loss": 0.1163, "step": 12960 }, { "epoch": 36.8727789623312, "grad_norm": 7.57999849319458, "learning_rate": 1.4498258942703389e-05, "loss": 0.1238, "step": 12970 }, { "epoch": 36.901208244491826, "grad_norm": 7.342813491821289, "learning_rate": 1.4466603355492245e-05, "loss": 0.1209, "step": 12980 }, { "epoch": 36.92963752665245, "grad_norm": 12.947957992553711, "learning_rate": 1.44349477682811e-05, "loss": 0.1213, "step": 12990 }, { "epoch": 36.95806680881308, "grad_norm": 9.660155296325684, "learning_rate": 1.440329218106996e-05, "loss": 0.126, "step": 13000 }, { "epoch": 36.9864960909737, "grad_norm": 8.425676345825195, "learning_rate": 1.4371636593858818e-05, "loss": 0.1243, "step": 13010 }, { "epoch": 36.997867803837956, "eval_accuracy": 0.7988, "eval_loss": 0.09931226819753647, "eval_runtime": 13.4661, "eval_samples_per_second": 371.303, "eval_steps_per_second": 11.659, "step": 13014 }, { "epoch": 37.014925373134325, "grad_norm": 8.293249130249023, "learning_rate": 1.4339981006647673e-05, "loss": 0.1233, "step": 13020 }, { "epoch": 37.043354655294955, "grad_norm": 12.379276275634766, "learning_rate": 1.4308325419436531e-05, "loss": 0.1223, "step": 13030 }, { "epoch": 37.07178393745558, "grad_norm": 5.4258880615234375, "learning_rate": 1.427666983222539e-05, "loss": 0.1214, "step": 13040 }, { "epoch": 37.10021321961621, "grad_norm": 9.708759307861328, "learning_rate": 1.4245014245014246e-05, "loss": 0.1291, "step": 13050 }, { "epoch": 37.12864250177683, "grad_norm": 8.741883277893066, "learning_rate": 1.4213358657803102e-05, "loss": 0.1222, "step": 13060 }, { "epoch": 37.157071783937454, "grad_norm": 13.334722518920898, "learning_rate": 1.418170307059196e-05, "loss": 0.1252, "step": 13070 }, { "epoch": 37.185501066098084, "grad_norm": 8.051800727844238, "learning_rate": 1.4150047483380819e-05, "loss": 0.1187, "step": 13080 }, { "epoch": 37.21393034825871, "grad_norm": 8.892814636230469, "learning_rate": 1.4118391896169675e-05, "loss": 0.1226, "step": 13090 }, { "epoch": 37.24235963041933, "grad_norm": 10.279051780700684, "learning_rate": 1.408673630895853e-05, "loss": 0.1204, "step": 13100 }, { "epoch": 37.27078891257996, "grad_norm": 11.21853256225586, "learning_rate": 1.4055080721747388e-05, "loss": 0.1208, "step": 13110 }, { "epoch": 37.29921819474058, "grad_norm": 12.552212715148926, "learning_rate": 1.4023425134536247e-05, "loss": 0.1127, "step": 13120 }, { "epoch": 37.327647476901205, "grad_norm": 8.965018272399902, "learning_rate": 1.3991769547325103e-05, "loss": 0.1195, "step": 13130 }, { "epoch": 37.356076759061835, "grad_norm": 7.319093227386475, "learning_rate": 1.3960113960113961e-05, "loss": 0.121, "step": 13140 }, { "epoch": 37.38450604122246, "grad_norm": 7.837822437286377, "learning_rate": 1.3928458372902817e-05, "loss": 0.1283, "step": 13150 }, { "epoch": 37.41293532338308, "grad_norm": 11.349024772644043, "learning_rate": 1.3896802785691676e-05, "loss": 0.1159, "step": 13160 }, { "epoch": 37.44136460554371, "grad_norm": 7.202425479888916, "learning_rate": 1.3865147198480532e-05, "loss": 0.1224, "step": 13170 }, { "epoch": 37.469793887704334, "grad_norm": 6.7827653884887695, "learning_rate": 1.383349161126939e-05, "loss": 0.1177, "step": 13180 }, { "epoch": 37.498223169864964, "grad_norm": 8.464743614196777, "learning_rate": 1.3801836024058245e-05, "loss": 0.1226, "step": 13190 }, { "epoch": 37.52665245202559, "grad_norm": 7.994402885437012, "learning_rate": 1.3770180436847105e-05, "loss": 0.119, "step": 13200 }, { "epoch": 37.55508173418621, "grad_norm": 7.2415642738342285, "learning_rate": 1.3738524849635962e-05, "loss": 0.1148, "step": 13210 }, { "epoch": 37.58351101634684, "grad_norm": 8.475737571716309, "learning_rate": 1.3706869262424818e-05, "loss": 0.124, "step": 13220 }, { "epoch": 37.61194029850746, "grad_norm": 9.970819473266602, "learning_rate": 1.3675213675213677e-05, "loss": 0.1229, "step": 13230 }, { "epoch": 37.640369580668086, "grad_norm": 11.822175979614258, "learning_rate": 1.3643558088002533e-05, "loss": 0.1216, "step": 13240 }, { "epoch": 37.668798862828716, "grad_norm": 8.223102569580078, "learning_rate": 1.361190250079139e-05, "loss": 0.1216, "step": 13250 }, { "epoch": 37.69722814498934, "grad_norm": 9.171164512634277, "learning_rate": 1.3580246913580247e-05, "loss": 0.1211, "step": 13260 }, { "epoch": 37.72565742714996, "grad_norm": 8.415572166442871, "learning_rate": 1.3548591326369106e-05, "loss": 0.1254, "step": 13270 }, { "epoch": 37.75408670931059, "grad_norm": 8.263115882873535, "learning_rate": 1.3516935739157963e-05, "loss": 0.1175, "step": 13280 }, { "epoch": 37.782515991471215, "grad_norm": 7.080509662628174, "learning_rate": 1.348528015194682e-05, "loss": 0.1301, "step": 13290 }, { "epoch": 37.81094527363184, "grad_norm": 6.205903053283691, "learning_rate": 1.3453624564735675e-05, "loss": 0.1207, "step": 13300 }, { "epoch": 37.83937455579247, "grad_norm": 6.203110218048096, "learning_rate": 1.3421968977524534e-05, "loss": 0.1207, "step": 13310 }, { "epoch": 37.86780383795309, "grad_norm": 6.9992356300354, "learning_rate": 1.3390313390313392e-05, "loss": 0.1201, "step": 13320 }, { "epoch": 37.89623312011372, "grad_norm": 6.464837074279785, "learning_rate": 1.3358657803102248e-05, "loss": 0.1203, "step": 13330 }, { "epoch": 37.92466240227434, "grad_norm": 7.350111961364746, "learning_rate": 1.3327002215891104e-05, "loss": 0.1192, "step": 13340 }, { "epoch": 37.953091684434966, "grad_norm": 11.795348167419434, "learning_rate": 1.3295346628679963e-05, "loss": 0.1154, "step": 13350 }, { "epoch": 37.981520966595596, "grad_norm": 10.910676956176758, "learning_rate": 1.326369104146882e-05, "loss": 0.1194, "step": 13360 }, { "epoch": 37.998578535891966, "eval_accuracy": 0.7986, "eval_loss": 0.10112451016902924, "eval_runtime": 13.4598, "eval_samples_per_second": 371.477, "eval_steps_per_second": 11.664, "step": 13366 }, { "epoch": 38.00995024875622, "grad_norm": 11.27439022064209, "learning_rate": 1.3232035454257677e-05, "loss": 0.1283, "step": 13370 }, { "epoch": 38.03837953091684, "grad_norm": 5.813992023468018, "learning_rate": 1.3200379867046534e-05, "loss": 0.1205, "step": 13380 }, { "epoch": 38.06680881307747, "grad_norm": 10.742948532104492, "learning_rate": 1.3168724279835393e-05, "loss": 0.1234, "step": 13390 }, { "epoch": 38.095238095238095, "grad_norm": 6.894353866577148, "learning_rate": 1.313706869262425e-05, "loss": 0.1129, "step": 13400 }, { "epoch": 38.12366737739872, "grad_norm": 9.801575660705566, "learning_rate": 1.3105413105413105e-05, "loss": 0.1287, "step": 13410 }, { "epoch": 38.15209665955935, "grad_norm": 9.128540992736816, "learning_rate": 1.3073757518201963e-05, "loss": 0.1222, "step": 13420 }, { "epoch": 38.18052594171997, "grad_norm": 6.271121978759766, "learning_rate": 1.3042101930990822e-05, "loss": 0.1203, "step": 13430 }, { "epoch": 38.208955223880594, "grad_norm": 8.044878005981445, "learning_rate": 1.3010446343779678e-05, "loss": 0.1162, "step": 13440 }, { "epoch": 38.237384506041224, "grad_norm": 8.245278358459473, "learning_rate": 1.2978790756568535e-05, "loss": 0.1173, "step": 13450 }, { "epoch": 38.26581378820185, "grad_norm": 6.552023410797119, "learning_rate": 1.2947135169357391e-05, "loss": 0.1085, "step": 13460 }, { "epoch": 38.29424307036248, "grad_norm": 7.950149059295654, "learning_rate": 1.291547958214625e-05, "loss": 0.1171, "step": 13470 }, { "epoch": 38.3226723525231, "grad_norm": 11.565644264221191, "learning_rate": 1.2883823994935106e-05, "loss": 0.119, "step": 13480 }, { "epoch": 38.35110163468372, "grad_norm": 11.562865257263184, "learning_rate": 1.2852168407723964e-05, "loss": 0.1158, "step": 13490 }, { "epoch": 38.37953091684435, "grad_norm": 7.859018802642822, "learning_rate": 1.282051282051282e-05, "loss": 0.1167, "step": 13500 }, { "epoch": 38.407960199004975, "grad_norm": 9.07667350769043, "learning_rate": 1.2788857233301679e-05, "loss": 0.1213, "step": 13510 }, { "epoch": 38.4363894811656, "grad_norm": 7.337536334991455, "learning_rate": 1.2757201646090535e-05, "loss": 0.1222, "step": 13520 }, { "epoch": 38.46481876332623, "grad_norm": 7.881592750549316, "learning_rate": 1.2725546058879393e-05, "loss": 0.1221, "step": 13530 }, { "epoch": 38.49324804548685, "grad_norm": 7.943172931671143, "learning_rate": 1.2693890471668248e-05, "loss": 0.125, "step": 13540 }, { "epoch": 38.521677327647474, "grad_norm": 12.858965873718262, "learning_rate": 1.2662234884457108e-05, "loss": 0.1176, "step": 13550 }, { "epoch": 38.550106609808104, "grad_norm": 7.728805065155029, "learning_rate": 1.2630579297245965e-05, "loss": 0.1169, "step": 13560 }, { "epoch": 38.57853589196873, "grad_norm": 10.744624137878418, "learning_rate": 1.2598923710034821e-05, "loss": 0.1253, "step": 13570 }, { "epoch": 38.60696517412935, "grad_norm": 7.839132308959961, "learning_rate": 1.2567268122823677e-05, "loss": 0.1196, "step": 13580 }, { "epoch": 38.63539445628998, "grad_norm": 11.8473482131958, "learning_rate": 1.2535612535612536e-05, "loss": 0.116, "step": 13590 }, { "epoch": 38.6638237384506, "grad_norm": 5.679298400878906, "learning_rate": 1.2503956948401394e-05, "loss": 0.1212, "step": 13600 }, { "epoch": 38.69225302061123, "grad_norm": 7.21807861328125, "learning_rate": 1.247230136119025e-05, "loss": 0.1293, "step": 13610 }, { "epoch": 38.720682302771856, "grad_norm": 6.463917255401611, "learning_rate": 1.2440645773979107e-05, "loss": 0.1246, "step": 13620 }, { "epoch": 38.74911158493248, "grad_norm": 4.381994247436523, "learning_rate": 1.2408990186767965e-05, "loss": 0.1164, "step": 13630 }, { "epoch": 38.77754086709311, "grad_norm": 8.917057991027832, "learning_rate": 1.2377334599556822e-05, "loss": 0.1246, "step": 13640 }, { "epoch": 38.80597014925373, "grad_norm": 9.278229713439941, "learning_rate": 1.2345679012345678e-05, "loss": 0.1262, "step": 13650 }, { "epoch": 38.834399431414354, "grad_norm": 8.86185359954834, "learning_rate": 1.2314023425134538e-05, "loss": 0.1139, "step": 13660 }, { "epoch": 38.862828713574984, "grad_norm": 8.304885864257812, "learning_rate": 1.2282367837923395e-05, "loss": 0.1224, "step": 13670 }, { "epoch": 38.89125799573561, "grad_norm": 10.463714599609375, "learning_rate": 1.2250712250712251e-05, "loss": 0.1156, "step": 13680 }, { "epoch": 38.91968727789623, "grad_norm": 7.358211517333984, "learning_rate": 1.2219056663501109e-05, "loss": 0.1145, "step": 13690 }, { "epoch": 38.94811656005686, "grad_norm": 8.692245483398438, "learning_rate": 1.2187401076289966e-05, "loss": 0.1177, "step": 13700 }, { "epoch": 38.97654584221748, "grad_norm": 6.572288513183594, "learning_rate": 1.2155745489078824e-05, "loss": 0.1213, "step": 13710 }, { "epoch": 38.99928926794598, "eval_accuracy": 0.8064, "eval_loss": 0.09586889296770096, "eval_runtime": 13.4281, "eval_samples_per_second": 372.353, "eval_steps_per_second": 11.692, "step": 13718 }, { "epoch": 39.004975124378106, "grad_norm": 5.5057291984558105, "learning_rate": 1.212408990186768e-05, "loss": 0.1167, "step": 13720 }, { "epoch": 39.033404406538736, "grad_norm": 6.4893012046813965, "learning_rate": 1.2092434314656539e-05, "loss": 0.1201, "step": 13730 }, { "epoch": 39.06183368869936, "grad_norm": 7.525362014770508, "learning_rate": 1.2060778727445395e-05, "loss": 0.1114, "step": 13740 }, { "epoch": 39.09026297085999, "grad_norm": 7.450962066650391, "learning_rate": 1.2029123140234252e-05, "loss": 0.1096, "step": 13750 }, { "epoch": 39.11869225302061, "grad_norm": 14.59123420715332, "learning_rate": 1.1997467553023108e-05, "loss": 0.1206, "step": 13760 }, { "epoch": 39.147121535181235, "grad_norm": 10.16324520111084, "learning_rate": 1.1965811965811967e-05, "loss": 0.1158, "step": 13770 }, { "epoch": 39.175550817341865, "grad_norm": 7.15183162689209, "learning_rate": 1.1934156378600823e-05, "loss": 0.1141, "step": 13780 }, { "epoch": 39.20398009950249, "grad_norm": 5.309065818786621, "learning_rate": 1.1902500791389681e-05, "loss": 0.1255, "step": 13790 }, { "epoch": 39.23240938166311, "grad_norm": 7.341120719909668, "learning_rate": 1.1870845204178538e-05, "loss": 0.1166, "step": 13800 }, { "epoch": 39.26083866382374, "grad_norm": 9.791176795959473, "learning_rate": 1.1839189616967396e-05, "loss": 0.1126, "step": 13810 }, { "epoch": 39.28926794598436, "grad_norm": 11.525360107421875, "learning_rate": 1.1807534029756252e-05, "loss": 0.1135, "step": 13820 }, { "epoch": 39.317697228144986, "grad_norm": 6.479952335357666, "learning_rate": 1.177587844254511e-05, "loss": 0.1246, "step": 13830 }, { "epoch": 39.346126510305616, "grad_norm": 8.19922161102295, "learning_rate": 1.1744222855333967e-05, "loss": 0.1161, "step": 13840 }, { "epoch": 39.37455579246624, "grad_norm": 8.969111442565918, "learning_rate": 1.1712567268122825e-05, "loss": 0.1179, "step": 13850 }, { "epoch": 39.40298507462686, "grad_norm": 10.68674087524414, "learning_rate": 1.168091168091168e-05, "loss": 0.1197, "step": 13860 }, { "epoch": 39.43141435678749, "grad_norm": 8.477005004882812, "learning_rate": 1.164925609370054e-05, "loss": 0.1286, "step": 13870 }, { "epoch": 39.459843638948115, "grad_norm": 8.531758308410645, "learning_rate": 1.1617600506489396e-05, "loss": 0.1179, "step": 13880 }, { "epoch": 39.488272921108745, "grad_norm": 8.02099609375, "learning_rate": 1.1585944919278253e-05, "loss": 0.1185, "step": 13890 }, { "epoch": 39.51670220326937, "grad_norm": 11.112560272216797, "learning_rate": 1.155428933206711e-05, "loss": 0.121, "step": 13900 }, { "epoch": 39.54513148542999, "grad_norm": 7.824585914611816, "learning_rate": 1.1522633744855968e-05, "loss": 0.122, "step": 13910 }, { "epoch": 39.57356076759062, "grad_norm": 11.019662857055664, "learning_rate": 1.1490978157644824e-05, "loss": 0.1141, "step": 13920 }, { "epoch": 39.601990049751244, "grad_norm": 8.420326232910156, "learning_rate": 1.1459322570433682e-05, "loss": 0.1208, "step": 13930 }, { "epoch": 39.63041933191187, "grad_norm": 7.602954864501953, "learning_rate": 1.142766698322254e-05, "loss": 0.1133, "step": 13940 }, { "epoch": 39.6588486140725, "grad_norm": 9.360836029052734, "learning_rate": 1.1396011396011397e-05, "loss": 0.1199, "step": 13950 }, { "epoch": 39.68727789623312, "grad_norm": 6.975888252258301, "learning_rate": 1.1364355808800253e-05, "loss": 0.1115, "step": 13960 }, { "epoch": 39.71570717839374, "grad_norm": 10.268877029418945, "learning_rate": 1.1332700221589112e-05, "loss": 0.1285, "step": 13970 }, { "epoch": 39.74413646055437, "grad_norm": 6.8778839111328125, "learning_rate": 1.1301044634377968e-05, "loss": 0.1166, "step": 13980 }, { "epoch": 39.772565742714995, "grad_norm": 6.247370719909668, "learning_rate": 1.1269389047166825e-05, "loss": 0.115, "step": 13990 }, { "epoch": 39.80099502487562, "grad_norm": 7.199132919311523, "learning_rate": 1.1237733459955681e-05, "loss": 0.1197, "step": 14000 }, { "epoch": 39.82942430703625, "grad_norm": 6.758470058441162, "learning_rate": 1.120607787274454e-05, "loss": 0.1161, "step": 14010 }, { "epoch": 39.85785358919687, "grad_norm": 7.161981582641602, "learning_rate": 1.1174422285533397e-05, "loss": 0.1167, "step": 14020 }, { "epoch": 39.8862828713575, "grad_norm": 8.363533020019531, "learning_rate": 1.1142766698322254e-05, "loss": 0.113, "step": 14030 }, { "epoch": 39.914712153518124, "grad_norm": 7.314785480499268, "learning_rate": 1.1111111111111112e-05, "loss": 0.113, "step": 14040 }, { "epoch": 39.94314143567875, "grad_norm": 7.622596740722656, "learning_rate": 1.107945552389997e-05, "loss": 0.1124, "step": 14050 }, { "epoch": 39.97157071783938, "grad_norm": 9.534564971923828, "learning_rate": 1.1047799936688825e-05, "loss": 0.1139, "step": 14060 }, { "epoch": 40.0, "grad_norm": 11.228628158569336, "learning_rate": 1.1016144349477683e-05, "loss": 0.1155, "step": 14070 }, { "epoch": 40.0, "eval_accuracy": 0.8108, "eval_loss": 0.0941707044839859, "eval_runtime": 13.3833, "eval_samples_per_second": 373.599, "eval_steps_per_second": 11.731, "step": 14070 }, { "epoch": 40.02842928216062, "grad_norm": 6.5851640701293945, "learning_rate": 1.098448876226654e-05, "loss": 0.1137, "step": 14080 }, { "epoch": 40.05685856432125, "grad_norm": 8.729216575622559, "learning_rate": 1.0952833175055398e-05, "loss": 0.1182, "step": 14090 }, { "epoch": 40.085287846481876, "grad_norm": 8.950576782226562, "learning_rate": 1.0921177587844255e-05, "loss": 0.1158, "step": 14100 }, { "epoch": 40.1137171286425, "grad_norm": 15.473535537719727, "learning_rate": 1.0889522000633113e-05, "loss": 0.1224, "step": 14110 }, { "epoch": 40.14214641080313, "grad_norm": 9.393978118896484, "learning_rate": 1.085786641342197e-05, "loss": 0.1146, "step": 14120 }, { "epoch": 40.17057569296375, "grad_norm": 7.201521396636963, "learning_rate": 1.0826210826210826e-05, "loss": 0.1151, "step": 14130 }, { "epoch": 40.19900497512438, "grad_norm": 8.841422080993652, "learning_rate": 1.0794555238999684e-05, "loss": 0.1214, "step": 14140 }, { "epoch": 40.227434257285005, "grad_norm": 7.663669109344482, "learning_rate": 1.0762899651788542e-05, "loss": 0.1073, "step": 14150 }, { "epoch": 40.25586353944563, "grad_norm": 8.959308624267578, "learning_rate": 1.0731244064577399e-05, "loss": 0.1174, "step": 14160 }, { "epoch": 40.28429282160626, "grad_norm": 7.097794055938721, "learning_rate": 1.0699588477366255e-05, "loss": 0.1221, "step": 14170 }, { "epoch": 40.31272210376688, "grad_norm": 6.630631923675537, "learning_rate": 1.0667932890155114e-05, "loss": 0.122, "step": 14180 }, { "epoch": 40.3411513859275, "grad_norm": 11.027847290039062, "learning_rate": 1.063627730294397e-05, "loss": 0.1217, "step": 14190 }, { "epoch": 40.36958066808813, "grad_norm": 10.981993675231934, "learning_rate": 1.0604621715732828e-05, "loss": 0.1205, "step": 14200 }, { "epoch": 40.398009950248756, "grad_norm": 6.021301746368408, "learning_rate": 1.0572966128521684e-05, "loss": 0.1212, "step": 14210 }, { "epoch": 40.42643923240938, "grad_norm": 8.385786056518555, "learning_rate": 1.0541310541310543e-05, "loss": 0.1105, "step": 14220 }, { "epoch": 40.45486851457001, "grad_norm": 7.228283882141113, "learning_rate": 1.0509654954099399e-05, "loss": 0.1144, "step": 14230 }, { "epoch": 40.48329779673063, "grad_norm": 7.585718154907227, "learning_rate": 1.0477999366888256e-05, "loss": 0.1119, "step": 14240 }, { "epoch": 40.511727078891255, "grad_norm": 6.874379634857178, "learning_rate": 1.0446343779677114e-05, "loss": 0.1164, "step": 14250 }, { "epoch": 40.540156361051885, "grad_norm": 10.106830596923828, "learning_rate": 1.0414688192465971e-05, "loss": 0.1096, "step": 14260 }, { "epoch": 40.56858564321251, "grad_norm": 7.33984375, "learning_rate": 1.0383032605254827e-05, "loss": 0.1109, "step": 14270 }, { "epoch": 40.59701492537313, "grad_norm": 7.386768817901611, "learning_rate": 1.0351377018043685e-05, "loss": 0.1211, "step": 14280 }, { "epoch": 40.62544420753376, "grad_norm": 9.43076229095459, "learning_rate": 1.0319721430832542e-05, "loss": 0.1143, "step": 14290 }, { "epoch": 40.653873489694384, "grad_norm": 7.514646053314209, "learning_rate": 1.02880658436214e-05, "loss": 0.1156, "step": 14300 }, { "epoch": 40.682302771855014, "grad_norm": 7.45674467086792, "learning_rate": 1.0256410256410256e-05, "loss": 0.1124, "step": 14310 }, { "epoch": 40.71073205401564, "grad_norm": 5.387953758239746, "learning_rate": 1.0224754669199115e-05, "loss": 0.1142, "step": 14320 }, { "epoch": 40.73916133617626, "grad_norm": 5.168889999389648, "learning_rate": 1.0193099081987971e-05, "loss": 0.1183, "step": 14330 }, { "epoch": 40.76759061833689, "grad_norm": 8.3053617477417, "learning_rate": 1.0161443494776829e-05, "loss": 0.1124, "step": 14340 }, { "epoch": 40.79601990049751, "grad_norm": 10.07886791229248, "learning_rate": 1.0129787907565686e-05, "loss": 0.1134, "step": 14350 }, { "epoch": 40.824449182658135, "grad_norm": 6.34689998626709, "learning_rate": 1.0098132320354544e-05, "loss": 0.12, "step": 14360 }, { "epoch": 40.852878464818765, "grad_norm": 8.08410930633545, "learning_rate": 1.00664767331434e-05, "loss": 0.1236, "step": 14370 }, { "epoch": 40.88130774697939, "grad_norm": 5.954514503479004, "learning_rate": 1.0034821145932257e-05, "loss": 0.1057, "step": 14380 }, { "epoch": 40.90973702914001, "grad_norm": 9.783620834350586, "learning_rate": 1.0003165558721115e-05, "loss": 0.1162, "step": 14390 }, { "epoch": 40.93816631130064, "grad_norm": 8.720817565917969, "learning_rate": 9.971509971509972e-06, "loss": 0.1131, "step": 14400 }, { "epoch": 40.966595593461264, "grad_norm": 9.102250099182129, "learning_rate": 9.939854384298828e-06, "loss": 0.1069, "step": 14410 }, { "epoch": 40.995024875621894, "grad_norm": 6.666236400604248, "learning_rate": 9.908198797087687e-06, "loss": 0.1179, "step": 14420 }, { "epoch": 40.997867803837956, "eval_accuracy": 0.8072, "eval_loss": 0.09497536718845367, "eval_runtime": 13.3651, "eval_samples_per_second": 374.109, "eval_steps_per_second": 11.747, "step": 14421 }, { "epoch": 41.02345415778252, "grad_norm": 9.263477325439453, "learning_rate": 9.876543209876543e-06, "loss": 0.1103, "step": 14430 }, { "epoch": 41.05188343994314, "grad_norm": 10.252467155456543, "learning_rate": 9.844887622665401e-06, "loss": 0.1216, "step": 14440 }, { "epoch": 41.08031272210377, "grad_norm": 10.800745964050293, "learning_rate": 9.813232035454257e-06, "loss": 0.1175, "step": 14450 }, { "epoch": 41.10874200426439, "grad_norm": 7.863544940948486, "learning_rate": 9.781576448243116e-06, "loss": 0.1179, "step": 14460 }, { "epoch": 41.137171286425016, "grad_norm": 6.894473075866699, "learning_rate": 9.749920861031972e-06, "loss": 0.11, "step": 14470 }, { "epoch": 41.165600568585646, "grad_norm": 7.9413981437683105, "learning_rate": 9.71826527382083e-06, "loss": 0.116, "step": 14480 }, { "epoch": 41.19402985074627, "grad_norm": 5.897985458374023, "learning_rate": 9.686609686609687e-06, "loss": 0.1073, "step": 14490 }, { "epoch": 41.22245913290689, "grad_norm": 8.500201225280762, "learning_rate": 9.654954099398545e-06, "loss": 0.1173, "step": 14500 }, { "epoch": 41.25088841506752, "grad_norm": 6.6483378410339355, "learning_rate": 9.6232985121874e-06, "loss": 0.1119, "step": 14510 }, { "epoch": 41.279317697228144, "grad_norm": 6.9826226234436035, "learning_rate": 9.591642924976258e-06, "loss": 0.1206, "step": 14520 }, { "epoch": 41.30774697938877, "grad_norm": 8.78371524810791, "learning_rate": 9.559987337765117e-06, "loss": 0.1163, "step": 14530 }, { "epoch": 41.3361762615494, "grad_norm": 8.918118476867676, "learning_rate": 9.528331750553973e-06, "loss": 0.1102, "step": 14540 }, { "epoch": 41.36460554371002, "grad_norm": 14.389333724975586, "learning_rate": 9.49667616334283e-06, "loss": 0.1198, "step": 14550 }, { "epoch": 41.39303482587065, "grad_norm": 6.26442289352417, "learning_rate": 9.465020576131688e-06, "loss": 0.1218, "step": 14560 }, { "epoch": 41.42146410803127, "grad_norm": 7.8187456130981445, "learning_rate": 9.433364988920546e-06, "loss": 0.1273, "step": 14570 }, { "epoch": 41.449893390191896, "grad_norm": 10.892502784729004, "learning_rate": 9.401709401709402e-06, "loss": 0.1133, "step": 14580 }, { "epoch": 41.478322672352526, "grad_norm": 7.710583209991455, "learning_rate": 9.37005381449826e-06, "loss": 0.1161, "step": 14590 }, { "epoch": 41.50675195451315, "grad_norm": 7.214125633239746, "learning_rate": 9.338398227287117e-06, "loss": 0.1215, "step": 14600 }, { "epoch": 41.53518123667377, "grad_norm": 7.554646015167236, "learning_rate": 9.306742640075974e-06, "loss": 0.1214, "step": 14610 }, { "epoch": 41.5636105188344, "grad_norm": 6.459896087646484, "learning_rate": 9.27508705286483e-06, "loss": 0.1172, "step": 14620 }, { "epoch": 41.592039800995025, "grad_norm": 7.760246753692627, "learning_rate": 9.24343146565369e-06, "loss": 0.1152, "step": 14630 }, { "epoch": 41.62046908315565, "grad_norm": 9.48446273803711, "learning_rate": 9.211775878442545e-06, "loss": 0.1161, "step": 14640 }, { "epoch": 41.64889836531628, "grad_norm": 6.437201499938965, "learning_rate": 9.180120291231403e-06, "loss": 0.115, "step": 14650 }, { "epoch": 41.6773276474769, "grad_norm": 10.522683143615723, "learning_rate": 9.14846470402026e-06, "loss": 0.1199, "step": 14660 }, { "epoch": 41.70575692963752, "grad_norm": 8.922218322753906, "learning_rate": 9.116809116809118e-06, "loss": 0.1139, "step": 14670 }, { "epoch": 41.73418621179815, "grad_norm": 4.189915657043457, "learning_rate": 9.085153529597974e-06, "loss": 0.112, "step": 14680 }, { "epoch": 41.762615493958776, "grad_norm": 11.384885787963867, "learning_rate": 9.053497942386832e-06, "loss": 0.1121, "step": 14690 }, { "epoch": 41.791044776119406, "grad_norm": 9.044368743896484, "learning_rate": 9.02184235517569e-06, "loss": 0.1136, "step": 14700 }, { "epoch": 41.81947405828003, "grad_norm": 4.6811203956604, "learning_rate": 8.990186767964547e-06, "loss": 0.1134, "step": 14710 }, { "epoch": 41.84790334044065, "grad_norm": 9.034210205078125, "learning_rate": 8.958531180753403e-06, "loss": 0.1184, "step": 14720 }, { "epoch": 41.87633262260128, "grad_norm": 8.152844429016113, "learning_rate": 8.92687559354226e-06, "loss": 0.1172, "step": 14730 }, { "epoch": 41.904761904761905, "grad_norm": 6.419579029083252, "learning_rate": 8.895220006331118e-06, "loss": 0.1175, "step": 14740 }, { "epoch": 41.93319118692253, "grad_norm": 5.592881679534912, "learning_rate": 8.863564419119975e-06, "loss": 0.1077, "step": 14750 }, { "epoch": 41.96162046908316, "grad_norm": 9.540054321289062, "learning_rate": 8.831908831908831e-06, "loss": 0.1144, "step": 14760 }, { "epoch": 41.99004975124378, "grad_norm": 7.7718610763549805, "learning_rate": 8.80025324469769e-06, "loss": 0.1057, "step": 14770 }, { "epoch": 41.998578535891966, "eval_accuracy": 0.8166, "eval_loss": 0.0924171730875969, "eval_runtime": 13.3955, "eval_samples_per_second": 373.259, "eval_steps_per_second": 11.72, "step": 14773 }, { "epoch": 42.018479033404404, "grad_norm": 9.009836196899414, "learning_rate": 8.768597657486546e-06, "loss": 0.1099, "step": 14780 }, { "epoch": 42.046908315565034, "grad_norm": 10.051161766052246, "learning_rate": 8.736942070275404e-06, "loss": 0.1081, "step": 14790 }, { "epoch": 42.07533759772566, "grad_norm": 6.737943649291992, "learning_rate": 8.705286483064262e-06, "loss": 0.1127, "step": 14800 }, { "epoch": 42.10376687988628, "grad_norm": 7.190810680389404, "learning_rate": 8.673630895853119e-06, "loss": 0.1155, "step": 14810 }, { "epoch": 42.13219616204691, "grad_norm": 11.324483871459961, "learning_rate": 8.641975308641975e-06, "loss": 0.1147, "step": 14820 }, { "epoch": 42.16062544420753, "grad_norm": 6.89904260635376, "learning_rate": 8.610319721430833e-06, "loss": 0.1208, "step": 14830 }, { "epoch": 42.18905472636816, "grad_norm": 8.286101341247559, "learning_rate": 8.57866413421969e-06, "loss": 0.1132, "step": 14840 }, { "epoch": 42.217484008528785, "grad_norm": 8.320545196533203, "learning_rate": 8.547008547008548e-06, "loss": 0.1254, "step": 14850 }, { "epoch": 42.24591329068941, "grad_norm": 11.75759220123291, "learning_rate": 8.515352959797404e-06, "loss": 0.1186, "step": 14860 }, { "epoch": 42.27434257285004, "grad_norm": 7.68834924697876, "learning_rate": 8.483697372586263e-06, "loss": 0.1159, "step": 14870 }, { "epoch": 42.30277185501066, "grad_norm": 11.214583396911621, "learning_rate": 8.452041785375119e-06, "loss": 0.1043, "step": 14880 }, { "epoch": 42.331201137171284, "grad_norm": 17.57444953918457, "learning_rate": 8.420386198163976e-06, "loss": 0.1115, "step": 14890 }, { "epoch": 42.359630419331914, "grad_norm": 9.51223373413086, "learning_rate": 8.388730610952832e-06, "loss": 0.1055, "step": 14900 }, { "epoch": 42.38805970149254, "grad_norm": 7.176620006561279, "learning_rate": 8.357075023741691e-06, "loss": 0.1194, "step": 14910 }, { "epoch": 42.41648898365316, "grad_norm": 7.583780765533447, "learning_rate": 8.325419436530547e-06, "loss": 0.1137, "step": 14920 }, { "epoch": 42.44491826581379, "grad_norm": 7.420557022094727, "learning_rate": 8.293763849319405e-06, "loss": 0.1131, "step": 14930 }, { "epoch": 42.47334754797441, "grad_norm": 6.603710174560547, "learning_rate": 8.262108262108262e-06, "loss": 0.1195, "step": 14940 }, { "epoch": 42.501776830135036, "grad_norm": 6.232205390930176, "learning_rate": 8.23045267489712e-06, "loss": 0.1092, "step": 14950 }, { "epoch": 42.530206112295666, "grad_norm": 7.759372234344482, "learning_rate": 8.198797087685978e-06, "loss": 0.1192, "step": 14960 }, { "epoch": 42.55863539445629, "grad_norm": 6.483234882354736, "learning_rate": 8.167141500474833e-06, "loss": 0.1133, "step": 14970 }, { "epoch": 42.58706467661692, "grad_norm": 6.6437859535217285, "learning_rate": 8.135485913263693e-06, "loss": 0.1196, "step": 14980 }, { "epoch": 42.61549395877754, "grad_norm": 9.488911628723145, "learning_rate": 8.103830326052549e-06, "loss": 0.1117, "step": 14990 }, { "epoch": 42.643923240938165, "grad_norm": 9.877264976501465, "learning_rate": 8.072174738841406e-06, "loss": 0.1129, "step": 15000 }, { "epoch": 42.672352523098795, "grad_norm": 7.255493640899658, "learning_rate": 8.040519151630264e-06, "loss": 0.1142, "step": 15010 }, { "epoch": 42.70078180525942, "grad_norm": 7.835541725158691, "learning_rate": 8.008863564419121e-06, "loss": 0.1128, "step": 15020 }, { "epoch": 42.72921108742004, "grad_norm": 7.54576301574707, "learning_rate": 7.977207977207977e-06, "loss": 0.1146, "step": 15030 }, { "epoch": 42.75764036958067, "grad_norm": 8.6882905960083, "learning_rate": 7.945552389996835e-06, "loss": 0.1122, "step": 15040 }, { "epoch": 42.78606965174129, "grad_norm": 8.508428573608398, "learning_rate": 7.913896802785692e-06, "loss": 0.109, "step": 15050 }, { "epoch": 42.814498933901916, "grad_norm": 6.899832248687744, "learning_rate": 7.88224121557455e-06, "loss": 0.11, "step": 15060 }, { "epoch": 42.842928216062546, "grad_norm": 8.702173233032227, "learning_rate": 7.850585628363406e-06, "loss": 0.1162, "step": 15070 }, { "epoch": 42.87135749822317, "grad_norm": 9.671116828918457, "learning_rate": 7.818930041152265e-06, "loss": 0.1175, "step": 15080 }, { "epoch": 42.89978678038379, "grad_norm": 8.765652656555176, "learning_rate": 7.787274453941121e-06, "loss": 0.1061, "step": 15090 }, { "epoch": 42.92821606254442, "grad_norm": 8.824734687805176, "learning_rate": 7.755618866729978e-06, "loss": 0.1153, "step": 15100 }, { "epoch": 42.956645344705045, "grad_norm": 7.316644191741943, "learning_rate": 7.723963279518836e-06, "loss": 0.1177, "step": 15110 }, { "epoch": 42.985074626865675, "grad_norm": 5.55697774887085, "learning_rate": 7.692307692307694e-06, "loss": 0.1042, "step": 15120 }, { "epoch": 42.99928926794598, "eval_accuracy": 0.8152, "eval_loss": 0.09235040098428726, "eval_runtime": 13.4568, "eval_samples_per_second": 371.56, "eval_steps_per_second": 11.667, "step": 15125 }, { "epoch": 43.0135039090263, "grad_norm": 11.253617286682129, "learning_rate": 7.66065210509655e-06, "loss": 0.1153, "step": 15130 }, { "epoch": 43.04193319118692, "grad_norm": 9.907164573669434, "learning_rate": 7.628996517885408e-06, "loss": 0.1122, "step": 15140 }, { "epoch": 43.07036247334755, "grad_norm": 6.975913047790527, "learning_rate": 7.597340930674264e-06, "loss": 0.1169, "step": 15150 }, { "epoch": 43.098791755508174, "grad_norm": 7.245847702026367, "learning_rate": 7.565685343463122e-06, "loss": 0.1087, "step": 15160 }, { "epoch": 43.1272210376688, "grad_norm": 7.689225196838379, "learning_rate": 7.534029756251979e-06, "loss": 0.1126, "step": 15170 }, { "epoch": 43.15565031982943, "grad_norm": 10.346821784973145, "learning_rate": 7.5023741690408365e-06, "loss": 0.1077, "step": 15180 }, { "epoch": 43.18407960199005, "grad_norm": 9.350476264953613, "learning_rate": 7.470718581829693e-06, "loss": 0.1065, "step": 15190 }, { "epoch": 43.21250888415067, "grad_norm": 8.799906730651855, "learning_rate": 7.439062994618551e-06, "loss": 0.1095, "step": 15200 }, { "epoch": 43.2409381663113, "grad_norm": 6.088229656219482, "learning_rate": 7.4074074074074075e-06, "loss": 0.1107, "step": 15210 }, { "epoch": 43.269367448471925, "grad_norm": 9.523571014404297, "learning_rate": 7.375751820196265e-06, "loss": 0.1107, "step": 15220 }, { "epoch": 43.29779673063255, "grad_norm": 7.971775531768799, "learning_rate": 7.344096232985122e-06, "loss": 0.1145, "step": 15230 }, { "epoch": 43.32622601279318, "grad_norm": 8.00693416595459, "learning_rate": 7.312440645773979e-06, "loss": 0.1136, "step": 15240 }, { "epoch": 43.3546552949538, "grad_norm": 10.808344841003418, "learning_rate": 7.280785058562836e-06, "loss": 0.1188, "step": 15250 }, { "epoch": 43.38308457711443, "grad_norm": 8.63160228729248, "learning_rate": 7.2491294713516945e-06, "loss": 0.1166, "step": 15260 }, { "epoch": 43.411513859275054, "grad_norm": 5.847464084625244, "learning_rate": 7.21747388414055e-06, "loss": 0.1069, "step": 15270 }, { "epoch": 43.43994314143568, "grad_norm": 6.7100677490234375, "learning_rate": 7.185818296929409e-06, "loss": 0.1118, "step": 15280 }, { "epoch": 43.46837242359631, "grad_norm": 7.616508483886719, "learning_rate": 7.1541627097182655e-06, "loss": 0.113, "step": 15290 }, { "epoch": 43.49680170575693, "grad_norm": 5.855709552764893, "learning_rate": 7.122507122507123e-06, "loss": 0.1078, "step": 15300 }, { "epoch": 43.52523098791755, "grad_norm": 8.829164505004883, "learning_rate": 7.09085153529598e-06, "loss": 0.1116, "step": 15310 }, { "epoch": 43.55366027007818, "grad_norm": 7.1706318855285645, "learning_rate": 7.059195948084837e-06, "loss": 0.1188, "step": 15320 }, { "epoch": 43.582089552238806, "grad_norm": 6.537577152252197, "learning_rate": 7.027540360873694e-06, "loss": 0.1077, "step": 15330 }, { "epoch": 43.61051883439943, "grad_norm": 7.335690021514893, "learning_rate": 6.995884773662552e-06, "loss": 0.1143, "step": 15340 }, { "epoch": 43.63894811656006, "grad_norm": 7.9423418045043945, "learning_rate": 6.964229186451408e-06, "loss": 0.1036, "step": 15350 }, { "epoch": 43.66737739872068, "grad_norm": 9.425837516784668, "learning_rate": 6.932573599240266e-06, "loss": 0.1066, "step": 15360 }, { "epoch": 43.695806680881304, "grad_norm": 6.1898722648620605, "learning_rate": 6.900918012029123e-06, "loss": 0.1164, "step": 15370 }, { "epoch": 43.724235963041934, "grad_norm": 8.25695514678955, "learning_rate": 6.869262424817981e-06, "loss": 0.1091, "step": 15380 }, { "epoch": 43.75266524520256, "grad_norm": 9.368672370910645, "learning_rate": 6.837606837606839e-06, "loss": 0.1124, "step": 15390 }, { "epoch": 43.78109452736319, "grad_norm": 10.280566215515137, "learning_rate": 6.805951250395695e-06, "loss": 0.1092, "step": 15400 }, { "epoch": 43.80952380952381, "grad_norm": 8.319000244140625, "learning_rate": 6.774295663184553e-06, "loss": 0.1151, "step": 15410 }, { "epoch": 43.83795309168443, "grad_norm": 5.234455585479736, "learning_rate": 6.74264007597341e-06, "loss": 0.1114, "step": 15420 }, { "epoch": 43.86638237384506, "grad_norm": 7.575740814208984, "learning_rate": 6.710984488762267e-06, "loss": 0.1097, "step": 15430 }, { "epoch": 43.894811656005686, "grad_norm": 8.142337799072266, "learning_rate": 6.679328901551124e-06, "loss": 0.1036, "step": 15440 }, { "epoch": 43.92324093816631, "grad_norm": 7.75861930847168, "learning_rate": 6.6476733143399815e-06, "loss": 0.113, "step": 15450 }, { "epoch": 43.95167022032694, "grad_norm": 10.706192970275879, "learning_rate": 6.616017727128838e-06, "loss": 0.1169, "step": 15460 }, { "epoch": 43.98009950248756, "grad_norm": 9.635318756103516, "learning_rate": 6.584362139917697e-06, "loss": 0.1151, "step": 15470 }, { "epoch": 44.0, "eval_accuracy": 0.8132, "eval_loss": 0.09280282258987427, "eval_runtime": 13.5091, "eval_samples_per_second": 370.12, "eval_steps_per_second": 11.622, "step": 15477 }, { "epoch": 44.008528784648185, "grad_norm": 4.650729656219482, "learning_rate": 6.5527065527065525e-06, "loss": 0.1074, "step": 15480 }, { "epoch": 44.036958066808815, "grad_norm": 9.15291690826416, "learning_rate": 6.521050965495411e-06, "loss": 0.1121, "step": 15490 }, { "epoch": 44.06538734896944, "grad_norm": 8.391106605529785, "learning_rate": 6.489395378284268e-06, "loss": 0.1153, "step": 15500 }, { "epoch": 44.09381663113006, "grad_norm": 7.03884744644165, "learning_rate": 6.457739791073125e-06, "loss": 0.1079, "step": 15510 }, { "epoch": 44.12224591329069, "grad_norm": 6.90254020690918, "learning_rate": 6.426084203861982e-06, "loss": 0.1057, "step": 15520 }, { "epoch": 44.15067519545131, "grad_norm": 8.857616424560547, "learning_rate": 6.3944286166508395e-06, "loss": 0.117, "step": 15530 }, { "epoch": 44.17910447761194, "grad_norm": 7.565794944763184, "learning_rate": 6.362773029439696e-06, "loss": 0.1172, "step": 15540 }, { "epoch": 44.207533759772566, "grad_norm": 11.730203628540039, "learning_rate": 6.331117442228554e-06, "loss": 0.1122, "step": 15550 }, { "epoch": 44.23596304193319, "grad_norm": 8.397017478942871, "learning_rate": 6.2994618550174106e-06, "loss": 0.1137, "step": 15560 }, { "epoch": 44.26439232409382, "grad_norm": 9.356100082397461, "learning_rate": 6.267806267806268e-06, "loss": 0.1054, "step": 15570 }, { "epoch": 44.29282160625444, "grad_norm": 8.829228401184082, "learning_rate": 6.236150680595125e-06, "loss": 0.1086, "step": 15580 }, { "epoch": 44.321250888415065, "grad_norm": 9.162749290466309, "learning_rate": 6.204495093383982e-06, "loss": 0.1142, "step": 15590 }, { "epoch": 44.349680170575695, "grad_norm": 8.511028289794922, "learning_rate": 6.172839506172839e-06, "loss": 0.1107, "step": 15600 }, { "epoch": 44.37810945273632, "grad_norm": 17.1426944732666, "learning_rate": 6.1411839189616976e-06, "loss": 0.1064, "step": 15610 }, { "epoch": 44.40653873489694, "grad_norm": 4.927452564239502, "learning_rate": 6.109528331750554e-06, "loss": 0.1155, "step": 15620 }, { "epoch": 44.43496801705757, "grad_norm": 6.737484455108643, "learning_rate": 6.077872744539412e-06, "loss": 0.1115, "step": 15630 }, { "epoch": 44.463397299218194, "grad_norm": 7.3624982833862305, "learning_rate": 6.046217157328269e-06, "loss": 0.108, "step": 15640 }, { "epoch": 44.49182658137882, "grad_norm": 7.903335094451904, "learning_rate": 6.014561570117126e-06, "loss": 0.1064, "step": 15650 }, { "epoch": 44.52025586353945, "grad_norm": 10.368175506591797, "learning_rate": 5.982905982905984e-06, "loss": 0.1159, "step": 15660 }, { "epoch": 44.54868514570007, "grad_norm": 8.306153297424316, "learning_rate": 5.9512503956948404e-06, "loss": 0.1093, "step": 15670 }, { "epoch": 44.5771144278607, "grad_norm": 7.658620357513428, "learning_rate": 5.919594808483698e-06, "loss": 0.1188, "step": 15680 }, { "epoch": 44.60554371002132, "grad_norm": 6.1418375968933105, "learning_rate": 5.887939221272555e-06, "loss": 0.1095, "step": 15690 }, { "epoch": 44.633972992181945, "grad_norm": 8.000725746154785, "learning_rate": 5.856283634061412e-06, "loss": 0.1122, "step": 15700 }, { "epoch": 44.662402274342575, "grad_norm": 11.74553394317627, "learning_rate": 5.82462804685027e-06, "loss": 0.1072, "step": 15710 }, { "epoch": 44.6908315565032, "grad_norm": 6.933135509490967, "learning_rate": 5.792972459639127e-06, "loss": 0.1164, "step": 15720 }, { "epoch": 44.71926083866382, "grad_norm": 6.430202007293701, "learning_rate": 5.761316872427984e-06, "loss": 0.1162, "step": 15730 }, { "epoch": 44.74769012082445, "grad_norm": 9.141294479370117, "learning_rate": 5.729661285216841e-06, "loss": 0.1068, "step": 15740 }, { "epoch": 44.776119402985074, "grad_norm": 9.290877342224121, "learning_rate": 5.6980056980056985e-06, "loss": 0.1178, "step": 15750 }, { "epoch": 44.8045486851457, "grad_norm": 6.567460060119629, "learning_rate": 5.666350110794556e-06, "loss": 0.1083, "step": 15760 }, { "epoch": 44.83297796730633, "grad_norm": 13.561402320861816, "learning_rate": 5.634694523583413e-06, "loss": 0.1126, "step": 15770 }, { "epoch": 44.86140724946695, "grad_norm": 6.169399738311768, "learning_rate": 5.60303893637227e-06, "loss": 0.1071, "step": 15780 }, { "epoch": 44.88983653162757, "grad_norm": 6.180117607116699, "learning_rate": 5.571383349161127e-06, "loss": 0.1036, "step": 15790 }, { "epoch": 44.9182658137882, "grad_norm": 5.523869514465332, "learning_rate": 5.539727761949985e-06, "loss": 0.1037, "step": 15800 }, { "epoch": 44.946695095948826, "grad_norm": 7.544118881225586, "learning_rate": 5.508072174738841e-06, "loss": 0.1094, "step": 15810 }, { "epoch": 44.975124378109456, "grad_norm": 7.1479387283325195, "learning_rate": 5.476416587527699e-06, "loss": 0.1122, "step": 15820 }, { "epoch": 44.997867803837956, "eval_accuracy": 0.8146, "eval_loss": 0.09199415147304535, "eval_runtime": 13.3661, "eval_samples_per_second": 374.081, "eval_steps_per_second": 11.746, "step": 15828 }, { "epoch": 45.00355366027008, "grad_norm": 6.648639678955078, "learning_rate": 5.4447610003165565e-06, "loss": 0.1063, "step": 15830 }, { "epoch": 45.0319829424307, "grad_norm": 9.599860191345215, "learning_rate": 5.413105413105413e-06, "loss": 0.1077, "step": 15840 }, { "epoch": 45.06041222459133, "grad_norm": 8.63837718963623, "learning_rate": 5.381449825894271e-06, "loss": 0.1121, "step": 15850 }, { "epoch": 45.088841506751955, "grad_norm": 6.7223334312438965, "learning_rate": 5.3497942386831275e-06, "loss": 0.1097, "step": 15860 }, { "epoch": 45.11727078891258, "grad_norm": 5.2021942138671875, "learning_rate": 5.318138651471985e-06, "loss": 0.1155, "step": 15870 }, { "epoch": 45.14570007107321, "grad_norm": 6.177606105804443, "learning_rate": 5.286483064260842e-06, "loss": 0.1111, "step": 15880 }, { "epoch": 45.17412935323383, "grad_norm": 8.523933410644531, "learning_rate": 5.254827477049699e-06, "loss": 0.1099, "step": 15890 }, { "epoch": 45.20255863539445, "grad_norm": 9.170498847961426, "learning_rate": 5.223171889838557e-06, "loss": 0.1093, "step": 15900 }, { "epoch": 45.23098791755508, "grad_norm": 10.456220626831055, "learning_rate": 5.191516302627414e-06, "loss": 0.1077, "step": 15910 }, { "epoch": 45.259417199715706, "grad_norm": 7.951976299285889, "learning_rate": 5.159860715416271e-06, "loss": 0.1068, "step": 15920 }, { "epoch": 45.28784648187633, "grad_norm": 11.156434059143066, "learning_rate": 5.128205128205128e-06, "loss": 0.1104, "step": 15930 }, { "epoch": 45.31627576403696, "grad_norm": 7.797679424285889, "learning_rate": 5.0965495409939855e-06, "loss": 0.1066, "step": 15940 }, { "epoch": 45.34470504619758, "grad_norm": 9.258740425109863, "learning_rate": 5.064893953782843e-06, "loss": 0.1052, "step": 15950 }, { "epoch": 45.37313432835821, "grad_norm": 10.555813789367676, "learning_rate": 5.0332383665717e-06, "loss": 0.1101, "step": 15960 }, { "epoch": 45.401563610518835, "grad_norm": 11.744818687438965, "learning_rate": 5.001582779360557e-06, "loss": 0.1115, "step": 15970 }, { "epoch": 45.42999289267946, "grad_norm": 8.023004531860352, "learning_rate": 4.969927192149414e-06, "loss": 0.1124, "step": 15980 }, { "epoch": 45.45842217484009, "grad_norm": 12.692828178405762, "learning_rate": 4.938271604938272e-06, "loss": 0.1136, "step": 15990 }, { "epoch": 45.48685145700071, "grad_norm": 11.077971458435059, "learning_rate": 4.906616017727128e-06, "loss": 0.1022, "step": 16000 }, { "epoch": 45.515280739161334, "grad_norm": 5.671931266784668, "learning_rate": 4.874960430515986e-06, "loss": 0.1099, "step": 16010 }, { "epoch": 45.543710021321964, "grad_norm": 10.317317962646484, "learning_rate": 4.8433048433048435e-06, "loss": 0.1151, "step": 16020 }, { "epoch": 45.57213930348259, "grad_norm": 6.596101760864258, "learning_rate": 4.8116492560937e-06, "loss": 0.1113, "step": 16030 }, { "epoch": 45.60056858564321, "grad_norm": 7.351507663726807, "learning_rate": 4.779993668882559e-06, "loss": 0.1065, "step": 16040 }, { "epoch": 45.62899786780384, "grad_norm": 7.493065357208252, "learning_rate": 4.748338081671415e-06, "loss": 0.1194, "step": 16050 }, { "epoch": 45.65742714996446, "grad_norm": 6.896857261657715, "learning_rate": 4.716682494460273e-06, "loss": 0.1053, "step": 16060 }, { "epoch": 45.68585643212509, "grad_norm": 8.050280570983887, "learning_rate": 4.68502690724913e-06, "loss": 0.1097, "step": 16070 }, { "epoch": 45.714285714285715, "grad_norm": 5.967408657073975, "learning_rate": 4.653371320037987e-06, "loss": 0.1178, "step": 16080 }, { "epoch": 45.74271499644634, "grad_norm": 7.249273300170898, "learning_rate": 4.621715732826845e-06, "loss": 0.1048, "step": 16090 }, { "epoch": 45.77114427860697, "grad_norm": 16.740827560424805, "learning_rate": 4.5900601456157015e-06, "loss": 0.1096, "step": 16100 }, { "epoch": 45.79957356076759, "grad_norm": 10.733427047729492, "learning_rate": 4.558404558404559e-06, "loss": 0.1113, "step": 16110 }, { "epoch": 45.828002842928214, "grad_norm": 9.221504211425781, "learning_rate": 4.526748971193416e-06, "loss": 0.1134, "step": 16120 }, { "epoch": 45.856432125088844, "grad_norm": 7.773632526397705, "learning_rate": 4.495093383982273e-06, "loss": 0.1095, "step": 16130 }, { "epoch": 45.88486140724947, "grad_norm": 10.043051719665527, "learning_rate": 4.46343779677113e-06, "loss": 0.111, "step": 16140 }, { "epoch": 45.91329068941009, "grad_norm": 9.371420860290527, "learning_rate": 4.431782209559988e-06, "loss": 0.1083, "step": 16150 }, { "epoch": 45.94171997157072, "grad_norm": 10.984370231628418, "learning_rate": 4.400126622348845e-06, "loss": 0.1087, "step": 16160 }, { "epoch": 45.97014925373134, "grad_norm": 7.1741943359375, "learning_rate": 4.368471035137702e-06, "loss": 0.1079, "step": 16170 }, { "epoch": 45.998578535891966, "grad_norm": 7.4503397941589355, "learning_rate": 4.3368154479265595e-06, "loss": 0.11, "step": 16180 }, { "epoch": 45.998578535891966, "eval_accuracy": 0.8152, "eval_loss": 0.0905674546957016, "eval_runtime": 13.2948, "eval_samples_per_second": 376.087, "eval_steps_per_second": 11.809, "step": 16180 }, { "epoch": 46.027007818052596, "grad_norm": 6.4230804443359375, "learning_rate": 4.305159860715416e-06, "loss": 0.1071, "step": 16190 }, { "epoch": 46.05543710021322, "grad_norm": 6.9457855224609375, "learning_rate": 4.273504273504274e-06, "loss": 0.107, "step": 16200 }, { "epoch": 46.08386638237385, "grad_norm": 14.256475448608398, "learning_rate": 4.241848686293131e-06, "loss": 0.1151, "step": 16210 }, { "epoch": 46.11229566453447, "grad_norm": 10.412154197692871, "learning_rate": 4.210193099081988e-06, "loss": 0.1164, "step": 16220 }, { "epoch": 46.140724946695094, "grad_norm": 8.378962516784668, "learning_rate": 4.178537511870846e-06, "loss": 0.1051, "step": 16230 }, { "epoch": 46.169154228855724, "grad_norm": 7.868841171264648, "learning_rate": 4.1468819246597024e-06, "loss": 0.1071, "step": 16240 }, { "epoch": 46.19758351101635, "grad_norm": 10.3138427734375, "learning_rate": 4.11522633744856e-06, "loss": 0.1155, "step": 16250 }, { "epoch": 46.22601279317697, "grad_norm": 7.106865882873535, "learning_rate": 4.083570750237417e-06, "loss": 0.1066, "step": 16260 }, { "epoch": 46.2544420753376, "grad_norm": 5.522883415222168, "learning_rate": 4.051915163026274e-06, "loss": 0.1046, "step": 16270 }, { "epoch": 46.28287135749822, "grad_norm": 8.359395027160645, "learning_rate": 4.020259575815132e-06, "loss": 0.1078, "step": 16280 }, { "epoch": 46.311300639658846, "grad_norm": 9.841255187988281, "learning_rate": 3.988603988603989e-06, "loss": 0.1049, "step": 16290 }, { "epoch": 46.339729921819476, "grad_norm": 6.074944972991943, "learning_rate": 3.956948401392846e-06, "loss": 0.1149, "step": 16300 }, { "epoch": 46.3681592039801, "grad_norm": 6.688263893127441, "learning_rate": 3.925292814181703e-06, "loss": 0.1161, "step": 16310 }, { "epoch": 46.39658848614072, "grad_norm": 12.920149803161621, "learning_rate": 3.8936372269705604e-06, "loss": 0.106, "step": 16320 }, { "epoch": 46.42501776830135, "grad_norm": 9.634818077087402, "learning_rate": 3.861981639759418e-06, "loss": 0.1116, "step": 16330 }, { "epoch": 46.453447050461975, "grad_norm": 8.39775276184082, "learning_rate": 3.830326052548275e-06, "loss": 0.1087, "step": 16340 }, { "epoch": 46.481876332622605, "grad_norm": 5.796605587005615, "learning_rate": 3.798670465337132e-06, "loss": 0.107, "step": 16350 }, { "epoch": 46.51030561478323, "grad_norm": 7.1116108894348145, "learning_rate": 3.7670148781259894e-06, "loss": 0.1144, "step": 16360 }, { "epoch": 46.53873489694385, "grad_norm": 6.552640914916992, "learning_rate": 3.7353592909148466e-06, "loss": 0.1128, "step": 16370 }, { "epoch": 46.56716417910448, "grad_norm": 8.284194946289062, "learning_rate": 3.7037037037037037e-06, "loss": 0.1132, "step": 16380 }, { "epoch": 46.5955934612651, "grad_norm": 8.98961067199707, "learning_rate": 3.672048116492561e-06, "loss": 0.1069, "step": 16390 }, { "epoch": 46.624022743425726, "grad_norm": 7.5320725440979, "learning_rate": 3.640392529281418e-06, "loss": 0.1101, "step": 16400 }, { "epoch": 46.652452025586356, "grad_norm": 12.350693702697754, "learning_rate": 3.608736942070275e-06, "loss": 0.1152, "step": 16410 }, { "epoch": 46.68088130774698, "grad_norm": 8.265623092651367, "learning_rate": 3.5770813548591327e-06, "loss": 0.1096, "step": 16420 }, { "epoch": 46.7093105899076, "grad_norm": 5.840739727020264, "learning_rate": 3.54542576764799e-06, "loss": 0.1046, "step": 16430 }, { "epoch": 46.73773987206823, "grad_norm": 7.326878547668457, "learning_rate": 3.513770180436847e-06, "loss": 0.1081, "step": 16440 }, { "epoch": 46.766169154228855, "grad_norm": 9.621678352355957, "learning_rate": 3.482114593225704e-06, "loss": 0.1019, "step": 16450 }, { "epoch": 46.79459843638948, "grad_norm": 9.967592239379883, "learning_rate": 3.4504590060145613e-06, "loss": 0.1043, "step": 16460 }, { "epoch": 46.82302771855011, "grad_norm": 7.968399524688721, "learning_rate": 3.4188034188034193e-06, "loss": 0.1112, "step": 16470 }, { "epoch": 46.85145700071073, "grad_norm": 9.030489921569824, "learning_rate": 3.3871478315922765e-06, "loss": 0.1016, "step": 16480 }, { "epoch": 46.87988628287136, "grad_norm": 8.156976699829102, "learning_rate": 3.3554922443811336e-06, "loss": 0.1041, "step": 16490 }, { "epoch": 46.908315565031984, "grad_norm": 7.740533828735352, "learning_rate": 3.3238366571699908e-06, "loss": 0.109, "step": 16500 }, { "epoch": 46.93674484719261, "grad_norm": 7.072854518890381, "learning_rate": 3.2921810699588483e-06, "loss": 0.1084, "step": 16510 }, { "epoch": 46.96517412935324, "grad_norm": 9.169633865356445, "learning_rate": 3.2605254827477055e-06, "loss": 0.112, "step": 16520 }, { "epoch": 46.99360341151386, "grad_norm": 11.556053161621094, "learning_rate": 3.2288698955365626e-06, "loss": 0.1096, "step": 16530 }, { "epoch": 46.99928926794598, "eval_accuracy": 0.82, "eval_loss": 0.08944196254014969, "eval_runtime": 13.3004, "eval_samples_per_second": 375.93, "eval_steps_per_second": 11.804, "step": 16532 }, { "epoch": 47.02203269367448, "grad_norm": 8.405094146728516, "learning_rate": 3.1972143083254198e-06, "loss": 0.1084, "step": 16540 }, { "epoch": 47.05046197583511, "grad_norm": 9.417608261108398, "learning_rate": 3.165558721114277e-06, "loss": 0.1064, "step": 16550 }, { "epoch": 47.078891257995735, "grad_norm": 8.933280944824219, "learning_rate": 3.133903133903134e-06, "loss": 0.1118, "step": 16560 }, { "epoch": 47.10732054015636, "grad_norm": 6.217892169952393, "learning_rate": 3.102247546691991e-06, "loss": 0.1102, "step": 16570 }, { "epoch": 47.13574982231699, "grad_norm": 7.039885520935059, "learning_rate": 3.0705919594808488e-06, "loss": 0.1103, "step": 16580 }, { "epoch": 47.16417910447761, "grad_norm": 12.953085899353027, "learning_rate": 3.038936372269706e-06, "loss": 0.1038, "step": 16590 }, { "epoch": 47.192608386638234, "grad_norm": 8.13263988494873, "learning_rate": 3.007280785058563e-06, "loss": 0.1131, "step": 16600 }, { "epoch": 47.221037668798864, "grad_norm": 8.566105842590332, "learning_rate": 2.9756251978474202e-06, "loss": 0.105, "step": 16610 }, { "epoch": 47.24946695095949, "grad_norm": 12.150867462158203, "learning_rate": 2.9439696106362774e-06, "loss": 0.1116, "step": 16620 }, { "epoch": 47.27789623312012, "grad_norm": 7.589836120605469, "learning_rate": 2.912314023425135e-06, "loss": 0.1058, "step": 16630 }, { "epoch": 47.30632551528074, "grad_norm": 8.349220275878906, "learning_rate": 2.880658436213992e-06, "loss": 0.1129, "step": 16640 }, { "epoch": 47.33475479744136, "grad_norm": 6.825118064880371, "learning_rate": 2.8490028490028492e-06, "loss": 0.1069, "step": 16650 }, { "epoch": 47.36318407960199, "grad_norm": 7.773575305938721, "learning_rate": 2.8173472617917064e-06, "loss": 0.1036, "step": 16660 }, { "epoch": 47.391613361762616, "grad_norm": 9.481576919555664, "learning_rate": 2.7856916745805635e-06, "loss": 0.1098, "step": 16670 }, { "epoch": 47.42004264392324, "grad_norm": 9.69212532043457, "learning_rate": 2.7540360873694207e-06, "loss": 0.1046, "step": 16680 }, { "epoch": 47.44847192608387, "grad_norm": 6.644554138183594, "learning_rate": 2.7223805001582782e-06, "loss": 0.1129, "step": 16690 }, { "epoch": 47.47690120824449, "grad_norm": 10.201190948486328, "learning_rate": 2.6907249129471354e-06, "loss": 0.1072, "step": 16700 }, { "epoch": 47.505330490405115, "grad_norm": 13.64123249053955, "learning_rate": 2.6590693257359925e-06, "loss": 0.1078, "step": 16710 }, { "epoch": 47.533759772565745, "grad_norm": 9.386454582214355, "learning_rate": 2.6274137385248497e-06, "loss": 0.1119, "step": 16720 }, { "epoch": 47.56218905472637, "grad_norm": 8.377169609069824, "learning_rate": 2.595758151313707e-06, "loss": 0.1091, "step": 16730 }, { "epoch": 47.59061833688699, "grad_norm": 6.280450820922852, "learning_rate": 2.564102564102564e-06, "loss": 0.1015, "step": 16740 }, { "epoch": 47.61904761904762, "grad_norm": 10.000690460205078, "learning_rate": 2.5324469768914215e-06, "loss": 0.1159, "step": 16750 }, { "epoch": 47.64747690120824, "grad_norm": 7.786238193511963, "learning_rate": 2.5007913896802787e-06, "loss": 0.1086, "step": 16760 }, { "epoch": 47.67590618336887, "grad_norm": 10.610147476196289, "learning_rate": 2.469135802469136e-06, "loss": 0.1102, "step": 16770 }, { "epoch": 47.704335465529496, "grad_norm": 9.869780540466309, "learning_rate": 2.437480215257993e-06, "loss": 0.1072, "step": 16780 }, { "epoch": 47.73276474769012, "grad_norm": 9.20813274383545, "learning_rate": 2.40582462804685e-06, "loss": 0.1027, "step": 16790 }, { "epoch": 47.76119402985075, "grad_norm": 10.896819114685059, "learning_rate": 2.3741690408357077e-06, "loss": 0.1061, "step": 16800 }, { "epoch": 47.78962331201137, "grad_norm": 7.118082046508789, "learning_rate": 2.342513453624565e-06, "loss": 0.1031, "step": 16810 }, { "epoch": 47.818052594171995, "grad_norm": 12.446362495422363, "learning_rate": 2.3108578664134224e-06, "loss": 0.1092, "step": 16820 }, { "epoch": 47.846481876332625, "grad_norm": 8.77364444732666, "learning_rate": 2.2792022792022796e-06, "loss": 0.1046, "step": 16830 }, { "epoch": 47.87491115849325, "grad_norm": 12.357502937316895, "learning_rate": 2.2475466919911367e-06, "loss": 0.1058, "step": 16840 }, { "epoch": 47.90334044065387, "grad_norm": 8.686717987060547, "learning_rate": 2.215891104779994e-06, "loss": 0.1068, "step": 16850 }, { "epoch": 47.9317697228145, "grad_norm": 11.524517059326172, "learning_rate": 2.184235517568851e-06, "loss": 0.1025, "step": 16860 }, { "epoch": 47.960199004975124, "grad_norm": 8.148204803466797, "learning_rate": 2.152579930357708e-06, "loss": 0.1101, "step": 16870 }, { "epoch": 47.98862828713575, "grad_norm": 9.529987335205078, "learning_rate": 2.1209243431465657e-06, "loss": 0.1082, "step": 16880 }, { "epoch": 48.0, "eval_accuracy": 0.821, "eval_loss": 0.08850264549255371, "eval_runtime": 13.3098, "eval_samples_per_second": 375.663, "eval_steps_per_second": 11.796, "step": 16884 }, { "epoch": 48.01705756929638, "grad_norm": 4.896656036376953, "learning_rate": 2.089268755935423e-06, "loss": 0.1078, "step": 16890 }, { "epoch": 48.045486851457, "grad_norm": 7.254157066345215, "learning_rate": 2.05761316872428e-06, "loss": 0.1097, "step": 16900 }, { "epoch": 48.07391613361763, "grad_norm": 6.3104400634765625, "learning_rate": 2.025957581513137e-06, "loss": 0.1051, "step": 16910 }, { "epoch": 48.10234541577825, "grad_norm": 9.964967727661133, "learning_rate": 1.9943019943019943e-06, "loss": 0.1068, "step": 16920 }, { "epoch": 48.130774697938875, "grad_norm": 8.124360084533691, "learning_rate": 1.9626464070908514e-06, "loss": 0.1046, "step": 16930 }, { "epoch": 48.159203980099505, "grad_norm": 7.244774341583252, "learning_rate": 1.930990819879709e-06, "loss": 0.1092, "step": 16940 }, { "epoch": 48.18763326226013, "grad_norm": 10.533236503601074, "learning_rate": 1.899335232668566e-06, "loss": 0.104, "step": 16950 }, { "epoch": 48.21606254442075, "grad_norm": 7.789974212646484, "learning_rate": 1.8676796454574233e-06, "loss": 0.1025, "step": 16960 }, { "epoch": 48.24449182658138, "grad_norm": 7.2511162757873535, "learning_rate": 1.8360240582462804e-06, "loss": 0.1044, "step": 16970 }, { "epoch": 48.272921108742004, "grad_norm": 9.249357223510742, "learning_rate": 1.8043684710351376e-06, "loss": 0.1108, "step": 16980 }, { "epoch": 48.30135039090263, "grad_norm": 10.286603927612305, "learning_rate": 1.772712883823995e-06, "loss": 0.1123, "step": 16990 }, { "epoch": 48.32977967306326, "grad_norm": 7.938315391540527, "learning_rate": 1.741057296612852e-06, "loss": 0.1093, "step": 17000 }, { "epoch": 48.35820895522388, "grad_norm": 7.06292200088501, "learning_rate": 1.7094017094017097e-06, "loss": 0.1102, "step": 17010 }, { "epoch": 48.3866382373845, "grad_norm": 7.245699882507324, "learning_rate": 1.6777461221905668e-06, "loss": 0.1053, "step": 17020 }, { "epoch": 48.41506751954513, "grad_norm": 9.67615032196045, "learning_rate": 1.6460905349794242e-06, "loss": 0.1028, "step": 17030 }, { "epoch": 48.443496801705756, "grad_norm": 10.36846923828125, "learning_rate": 1.6144349477682813e-06, "loss": 0.1053, "step": 17040 }, { "epoch": 48.471926083866386, "grad_norm": 7.186389923095703, "learning_rate": 1.5827793605571385e-06, "loss": 0.1089, "step": 17050 }, { "epoch": 48.50035536602701, "grad_norm": 13.724836349487305, "learning_rate": 1.5511237733459956e-06, "loss": 0.1047, "step": 17060 }, { "epoch": 48.52878464818763, "grad_norm": 10.096785545349121, "learning_rate": 1.519468186134853e-06, "loss": 0.1067, "step": 17070 }, { "epoch": 48.55721393034826, "grad_norm": 8.404655456542969, "learning_rate": 1.4878125989237101e-06, "loss": 0.0998, "step": 17080 }, { "epoch": 48.585643212508884, "grad_norm": 13.89511775970459, "learning_rate": 1.4561570117125675e-06, "loss": 0.1136, "step": 17090 }, { "epoch": 48.61407249466951, "grad_norm": 6.510432720184326, "learning_rate": 1.4245014245014246e-06, "loss": 0.1168, "step": 17100 }, { "epoch": 48.64250177683014, "grad_norm": 8.050429344177246, "learning_rate": 1.3928458372902818e-06, "loss": 0.1057, "step": 17110 }, { "epoch": 48.67093105899076, "grad_norm": 6.839679718017578, "learning_rate": 1.3611902500791391e-06, "loss": 0.1065, "step": 17120 }, { "epoch": 48.69936034115138, "grad_norm": 9.393758773803711, "learning_rate": 1.3295346628679963e-06, "loss": 0.1062, "step": 17130 }, { "epoch": 48.72778962331201, "grad_norm": 9.305793762207031, "learning_rate": 1.2978790756568534e-06, "loss": 0.1141, "step": 17140 }, { "epoch": 48.756218905472636, "grad_norm": 8.807723999023438, "learning_rate": 1.2662234884457108e-06, "loss": 0.0999, "step": 17150 }, { "epoch": 48.78464818763326, "grad_norm": 7.961617469787598, "learning_rate": 1.234567901234568e-06, "loss": 0.1041, "step": 17160 }, { "epoch": 48.81307746979389, "grad_norm": 5.597405910491943, "learning_rate": 1.202912314023425e-06, "loss": 0.1067, "step": 17170 }, { "epoch": 48.84150675195451, "grad_norm": 12.654012680053711, "learning_rate": 1.1712567268122824e-06, "loss": 0.1044, "step": 17180 }, { "epoch": 48.86993603411514, "grad_norm": 7.29843282699585, "learning_rate": 1.1396011396011398e-06, "loss": 0.1048, "step": 17190 }, { "epoch": 48.898365316275765, "grad_norm": 7.380970478057861, "learning_rate": 1.107945552389997e-06, "loss": 0.104, "step": 17200 }, { "epoch": 48.92679459843639, "grad_norm": 7.261307716369629, "learning_rate": 1.076289965178854e-06, "loss": 0.1105, "step": 17210 }, { "epoch": 48.95522388059702, "grad_norm": 8.641499519348145, "learning_rate": 1.0446343779677114e-06, "loss": 0.108, "step": 17220 }, { "epoch": 48.98365316275764, "grad_norm": 9.360930442810059, "learning_rate": 1.0129787907565686e-06, "loss": 0.108, "step": 17230 }, { "epoch": 48.997867803837956, "eval_accuracy": 0.8204, "eval_loss": 0.08857225626707077, "eval_runtime": 13.2857, "eval_samples_per_second": 376.343, "eval_steps_per_second": 11.817, "step": 17235 }, { "epoch": 49.01208244491826, "grad_norm": 9.049455642700195, "learning_rate": 9.813232035454257e-07, "loss": 0.1016, "step": 17240 }, { "epoch": 49.04051172707889, "grad_norm": 10.978199005126953, "learning_rate": 9.49667616334283e-07, "loss": 0.112, "step": 17250 }, { "epoch": 49.068941009239516, "grad_norm": 11.224713325500488, "learning_rate": 9.180120291231402e-07, "loss": 0.1054, "step": 17260 }, { "epoch": 49.09737029140014, "grad_norm": 9.726310729980469, "learning_rate": 8.863564419119975e-07, "loss": 0.1131, "step": 17270 }, { "epoch": 49.12579957356077, "grad_norm": 6.501070022583008, "learning_rate": 8.547008547008548e-07, "loss": 0.112, "step": 17280 }, { "epoch": 49.15422885572139, "grad_norm": 8.350793838500977, "learning_rate": 8.230452674897121e-07, "loss": 0.1058, "step": 17290 }, { "epoch": 49.182658137882015, "grad_norm": 9.776291847229004, "learning_rate": 7.913896802785692e-07, "loss": 0.109, "step": 17300 }, { "epoch": 49.211087420042645, "grad_norm": 7.640761375427246, "learning_rate": 7.597340930674265e-07, "loss": 0.1125, "step": 17310 }, { "epoch": 49.23951670220327, "grad_norm": 6.587189197540283, "learning_rate": 7.280785058562837e-07, "loss": 0.1042, "step": 17320 }, { "epoch": 49.2679459843639, "grad_norm": 7.57374906539917, "learning_rate": 6.964229186451409e-07, "loss": 0.1069, "step": 17330 }, { "epoch": 49.29637526652452, "grad_norm": 8.757404327392578, "learning_rate": 6.647673314339981e-07, "loss": 0.1107, "step": 17340 }, { "epoch": 49.324804548685144, "grad_norm": 8.916653633117676, "learning_rate": 6.331117442228554e-07, "loss": 0.1079, "step": 17350 }, { "epoch": 49.353233830845774, "grad_norm": 6.668952465057373, "learning_rate": 6.014561570117125e-07, "loss": 0.0973, "step": 17360 }, { "epoch": 49.3816631130064, "grad_norm": 6.317824363708496, "learning_rate": 5.698005698005699e-07, "loss": 0.1022, "step": 17370 }, { "epoch": 49.41009239516702, "grad_norm": 7.2825846672058105, "learning_rate": 5.38144982589427e-07, "loss": 0.1083, "step": 17380 }, { "epoch": 49.43852167732765, "grad_norm": 6.981980323791504, "learning_rate": 5.064893953782843e-07, "loss": 0.1048, "step": 17390 }, { "epoch": 49.46695095948827, "grad_norm": 11.234599113464355, "learning_rate": 4.748338081671415e-07, "loss": 0.1065, "step": 17400 }, { "epoch": 49.495380241648895, "grad_norm": 9.936217308044434, "learning_rate": 4.4317822095599874e-07, "loss": 0.1056, "step": 17410 }, { "epoch": 49.523809523809526, "grad_norm": 8.55952262878418, "learning_rate": 4.1152263374485604e-07, "loss": 0.1086, "step": 17420 }, { "epoch": 49.55223880597015, "grad_norm": 6.774092674255371, "learning_rate": 3.7986704653371324e-07, "loss": 0.1053, "step": 17430 }, { "epoch": 49.58066808813077, "grad_norm": 8.369775772094727, "learning_rate": 3.4821145932257044e-07, "loss": 0.1064, "step": 17440 }, { "epoch": 49.6090973702914, "grad_norm": 13.491937637329102, "learning_rate": 3.165558721114277e-07, "loss": 0.1048, "step": 17450 }, { "epoch": 49.637526652452024, "grad_norm": 16.046428680419922, "learning_rate": 2.8490028490028494e-07, "loss": 0.1052, "step": 17460 }, { "epoch": 49.665955934612654, "grad_norm": 10.546481132507324, "learning_rate": 2.5324469768914214e-07, "loss": 0.1014, "step": 17470 }, { "epoch": 49.69438521677328, "grad_norm": 9.7838134765625, "learning_rate": 2.2158911047799937e-07, "loss": 0.1089, "step": 17480 }, { "epoch": 49.7228144989339, "grad_norm": 8.329480171203613, "learning_rate": 1.8993352326685662e-07, "loss": 0.1068, "step": 17490 }, { "epoch": 49.75124378109453, "grad_norm": 6.636852741241455, "learning_rate": 1.5827793605571385e-07, "loss": 0.1092, "step": 17500 }, { "epoch": 49.77967306325515, "grad_norm": 7.376643657684326, "learning_rate": 1.2662234884457107e-07, "loss": 0.1043, "step": 17510 }, { "epoch": 49.808102345415776, "grad_norm": 8.541463851928711, "learning_rate": 9.496676163342831e-08, "loss": 0.1056, "step": 17520 }, { "epoch": 49.836531627576406, "grad_norm": 15.69192123413086, "learning_rate": 6.331117442228554e-08, "loss": 0.1063, "step": 17530 }, { "epoch": 49.86496090973703, "grad_norm": 10.664430618286133, "learning_rate": 3.165558721114277e-08, "loss": 0.1, "step": 17540 }, { "epoch": 49.89339019189765, "grad_norm": 8.656536102294922, "learning_rate": 0.0, "loss": 0.112, "step": 17550 }, { "epoch": 49.89339019189765, "eval_accuracy": 0.8232, "eval_loss": 0.088263601064682, "eval_runtime": 13.4785, "eval_samples_per_second": 370.962, "eval_steps_per_second": 11.648, "step": 17550 }, { "epoch": 49.89339019189765, "step": 17550, "total_flos": 5.581973812939673e+19, "train_loss": 0.15767387228813606, "train_runtime": 14602.4763, "train_samples_per_second": 154.083, "train_steps_per_second": 1.202 } ], "logging_steps": 10, "max_steps": 17550, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.581973812939673e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }