diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5923 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11786454262354065, + "eval_steps": 200, + "global_step": 19000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015508492450465875, + "grad_norm": 0.12764382362365723, + "learning_rate": 0.0015, + "loss": 3.062, + "step": 25 + }, + { + "epoch": 0.0003101698490093175, + "grad_norm": 0.08861421793699265, + "learning_rate": 0.0015, + "loss": 3.0523, + "step": 50 + }, + { + "epoch": 0.00046525477351397625, + "grad_norm": 0.10059793293476105, + "learning_rate": 0.0015, + "loss": 3.0271, + "step": 75 + }, + { + "epoch": 0.000620339698018635, + "grad_norm": 0.09730365872383118, + "learning_rate": 0.0015, + "loss": 3.0421, + "step": 100 + }, + { + "epoch": 0.0007754246225232938, + "grad_norm": 0.15407200157642365, + "learning_rate": 0.0015, + "loss": 2.9894, + "step": 125 + }, + { + "epoch": 0.0009305095470279525, + "grad_norm": 0.12250959873199463, + "learning_rate": 0.0015, + "loss": 3.0055, + "step": 150 + }, + { + "epoch": 0.0010855944715326112, + "grad_norm": 0.08540652692317963, + "learning_rate": 0.0015, + "loss": 3.0025, + "step": 175 + }, + { + "epoch": 0.00124067939603727, + "grad_norm": 0.1479829102754593, + "learning_rate": 0.0015, + "loss": 2.9881, + "step": 200 + }, + { + "epoch": 0.00124067939603727, + "eval_loss": 4.852784156799316, + "perplexity": 128.09652709960938, + "step": 200 + }, + { + "epoch": 0.0013957643205419288, + "grad_norm": 0.1036139577627182, + "learning_rate": 0.0015, + "loss": 2.9609, + "step": 225 + }, + { + "epoch": 0.0015508492450465876, + "grad_norm": 0.10382606089115143, + "learning_rate": 0.0015, + "loss": 2.9771, + "step": 250 + }, + { + "epoch": 0.0017059341695512462, + "grad_norm": 0.08648105710744858, + "learning_rate": 0.0015, + "loss": 2.9522, + "step": 275 + }, + { + "epoch": 0.001861019094055905, + "grad_norm": 0.08675844967365265, + "learning_rate": 0.0015, + "loss": 2.9833, + "step": 300 + }, + { + "epoch": 0.0020161040185605636, + "grad_norm": 0.1417882740497589, + "learning_rate": 0.0015, + "loss": 2.9626, + "step": 325 + }, + { + "epoch": 0.0021711889430652224, + "grad_norm": 0.09860406816005707, + "learning_rate": 0.0015, + "loss": 2.9515, + "step": 350 + }, + { + "epoch": 0.002326273867569881, + "grad_norm": 0.11757214367389679, + "learning_rate": 0.0015, + "loss": 2.9523, + "step": 375 + }, + { + "epoch": 0.00248135879207454, + "grad_norm": 0.11415340006351471, + "learning_rate": 0.0015, + "loss": 2.9579, + "step": 400 + }, + { + "epoch": 0.00248135879207454, + "eval_loss": 4.8426313400268555, + "perplexity": 126.80257415771484, + "step": 400 + }, + { + "epoch": 0.002636443716579199, + "grad_norm": 0.10692940652370453, + "learning_rate": 0.0015, + "loss": 2.9273, + "step": 425 + }, + { + "epoch": 0.0027915286410838576, + "grad_norm": 0.12780559062957764, + "learning_rate": 0.0015, + "loss": 2.9577, + "step": 450 + }, + { + "epoch": 0.0029466135655885164, + "grad_norm": 0.21147418022155762, + "learning_rate": 0.0015, + "loss": 2.9118, + "step": 475 + }, + { + "epoch": 0.003101698490093175, + "grad_norm": 0.13209331035614014, + "learning_rate": 0.0015, + "loss": 2.9584, + "step": 500 + }, + { + "epoch": 0.0032567834145978336, + "grad_norm": 0.13230836391448975, + "learning_rate": 0.0015, + "loss": 2.9621, + "step": 525 + }, + { + "epoch": 0.0034118683391024924, + "grad_norm": 0.11265246570110321, + "learning_rate": 0.0015, + "loss": 2.941, + "step": 550 + }, + { + "epoch": 0.003566953263607151, + "grad_norm": 0.10484226047992706, + "learning_rate": 0.0015, + "loss": 2.9311, + "step": 575 + }, + { + "epoch": 0.00372203818811181, + "grad_norm": 0.13941314816474915, + "learning_rate": 0.0015, + "loss": 2.9741, + "step": 600 + }, + { + "epoch": 0.00372203818811181, + "eval_loss": 4.831629276275635, + "perplexity": 125.41513061523438, + "step": 600 + }, + { + "epoch": 0.0038771231126164688, + "grad_norm": 0.0885343998670578, + "learning_rate": 0.0015, + "loss": 2.944, + "step": 625 + }, + { + "epoch": 0.004032208037121127, + "grad_norm": 0.093564473092556, + "learning_rate": 0.0015, + "loss": 2.9673, + "step": 650 + }, + { + "epoch": 0.004187292961625786, + "grad_norm": 0.15350665152072906, + "learning_rate": 0.0015, + "loss": 2.9314, + "step": 675 + }, + { + "epoch": 0.004342377886130445, + "grad_norm": 0.11337901651859283, + "learning_rate": 0.0015, + "loss": 2.97, + "step": 700 + }, + { + "epoch": 0.004497462810635104, + "grad_norm": 0.13508272171020508, + "learning_rate": 0.0015, + "loss": 2.9121, + "step": 725 + }, + { + "epoch": 0.004652547735139762, + "grad_norm": 0.10049441456794739, + "learning_rate": 0.0015, + "loss": 2.9572, + "step": 750 + }, + { + "epoch": 0.004807632659644422, + "grad_norm": 0.1017594188451767, + "learning_rate": 0.0015, + "loss": 2.9207, + "step": 775 + }, + { + "epoch": 0.00496271758414908, + "grad_norm": 0.09874167293310165, + "learning_rate": 0.0015, + "loss": 2.9258, + "step": 800 + }, + { + "epoch": 0.00496271758414908, + "eval_loss": 4.783432960510254, + "perplexity": 119.51393127441406, + "step": 800 + }, + { + "epoch": 0.005117802508653739, + "grad_norm": 0.09769408404827118, + "learning_rate": 0.0015, + "loss": 2.9606, + "step": 825 + }, + { + "epoch": 0.005272887433158398, + "grad_norm": 0.11946038156747818, + "learning_rate": 0.0015, + "loss": 2.889, + "step": 850 + }, + { + "epoch": 0.005427972357663056, + "grad_norm": 0.12191672623157501, + "learning_rate": 0.0015, + "loss": 2.9094, + "step": 875 + }, + { + "epoch": 0.005583057282167715, + "grad_norm": 0.09349209070205688, + "learning_rate": 0.0015, + "loss": 2.9242, + "step": 900 + }, + { + "epoch": 0.0057381422066723736, + "grad_norm": 0.07793531566858292, + "learning_rate": 0.0015, + "loss": 2.9692, + "step": 925 + }, + { + "epoch": 0.005893227131177033, + "grad_norm": 0.1276599019765854, + "learning_rate": 0.0015, + "loss": 2.9339, + "step": 950 + }, + { + "epoch": 0.006048312055681691, + "grad_norm": 0.11083021759986877, + "learning_rate": 0.0015, + "loss": 2.9251, + "step": 975 + }, + { + "epoch": 0.00620339698018635, + "grad_norm": 0.13207702338695526, + "learning_rate": 0.0015, + "loss": 2.8567, + "step": 1000 + }, + { + "epoch": 0.00620339698018635, + "eval_loss": 4.790068626403809, + "perplexity": 120.30962371826172, + "step": 1000 + }, + { + "epoch": 0.006358481904691009, + "grad_norm": 0.20453479886054993, + "learning_rate": 0.0015, + "loss": 2.9127, + "step": 1025 + }, + { + "epoch": 0.006513566829195667, + "grad_norm": 0.12530989944934845, + "learning_rate": 0.0015, + "loss": 2.9147, + "step": 1050 + }, + { + "epoch": 0.006668651753700326, + "grad_norm": 0.11520997434854507, + "learning_rate": 0.0015, + "loss": 2.936, + "step": 1075 + }, + { + "epoch": 0.006823736678204985, + "grad_norm": 0.09191219508647919, + "learning_rate": 0.0015, + "loss": 2.9115, + "step": 1100 + }, + { + "epoch": 0.006978821602709644, + "grad_norm": 0.07251202315092087, + "learning_rate": 0.0015, + "loss": 2.9154, + "step": 1125 + }, + { + "epoch": 0.007133906527214302, + "grad_norm": 0.10054546594619751, + "learning_rate": 0.0015, + "loss": 2.8924, + "step": 1150 + }, + { + "epoch": 0.007288991451718962, + "grad_norm": 0.1192697063088417, + "learning_rate": 0.0015, + "loss": 2.957, + "step": 1175 + }, + { + "epoch": 0.00744407637622362, + "grad_norm": 0.14840476214885712, + "learning_rate": 0.0015, + "loss": 2.895, + "step": 1200 + }, + { + "epoch": 0.00744407637622362, + "eval_loss": 4.770949363708496, + "perplexity": 118.03124237060547, + "step": 1200 + }, + { + "epoch": 0.007599161300728279, + "grad_norm": 0.11221906542778015, + "learning_rate": 0.0015, + "loss": 2.9131, + "step": 1225 + }, + { + "epoch": 0.0077542462252329376, + "grad_norm": 0.11528974026441574, + "learning_rate": 0.0015, + "loss": 2.8783, + "step": 1250 + }, + { + "epoch": 0.007909331149737596, + "grad_norm": 0.0807015597820282, + "learning_rate": 0.0015, + "loss": 2.91, + "step": 1275 + }, + { + "epoch": 0.008064416074242254, + "grad_norm": 0.1435490846633911, + "learning_rate": 0.0015, + "loss": 2.9198, + "step": 1300 + }, + { + "epoch": 0.008219500998746914, + "grad_norm": 0.11956608295440674, + "learning_rate": 0.0015, + "loss": 2.8771, + "step": 1325 + }, + { + "epoch": 0.008374585923251573, + "grad_norm": 0.10362117737531662, + "learning_rate": 0.0015, + "loss": 2.8913, + "step": 1350 + }, + { + "epoch": 0.008529670847756231, + "grad_norm": 0.07132004201412201, + "learning_rate": 0.0015, + "loss": 2.946, + "step": 1375 + }, + { + "epoch": 0.00868475577226089, + "grad_norm": 0.08756817877292633, + "learning_rate": 0.0015, + "loss": 2.9015, + "step": 1400 + }, + { + "epoch": 0.00868475577226089, + "eval_loss": 4.769084453582764, + "perplexity": 117.81133270263672, + "step": 1400 + }, + { + "epoch": 0.00883984069676555, + "grad_norm": 0.18067917227745056, + "learning_rate": 0.0015, + "loss": 2.8887, + "step": 1425 + }, + { + "epoch": 0.008994925621270208, + "grad_norm": 0.09742950648069382, + "learning_rate": 0.0015, + "loss": 2.8834, + "step": 1450 + }, + { + "epoch": 0.009150010545774866, + "grad_norm": 0.09857803583145142, + "learning_rate": 0.0015, + "loss": 2.8856, + "step": 1475 + }, + { + "epoch": 0.009305095470279525, + "grad_norm": 0.17605328559875488, + "learning_rate": 0.0015, + "loss": 2.9238, + "step": 1500 + }, + { + "epoch": 0.009460180394784183, + "grad_norm": 0.08441105484962463, + "learning_rate": 0.0015, + "loss": 2.8605, + "step": 1525 + }, + { + "epoch": 0.009615265319288843, + "grad_norm": 0.15339621901512146, + "learning_rate": 0.0015, + "loss": 2.9421, + "step": 1550 + }, + { + "epoch": 0.009770350243793502, + "grad_norm": 0.21426236629486084, + "learning_rate": 0.0015, + "loss": 2.8899, + "step": 1575 + }, + { + "epoch": 0.00992543516829816, + "grad_norm": 0.16503557562828064, + "learning_rate": 0.0015, + "loss": 2.878, + "step": 1600 + }, + { + "epoch": 0.00992543516829816, + "eval_loss": 4.774999618530273, + "perplexity": 118.51026916503906, + "step": 1600 + }, + { + "epoch": 0.010080520092802818, + "grad_norm": 0.11398541182279587, + "learning_rate": 0.0015, + "loss": 2.866, + "step": 1625 + }, + { + "epoch": 0.010235605017307478, + "grad_norm": 0.16510234773159027, + "learning_rate": 0.0015, + "loss": 2.8936, + "step": 1650 + }, + { + "epoch": 0.010390689941812137, + "grad_norm": 0.08827799558639526, + "learning_rate": 0.0015, + "loss": 2.8789, + "step": 1675 + }, + { + "epoch": 0.010545774866316795, + "grad_norm": 0.12703286111354828, + "learning_rate": 0.0015, + "loss": 2.9104, + "step": 1700 + }, + { + "epoch": 0.010700859790821454, + "grad_norm": 0.10185768455266953, + "learning_rate": 0.0015, + "loss": 2.8389, + "step": 1725 + }, + { + "epoch": 0.010855944715326112, + "grad_norm": 0.13076236844062805, + "learning_rate": 0.0015, + "loss": 2.8603, + "step": 1750 + }, + { + "epoch": 0.011011029639830772, + "grad_norm": 0.08955707401037216, + "learning_rate": 0.0015, + "loss": 2.8283, + "step": 1775 + }, + { + "epoch": 0.01116611456433543, + "grad_norm": 0.07163148373365402, + "learning_rate": 0.0015, + "loss": 2.8852, + "step": 1800 + }, + { + "epoch": 0.01116611456433543, + "eval_loss": 4.75281286239624, + "perplexity": 115.90986633300781, + "step": 1800 + }, + { + "epoch": 0.011321199488840089, + "grad_norm": 0.09710580855607986, + "learning_rate": 0.0015, + "loss": 2.8573, + "step": 1825 + }, + { + "epoch": 0.011476284413344747, + "grad_norm": 0.11669810861349106, + "learning_rate": 0.0015, + "loss": 2.8674, + "step": 1850 + }, + { + "epoch": 0.011631369337849405, + "grad_norm": 0.11174403876066208, + "learning_rate": 0.0015, + "loss": 2.9121, + "step": 1875 + }, + { + "epoch": 0.011786454262354066, + "grad_norm": 0.09547118842601776, + "learning_rate": 0.0015, + "loss": 2.9033, + "step": 1900 + }, + { + "epoch": 0.011941539186858724, + "grad_norm": 0.09878171980381012, + "learning_rate": 0.0015, + "loss": 2.8738, + "step": 1925 + }, + { + "epoch": 0.012096624111363382, + "grad_norm": 0.09479096531867981, + "learning_rate": 0.0015, + "loss": 2.8775, + "step": 1950 + }, + { + "epoch": 0.01225170903586804, + "grad_norm": 0.12434259057044983, + "learning_rate": 0.0015, + "loss": 2.8452, + "step": 1975 + }, + { + "epoch": 0.0124067939603727, + "grad_norm": 0.09166444838047028, + "learning_rate": 0.0015, + "loss": 2.8546, + "step": 2000 + }, + { + "epoch": 0.0124067939603727, + "eval_loss": 4.748600482940674, + "perplexity": 115.42263793945312, + "step": 2000 + }, + { + "epoch": 0.01256187888487736, + "grad_norm": 0.07793508470058441, + "learning_rate": 0.0015, + "loss": 2.8306, + "step": 2025 + }, + { + "epoch": 0.012716963809382018, + "grad_norm": 0.1670406609773636, + "learning_rate": 0.0015, + "loss": 2.863, + "step": 2050 + }, + { + "epoch": 0.012872048733886676, + "grad_norm": 0.20754718780517578, + "learning_rate": 0.0015, + "loss": 2.8871, + "step": 2075 + }, + { + "epoch": 0.013027133658391334, + "grad_norm": 0.14225496351718903, + "learning_rate": 0.0015, + "loss": 2.8498, + "step": 2100 + }, + { + "epoch": 0.013182218582895994, + "grad_norm": 0.11809197813272476, + "learning_rate": 0.0015, + "loss": 2.8206, + "step": 2125 + }, + { + "epoch": 0.013337303507400653, + "grad_norm": 0.09541622549295425, + "learning_rate": 0.0015, + "loss": 2.8585, + "step": 2150 + }, + { + "epoch": 0.013492388431905311, + "grad_norm": 0.1115843802690506, + "learning_rate": 0.0015, + "loss": 2.8533, + "step": 2175 + }, + { + "epoch": 0.01364747335640997, + "grad_norm": 0.08517899364233017, + "learning_rate": 0.0015, + "loss": 2.8477, + "step": 2200 + }, + { + "epoch": 0.01364747335640997, + "eval_loss": 4.753279685974121, + "perplexity": 115.9639892578125, + "step": 2200 + }, + { + "epoch": 0.01380255828091463, + "grad_norm": 0.13083544373512268, + "learning_rate": 0.0015, + "loss": 2.8518, + "step": 2225 + }, + { + "epoch": 0.013957643205419288, + "grad_norm": 0.07403870671987534, + "learning_rate": 0.0015, + "loss": 2.8685, + "step": 2250 + }, + { + "epoch": 0.014112728129923946, + "grad_norm": 0.16436311602592468, + "learning_rate": 0.0015, + "loss": 2.8601, + "step": 2275 + }, + { + "epoch": 0.014267813054428605, + "grad_norm": 0.12990187108516693, + "learning_rate": 0.0015, + "loss": 2.8332, + "step": 2300 + }, + { + "epoch": 0.014422897978933263, + "grad_norm": 0.0897112786769867, + "learning_rate": 0.0015, + "loss": 2.8578, + "step": 2325 + }, + { + "epoch": 0.014577982903437923, + "grad_norm": 0.10096879303455353, + "learning_rate": 0.0015, + "loss": 2.802, + "step": 2350 + }, + { + "epoch": 0.014733067827942582, + "grad_norm": 0.0850217416882515, + "learning_rate": 0.0015, + "loss": 2.8529, + "step": 2375 + }, + { + "epoch": 0.01488815275244724, + "grad_norm": 0.11395123600959778, + "learning_rate": 0.0015, + "loss": 2.8655, + "step": 2400 + }, + { + "epoch": 0.01488815275244724, + "eval_loss": 4.743602275848389, + "perplexity": 114.84716796875, + "step": 2400 + }, + { + "epoch": 0.015043237676951898, + "grad_norm": 0.1590801179409027, + "learning_rate": 0.0015, + "loss": 2.8227, + "step": 2425 + }, + { + "epoch": 0.015198322601456558, + "grad_norm": 0.16819922626018524, + "learning_rate": 0.0015, + "loss": 2.8551, + "step": 2450 + }, + { + "epoch": 0.015353407525961217, + "grad_norm": 0.15390118956565857, + "learning_rate": 0.0015, + "loss": 2.8691, + "step": 2475 + }, + { + "epoch": 0.015508492450465875, + "grad_norm": 0.10976951569318771, + "learning_rate": 0.0015, + "loss": 2.8615, + "step": 2500 + }, + { + "epoch": 0.015663577374970535, + "grad_norm": 0.09539350867271423, + "learning_rate": 0.0015, + "loss": 2.7755, + "step": 2525 + }, + { + "epoch": 0.015818662299475192, + "grad_norm": 0.09798863530158997, + "learning_rate": 0.0015, + "loss": 2.7675, + "step": 2550 + }, + { + "epoch": 0.015973747223979852, + "grad_norm": 0.10233014822006226, + "learning_rate": 0.0015, + "loss": 2.7905, + "step": 2575 + }, + { + "epoch": 0.01612883214848451, + "grad_norm": 0.09607812017202377, + "learning_rate": 0.0015, + "loss": 2.779, + "step": 2600 + }, + { + "epoch": 0.01612883214848451, + "eval_loss": 4.757762432098389, + "perplexity": 116.48499298095703, + "step": 2600 + }, + { + "epoch": 0.01628391707298917, + "grad_norm": 0.09782920032739639, + "learning_rate": 0.0015, + "loss": 2.8455, + "step": 2625 + }, + { + "epoch": 0.01643900199749383, + "grad_norm": 0.08443335443735123, + "learning_rate": 0.0015, + "loss": 2.8537, + "step": 2650 + }, + { + "epoch": 0.016594086921998485, + "grad_norm": 0.1567981094121933, + "learning_rate": 0.0015, + "loss": 2.8334, + "step": 2675 + }, + { + "epoch": 0.016749171846503146, + "grad_norm": 0.1279255449771881, + "learning_rate": 0.0015, + "loss": 2.8733, + "step": 2700 + }, + { + "epoch": 0.016904256771007802, + "grad_norm": 0.09086953848600388, + "learning_rate": 0.0015, + "loss": 2.7992, + "step": 2725 + }, + { + "epoch": 0.017059341695512462, + "grad_norm": 0.15084481239318848, + "learning_rate": 0.0015, + "loss": 2.7891, + "step": 2750 + }, + { + "epoch": 0.017214426620017122, + "grad_norm": 0.1059018149971962, + "learning_rate": 0.0015, + "loss": 2.8088, + "step": 2775 + }, + { + "epoch": 0.01736951154452178, + "grad_norm": 0.08803548663854599, + "learning_rate": 0.0015, + "loss": 2.817, + "step": 2800 + }, + { + "epoch": 0.01736951154452178, + "eval_loss": 4.730724334716797, + "perplexity": 113.37765502929688, + "step": 2800 + }, + { + "epoch": 0.01752459646902644, + "grad_norm": 0.0954984724521637, + "learning_rate": 0.0015, + "loss": 2.8528, + "step": 2825 + }, + { + "epoch": 0.0176796813935311, + "grad_norm": 0.14015914499759674, + "learning_rate": 0.0015, + "loss": 2.8131, + "step": 2850 + }, + { + "epoch": 0.017834766318035756, + "grad_norm": 0.07908599078655243, + "learning_rate": 0.0015, + "loss": 2.8371, + "step": 2875 + }, + { + "epoch": 0.017989851242540416, + "grad_norm": 0.14578266441822052, + "learning_rate": 0.0015, + "loss": 2.8033, + "step": 2900 + }, + { + "epoch": 0.018144936167045073, + "grad_norm": 0.10059946030378342, + "learning_rate": 0.0015, + "loss": 2.8165, + "step": 2925 + }, + { + "epoch": 0.018300021091549733, + "grad_norm": 0.10238490998744965, + "learning_rate": 0.0015, + "loss": 2.7739, + "step": 2950 + }, + { + "epoch": 0.018455106016054393, + "grad_norm": 0.12706336379051208, + "learning_rate": 0.0015, + "loss": 2.8018, + "step": 2975 + }, + { + "epoch": 0.01861019094055905, + "grad_norm": 0.1252700239419937, + "learning_rate": 0.0015, + "loss": 2.8155, + "step": 3000 + }, + { + "epoch": 0.01861019094055905, + "eval_loss": 4.707705020904541, + "perplexity": 110.79759216308594, + "step": 3000 + }, + { + "epoch": 0.01876527586506371, + "grad_norm": 0.13322588801383972, + "learning_rate": 0.0015, + "loss": 2.8201, + "step": 3025 + }, + { + "epoch": 0.018920360789568366, + "grad_norm": 0.14152252674102783, + "learning_rate": 0.0015, + "loss": 2.7942, + "step": 3050 + }, + { + "epoch": 0.019075445714073026, + "grad_norm": 0.1276037096977234, + "learning_rate": 0.0015, + "loss": 2.8065, + "step": 3075 + }, + { + "epoch": 0.019230530638577686, + "grad_norm": 0.11600831896066666, + "learning_rate": 0.0015, + "loss": 2.8335, + "step": 3100 + }, + { + "epoch": 0.019385615563082343, + "grad_norm": 0.11985427141189575, + "learning_rate": 0.0015, + "loss": 2.7993, + "step": 3125 + }, + { + "epoch": 0.019540700487587003, + "grad_norm": 0.11630894988775253, + "learning_rate": 0.0015, + "loss": 2.7838, + "step": 3150 + }, + { + "epoch": 0.01969578541209166, + "grad_norm": 0.08493560552597046, + "learning_rate": 0.0015, + "loss": 2.7884, + "step": 3175 + }, + { + "epoch": 0.01985087033659632, + "grad_norm": 0.12671016156673431, + "learning_rate": 0.0015, + "loss": 2.7763, + "step": 3200 + }, + { + "epoch": 0.01985087033659632, + "eval_loss": 4.7127766609191895, + "perplexity": 111.3609390258789, + "step": 3200 + }, + { + "epoch": 0.02000595526110098, + "grad_norm": 0.10381816327571869, + "learning_rate": 0.0015, + "loss": 2.7849, + "step": 3225 + }, + { + "epoch": 0.020161040185605637, + "grad_norm": 0.12319795787334442, + "learning_rate": 0.0015, + "loss": 2.8325, + "step": 3250 + }, + { + "epoch": 0.020316125110110297, + "grad_norm": 0.11378122121095657, + "learning_rate": 0.0015, + "loss": 2.7609, + "step": 3275 + }, + { + "epoch": 0.020471210034614957, + "grad_norm": 0.08910433948040009, + "learning_rate": 0.0015, + "loss": 2.7886, + "step": 3300 + }, + { + "epoch": 0.020626294959119613, + "grad_norm": 0.11803348362445831, + "learning_rate": 0.0015, + "loss": 2.7716, + "step": 3325 + }, + { + "epoch": 0.020781379883624274, + "grad_norm": 0.10203807801008224, + "learning_rate": 0.0015, + "loss": 2.778, + "step": 3350 + }, + { + "epoch": 0.02093646480812893, + "grad_norm": 0.07175683230161667, + "learning_rate": 0.0015, + "loss": 2.7844, + "step": 3375 + }, + { + "epoch": 0.02109154973263359, + "grad_norm": 0.1556989699602127, + "learning_rate": 0.0015, + "loss": 2.748, + "step": 3400 + }, + { + "epoch": 0.02109154973263359, + "eval_loss": 4.711516857147217, + "perplexity": 111.22074127197266, + "step": 3400 + }, + { + "epoch": 0.02124663465713825, + "grad_norm": 0.11983326822519302, + "learning_rate": 0.0015, + "loss": 2.7747, + "step": 3425 + }, + { + "epoch": 0.021401719581642907, + "grad_norm": 0.09098344296216965, + "learning_rate": 0.0015, + "loss": 2.7609, + "step": 3450 + }, + { + "epoch": 0.021556804506147567, + "grad_norm": 0.1238594651222229, + "learning_rate": 0.0015, + "loss": 2.7849, + "step": 3475 + }, + { + "epoch": 0.021711889430652224, + "grad_norm": 0.10654041916131973, + "learning_rate": 0.0015, + "loss": 2.7742, + "step": 3500 + }, + { + "epoch": 0.021866974355156884, + "grad_norm": 0.12955708801746368, + "learning_rate": 0.0015, + "loss": 2.7302, + "step": 3525 + }, + { + "epoch": 0.022022059279661544, + "grad_norm": 0.0945751890540123, + "learning_rate": 0.0015, + "loss": 2.7366, + "step": 3550 + }, + { + "epoch": 0.0221771442041662, + "grad_norm": 0.11322261393070221, + "learning_rate": 0.0015, + "loss": 2.7307, + "step": 3575 + }, + { + "epoch": 0.02233222912867086, + "grad_norm": 0.14438313245773315, + "learning_rate": 0.0015, + "loss": 2.741, + "step": 3600 + }, + { + "epoch": 0.02233222912867086, + "eval_loss": 4.7056427001953125, + "perplexity": 110.56932830810547, + "step": 3600 + }, + { + "epoch": 0.022487314053175517, + "grad_norm": 0.12101957201957703, + "learning_rate": 0.0015, + "loss": 2.7699, + "step": 3625 + }, + { + "epoch": 0.022642398977680177, + "grad_norm": 0.13060438632965088, + "learning_rate": 0.0015, + "loss": 2.7534, + "step": 3650 + }, + { + "epoch": 0.022797483902184838, + "grad_norm": 0.18028861284255981, + "learning_rate": 0.0015, + "loss": 2.7716, + "step": 3675 + }, + { + "epoch": 0.022952568826689494, + "grad_norm": 0.2551407217979431, + "learning_rate": 0.0015, + "loss": 2.7505, + "step": 3700 + }, + { + "epoch": 0.023107653751194154, + "grad_norm": 0.14461354911327362, + "learning_rate": 0.0015, + "loss": 2.762, + "step": 3725 + }, + { + "epoch": 0.02326273867569881, + "grad_norm": 0.08960037678480148, + "learning_rate": 0.0015, + "loss": 2.7752, + "step": 3750 + }, + { + "epoch": 0.02341782360020347, + "grad_norm": 0.12423495948314667, + "learning_rate": 0.0015, + "loss": 2.7649, + "step": 3775 + }, + { + "epoch": 0.02357290852470813, + "grad_norm": 0.11889061331748962, + "learning_rate": 0.0015, + "loss": 2.7465, + "step": 3800 + }, + { + "epoch": 0.02357290852470813, + "eval_loss": 4.709405422210693, + "perplexity": 110.98615264892578, + "step": 3800 + }, + { + "epoch": 0.023727993449212788, + "grad_norm": 0.1310662031173706, + "learning_rate": 0.0015, + "loss": 2.7739, + "step": 3825 + }, + { + "epoch": 0.023883078373717448, + "grad_norm": 0.10841766744852066, + "learning_rate": 0.0015, + "loss": 2.7558, + "step": 3850 + }, + { + "epoch": 0.024038163298222108, + "grad_norm": 0.11951743066310883, + "learning_rate": 0.0015, + "loss": 2.7574, + "step": 3875 + }, + { + "epoch": 0.024193248222726765, + "grad_norm": 0.10914873331785202, + "learning_rate": 0.0015, + "loss": 2.7593, + "step": 3900 + }, + { + "epoch": 0.024348333147231425, + "grad_norm": 0.12661431729793549, + "learning_rate": 0.0015, + "loss": 2.7405, + "step": 3925 + }, + { + "epoch": 0.02450341807173608, + "grad_norm": 0.09351510554552078, + "learning_rate": 0.0015, + "loss": 2.7614, + "step": 3950 + }, + { + "epoch": 0.02465850299624074, + "grad_norm": 0.10916408896446228, + "learning_rate": 0.0015, + "loss": 2.7348, + "step": 3975 + }, + { + "epoch": 0.0248135879207454, + "grad_norm": 0.1506185084581375, + "learning_rate": 0.0015, + "loss": 2.7465, + "step": 4000 + }, + { + "epoch": 0.0248135879207454, + "eval_loss": 4.691644191741943, + "perplexity": 109.03230285644531, + "step": 4000 + }, + { + "epoch": 0.024968672845250058, + "grad_norm": 0.16664201021194458, + "learning_rate": 0.0015, + "loss": 2.7099, + "step": 4025 + }, + { + "epoch": 0.02512375776975472, + "grad_norm": 0.08793428540229797, + "learning_rate": 0.0015, + "loss": 2.7062, + "step": 4050 + }, + { + "epoch": 0.025278842694259375, + "grad_norm": 0.10746140778064728, + "learning_rate": 0.0015, + "loss": 2.7013, + "step": 4075 + }, + { + "epoch": 0.025433927618764035, + "grad_norm": 0.14466698467731476, + "learning_rate": 0.0015, + "loss": 2.7366, + "step": 4100 + }, + { + "epoch": 0.025589012543268695, + "grad_norm": 0.12191653996706009, + "learning_rate": 0.0015, + "loss": 2.7042, + "step": 4125 + }, + { + "epoch": 0.025744097467773352, + "grad_norm": 0.10167489945888519, + "learning_rate": 0.0015, + "loss": 2.7215, + "step": 4150 + }, + { + "epoch": 0.025899182392278012, + "grad_norm": 0.11334148049354553, + "learning_rate": 0.0015, + "loss": 2.7365, + "step": 4175 + }, + { + "epoch": 0.02605426731678267, + "grad_norm": 0.09303794056177139, + "learning_rate": 0.0015, + "loss": 2.7471, + "step": 4200 + }, + { + "epoch": 0.02605426731678267, + "eval_loss": 4.692121505737305, + "perplexity": 109.08435821533203, + "step": 4200 + }, + { + "epoch": 0.02620935224128733, + "grad_norm": 0.09444712847471237, + "learning_rate": 0.0015, + "loss": 2.6965, + "step": 4225 + }, + { + "epoch": 0.02636443716579199, + "grad_norm": 0.09560113400220871, + "learning_rate": 0.0015, + "loss": 2.7186, + "step": 4250 + }, + { + "epoch": 0.026519522090296645, + "grad_norm": 0.10814715176820755, + "learning_rate": 0.0015, + "loss": 2.7, + "step": 4275 + }, + { + "epoch": 0.026674607014801305, + "grad_norm": 0.12008251994848251, + "learning_rate": 0.0015, + "loss": 2.6827, + "step": 4300 + }, + { + "epoch": 0.026829691939305966, + "grad_norm": 0.13892072439193726, + "learning_rate": 0.0015, + "loss": 2.7481, + "step": 4325 + }, + { + "epoch": 0.026984776863810622, + "grad_norm": 0.10116352885961533, + "learning_rate": 0.0015, + "loss": 2.6839, + "step": 4350 + }, + { + "epoch": 0.027139861788315282, + "grad_norm": 0.2541595697402954, + "learning_rate": 0.0015, + "loss": 2.6987, + "step": 4375 + }, + { + "epoch": 0.02729494671281994, + "grad_norm": 0.11070574074983597, + "learning_rate": 0.0015, + "loss": 2.7102, + "step": 4400 + }, + { + "epoch": 0.02729494671281994, + "eval_loss": 4.702114105224609, + "perplexity": 110.17985534667969, + "step": 4400 + }, + { + "epoch": 0.0274500316373246, + "grad_norm": 0.09290622174739838, + "learning_rate": 0.0015, + "loss": 2.744, + "step": 4425 + }, + { + "epoch": 0.02760511656182926, + "grad_norm": 0.09867129474878311, + "learning_rate": 0.0015, + "loss": 2.6979, + "step": 4450 + }, + { + "epoch": 0.027760201486333916, + "grad_norm": 0.08975850045681, + "learning_rate": 0.0015, + "loss": 2.7346, + "step": 4475 + }, + { + "epoch": 0.027915286410838576, + "grad_norm": 0.1251811683177948, + "learning_rate": 0.0015, + "loss": 2.6901, + "step": 4500 + }, + { + "epoch": 0.028070371335343233, + "grad_norm": 0.10718528181314468, + "learning_rate": 0.0015, + "loss": 2.6584, + "step": 4525 + }, + { + "epoch": 0.028225456259847893, + "grad_norm": 0.1920158714056015, + "learning_rate": 0.0015, + "loss": 2.6776, + "step": 4550 + }, + { + "epoch": 0.028380541184352553, + "grad_norm": 0.11409153789281845, + "learning_rate": 0.0015, + "loss": 2.7052, + "step": 4575 + }, + { + "epoch": 0.02853562610885721, + "grad_norm": 0.12506772577762604, + "learning_rate": 0.0015, + "loss": 2.6954, + "step": 4600 + }, + { + "epoch": 0.02853562610885721, + "eval_loss": 4.685390949249268, + "perplexity": 108.35262298583984, + "step": 4600 + }, + { + "epoch": 0.02869071103336187, + "grad_norm": 0.1093166172504425, + "learning_rate": 0.0015, + "loss": 2.7257, + "step": 4625 + }, + { + "epoch": 0.028845795957866526, + "grad_norm": 0.16628532111644745, + "learning_rate": 0.0015, + "loss": 2.6782, + "step": 4650 + }, + { + "epoch": 0.029000880882371186, + "grad_norm": 0.1638079136610031, + "learning_rate": 0.0015, + "loss": 2.6884, + "step": 4675 + }, + { + "epoch": 0.029155965806875846, + "grad_norm": 0.11411619931459427, + "learning_rate": 0.0015, + "loss": 2.7054, + "step": 4700 + }, + { + "epoch": 0.029311050731380503, + "grad_norm": 0.09292814135551453, + "learning_rate": 0.0015, + "loss": 2.6826, + "step": 4725 + }, + { + "epoch": 0.029466135655885163, + "grad_norm": 0.09136354923248291, + "learning_rate": 0.0015, + "loss": 2.6936, + "step": 4750 + }, + { + "epoch": 0.029621220580389823, + "grad_norm": 0.1188502386212349, + "learning_rate": 0.0015, + "loss": 2.6466, + "step": 4775 + }, + { + "epoch": 0.02977630550489448, + "grad_norm": 0.09645655751228333, + "learning_rate": 0.0015, + "loss": 2.6092, + "step": 4800 + }, + { + "epoch": 0.02977630550489448, + "eval_loss": 4.683995723724365, + "perplexity": 108.20155334472656, + "step": 4800 + }, + { + "epoch": 0.02993139042939914, + "grad_norm": 0.17193672060966492, + "learning_rate": 0.0015, + "loss": 2.6916, + "step": 4825 + }, + { + "epoch": 0.030086475353903797, + "grad_norm": 0.14866988360881805, + "learning_rate": 0.0015, + "loss": 2.6776, + "step": 4850 + }, + { + "epoch": 0.030241560278408457, + "grad_norm": 0.10588869452476501, + "learning_rate": 0.0015, + "loss": 2.6773, + "step": 4875 + }, + { + "epoch": 0.030396645202913117, + "grad_norm": 0.12059559673070908, + "learning_rate": 0.0015, + "loss": 2.639, + "step": 4900 + }, + { + "epoch": 0.030551730127417773, + "grad_norm": 0.13296598196029663, + "learning_rate": 0.0015, + "loss": 2.6359, + "step": 4925 + }, + { + "epoch": 0.030706815051922434, + "grad_norm": 0.12300167232751846, + "learning_rate": 0.0015, + "loss": 2.668, + "step": 4950 + }, + { + "epoch": 0.03086189997642709, + "grad_norm": 0.15900522470474243, + "learning_rate": 0.0015, + "loss": 2.6252, + "step": 4975 + }, + { + "epoch": 0.03101698490093175, + "grad_norm": 0.138090580701828, + "learning_rate": 0.0015, + "loss": 2.659, + "step": 5000 + }, + { + "epoch": 0.03101698490093175, + "eval_loss": 4.688181400299072, + "perplexity": 108.65540313720703, + "step": 5000 + }, + { + "epoch": 0.03117206982543641, + "grad_norm": 0.13720737397670746, + "learning_rate": 0.0015, + "loss": 2.6096, + "step": 5025 + }, + { + "epoch": 0.03132715474994107, + "grad_norm": 0.13671600818634033, + "learning_rate": 0.0015, + "loss": 2.647, + "step": 5050 + }, + { + "epoch": 0.031482239674445724, + "grad_norm": 0.12611277401447296, + "learning_rate": 0.0015, + "loss": 2.639, + "step": 5075 + }, + { + "epoch": 0.031637324598950384, + "grad_norm": 0.12045291066169739, + "learning_rate": 0.0015, + "loss": 2.663, + "step": 5100 + }, + { + "epoch": 0.031792409523455044, + "grad_norm": 0.10857657343149185, + "learning_rate": 0.0015, + "loss": 2.6677, + "step": 5125 + }, + { + "epoch": 0.031947494447959704, + "grad_norm": 0.12052007764577866, + "learning_rate": 0.0015, + "loss": 2.6508, + "step": 5150 + }, + { + "epoch": 0.032102579372464364, + "grad_norm": 0.10999467223882675, + "learning_rate": 0.0015, + "loss": 2.661, + "step": 5175 + }, + { + "epoch": 0.03225766429696902, + "grad_norm": 0.11075185984373093, + "learning_rate": 0.0015, + "loss": 2.6645, + "step": 5200 + }, + { + "epoch": 0.03225766429696902, + "eval_loss": 4.706582546234131, + "perplexity": 110.67329406738281, + "step": 5200 + }, + { + "epoch": 0.03241274922147368, + "grad_norm": 0.09703061729669571, + "learning_rate": 0.0015, + "loss": 2.6109, + "step": 5225 + }, + { + "epoch": 0.03256783414597834, + "grad_norm": 0.13556119799613953, + "learning_rate": 0.0015, + "loss": 2.6621, + "step": 5250 + }, + { + "epoch": 0.032722919070483, + "grad_norm": 0.09178316593170166, + "learning_rate": 0.0015, + "loss": 2.6263, + "step": 5275 + }, + { + "epoch": 0.03287800399498766, + "grad_norm": 0.10839138180017471, + "learning_rate": 0.0015, + "loss": 2.5999, + "step": 5300 + }, + { + "epoch": 0.03303308891949231, + "grad_norm": 0.12049377709627151, + "learning_rate": 0.0015, + "loss": 2.6085, + "step": 5325 + }, + { + "epoch": 0.03318817384399697, + "grad_norm": 0.15260230004787445, + "learning_rate": 0.0015, + "loss": 2.664, + "step": 5350 + }, + { + "epoch": 0.03334325876850163, + "grad_norm": 0.12393297255039215, + "learning_rate": 0.0015, + "loss": 2.6234, + "step": 5375 + }, + { + "epoch": 0.03349834369300629, + "grad_norm": 0.1284521073102951, + "learning_rate": 0.0015, + "loss": 2.5624, + "step": 5400 + }, + { + "epoch": 0.03349834369300629, + "eval_loss": 4.696901321411133, + "perplexity": 109.60700988769531, + "step": 5400 + }, + { + "epoch": 0.03365342861751095, + "grad_norm": 0.18052247166633606, + "learning_rate": 0.0015, + "loss": 2.5779, + "step": 5425 + }, + { + "epoch": 0.033808513542015604, + "grad_norm": 0.11775010824203491, + "learning_rate": 0.0015, + "loss": 2.6167, + "step": 5450 + }, + { + "epoch": 0.033963598466520264, + "grad_norm": 0.13769109547138214, + "learning_rate": 0.0015, + "loss": 2.6117, + "step": 5475 + }, + { + "epoch": 0.034118683391024925, + "grad_norm": 0.09634970873594284, + "learning_rate": 0.0015, + "loss": 2.613, + "step": 5500 + }, + { + "epoch": 0.034273768315529585, + "grad_norm": 0.14692488312721252, + "learning_rate": 0.0015, + "loss": 2.6176, + "step": 5525 + }, + { + "epoch": 0.034428853240034245, + "grad_norm": 0.21920783817768097, + "learning_rate": 0.0015, + "loss": 2.6196, + "step": 5550 + }, + { + "epoch": 0.034583938164538905, + "grad_norm": 0.1033003106713295, + "learning_rate": 0.0015, + "loss": 2.5872, + "step": 5575 + }, + { + "epoch": 0.03473902308904356, + "grad_norm": 0.09867612272500992, + "learning_rate": 0.0015, + "loss": 2.5782, + "step": 5600 + }, + { + "epoch": 0.03473902308904356, + "eval_loss": 4.704063892364502, + "perplexity": 110.3948974609375, + "step": 5600 + }, + { + "epoch": 0.03489410801354822, + "grad_norm": 0.1032184287905693, + "learning_rate": 0.0015, + "loss": 2.6187, + "step": 5625 + }, + { + "epoch": 0.03504919293805288, + "grad_norm": 0.12661318480968475, + "learning_rate": 0.0015, + "loss": 2.5805, + "step": 5650 + }, + { + "epoch": 0.03520427786255754, + "grad_norm": 0.28772449493408203, + "learning_rate": 0.0015, + "loss": 2.7518, + "step": 5675 + }, + { + "epoch": 0.0353593627870622, + "grad_norm": 0.10005131363868713, + "learning_rate": 0.0015, + "loss": 2.8556, + "step": 5700 + }, + { + "epoch": 0.03551444771156685, + "grad_norm": 0.10379570722579956, + "learning_rate": 0.0015, + "loss": 2.8648, + "step": 5725 + }, + { + "epoch": 0.03566953263607151, + "grad_norm": 0.08921229094266891, + "learning_rate": 0.0015, + "loss": 2.8421, + "step": 5750 + }, + { + "epoch": 0.03582461756057617, + "grad_norm": 0.15366144478321075, + "learning_rate": 0.0015, + "loss": 2.8162, + "step": 5775 + }, + { + "epoch": 0.03597970248508083, + "grad_norm": 0.12743431329727173, + "learning_rate": 0.0015, + "loss": 2.8635, + "step": 5800 + }, + { + "epoch": 0.03597970248508083, + "eval_loss": 4.674878120422363, + "perplexity": 107.21949768066406, + "step": 5800 + }, + { + "epoch": 0.03613478740958549, + "grad_norm": 0.08773666620254517, + "learning_rate": 0.0015, + "loss": 2.8787, + "step": 5825 + }, + { + "epoch": 0.036289872334090145, + "grad_norm": 0.11721781641244888, + "learning_rate": 0.0015, + "loss": 2.853, + "step": 5850 + }, + { + "epoch": 0.036444957258594805, + "grad_norm": 0.09957700222730637, + "learning_rate": 0.0015, + "loss": 2.8163, + "step": 5875 + }, + { + "epoch": 0.036600042183099465, + "grad_norm": 0.09999966621398926, + "learning_rate": 0.0015, + "loss": 2.8206, + "step": 5900 + }, + { + "epoch": 0.036755127107604126, + "grad_norm": 0.09899301081895828, + "learning_rate": 0.0015, + "loss": 2.8378, + "step": 5925 + }, + { + "epoch": 0.036910212032108786, + "grad_norm": 0.09676779061555862, + "learning_rate": 0.0015, + "loss": 2.8385, + "step": 5950 + }, + { + "epoch": 0.03706529695661344, + "grad_norm": 0.14397811889648438, + "learning_rate": 0.0015, + "loss": 2.8639, + "step": 5975 + }, + { + "epoch": 0.0372203818811181, + "grad_norm": 0.08991026133298874, + "learning_rate": 0.0015, + "loss": 2.862, + "step": 6000 + }, + { + "epoch": 0.0372203818811181, + "eval_loss": 4.649503707885742, + "perplexity": 104.53309631347656, + "step": 6000 + }, + { + "epoch": 0.03737546680562276, + "grad_norm": 0.11916879564523697, + "learning_rate": 0.0015, + "loss": 2.8336, + "step": 6025 + }, + { + "epoch": 0.03753055173012742, + "grad_norm": 0.1533547192811966, + "learning_rate": 0.0015, + "loss": 2.8154, + "step": 6050 + }, + { + "epoch": 0.03768563665463208, + "grad_norm": 0.10416785627603531, + "learning_rate": 0.0015, + "loss": 2.8073, + "step": 6075 + }, + { + "epoch": 0.03784072157913673, + "grad_norm": 0.1307593733072281, + "learning_rate": 0.0015, + "loss": 2.8227, + "step": 6100 + }, + { + "epoch": 0.03799580650364139, + "grad_norm": 0.11226139962673187, + "learning_rate": 0.0015, + "loss": 2.8316, + "step": 6125 + }, + { + "epoch": 0.03815089142814605, + "grad_norm": 0.12050950527191162, + "learning_rate": 0.0015, + "loss": 2.8636, + "step": 6150 + }, + { + "epoch": 0.03830597635265071, + "grad_norm": 0.14836955070495605, + "learning_rate": 0.0015, + "loss": 2.8433, + "step": 6175 + }, + { + "epoch": 0.03846106127715537, + "grad_norm": 0.1240909993648529, + "learning_rate": 0.0015, + "loss": 2.885, + "step": 6200 + }, + { + "epoch": 0.03846106127715537, + "eval_loss": 4.652696132659912, + "perplexity": 104.86734008789062, + "step": 6200 + }, + { + "epoch": 0.038616146201660026, + "grad_norm": 0.09549515694379807, + "learning_rate": 0.0015, + "loss": 2.822, + "step": 6225 + }, + { + "epoch": 0.038771231126164686, + "grad_norm": 0.1386450082063675, + "learning_rate": 0.0015, + "loss": 2.8455, + "step": 6250 + }, + { + "epoch": 0.038926316050669346, + "grad_norm": 0.10233025252819061, + "learning_rate": 0.0015, + "loss": 2.834, + "step": 6275 + }, + { + "epoch": 0.039081400975174006, + "grad_norm": 0.09776704013347626, + "learning_rate": 0.0015, + "loss": 2.8114, + "step": 6300 + }, + { + "epoch": 0.039236485899678666, + "grad_norm": 0.09631351381540298, + "learning_rate": 0.0015, + "loss": 2.8107, + "step": 6325 + }, + { + "epoch": 0.03939157082418332, + "grad_norm": 0.08424117416143417, + "learning_rate": 0.0015, + "loss": 2.8373, + "step": 6350 + }, + { + "epoch": 0.03954665574868798, + "grad_norm": 0.14171521365642548, + "learning_rate": 0.0015, + "loss": 2.8394, + "step": 6375 + }, + { + "epoch": 0.03970174067319264, + "grad_norm": 0.11349046230316162, + "learning_rate": 0.0015, + "loss": 2.8131, + "step": 6400 + }, + { + "epoch": 0.03970174067319264, + "eval_loss": 4.652514934539795, + "perplexity": 104.84834289550781, + "step": 6400 + }, + { + "epoch": 0.0398568255976973, + "grad_norm": 0.09066054224967957, + "learning_rate": 0.0015, + "loss": 2.8758, + "step": 6425 + }, + { + "epoch": 0.04001191052220196, + "grad_norm": 0.09391192346811295, + "learning_rate": 0.0015, + "loss": 2.826, + "step": 6450 + }, + { + "epoch": 0.04016699544670661, + "grad_norm": 0.17412593960762024, + "learning_rate": 0.0015, + "loss": 2.8487, + "step": 6475 + }, + { + "epoch": 0.04032208037121127, + "grad_norm": 0.17672564089298248, + "learning_rate": 0.0015, + "loss": 2.8441, + "step": 6500 + }, + { + "epoch": 0.04047716529571593, + "grad_norm": 0.11427825689315796, + "learning_rate": 0.0015, + "loss": 2.8843, + "step": 6525 + }, + { + "epoch": 0.04063225022022059, + "grad_norm": 0.13745597004890442, + "learning_rate": 0.0015, + "loss": 2.8458, + "step": 6550 + }, + { + "epoch": 0.040787335144725254, + "grad_norm": 0.12339327484369278, + "learning_rate": 0.0015, + "loss": 2.8299, + "step": 6575 + }, + { + "epoch": 0.040942420069229914, + "grad_norm": 0.11045660078525543, + "learning_rate": 0.0015, + "loss": 2.8504, + "step": 6600 + }, + { + "epoch": 0.040942420069229914, + "eval_loss": 4.645139217376709, + "perplexity": 104.0778579711914, + "step": 6600 + }, + { + "epoch": 0.04109750499373457, + "grad_norm": 0.14822149276733398, + "learning_rate": 0.0015, + "loss": 2.8438, + "step": 6625 + }, + { + "epoch": 0.04125258991823923, + "grad_norm": 0.09271769225597382, + "learning_rate": 0.0015, + "loss": 2.8195, + "step": 6650 + }, + { + "epoch": 0.04140767484274389, + "grad_norm": 0.12357133626937866, + "learning_rate": 0.0015, + "loss": 2.8434, + "step": 6675 + }, + { + "epoch": 0.04156275976724855, + "grad_norm": 0.12669824063777924, + "learning_rate": 0.0015, + "loss": 2.8262, + "step": 6700 + }, + { + "epoch": 0.04171784469175321, + "grad_norm": 0.10409893840551376, + "learning_rate": 0.0015, + "loss": 2.8164, + "step": 6725 + }, + { + "epoch": 0.04187292961625786, + "grad_norm": 0.10687699913978577, + "learning_rate": 0.0015, + "loss": 2.83, + "step": 6750 + }, + { + "epoch": 0.04202801454076252, + "grad_norm": 0.09924216568470001, + "learning_rate": 0.0015, + "loss": 2.8415, + "step": 6775 + }, + { + "epoch": 0.04218309946526718, + "grad_norm": 0.11719833314418793, + "learning_rate": 0.0015, + "loss": 2.8368, + "step": 6800 + }, + { + "epoch": 0.04218309946526718, + "eval_loss": 4.673882484436035, + "perplexity": 107.11280059814453, + "step": 6800 + }, + { + "epoch": 0.04233818438977184, + "grad_norm": 0.10162920504808426, + "learning_rate": 0.0015, + "loss": 2.8285, + "step": 6825 + }, + { + "epoch": 0.0424932693142765, + "grad_norm": 0.10563603043556213, + "learning_rate": 0.0015, + "loss": 2.809, + "step": 6850 + }, + { + "epoch": 0.042648354238781154, + "grad_norm": 0.079631008207798, + "learning_rate": 0.0015, + "loss": 2.8362, + "step": 6875 + }, + { + "epoch": 0.042803439163285814, + "grad_norm": 0.11915802210569382, + "learning_rate": 0.0015, + "loss": 2.8211, + "step": 6900 + }, + { + "epoch": 0.042958524087790474, + "grad_norm": 0.13783864676952362, + "learning_rate": 0.0015, + "loss": 2.8403, + "step": 6925 + }, + { + "epoch": 0.043113609012295134, + "grad_norm": 0.17333541810512543, + "learning_rate": 0.0015, + "loss": 2.8699, + "step": 6950 + }, + { + "epoch": 0.043268693936799794, + "grad_norm": 0.10923554003238678, + "learning_rate": 0.0015, + "loss": 2.8016, + "step": 6975 + }, + { + "epoch": 0.04342377886130445, + "grad_norm": 0.10525023192167282, + "learning_rate": 0.0015, + "loss": 2.8302, + "step": 7000 + }, + { + "epoch": 0.04342377886130445, + "eval_loss": 4.660215854644775, + "perplexity": 105.65888977050781, + "step": 7000 + }, + { + "epoch": 0.04357886378580911, + "grad_norm": 0.10499420017004013, + "learning_rate": 0.0015, + "loss": 2.8215, + "step": 7025 + }, + { + "epoch": 0.04373394871031377, + "grad_norm": 0.09560755640268326, + "learning_rate": 0.0015, + "loss": 2.8279, + "step": 7050 + }, + { + "epoch": 0.04388903363481843, + "grad_norm": 0.10454019159078598, + "learning_rate": 0.0015, + "loss": 2.8161, + "step": 7075 + }, + { + "epoch": 0.04404411855932309, + "grad_norm": 0.0982690081000328, + "learning_rate": 0.0015, + "loss": 2.7895, + "step": 7100 + }, + { + "epoch": 0.04419920348382774, + "grad_norm": 0.10405784100294113, + "learning_rate": 0.0015, + "loss": 2.7945, + "step": 7125 + }, + { + "epoch": 0.0443542884083324, + "grad_norm": 0.09310988336801529, + "learning_rate": 0.0015, + "loss": 2.8535, + "step": 7150 + }, + { + "epoch": 0.04450937333283706, + "grad_norm": 0.1031995639204979, + "learning_rate": 0.0015, + "loss": 2.8298, + "step": 7175 + }, + { + "epoch": 0.04466445825734172, + "grad_norm": 0.09206147491931915, + "learning_rate": 0.0015, + "loss": 2.794, + "step": 7200 + }, + { + "epoch": 0.04466445825734172, + "eval_loss": 4.642621994018555, + "perplexity": 103.81619262695312, + "step": 7200 + }, + { + "epoch": 0.04481954318184638, + "grad_norm": 0.1051359549164772, + "learning_rate": 0.0015, + "loss": 2.7996, + "step": 7225 + }, + { + "epoch": 0.044974628106351035, + "grad_norm": 0.12941063940525055, + "learning_rate": 0.0015, + "loss": 2.792, + "step": 7250 + }, + { + "epoch": 0.045129713030855695, + "grad_norm": 0.09297281503677368, + "learning_rate": 0.0015, + "loss": 2.7847, + "step": 7275 + }, + { + "epoch": 0.045284797955360355, + "grad_norm": 0.11114951968193054, + "learning_rate": 0.0015, + "loss": 2.8164, + "step": 7300 + }, + { + "epoch": 0.045439882879865015, + "grad_norm": 0.08519440144300461, + "learning_rate": 0.0015, + "loss": 2.8053, + "step": 7325 + }, + { + "epoch": 0.045594967804369675, + "grad_norm": 0.11148552596569061, + "learning_rate": 0.0015, + "loss": 2.7871, + "step": 7350 + }, + { + "epoch": 0.04575005272887433, + "grad_norm": 0.136012002825737, + "learning_rate": 0.0015, + "loss": 2.8457, + "step": 7375 + }, + { + "epoch": 0.04590513765337899, + "grad_norm": 0.1037759929895401, + "learning_rate": 0.0015, + "loss": 2.748, + "step": 7400 + }, + { + "epoch": 0.04590513765337899, + "eval_loss": 4.631537437438965, + "perplexity": 102.67179107666016, + "step": 7400 + }, + { + "epoch": 0.04606022257788365, + "grad_norm": 0.11162275820970535, + "learning_rate": 0.0015, + "loss": 2.8044, + "step": 7425 + }, + { + "epoch": 0.04621530750238831, + "grad_norm": 0.11309058219194412, + "learning_rate": 0.0015, + "loss": 2.8198, + "step": 7450 + }, + { + "epoch": 0.04637039242689297, + "grad_norm": 0.09359199553728104, + "learning_rate": 0.0015, + "loss": 2.8302, + "step": 7475 + }, + { + "epoch": 0.04652547735139762, + "grad_norm": 0.09513767808675766, + "learning_rate": 0.0015, + "loss": 2.8325, + "step": 7500 + }, + { + "epoch": 0.04668056227590228, + "grad_norm": 0.08243551850318909, + "learning_rate": 0.0015, + "loss": 2.7925, + "step": 7525 + }, + { + "epoch": 0.04683564720040694, + "grad_norm": 0.08001349121332169, + "learning_rate": 0.0015, + "loss": 2.8406, + "step": 7550 + }, + { + "epoch": 0.0469907321249116, + "grad_norm": 0.11749595403671265, + "learning_rate": 0.0015, + "loss": 2.7762, + "step": 7575 + }, + { + "epoch": 0.04714581704941626, + "grad_norm": 0.15697765350341797, + "learning_rate": 0.0015, + "loss": 2.8137, + "step": 7600 + }, + { + "epoch": 0.04714581704941626, + "eval_loss": 4.643322467803955, + "perplexity": 103.8889389038086, + "step": 7600 + }, + { + "epoch": 0.04730090197392092, + "grad_norm": 0.1004658117890358, + "learning_rate": 0.0015, + "loss": 2.7787, + "step": 7625 + }, + { + "epoch": 0.047455986898425576, + "grad_norm": 0.11577022075653076, + "learning_rate": 0.0015, + "loss": 2.806, + "step": 7650 + }, + { + "epoch": 0.047611071822930236, + "grad_norm": 0.10791046917438507, + "learning_rate": 0.0015, + "loss": 2.7637, + "step": 7675 + }, + { + "epoch": 0.047766156747434896, + "grad_norm": 0.09490654617547989, + "learning_rate": 0.0015, + "loss": 2.8187, + "step": 7700 + }, + { + "epoch": 0.047921241671939556, + "grad_norm": 0.10448817163705826, + "learning_rate": 0.0015, + "loss": 2.8335, + "step": 7725 + }, + { + "epoch": 0.048076326596444216, + "grad_norm": 0.10800398141145706, + "learning_rate": 0.0015, + "loss": 2.8138, + "step": 7750 + }, + { + "epoch": 0.04823141152094887, + "grad_norm": 0.10268035531044006, + "learning_rate": 0.0015, + "loss": 2.8074, + "step": 7775 + }, + { + "epoch": 0.04838649644545353, + "grad_norm": 0.145925372838974, + "learning_rate": 0.0015, + "loss": 2.8161, + "step": 7800 + }, + { + "epoch": 0.04838649644545353, + "eval_loss": 4.628528118133545, + "perplexity": 102.36328887939453, + "step": 7800 + }, + { + "epoch": 0.04854158136995819, + "grad_norm": 0.1422831267118454, + "learning_rate": 0.0015, + "loss": 2.8179, + "step": 7825 + }, + { + "epoch": 0.04869666629446285, + "grad_norm": 0.10019826889038086, + "learning_rate": 0.0015, + "loss": 2.8228, + "step": 7850 + }, + { + "epoch": 0.04885175121896751, + "grad_norm": 0.12028387933969498, + "learning_rate": 0.0015, + "loss": 2.8359, + "step": 7875 + }, + { + "epoch": 0.04900683614347216, + "grad_norm": 0.08171118795871735, + "learning_rate": 0.0015, + "loss": 2.7829, + "step": 7900 + }, + { + "epoch": 0.04916192106797682, + "grad_norm": 0.138522207736969, + "learning_rate": 0.0015, + "loss": 2.7992, + "step": 7925 + }, + { + "epoch": 0.04931700599248148, + "grad_norm": 0.10419227927923203, + "learning_rate": 0.0015, + "loss": 2.8097, + "step": 7950 + }, + { + "epoch": 0.04947209091698614, + "grad_norm": 0.1020691841840744, + "learning_rate": 0.0015, + "loss": 2.8152, + "step": 7975 + }, + { + "epoch": 0.0496271758414908, + "grad_norm": 0.12423787266016006, + "learning_rate": 0.0015, + "loss": 2.7966, + "step": 8000 + }, + { + "epoch": 0.0496271758414908, + "eval_loss": 4.6273722648620605, + "perplexity": 102.24504089355469, + "step": 8000 + }, + { + "epoch": 0.049782260765995456, + "grad_norm": 0.15230977535247803, + "learning_rate": 0.0015, + "loss": 2.7575, + "step": 8025 + }, + { + "epoch": 0.049937345690500116, + "grad_norm": 0.12649676203727722, + "learning_rate": 0.0015, + "loss": 2.7897, + "step": 8050 + }, + { + "epoch": 0.05009243061500478, + "grad_norm": 0.11257271468639374, + "learning_rate": 0.0015, + "loss": 2.8115, + "step": 8075 + }, + { + "epoch": 0.05024751553950944, + "grad_norm": 0.09349871426820755, + "learning_rate": 0.0015, + "loss": 2.8041, + "step": 8100 + }, + { + "epoch": 0.0504026004640141, + "grad_norm": 0.14108401536941528, + "learning_rate": 0.0015, + "loss": 2.7772, + "step": 8125 + }, + { + "epoch": 0.05055768538851875, + "grad_norm": 0.17286863923072815, + "learning_rate": 0.0015, + "loss": 2.8197, + "step": 8150 + }, + { + "epoch": 0.05071277031302341, + "grad_norm": 0.10759209096431732, + "learning_rate": 0.0015, + "loss": 2.8396, + "step": 8175 + }, + { + "epoch": 0.05086785523752807, + "grad_norm": 0.10236554592847824, + "learning_rate": 0.0015, + "loss": 2.8175, + "step": 8200 + }, + { + "epoch": 0.05086785523752807, + "eval_loss": 4.610519886016846, + "perplexity": 100.5363998413086, + "step": 8200 + }, + { + "epoch": 0.05102294016203273, + "grad_norm": 0.12348885089159012, + "learning_rate": 0.0015, + "loss": 2.8139, + "step": 8225 + }, + { + "epoch": 0.05117802508653739, + "grad_norm": 0.10251584649085999, + "learning_rate": 0.0015, + "loss": 2.8436, + "step": 8250 + }, + { + "epoch": 0.051333110011042044, + "grad_norm": 0.10069389641284943, + "learning_rate": 0.0015, + "loss": 2.8409, + "step": 8275 + }, + { + "epoch": 0.051488194935546704, + "grad_norm": 0.1546829789876938, + "learning_rate": 0.0015, + "loss": 2.8199, + "step": 8300 + }, + { + "epoch": 0.051643279860051364, + "grad_norm": 0.10704527795314789, + "learning_rate": 0.0015, + "loss": 2.7721, + "step": 8325 + }, + { + "epoch": 0.051798364784556024, + "grad_norm": 0.12251198291778564, + "learning_rate": 0.0015, + "loss": 2.8175, + "step": 8350 + }, + { + "epoch": 0.051953449709060684, + "grad_norm": 0.11113474518060684, + "learning_rate": 0.0015, + "loss": 2.8085, + "step": 8375 + }, + { + "epoch": 0.05210853463356534, + "grad_norm": 0.1341187059879303, + "learning_rate": 0.0015, + "loss": 2.8169, + "step": 8400 + }, + { + "epoch": 0.05210853463356534, + "eval_loss": 4.610434532165527, + "perplexity": 100.52782440185547, + "step": 8400 + }, + { + "epoch": 0.05226361955807, + "grad_norm": 0.16195224225521088, + "learning_rate": 0.0015, + "loss": 2.8266, + "step": 8425 + }, + { + "epoch": 0.05241870448257466, + "grad_norm": 0.1637653261423111, + "learning_rate": 0.0015, + "loss": 2.8106, + "step": 8450 + }, + { + "epoch": 0.05257378940707932, + "grad_norm": 0.10014921426773071, + "learning_rate": 0.0015, + "loss": 2.8103, + "step": 8475 + }, + { + "epoch": 0.05272887433158398, + "grad_norm": 0.11419603228569031, + "learning_rate": 0.0015, + "loss": 2.7965, + "step": 8500 + }, + { + "epoch": 0.05288395925608863, + "grad_norm": 0.08137035369873047, + "learning_rate": 0.0015, + "loss": 2.7802, + "step": 8525 + }, + { + "epoch": 0.05303904418059329, + "grad_norm": 0.08078640699386597, + "learning_rate": 0.0015, + "loss": 2.7819, + "step": 8550 + }, + { + "epoch": 0.05319412910509795, + "grad_norm": 0.13133442401885986, + "learning_rate": 0.0015, + "loss": 2.83, + "step": 8575 + }, + { + "epoch": 0.05334921402960261, + "grad_norm": 0.08819993585348129, + "learning_rate": 0.0015, + "loss": 2.833, + "step": 8600 + }, + { + "epoch": 0.05334921402960261, + "eval_loss": 4.603670120239258, + "perplexity": 99.85010528564453, + "step": 8600 + }, + { + "epoch": 0.05350429895410727, + "grad_norm": 0.14662431180477142, + "learning_rate": 0.0015, + "loss": 2.8201, + "step": 8625 + }, + { + "epoch": 0.05365938387861193, + "grad_norm": 0.10400764644145966, + "learning_rate": 0.0015, + "loss": 2.7944, + "step": 8650 + }, + { + "epoch": 0.053814468803116584, + "grad_norm": 0.2790142297744751, + "learning_rate": 0.0015, + "loss": 2.8307, + "step": 8675 + }, + { + "epoch": 0.053969553727621244, + "grad_norm": 0.13645683228969574, + "learning_rate": 0.0015, + "loss": 2.7904, + "step": 8700 + }, + { + "epoch": 0.054124638652125905, + "grad_norm": 0.09604925662279129, + "learning_rate": 0.0015, + "loss": 2.76, + "step": 8725 + }, + { + "epoch": 0.054279723576630565, + "grad_norm": 0.07631650567054749, + "learning_rate": 0.0015, + "loss": 2.7955, + "step": 8750 + }, + { + "epoch": 0.054434808501135225, + "grad_norm": 0.13132531940937042, + "learning_rate": 0.0015, + "loss": 2.8308, + "step": 8775 + }, + { + "epoch": 0.05458989342563988, + "grad_norm": 0.08334681391716003, + "learning_rate": 0.0015, + "loss": 2.755, + "step": 8800 + }, + { + "epoch": 0.05458989342563988, + "eval_loss": 4.597860336303711, + "perplexity": 99.27168273925781, + "step": 8800 + }, + { + "epoch": 0.05474497835014454, + "grad_norm": 0.10585317760705948, + "learning_rate": 0.0015, + "loss": 2.7708, + "step": 8825 + }, + { + "epoch": 0.0549000632746492, + "grad_norm": 0.08953095227479935, + "learning_rate": 0.0015, + "loss": 2.7622, + "step": 8850 + }, + { + "epoch": 0.05505514819915386, + "grad_norm": 0.10430523008108139, + "learning_rate": 0.0015, + "loss": 2.8255, + "step": 8875 + }, + { + "epoch": 0.05521023312365852, + "grad_norm": 0.08961856365203857, + "learning_rate": 0.0015, + "loss": 2.7835, + "step": 8900 + }, + { + "epoch": 0.05536531804816317, + "grad_norm": 0.13602201640605927, + "learning_rate": 0.0015, + "loss": 2.813, + "step": 8925 + }, + { + "epoch": 0.05552040297266783, + "grad_norm": 0.1858643889427185, + "learning_rate": 0.0015, + "loss": 2.8296, + "step": 8950 + }, + { + "epoch": 0.05567548789717249, + "grad_norm": 0.12873806059360504, + "learning_rate": 0.0015, + "loss": 2.7669, + "step": 8975 + }, + { + "epoch": 0.05583057282167715, + "grad_norm": 0.09891733527183533, + "learning_rate": 0.0015, + "loss": 2.7829, + "step": 9000 + }, + { + "epoch": 0.05583057282167715, + "eval_loss": 4.606179714202881, + "perplexity": 100.10100555419922, + "step": 9000 + }, + { + "epoch": 0.05598565774618181, + "grad_norm": 0.1619413048028946, + "learning_rate": 0.0015, + "loss": 2.7885, + "step": 9025 + }, + { + "epoch": 0.056140742670686465, + "grad_norm": 0.1223379522562027, + "learning_rate": 0.0015, + "loss": 2.7829, + "step": 9050 + }, + { + "epoch": 0.056295827595191125, + "grad_norm": 0.10872245579957962, + "learning_rate": 0.0015, + "loss": 2.7962, + "step": 9075 + }, + { + "epoch": 0.056450912519695785, + "grad_norm": 0.11461862176656723, + "learning_rate": 0.0015, + "loss": 2.7476, + "step": 9100 + }, + { + "epoch": 0.056605997444200445, + "grad_norm": 0.08933119475841522, + "learning_rate": 0.0015, + "loss": 2.7745, + "step": 9125 + }, + { + "epoch": 0.056761082368705106, + "grad_norm": 0.12911683320999146, + "learning_rate": 0.0015, + "loss": 2.8029, + "step": 9150 + }, + { + "epoch": 0.05691616729320976, + "grad_norm": 0.13963252305984497, + "learning_rate": 0.0015, + "loss": 2.7931, + "step": 9175 + }, + { + "epoch": 0.05707125221771442, + "grad_norm": 0.13462606072425842, + "learning_rate": 0.0015, + "loss": 2.7771, + "step": 9200 + }, + { + "epoch": 0.05707125221771442, + "eval_loss": 4.619841575622559, + "perplexity": 101.47795104980469, + "step": 9200 + }, + { + "epoch": 0.05722633714221908, + "grad_norm": 0.12551379203796387, + "learning_rate": 0.0015, + "loss": 2.7934, + "step": 9225 + }, + { + "epoch": 0.05738142206672374, + "grad_norm": 0.12379872798919678, + "learning_rate": 0.0015, + "loss": 2.7882, + "step": 9250 + }, + { + "epoch": 0.0575365069912284, + "grad_norm": 0.0940781831741333, + "learning_rate": 0.0015, + "loss": 2.7658, + "step": 9275 + }, + { + "epoch": 0.05769159191573305, + "grad_norm": 0.14165829122066498, + "learning_rate": 0.0015, + "loss": 2.7973, + "step": 9300 + }, + { + "epoch": 0.05784667684023771, + "grad_norm": 0.10727201402187347, + "learning_rate": 0.0015, + "loss": 2.815, + "step": 9325 + }, + { + "epoch": 0.05800176176474237, + "grad_norm": 0.1628653109073639, + "learning_rate": 0.0015, + "loss": 2.7854, + "step": 9350 + }, + { + "epoch": 0.05815684668924703, + "grad_norm": 0.09925588220357895, + "learning_rate": 0.0015, + "loss": 2.7578, + "step": 9375 + }, + { + "epoch": 0.05831193161375169, + "grad_norm": 0.1587476134300232, + "learning_rate": 0.0015, + "loss": 2.7296, + "step": 9400 + }, + { + "epoch": 0.05831193161375169, + "eval_loss": 4.604221343994141, + "perplexity": 99.90515899658203, + "step": 9400 + }, + { + "epoch": 0.058467016538256346, + "grad_norm": 0.10519708693027496, + "learning_rate": 0.0015, + "loss": 2.7712, + "step": 9425 + }, + { + "epoch": 0.058622101462761006, + "grad_norm": 0.10321429371833801, + "learning_rate": 0.0015, + "loss": 2.7281, + "step": 9450 + }, + { + "epoch": 0.058777186387265666, + "grad_norm": 0.20060209929943085, + "learning_rate": 0.0015, + "loss": 2.807, + "step": 9475 + }, + { + "epoch": 0.058932271311770326, + "grad_norm": 0.10847010463476181, + "learning_rate": 0.0015, + "loss": 2.8078, + "step": 9500 + }, + { + "epoch": 0.059087356236274986, + "grad_norm": 0.11248752474784851, + "learning_rate": 0.0015, + "loss": 2.796, + "step": 9525 + }, + { + "epoch": 0.059242441160779646, + "grad_norm": 0.13171915709972382, + "learning_rate": 0.0015, + "loss": 2.7658, + "step": 9550 + }, + { + "epoch": 0.0593975260852843, + "grad_norm": 0.12041529268026352, + "learning_rate": 0.0015, + "loss": 2.7507, + "step": 9575 + }, + { + "epoch": 0.05955261100978896, + "grad_norm": 0.11275593191385269, + "learning_rate": 0.0015, + "loss": 2.8022, + "step": 9600 + }, + { + "epoch": 0.05955261100978896, + "eval_loss": 4.5886077880859375, + "perplexity": 98.3573989868164, + "step": 9600 + }, + { + "epoch": 0.05970769593429362, + "grad_norm": 0.1715971678495407, + "learning_rate": 0.0015, + "loss": 2.8003, + "step": 9625 + }, + { + "epoch": 0.05986278085879828, + "grad_norm": 0.1223614364862442, + "learning_rate": 0.0015, + "loss": 2.8012, + "step": 9650 + }, + { + "epoch": 0.06001786578330294, + "grad_norm": 0.114704430103302, + "learning_rate": 0.0015, + "loss": 2.7963, + "step": 9675 + }, + { + "epoch": 0.06017295070780759, + "grad_norm": 0.10282139480113983, + "learning_rate": 0.0015, + "loss": 2.7965, + "step": 9700 + }, + { + "epoch": 0.06032803563231225, + "grad_norm": 0.10494767129421234, + "learning_rate": 0.0015, + "loss": 2.7698, + "step": 9725 + }, + { + "epoch": 0.06048312055681691, + "grad_norm": 0.0908605083823204, + "learning_rate": 0.0015, + "loss": 2.749, + "step": 9750 + }, + { + "epoch": 0.06063820548132157, + "grad_norm": 0.0847998857498169, + "learning_rate": 0.0015, + "loss": 2.838, + "step": 9775 + }, + { + "epoch": 0.060793290405826234, + "grad_norm": 0.24615754187107086, + "learning_rate": 0.0015, + "loss": 2.8117, + "step": 9800 + }, + { + "epoch": 0.060793290405826234, + "eval_loss": 4.593789100646973, + "perplexity": 98.86833953857422, + "step": 9800 + }, + { + "epoch": 0.06094837533033089, + "grad_norm": 0.0959208682179451, + "learning_rate": 0.0015, + "loss": 2.7845, + "step": 9825 + }, + { + "epoch": 0.06110346025483555, + "grad_norm": 0.09963307529687881, + "learning_rate": 0.0015, + "loss": 2.8296, + "step": 9850 + }, + { + "epoch": 0.06125854517934021, + "grad_norm": 0.1115136444568634, + "learning_rate": 0.0015, + "loss": 2.7586, + "step": 9875 + }, + { + "epoch": 0.06141363010384487, + "grad_norm": 0.13883067667484283, + "learning_rate": 0.0015, + "loss": 2.7978, + "step": 9900 + }, + { + "epoch": 0.06156871502834953, + "grad_norm": 0.2048570066690445, + "learning_rate": 0.0015, + "loss": 2.8397, + "step": 9925 + }, + { + "epoch": 0.06172379995285418, + "grad_norm": 0.1306881606578827, + "learning_rate": 0.0015, + "loss": 2.8084, + "step": 9950 + }, + { + "epoch": 0.06187888487735884, + "grad_norm": 0.18285603821277618, + "learning_rate": 0.0015, + "loss": 2.7989, + "step": 9975 + }, + { + "epoch": 0.0620339698018635, + "grad_norm": 0.1109723299741745, + "learning_rate": 0.0015, + "loss": 2.8064, + "step": 10000 + }, + { + "epoch": 0.0620339698018635, + "eval_loss": 4.5877556800842285, + "perplexity": 98.27362823486328, + "step": 10000 + }, + { + "epoch": 0.06218905472636816, + "grad_norm": 0.12350066751241684, + "learning_rate": 0.0015, + "loss": 2.7684, + "step": 10025 + }, + { + "epoch": 0.06234413965087282, + "grad_norm": 0.11565285176038742, + "learning_rate": 0.0015, + "loss": 2.7748, + "step": 10050 + }, + { + "epoch": 0.062499224575377474, + "grad_norm": 0.1117839589715004, + "learning_rate": 0.0015, + "loss": 2.8044, + "step": 10075 + }, + { + "epoch": 0.06265430949988214, + "grad_norm": 0.1102209985256195, + "learning_rate": 0.0015, + "loss": 2.7844, + "step": 10100 + }, + { + "epoch": 0.0628093944243868, + "grad_norm": 0.10270575433969498, + "learning_rate": 0.0015, + "loss": 2.7685, + "step": 10125 + }, + { + "epoch": 0.06296447934889145, + "grad_norm": 0.09842963516712189, + "learning_rate": 0.0015, + "loss": 2.8048, + "step": 10150 + }, + { + "epoch": 0.06311956427339611, + "grad_norm": 0.10446088761091232, + "learning_rate": 0.0015, + "loss": 2.8051, + "step": 10175 + }, + { + "epoch": 0.06327464919790077, + "grad_norm": 0.14759957790374756, + "learning_rate": 0.0015, + "loss": 2.8089, + "step": 10200 + }, + { + "epoch": 0.06327464919790077, + "eval_loss": 4.588883399963379, + "perplexity": 98.38451385498047, + "step": 10200 + }, + { + "epoch": 0.06342973412240543, + "grad_norm": 0.12910906970500946, + "learning_rate": 0.0015, + "loss": 2.8193, + "step": 10225 + }, + { + "epoch": 0.06358481904691009, + "grad_norm": 0.13095402717590332, + "learning_rate": 0.0015, + "loss": 2.7509, + "step": 10250 + }, + { + "epoch": 0.06373990397141474, + "grad_norm": 0.16069594025611877, + "learning_rate": 0.0015, + "loss": 2.7911, + "step": 10275 + }, + { + "epoch": 0.06389498889591941, + "grad_norm": 0.08322907984256744, + "learning_rate": 0.0015, + "loss": 2.8025, + "step": 10300 + }, + { + "epoch": 0.06405007382042406, + "grad_norm": 0.2328927367925644, + "learning_rate": 0.0015, + "loss": 2.7863, + "step": 10325 + }, + { + "epoch": 0.06420515874492873, + "grad_norm": 0.09172859787940979, + "learning_rate": 0.0015, + "loss": 2.8101, + "step": 10350 + }, + { + "epoch": 0.06436024366943338, + "grad_norm": 0.13464473187923431, + "learning_rate": 0.0015, + "loss": 2.7718, + "step": 10375 + }, + { + "epoch": 0.06451532859393803, + "grad_norm": 0.1284090131521225, + "learning_rate": 0.0015, + "loss": 2.7667, + "step": 10400 + }, + { + "epoch": 0.06451532859393803, + "eval_loss": 4.59510612487793, + "perplexity": 98.99864196777344, + "step": 10400 + }, + { + "epoch": 0.0646704135184427, + "grad_norm": 0.13565704226493835, + "learning_rate": 0.0015, + "loss": 2.7552, + "step": 10425 + }, + { + "epoch": 0.06482549844294735, + "grad_norm": 0.1089024469256401, + "learning_rate": 0.0015, + "loss": 2.7838, + "step": 10450 + }, + { + "epoch": 0.06498058336745202, + "grad_norm": 0.11035135388374329, + "learning_rate": 0.0015, + "loss": 2.7986, + "step": 10475 + }, + { + "epoch": 0.06513566829195667, + "grad_norm": 0.08107917010784149, + "learning_rate": 0.0015, + "loss": 2.7791, + "step": 10500 + }, + { + "epoch": 0.06529075321646133, + "grad_norm": 0.10200012475252151, + "learning_rate": 0.0015, + "loss": 2.7636, + "step": 10525 + }, + { + "epoch": 0.065445838140966, + "grad_norm": 0.08427785336971283, + "learning_rate": 0.0015, + "loss": 2.794, + "step": 10550 + }, + { + "epoch": 0.06560092306547065, + "grad_norm": 0.10828018933534622, + "learning_rate": 0.0015, + "loss": 2.7778, + "step": 10575 + }, + { + "epoch": 0.06575600798997532, + "grad_norm": 0.12101134657859802, + "learning_rate": 0.0015, + "loss": 2.7469, + "step": 10600 + }, + { + "epoch": 0.06575600798997532, + "eval_loss": 4.597805500030518, + "perplexity": 99.2662353515625, + "step": 10600 + }, + { + "epoch": 0.06591109291447997, + "grad_norm": 0.11220554262399673, + "learning_rate": 0.0015, + "loss": 2.7294, + "step": 10625 + }, + { + "epoch": 0.06606617783898462, + "grad_norm": 0.13899332284927368, + "learning_rate": 0.0015, + "loss": 2.763, + "step": 10650 + }, + { + "epoch": 0.06622126276348929, + "grad_norm": 0.11773937195539474, + "learning_rate": 0.0015, + "loss": 2.7866, + "step": 10675 + }, + { + "epoch": 0.06637634768799394, + "grad_norm": 0.11059702187776566, + "learning_rate": 0.0015, + "loss": 2.8076, + "step": 10700 + }, + { + "epoch": 0.06653143261249861, + "grad_norm": 0.1251254379749298, + "learning_rate": 0.0015, + "loss": 2.7674, + "step": 10725 + }, + { + "epoch": 0.06668651753700326, + "grad_norm": 0.12195979803800583, + "learning_rate": 0.0015, + "loss": 2.768, + "step": 10750 + }, + { + "epoch": 0.06684160246150792, + "grad_norm": 0.1487302929162979, + "learning_rate": 0.0015, + "loss": 2.762, + "step": 10775 + }, + { + "epoch": 0.06699668738601258, + "grad_norm": 0.1315547525882721, + "learning_rate": 0.0015, + "loss": 2.7348, + "step": 10800 + }, + { + "epoch": 0.06699668738601258, + "eval_loss": 4.566490650177002, + "perplexity": 96.20589447021484, + "step": 10800 + }, + { + "epoch": 0.06715177231051724, + "grad_norm": 0.13864025473594666, + "learning_rate": 0.0015, + "loss": 2.7517, + "step": 10825 + }, + { + "epoch": 0.0673068572350219, + "grad_norm": 0.08808566629886627, + "learning_rate": 0.0015, + "loss": 2.7718, + "step": 10850 + }, + { + "epoch": 0.06746194215952656, + "grad_norm": 0.115321584045887, + "learning_rate": 0.0015, + "loss": 2.7007, + "step": 10875 + }, + { + "epoch": 0.06761702708403121, + "grad_norm": 0.10276370495557785, + "learning_rate": 0.0015, + "loss": 2.7692, + "step": 10900 + }, + { + "epoch": 0.06777211200853588, + "grad_norm": 0.09534792602062225, + "learning_rate": 0.0015, + "loss": 2.8186, + "step": 10925 + }, + { + "epoch": 0.06792719693304053, + "grad_norm": 0.14239507913589478, + "learning_rate": 0.0015, + "loss": 2.7801, + "step": 10950 + }, + { + "epoch": 0.0680822818575452, + "grad_norm": 0.11848737299442291, + "learning_rate": 0.0015, + "loss": 2.7394, + "step": 10975 + }, + { + "epoch": 0.06823736678204985, + "grad_norm": 0.09367898106575012, + "learning_rate": 0.0015, + "loss": 2.8043, + "step": 11000 + }, + { + "epoch": 0.06823736678204985, + "eval_loss": 4.5800089836120605, + "perplexity": 97.51527404785156, + "step": 11000 + }, + { + "epoch": 0.0683924517065545, + "grad_norm": 0.1494915634393692, + "learning_rate": 0.0015, + "loss": 2.7841, + "step": 11025 + }, + { + "epoch": 0.06854753663105917, + "grad_norm": 0.09982737898826599, + "learning_rate": 0.0015, + "loss": 2.7933, + "step": 11050 + }, + { + "epoch": 0.06870262155556382, + "grad_norm": 0.12379477173089981, + "learning_rate": 0.0015, + "loss": 2.7419, + "step": 11075 + }, + { + "epoch": 0.06885770648006849, + "grad_norm": 0.11405149102210999, + "learning_rate": 0.0015, + "loss": 2.763, + "step": 11100 + }, + { + "epoch": 0.06901279140457314, + "grad_norm": 0.09574620425701141, + "learning_rate": 0.0015, + "loss": 2.7961, + "step": 11125 + }, + { + "epoch": 0.06916787632907781, + "grad_norm": 0.2947874963283539, + "learning_rate": 0.0015, + "loss": 2.789, + "step": 11150 + }, + { + "epoch": 0.06932296125358246, + "grad_norm": 0.09219149500131607, + "learning_rate": 0.0015, + "loss": 2.7951, + "step": 11175 + }, + { + "epoch": 0.06947804617808712, + "grad_norm": 0.11840498447418213, + "learning_rate": 0.0015, + "loss": 2.7717, + "step": 11200 + }, + { + "epoch": 0.06947804617808712, + "eval_loss": 4.564184188842773, + "perplexity": 95.98426055908203, + "step": 11200 + }, + { + "epoch": 0.06963313110259178, + "grad_norm": 0.09422053396701813, + "learning_rate": 0.0015, + "loss": 2.7976, + "step": 11225 + }, + { + "epoch": 0.06978821602709644, + "grad_norm": 0.11220031976699829, + "learning_rate": 0.0015, + "loss": 2.7634, + "step": 11250 + }, + { + "epoch": 0.0699433009516011, + "grad_norm": 0.10228817909955978, + "learning_rate": 0.0015, + "loss": 2.7256, + "step": 11275 + }, + { + "epoch": 0.07009838587610576, + "grad_norm": 0.0929483100771904, + "learning_rate": 0.0015, + "loss": 2.8005, + "step": 11300 + }, + { + "epoch": 0.07025347080061041, + "grad_norm": 0.11491668224334717, + "learning_rate": 0.0015, + "loss": 2.7504, + "step": 11325 + }, + { + "epoch": 0.07040855572511508, + "grad_norm": 0.15256111323833466, + "learning_rate": 0.0015, + "loss": 2.7609, + "step": 11350 + }, + { + "epoch": 0.07056364064961973, + "grad_norm": 0.11576159298419952, + "learning_rate": 0.0015, + "loss": 2.7742, + "step": 11375 + }, + { + "epoch": 0.0707187255741244, + "grad_norm": 0.08809765428304672, + "learning_rate": 0.0015, + "loss": 2.7891, + "step": 11400 + }, + { + "epoch": 0.0707187255741244, + "eval_loss": 4.568883895874023, + "perplexity": 96.43641662597656, + "step": 11400 + }, + { + "epoch": 0.07087381049862905, + "grad_norm": 0.08563827723264694, + "learning_rate": 0.0015, + "loss": 2.8066, + "step": 11425 + }, + { + "epoch": 0.0710288954231337, + "grad_norm": 0.18896931409835815, + "learning_rate": 0.0015, + "loss": 2.8055, + "step": 11450 + }, + { + "epoch": 0.07118398034763837, + "grad_norm": 0.13940319418907166, + "learning_rate": 0.0015, + "loss": 2.7766, + "step": 11475 + }, + { + "epoch": 0.07133906527214302, + "grad_norm": 0.09737322479486465, + "learning_rate": 0.0015, + "loss": 2.7945, + "step": 11500 + }, + { + "epoch": 0.07149415019664769, + "grad_norm": 0.11357785761356354, + "learning_rate": 0.0015, + "loss": 2.7799, + "step": 11525 + }, + { + "epoch": 0.07164923512115234, + "grad_norm": 0.10513681918382645, + "learning_rate": 0.0015, + "loss": 2.7627, + "step": 11550 + }, + { + "epoch": 0.071804320045657, + "grad_norm": 0.1434682458639145, + "learning_rate": 0.0015, + "loss": 2.8055, + "step": 11575 + }, + { + "epoch": 0.07195940497016166, + "grad_norm": 0.10169105976819992, + "learning_rate": 0.0015, + "loss": 2.7832, + "step": 11600 + }, + { + "epoch": 0.07195940497016166, + "eval_loss": 4.560365676879883, + "perplexity": 95.61843872070312, + "step": 11600 + }, + { + "epoch": 0.07211448989466632, + "grad_norm": 0.1385478526353836, + "learning_rate": 0.0015, + "loss": 2.7548, + "step": 11625 + }, + { + "epoch": 0.07226957481917098, + "grad_norm": 0.1300746351480484, + "learning_rate": 0.0015, + "loss": 2.7553, + "step": 11650 + }, + { + "epoch": 0.07242465974367564, + "grad_norm": 0.11596991866827011, + "learning_rate": 0.0015, + "loss": 2.8095, + "step": 11675 + }, + { + "epoch": 0.07257974466818029, + "grad_norm": 0.11611347645521164, + "learning_rate": 0.0015, + "loss": 2.76, + "step": 11700 + }, + { + "epoch": 0.07273482959268496, + "grad_norm": 0.11249697953462601, + "learning_rate": 0.0015, + "loss": 2.7827, + "step": 11725 + }, + { + "epoch": 0.07288991451718961, + "grad_norm": 0.1243973895907402, + "learning_rate": 0.0015, + "loss": 2.7754, + "step": 11750 + }, + { + "epoch": 0.07304499944169428, + "grad_norm": 0.08843350410461426, + "learning_rate": 0.0015, + "loss": 2.8079, + "step": 11775 + }, + { + "epoch": 0.07320008436619893, + "grad_norm": 0.09881053864955902, + "learning_rate": 0.0015, + "loss": 2.7961, + "step": 11800 + }, + { + "epoch": 0.07320008436619893, + "eval_loss": 4.567913055419922, + "perplexity": 96.34283447265625, + "step": 11800 + }, + { + "epoch": 0.07335516929070358, + "grad_norm": 0.08978071063756943, + "learning_rate": 0.0015, + "loss": 2.7786, + "step": 11825 + }, + { + "epoch": 0.07351025421520825, + "grad_norm": 0.1376107782125473, + "learning_rate": 0.0015, + "loss": 2.7931, + "step": 11850 + }, + { + "epoch": 0.0736653391397129, + "grad_norm": 0.09934777021408081, + "learning_rate": 0.0015, + "loss": 2.7787, + "step": 11875 + }, + { + "epoch": 0.07382042406421757, + "grad_norm": 0.17031100392341614, + "learning_rate": 0.0015, + "loss": 2.7997, + "step": 11900 + }, + { + "epoch": 0.07397550898872222, + "grad_norm": 0.13974526524543762, + "learning_rate": 0.0015, + "loss": 2.7975, + "step": 11925 + }, + { + "epoch": 0.07413059391322688, + "grad_norm": 0.12611718475818634, + "learning_rate": 0.0015, + "loss": 2.792, + "step": 11950 + }, + { + "epoch": 0.07428567883773154, + "grad_norm": 0.15177124738693237, + "learning_rate": 0.0015, + "loss": 2.7904, + "step": 11975 + }, + { + "epoch": 0.0744407637622362, + "grad_norm": 0.1411113739013672, + "learning_rate": 0.0015, + "loss": 2.7677, + "step": 12000 + }, + { + "epoch": 0.0744407637622362, + "eval_loss": 4.5571770668029785, + "perplexity": 95.31403350830078, + "step": 12000 + }, + { + "epoch": 0.07459584868674086, + "grad_norm": 0.08981940150260925, + "learning_rate": 0.0015, + "loss": 2.7765, + "step": 12025 + }, + { + "epoch": 0.07475093361124552, + "grad_norm": 0.09796686470508575, + "learning_rate": 0.0015, + "loss": 2.7503, + "step": 12050 + }, + { + "epoch": 0.07490601853575017, + "grad_norm": 0.1125386580824852, + "learning_rate": 0.0015, + "loss": 2.7263, + "step": 12075 + }, + { + "epoch": 0.07506110346025484, + "grad_norm": 0.11394508183002472, + "learning_rate": 0.0015, + "loss": 2.7855, + "step": 12100 + }, + { + "epoch": 0.07521618838475949, + "grad_norm": 0.11744117736816406, + "learning_rate": 0.0015, + "loss": 2.7698, + "step": 12125 + }, + { + "epoch": 0.07537127330926416, + "grad_norm": 0.17264704406261444, + "learning_rate": 0.0015, + "loss": 2.7592, + "step": 12150 + }, + { + "epoch": 0.07552635823376881, + "grad_norm": 0.10691671818494797, + "learning_rate": 0.0015, + "loss": 2.7519, + "step": 12175 + }, + { + "epoch": 0.07568144315827346, + "grad_norm": 0.1205432191491127, + "learning_rate": 0.0015, + "loss": 2.7676, + "step": 12200 + }, + { + "epoch": 0.07568144315827346, + "eval_loss": 4.544521808624268, + "perplexity": 94.11540985107422, + "step": 12200 + }, + { + "epoch": 0.07583652808277813, + "grad_norm": 0.1253867894411087, + "learning_rate": 0.0015, + "loss": 2.7698, + "step": 12225 + }, + { + "epoch": 0.07599161300728279, + "grad_norm": 0.1450471729040146, + "learning_rate": 0.0015, + "loss": 2.77, + "step": 12250 + }, + { + "epoch": 0.07614669793178745, + "grad_norm": 0.17055222392082214, + "learning_rate": 0.0015, + "loss": 2.7352, + "step": 12275 + }, + { + "epoch": 0.0763017828562921, + "grad_norm": 0.10687011480331421, + "learning_rate": 0.0015, + "loss": 2.7988, + "step": 12300 + }, + { + "epoch": 0.07645686778079676, + "grad_norm": 0.15520496666431427, + "learning_rate": 0.0015, + "loss": 2.7828, + "step": 12325 + }, + { + "epoch": 0.07661195270530143, + "grad_norm": 0.09279755502939224, + "learning_rate": 0.0015, + "loss": 2.7222, + "step": 12350 + }, + { + "epoch": 0.07676703762980608, + "grad_norm": 0.18024928867816925, + "learning_rate": 0.0015, + "loss": 2.7555, + "step": 12375 + }, + { + "epoch": 0.07692212255431075, + "grad_norm": 0.13292630016803741, + "learning_rate": 0.0015, + "loss": 2.733, + "step": 12400 + }, + { + "epoch": 0.07692212255431075, + "eval_loss": 4.538700103759766, + "perplexity": 93.569091796875, + "step": 12400 + }, + { + "epoch": 0.0770772074788154, + "grad_norm": 0.09353446960449219, + "learning_rate": 0.0015, + "loss": 2.7768, + "step": 12425 + }, + { + "epoch": 0.07723229240332005, + "grad_norm": 0.0946316123008728, + "learning_rate": 0.0015, + "loss": 2.7321, + "step": 12450 + }, + { + "epoch": 0.07738737732782472, + "grad_norm": 0.11109050363302231, + "learning_rate": 0.0015, + "loss": 2.7607, + "step": 12475 + }, + { + "epoch": 0.07754246225232937, + "grad_norm": 0.10057735443115234, + "learning_rate": 0.0015, + "loss": 2.7707, + "step": 12500 + }, + { + "epoch": 0.07769754717683404, + "grad_norm": 0.1466909795999527, + "learning_rate": 0.0015, + "loss": 2.7434, + "step": 12525 + }, + { + "epoch": 0.07785263210133869, + "grad_norm": 0.09831534326076508, + "learning_rate": 0.0015, + "loss": 2.7858, + "step": 12550 + }, + { + "epoch": 0.07800771702584335, + "grad_norm": 0.13202817738056183, + "learning_rate": 0.0015, + "loss": 2.7884, + "step": 12575 + }, + { + "epoch": 0.07816280195034801, + "grad_norm": 0.10797799378633499, + "learning_rate": 0.0015, + "loss": 2.7788, + "step": 12600 + }, + { + "epoch": 0.07816280195034801, + "eval_loss": 4.5452494621276855, + "perplexity": 94.18392181396484, + "step": 12600 + }, + { + "epoch": 0.07831788687485267, + "grad_norm": 0.10239394754171371, + "learning_rate": 0.0015, + "loss": 2.7803, + "step": 12625 + }, + { + "epoch": 0.07847297179935733, + "grad_norm": 0.10468672215938568, + "learning_rate": 0.0015, + "loss": 2.7449, + "step": 12650 + }, + { + "epoch": 0.07862805672386199, + "grad_norm": 0.13691146671772003, + "learning_rate": 0.0015, + "loss": 2.7837, + "step": 12675 + }, + { + "epoch": 0.07878314164836664, + "grad_norm": 0.16976097226142883, + "learning_rate": 0.0015, + "loss": 2.7557, + "step": 12700 + }, + { + "epoch": 0.0789382265728713, + "grad_norm": 0.09623986482620239, + "learning_rate": 0.0015, + "loss": 2.7576, + "step": 12725 + }, + { + "epoch": 0.07909331149737596, + "grad_norm": 0.11203131079673767, + "learning_rate": 0.0015, + "loss": 2.7846, + "step": 12750 + }, + { + "epoch": 0.07924839642188063, + "grad_norm": 0.12257611751556396, + "learning_rate": 0.0015, + "loss": 2.8015, + "step": 12775 + }, + { + "epoch": 0.07940348134638528, + "grad_norm": 0.08369628340005875, + "learning_rate": 0.0015, + "loss": 2.7616, + "step": 12800 + }, + { + "epoch": 0.07940348134638528, + "eval_loss": 4.548933506011963, + "perplexity": 94.53153991699219, + "step": 12800 + }, + { + "epoch": 0.07955856627088993, + "grad_norm": 0.12149519473314285, + "learning_rate": 0.0015, + "loss": 2.7651, + "step": 12825 + }, + { + "epoch": 0.0797136511953946, + "grad_norm": 0.09911686927080154, + "learning_rate": 0.0015, + "loss": 2.7964, + "step": 12850 + }, + { + "epoch": 0.07986873611989925, + "grad_norm": 0.09883631020784378, + "learning_rate": 0.0015, + "loss": 2.7461, + "step": 12875 + }, + { + "epoch": 0.08002382104440392, + "grad_norm": 0.08828576654195786, + "learning_rate": 0.0015, + "loss": 2.7735, + "step": 12900 + }, + { + "epoch": 0.08017890596890857, + "grad_norm": 0.18119321763515472, + "learning_rate": 0.0015, + "loss": 2.7863, + "step": 12925 + }, + { + "epoch": 0.08033399089341323, + "grad_norm": 0.09123501181602478, + "learning_rate": 0.0015, + "loss": 2.7559, + "step": 12950 + }, + { + "epoch": 0.0804890758179179, + "grad_norm": 0.18334759771823883, + "learning_rate": 0.0015, + "loss": 2.7357, + "step": 12975 + }, + { + "epoch": 0.08064416074242255, + "grad_norm": 0.08934136480093002, + "learning_rate": 0.0015, + "loss": 2.8003, + "step": 13000 + }, + { + "epoch": 0.08064416074242255, + "eval_loss": 4.537932395935059, + "perplexity": 93.49728393554688, + "step": 13000 + }, + { + "epoch": 0.08079924566692721, + "grad_norm": 0.117793008685112, + "learning_rate": 0.0015, + "loss": 2.738, + "step": 13025 + }, + { + "epoch": 0.08095433059143187, + "grad_norm": 0.1012151837348938, + "learning_rate": 0.0015, + "loss": 2.767, + "step": 13050 + }, + { + "epoch": 0.08110941551593653, + "grad_norm": 0.1099851131439209, + "learning_rate": 0.0015, + "loss": 2.7899, + "step": 13075 + }, + { + "epoch": 0.08126450044044119, + "grad_norm": 0.105575330555439, + "learning_rate": 0.0015, + "loss": 2.7857, + "step": 13100 + }, + { + "epoch": 0.08141958536494584, + "grad_norm": 0.11926279962062836, + "learning_rate": 0.0015, + "loss": 2.7821, + "step": 13125 + }, + { + "epoch": 0.08157467028945051, + "grad_norm": 0.1669924259185791, + "learning_rate": 0.0015, + "loss": 2.7673, + "step": 13150 + }, + { + "epoch": 0.08172975521395516, + "grad_norm": 0.11445988714694977, + "learning_rate": 0.0015, + "loss": 2.8081, + "step": 13175 + }, + { + "epoch": 0.08188484013845983, + "grad_norm": 0.09700124710798264, + "learning_rate": 0.0015, + "loss": 2.7841, + "step": 13200 + }, + { + "epoch": 0.08188484013845983, + "eval_loss": 4.540359973907471, + "perplexity": 93.72453308105469, + "step": 13200 + }, + { + "epoch": 0.08203992506296448, + "grad_norm": 0.11112058907747269, + "learning_rate": 0.0015, + "loss": 2.7471, + "step": 13225 + }, + { + "epoch": 0.08219500998746913, + "grad_norm": 0.17890195548534393, + "learning_rate": 0.0015, + "loss": 2.7898, + "step": 13250 + }, + { + "epoch": 0.0823500949119738, + "grad_norm": 0.12197751551866531, + "learning_rate": 0.0015, + "loss": 2.7328, + "step": 13275 + }, + { + "epoch": 0.08250517983647845, + "grad_norm": 0.11677111685276031, + "learning_rate": 0.0015, + "loss": 2.7849, + "step": 13300 + }, + { + "epoch": 0.08266026476098312, + "grad_norm": 0.15514017641544342, + "learning_rate": 0.0015, + "loss": 2.7561, + "step": 13325 + }, + { + "epoch": 0.08281534968548777, + "grad_norm": 0.10389192402362823, + "learning_rate": 0.0015, + "loss": 2.7611, + "step": 13350 + }, + { + "epoch": 0.08297043460999243, + "grad_norm": 0.10176412016153336, + "learning_rate": 0.0015, + "loss": 2.7793, + "step": 13375 + }, + { + "epoch": 0.0831255195344971, + "grad_norm": 0.1043052077293396, + "learning_rate": 0.0015, + "loss": 2.7375, + "step": 13400 + }, + { + "epoch": 0.0831255195344971, + "eval_loss": 4.5388336181640625, + "perplexity": 93.58158111572266, + "step": 13400 + }, + { + "epoch": 0.08328060445900175, + "grad_norm": 0.08918718248605728, + "learning_rate": 0.0015, + "loss": 2.7465, + "step": 13425 + }, + { + "epoch": 0.08343568938350641, + "grad_norm": 0.10008233785629272, + "learning_rate": 0.0015, + "loss": 2.7776, + "step": 13450 + }, + { + "epoch": 0.08359077430801107, + "grad_norm": 0.10228800773620605, + "learning_rate": 0.0015, + "loss": 2.756, + "step": 13475 + }, + { + "epoch": 0.08374585923251572, + "grad_norm": 0.0868915542960167, + "learning_rate": 0.0015, + "loss": 2.7556, + "step": 13500 + }, + { + "epoch": 0.08390094415702039, + "grad_norm": 0.11076166480779648, + "learning_rate": 0.0015, + "loss": 2.6975, + "step": 13525 + }, + { + "epoch": 0.08405602908152504, + "grad_norm": 0.13617128133773804, + "learning_rate": 0.0015, + "loss": 2.7643, + "step": 13550 + }, + { + "epoch": 0.08421111400602971, + "grad_norm": 0.15346932411193848, + "learning_rate": 0.0015, + "loss": 2.7966, + "step": 13575 + }, + { + "epoch": 0.08436619893053436, + "grad_norm": 0.17080894112586975, + "learning_rate": 0.0015, + "loss": 2.7636, + "step": 13600 + }, + { + "epoch": 0.08436619893053436, + "eval_loss": 4.513378620147705, + "perplexity": 91.22953033447266, + "step": 13600 + }, + { + "epoch": 0.08452128385503901, + "grad_norm": 0.11548548936843872, + "learning_rate": 0.0015, + "loss": 2.7729, + "step": 13625 + }, + { + "epoch": 0.08467636877954368, + "grad_norm": 0.14650912582874298, + "learning_rate": 0.0015, + "loss": 2.7063, + "step": 13650 + }, + { + "epoch": 0.08483145370404833, + "grad_norm": 0.09750749915838242, + "learning_rate": 0.0015, + "loss": 2.7648, + "step": 13675 + }, + { + "epoch": 0.084986538628553, + "grad_norm": 0.18051239848136902, + "learning_rate": 0.0015, + "loss": 2.754, + "step": 13700 + }, + { + "epoch": 0.08514162355305765, + "grad_norm": 0.21637938916683197, + "learning_rate": 0.0015, + "loss": 2.7529, + "step": 13725 + }, + { + "epoch": 0.08529670847756231, + "grad_norm": 0.10037226974964142, + "learning_rate": 0.0015, + "loss": 2.7638, + "step": 13750 + }, + { + "epoch": 0.08545179340206698, + "grad_norm": 0.1033267229795456, + "learning_rate": 0.0015, + "loss": 2.7713, + "step": 13775 + }, + { + "epoch": 0.08560687832657163, + "grad_norm": 0.09179462492465973, + "learning_rate": 0.0015, + "loss": 2.8278, + "step": 13800 + }, + { + "epoch": 0.08560687832657163, + "eval_loss": 4.508410453796387, + "perplexity": 90.77741241455078, + "step": 13800 + }, + { + "epoch": 0.0857619632510763, + "grad_norm": 0.09874552488327026, + "learning_rate": 0.0015, + "loss": 2.7544, + "step": 13825 + }, + { + "epoch": 0.08591704817558095, + "grad_norm": 0.17807777225971222, + "learning_rate": 0.0015, + "loss": 2.7401, + "step": 13850 + }, + { + "epoch": 0.0860721331000856, + "grad_norm": 0.14388497173786163, + "learning_rate": 0.0015, + "loss": 2.7879, + "step": 13875 + }, + { + "epoch": 0.08622721802459027, + "grad_norm": 0.13081450760364532, + "learning_rate": 0.0015, + "loss": 2.7162, + "step": 13900 + }, + { + "epoch": 0.08638230294909492, + "grad_norm": 0.15077342092990875, + "learning_rate": 0.0015, + "loss": 2.757, + "step": 13925 + }, + { + "epoch": 0.08653738787359959, + "grad_norm": 0.11368410289287567, + "learning_rate": 0.0015, + "loss": 2.7546, + "step": 13950 + }, + { + "epoch": 0.08669247279810424, + "grad_norm": 0.16447153687477112, + "learning_rate": 0.0015, + "loss": 2.7371, + "step": 13975 + }, + { + "epoch": 0.0868475577226089, + "grad_norm": 0.20563559234142303, + "learning_rate": 0.0015, + "loss": 2.7474, + "step": 14000 + }, + { + "epoch": 0.0868475577226089, + "eval_loss": 4.525671005249023, + "perplexity": 92.35787963867188, + "step": 14000 + }, + { + "epoch": 0.08700264264711356, + "grad_norm": 0.10695035755634308, + "learning_rate": 0.0015, + "loss": 2.7565, + "step": 14025 + }, + { + "epoch": 0.08715772757161822, + "grad_norm": 0.12368099391460419, + "learning_rate": 0.0015, + "loss": 2.784, + "step": 14050 + }, + { + "epoch": 0.08731281249612288, + "grad_norm": 0.11491699516773224, + "learning_rate": 0.0015, + "loss": 2.7477, + "step": 14075 + }, + { + "epoch": 0.08746789742062754, + "grad_norm": 0.10570378601551056, + "learning_rate": 0.0015, + "loss": 2.7575, + "step": 14100 + }, + { + "epoch": 0.08762298234513219, + "grad_norm": 0.09137633442878723, + "learning_rate": 0.0015, + "loss": 2.7517, + "step": 14125 + }, + { + "epoch": 0.08777806726963686, + "grad_norm": 0.09999803453683853, + "learning_rate": 0.0015, + "loss": 2.7446, + "step": 14150 + }, + { + "epoch": 0.08793315219414151, + "grad_norm": 0.15709616243839264, + "learning_rate": 0.0015, + "loss": 2.7606, + "step": 14175 + }, + { + "epoch": 0.08808823711864618, + "grad_norm": 0.10327859222888947, + "learning_rate": 0.0015, + "loss": 2.7441, + "step": 14200 + }, + { + "epoch": 0.08808823711864618, + "eval_loss": 4.521189212799072, + "perplexity": 91.94487762451172, + "step": 14200 + }, + { + "epoch": 0.08824332204315083, + "grad_norm": 0.1964125633239746, + "learning_rate": 0.0015, + "loss": 2.7109, + "step": 14225 + }, + { + "epoch": 0.08839840696765548, + "grad_norm": 0.12792247533798218, + "learning_rate": 0.0015, + "loss": 2.7401, + "step": 14250 + }, + { + "epoch": 0.08855349189216015, + "grad_norm": 0.17532923817634583, + "learning_rate": 0.0015, + "loss": 2.7609, + "step": 14275 + }, + { + "epoch": 0.0887085768166648, + "grad_norm": 0.096143439412117, + "learning_rate": 0.0015, + "loss": 2.7749, + "step": 14300 + }, + { + "epoch": 0.08886366174116947, + "grad_norm": 0.12778601050376892, + "learning_rate": 0.0015, + "loss": 2.6981, + "step": 14325 + }, + { + "epoch": 0.08901874666567412, + "grad_norm": 0.1130848377943039, + "learning_rate": 0.0015, + "loss": 2.7255, + "step": 14350 + }, + { + "epoch": 0.08917383159017878, + "grad_norm": 0.0818464607000351, + "learning_rate": 0.0015, + "loss": 2.7223, + "step": 14375 + }, + { + "epoch": 0.08932891651468344, + "grad_norm": 0.10516222566366196, + "learning_rate": 0.0015, + "loss": 2.7672, + "step": 14400 + }, + { + "epoch": 0.08932891651468344, + "eval_loss": 4.524067401885986, + "perplexity": 92.20989227294922, + "step": 14400 + }, + { + "epoch": 0.0894840014391881, + "grad_norm": 0.08912840485572815, + "learning_rate": 0.0015, + "loss": 2.7349, + "step": 14425 + }, + { + "epoch": 0.08963908636369276, + "grad_norm": 0.11931388080120087, + "learning_rate": 0.0015, + "loss": 2.7326, + "step": 14450 + }, + { + "epoch": 0.08979417128819742, + "grad_norm": 0.12271756678819656, + "learning_rate": 0.0015, + "loss": 2.7327, + "step": 14475 + }, + { + "epoch": 0.08994925621270207, + "grad_norm": 0.1567191183567047, + "learning_rate": 0.0015, + "loss": 2.7573, + "step": 14500 + }, + { + "epoch": 0.09010434113720674, + "grad_norm": 0.1841791719198227, + "learning_rate": 0.0015, + "loss": 2.7582, + "step": 14525 + }, + { + "epoch": 0.09025942606171139, + "grad_norm": 0.12743189930915833, + "learning_rate": 0.0015, + "loss": 2.8061, + "step": 14550 + }, + { + "epoch": 0.09041451098621606, + "grad_norm": 0.11932828277349472, + "learning_rate": 0.0015, + "loss": 2.7447, + "step": 14575 + }, + { + "epoch": 0.09056959591072071, + "grad_norm": 0.18284690380096436, + "learning_rate": 0.0015, + "loss": 2.7436, + "step": 14600 + }, + { + "epoch": 0.09056959591072071, + "eval_loss": 4.515897750854492, + "perplexity": 91.45964050292969, + "step": 14600 + }, + { + "epoch": 0.09072468083522536, + "grad_norm": 0.17987670004367828, + "learning_rate": 0.0015, + "loss": 2.7831, + "step": 14625 + }, + { + "epoch": 0.09087976575973003, + "grad_norm": 0.10992395132780075, + "learning_rate": 0.0015, + "loss": 2.7516, + "step": 14650 + }, + { + "epoch": 0.09103485068423468, + "grad_norm": 0.09343726187944412, + "learning_rate": 0.0015, + "loss": 2.7475, + "step": 14675 + }, + { + "epoch": 0.09118993560873935, + "grad_norm": 0.10370751470327377, + "learning_rate": 0.0015, + "loss": 2.7518, + "step": 14700 + }, + { + "epoch": 0.091345020533244, + "grad_norm": 0.11190348863601685, + "learning_rate": 0.0015, + "loss": 2.7482, + "step": 14725 + }, + { + "epoch": 0.09150010545774866, + "grad_norm": 0.12450053542852402, + "learning_rate": 0.0015, + "loss": 2.7726, + "step": 14750 + }, + { + "epoch": 0.09165519038225332, + "grad_norm": 0.11882703006267548, + "learning_rate": 0.0015, + "loss": 2.7318, + "step": 14775 + }, + { + "epoch": 0.09181027530675798, + "grad_norm": 0.1315181404352188, + "learning_rate": 0.0015, + "loss": 2.757, + "step": 14800 + }, + { + "epoch": 0.09181027530675798, + "eval_loss": 4.521557807922363, + "perplexity": 91.97877502441406, + "step": 14800 + }, + { + "epoch": 0.09196536023126264, + "grad_norm": 0.18574784696102142, + "learning_rate": 0.0015, + "loss": 2.7353, + "step": 14825 + }, + { + "epoch": 0.0921204451557673, + "grad_norm": 0.17665444314479828, + "learning_rate": 0.0015, + "loss": 2.7687, + "step": 14850 + }, + { + "epoch": 0.09227553008027195, + "grad_norm": 0.12507860362529755, + "learning_rate": 0.0015, + "loss": 2.7386, + "step": 14875 + }, + { + "epoch": 0.09243061500477662, + "grad_norm": 0.10472691059112549, + "learning_rate": 0.0015, + "loss": 2.7716, + "step": 14900 + }, + { + "epoch": 0.09258569992928127, + "grad_norm": 0.10282575339078903, + "learning_rate": 0.0015, + "loss": 2.7312, + "step": 14925 + }, + { + "epoch": 0.09274078485378594, + "grad_norm": 0.12706094980239868, + "learning_rate": 0.0015, + "loss": 2.7995, + "step": 14950 + }, + { + "epoch": 0.09289586977829059, + "grad_norm": 0.15283973515033722, + "learning_rate": 0.0015, + "loss": 2.7313, + "step": 14975 + }, + { + "epoch": 0.09305095470279524, + "grad_norm": 0.12476324290037155, + "learning_rate": 0.0015, + "loss": 2.7727, + "step": 15000 + }, + { + "epoch": 0.09305095470279524, + "eval_loss": 4.547565937042236, + "perplexity": 94.40234375, + "step": 15000 + }, + { + "epoch": 0.09320603962729991, + "grad_norm": 0.12369734048843384, + "learning_rate": 0.0015, + "loss": 2.7565, + "step": 15025 + }, + { + "epoch": 0.09336112455180456, + "grad_norm": 0.1322038471698761, + "learning_rate": 0.0015, + "loss": 2.7588, + "step": 15050 + }, + { + "epoch": 0.09351620947630923, + "grad_norm": 0.0926559790968895, + "learning_rate": 0.0015, + "loss": 2.7393, + "step": 15075 + }, + { + "epoch": 0.09367129440081388, + "grad_norm": 0.17404210567474365, + "learning_rate": 0.0015, + "loss": 2.723, + "step": 15100 + }, + { + "epoch": 0.09382637932531855, + "grad_norm": 0.10326647758483887, + "learning_rate": 0.0015, + "loss": 2.7853, + "step": 15125 + }, + { + "epoch": 0.0939814642498232, + "grad_norm": 0.13869203627109528, + "learning_rate": 0.0015, + "loss": 2.7535, + "step": 15150 + }, + { + "epoch": 0.09413654917432786, + "grad_norm": 0.14325955510139465, + "learning_rate": 0.0015, + "loss": 2.7597, + "step": 15175 + }, + { + "epoch": 0.09429163409883252, + "grad_norm": 0.11783768236637115, + "learning_rate": 0.0015, + "loss": 2.7524, + "step": 15200 + }, + { + "epoch": 0.09429163409883252, + "eval_loss": 4.5251593589782715, + "perplexity": 92.31063842773438, + "step": 15200 + }, + { + "epoch": 0.09444671902333718, + "grad_norm": 0.12261676043272018, + "learning_rate": 0.0015, + "loss": 2.7279, + "step": 15225 + }, + { + "epoch": 0.09460180394784184, + "grad_norm": 0.09966279566287994, + "learning_rate": 0.0015, + "loss": 2.8119, + "step": 15250 + }, + { + "epoch": 0.0947568888723465, + "grad_norm": 0.1052974984049797, + "learning_rate": 0.0015, + "loss": 2.7392, + "step": 15275 + }, + { + "epoch": 0.09491197379685115, + "grad_norm": 0.11074663698673248, + "learning_rate": 0.0015, + "loss": 2.7319, + "step": 15300 + }, + { + "epoch": 0.09506705872135582, + "grad_norm": 0.09762706607580185, + "learning_rate": 0.0015, + "loss": 2.7806, + "step": 15325 + }, + { + "epoch": 0.09522214364586047, + "grad_norm": 0.08552476018667221, + "learning_rate": 0.0015, + "loss": 2.7351, + "step": 15350 + }, + { + "epoch": 0.09537722857036514, + "grad_norm": 0.13211695849895477, + "learning_rate": 0.0015, + "loss": 2.7667, + "step": 15375 + }, + { + "epoch": 0.09553231349486979, + "grad_norm": 0.12074939906597137, + "learning_rate": 0.0015, + "loss": 2.7614, + "step": 15400 + }, + { + "epoch": 0.09553231349486979, + "eval_loss": 4.53213357925415, + "perplexity": 92.95668029785156, + "step": 15400 + }, + { + "epoch": 0.09568739841937444, + "grad_norm": 0.11755666136741638, + "learning_rate": 0.0015, + "loss": 2.7101, + "step": 15425 + }, + { + "epoch": 0.09584248334387911, + "grad_norm": 0.10476246476173401, + "learning_rate": 0.0015, + "loss": 2.7391, + "step": 15450 + }, + { + "epoch": 0.09599756826838376, + "grad_norm": 0.10921350121498108, + "learning_rate": 0.0015, + "loss": 2.7423, + "step": 15475 + }, + { + "epoch": 0.09615265319288843, + "grad_norm": 0.11517275124788284, + "learning_rate": 0.0015, + "loss": 2.7374, + "step": 15500 + }, + { + "epoch": 0.09630773811739309, + "grad_norm": 0.10500945895910263, + "learning_rate": 0.0015, + "loss": 2.73, + "step": 15525 + }, + { + "epoch": 0.09646282304189774, + "grad_norm": 0.0962584912776947, + "learning_rate": 0.0015, + "loss": 2.7597, + "step": 15550 + }, + { + "epoch": 0.0966179079664024, + "grad_norm": 0.1273050308227539, + "learning_rate": 0.0015, + "loss": 2.7306, + "step": 15575 + }, + { + "epoch": 0.09677299289090706, + "grad_norm": 0.11249135434627533, + "learning_rate": 0.0015, + "loss": 2.7859, + "step": 15600 + }, + { + "epoch": 0.09677299289090706, + "eval_loss": 4.537318706512451, + "perplexity": 93.43992614746094, + "step": 15600 + }, + { + "epoch": 0.09692807781541173, + "grad_norm": 0.19111056625843048, + "learning_rate": 0.0015, + "loss": 2.7386, + "step": 15625 + }, + { + "epoch": 0.09708316273991638, + "grad_norm": 0.10486472398042679, + "learning_rate": 0.0015, + "loss": 2.7462, + "step": 15650 + }, + { + "epoch": 0.09723824766442103, + "grad_norm": 0.1453208327293396, + "learning_rate": 0.0015, + "loss": 2.762, + "step": 15675 + }, + { + "epoch": 0.0973933325889257, + "grad_norm": 0.08459452539682388, + "learning_rate": 0.0015, + "loss": 2.7353, + "step": 15700 + }, + { + "epoch": 0.09754841751343035, + "grad_norm": 0.11150529980659485, + "learning_rate": 0.0015, + "loss": 2.7617, + "step": 15725 + }, + { + "epoch": 0.09770350243793502, + "grad_norm": 0.11301703006029129, + "learning_rate": 0.0015, + "loss": 2.7623, + "step": 15750 + }, + { + "epoch": 0.09785858736243967, + "grad_norm": 0.16564789414405823, + "learning_rate": 0.0015, + "loss": 2.7315, + "step": 15775 + }, + { + "epoch": 0.09801367228694433, + "grad_norm": 0.08968822658061981, + "learning_rate": 0.0015, + "loss": 2.7842, + "step": 15800 + }, + { + "epoch": 0.09801367228694433, + "eval_loss": 4.528219223022461, + "perplexity": 92.5935287475586, + "step": 15800 + }, + { + "epoch": 0.09816875721144899, + "grad_norm": 0.1233256533741951, + "learning_rate": 0.0015, + "loss": 2.7584, + "step": 15825 + }, + { + "epoch": 0.09832384213595365, + "grad_norm": 0.18926863372325897, + "learning_rate": 0.0015, + "loss": 2.7651, + "step": 15850 + }, + { + "epoch": 0.09847892706045831, + "grad_norm": 0.0912550836801529, + "learning_rate": 0.0015, + "loss": 2.7551, + "step": 15875 + }, + { + "epoch": 0.09863401198496297, + "grad_norm": 0.1443813592195511, + "learning_rate": 0.0015, + "loss": 2.7378, + "step": 15900 + }, + { + "epoch": 0.09878909690946762, + "grad_norm": 0.11620072275400162, + "learning_rate": 0.0015, + "loss": 2.7706, + "step": 15925 + }, + { + "epoch": 0.09894418183397229, + "grad_norm": 0.10275860130786896, + "learning_rate": 0.0015, + "loss": 2.7502, + "step": 15950 + }, + { + "epoch": 0.09909926675847694, + "grad_norm": 0.1417694240808487, + "learning_rate": 0.0015, + "loss": 2.706, + "step": 15975 + }, + { + "epoch": 0.0992543516829816, + "grad_norm": 0.1121877133846283, + "learning_rate": 0.0015, + "loss": 2.7537, + "step": 16000 + }, + { + "epoch": 0.0992543516829816, + "eval_loss": 4.520648956298828, + "perplexity": 91.89521789550781, + "step": 16000 + }, + { + "epoch": 0.09940943660748626, + "grad_norm": 0.10022582858800888, + "learning_rate": 0.0015, + "loss": 2.7213, + "step": 16025 + }, + { + "epoch": 0.09956452153199091, + "grad_norm": 0.09722616523504257, + "learning_rate": 0.0015, + "loss": 2.7437, + "step": 16050 + }, + { + "epoch": 0.09971960645649558, + "grad_norm": 0.11053729802370071, + "learning_rate": 0.0015, + "loss": 2.7495, + "step": 16075 + }, + { + "epoch": 0.09987469138100023, + "grad_norm": 0.10231011360883713, + "learning_rate": 0.0015, + "loss": 2.7505, + "step": 16100 + }, + { + "epoch": 0.1000297763055049, + "grad_norm": 0.135975643992424, + "learning_rate": 0.0015, + "loss": 2.7487, + "step": 16125 + }, + { + "epoch": 0.10018486123000955, + "grad_norm": 0.11350739002227783, + "learning_rate": 0.0015, + "loss": 2.7484, + "step": 16150 + }, + { + "epoch": 0.1003399461545142, + "grad_norm": 0.10639143735170364, + "learning_rate": 0.0015, + "loss": 2.7429, + "step": 16175 + }, + { + "epoch": 0.10049503107901887, + "grad_norm": 0.09016221761703491, + "learning_rate": 0.0015, + "loss": 2.7891, + "step": 16200 + }, + { + "epoch": 0.10049503107901887, + "eval_loss": 4.5112504959106445, + "perplexity": 91.03558349609375, + "step": 16200 + }, + { + "epoch": 0.10065011600352353, + "grad_norm": 0.11324500292539597, + "learning_rate": 0.0015, + "loss": 2.7678, + "step": 16225 + }, + { + "epoch": 0.1008052009280282, + "grad_norm": 0.13268886506557465, + "learning_rate": 0.0015, + "loss": 2.723, + "step": 16250 + }, + { + "epoch": 0.10096028585253285, + "grad_norm": 0.11448831856250763, + "learning_rate": 0.0015, + "loss": 2.7328, + "step": 16275 + }, + { + "epoch": 0.1011153707770375, + "grad_norm": 0.10799309611320496, + "learning_rate": 0.0015, + "loss": 2.7478, + "step": 16300 + }, + { + "epoch": 0.10127045570154217, + "grad_norm": 0.19559204578399658, + "learning_rate": 0.0015, + "loss": 2.7606, + "step": 16325 + }, + { + "epoch": 0.10142554062604682, + "grad_norm": 0.14151975512504578, + "learning_rate": 0.0015, + "loss": 2.7279, + "step": 16350 + }, + { + "epoch": 0.10158062555055149, + "grad_norm": 0.10044725239276886, + "learning_rate": 0.0015, + "loss": 2.7609, + "step": 16375 + }, + { + "epoch": 0.10173571047505614, + "grad_norm": 0.10686340183019638, + "learning_rate": 0.0015, + "loss": 2.7295, + "step": 16400 + }, + { + "epoch": 0.10173571047505614, + "eval_loss": 4.521287441253662, + "perplexity": 91.95391082763672, + "step": 16400 + }, + { + "epoch": 0.1018907953995608, + "grad_norm": 0.1561044305562973, + "learning_rate": 0.0015, + "loss": 2.7769, + "step": 16425 + }, + { + "epoch": 0.10204588032406546, + "grad_norm": 0.12182148545980453, + "learning_rate": 0.0015, + "loss": 2.757, + "step": 16450 + }, + { + "epoch": 0.10220096524857011, + "grad_norm": 0.20665724575519562, + "learning_rate": 0.0015, + "loss": 2.7349, + "step": 16475 + }, + { + "epoch": 0.10235605017307478, + "grad_norm": 0.09160878509283066, + "learning_rate": 0.0015, + "loss": 2.7393, + "step": 16500 + }, + { + "epoch": 0.10251113509757943, + "grad_norm": 0.16651533544063568, + "learning_rate": 0.0015, + "loss": 2.7441, + "step": 16525 + }, + { + "epoch": 0.10266622002208409, + "grad_norm": 0.09358719736337662, + "learning_rate": 0.0015, + "loss": 2.7297, + "step": 16550 + }, + { + "epoch": 0.10282130494658875, + "grad_norm": 0.20277003943920135, + "learning_rate": 0.0015, + "loss": 2.7506, + "step": 16575 + }, + { + "epoch": 0.10297638987109341, + "grad_norm": 0.13382607698440552, + "learning_rate": 0.0015, + "loss": 2.7924, + "step": 16600 + }, + { + "epoch": 0.10297638987109341, + "eval_loss": 4.525242328643799, + "perplexity": 92.31829833984375, + "step": 16600 + }, + { + "epoch": 0.10313147479559807, + "grad_norm": 0.09686290472745895, + "learning_rate": 0.0015, + "loss": 2.7417, + "step": 16625 + }, + { + "epoch": 0.10328655972010273, + "grad_norm": 0.11446567624807358, + "learning_rate": 0.0015, + "loss": 2.7582, + "step": 16650 + }, + { + "epoch": 0.10344164464460738, + "grad_norm": 0.15948985517024994, + "learning_rate": 0.0015, + "loss": 2.7254, + "step": 16675 + }, + { + "epoch": 0.10359672956911205, + "grad_norm": 0.1254827231168747, + "learning_rate": 0.0015, + "loss": 2.7515, + "step": 16700 + }, + { + "epoch": 0.1037518144936167, + "grad_norm": 0.11295375972986221, + "learning_rate": 0.0015, + "loss": 2.7058, + "step": 16725 + }, + { + "epoch": 0.10390689941812137, + "grad_norm": 0.10659389197826385, + "learning_rate": 0.0015, + "loss": 2.7281, + "step": 16750 + }, + { + "epoch": 0.10406198434262602, + "grad_norm": 0.1045156791806221, + "learning_rate": 0.0015, + "loss": 2.7131, + "step": 16775 + }, + { + "epoch": 0.10421706926713067, + "grad_norm": 0.13835974037647247, + "learning_rate": 0.0015, + "loss": 2.744, + "step": 16800 + }, + { + "epoch": 0.10421706926713067, + "eval_loss": 4.507747650146484, + "perplexity": 90.7172622680664, + "step": 16800 + }, + { + "epoch": 0.10437215419163534, + "grad_norm": 0.19872727990150452, + "learning_rate": 0.0015, + "loss": 2.7642, + "step": 16825 + }, + { + "epoch": 0.10452723911614, + "grad_norm": 0.13754956424236298, + "learning_rate": 0.0015, + "loss": 2.7652, + "step": 16850 + }, + { + "epoch": 0.10468232404064466, + "grad_norm": 0.1451335996389389, + "learning_rate": 0.0015, + "loss": 2.7561, + "step": 16875 + }, + { + "epoch": 0.10483740896514931, + "grad_norm": 0.16750144958496094, + "learning_rate": 0.0015, + "loss": 2.7206, + "step": 16900 + }, + { + "epoch": 0.10499249388965397, + "grad_norm": 0.12020619958639145, + "learning_rate": 0.0015, + "loss": 2.699, + "step": 16925 + }, + { + "epoch": 0.10514757881415863, + "grad_norm": 0.16792155802249908, + "learning_rate": 0.0015, + "loss": 2.8062, + "step": 16950 + }, + { + "epoch": 0.10530266373866329, + "grad_norm": 0.11066465824842453, + "learning_rate": 0.0015, + "loss": 2.6968, + "step": 16975 + }, + { + "epoch": 0.10545774866316796, + "grad_norm": 0.11885298788547516, + "learning_rate": 0.0015, + "loss": 2.7699, + "step": 17000 + }, + { + "epoch": 0.10545774866316796, + "eval_loss": 4.524214744567871, + "perplexity": 92.22348022460938, + "step": 17000 + }, + { + "epoch": 0.10561283358767261, + "grad_norm": 0.1298653483390808, + "learning_rate": 0.0015, + "loss": 2.7199, + "step": 17025 + }, + { + "epoch": 0.10576791851217726, + "grad_norm": 0.11387672275304794, + "learning_rate": 0.0015, + "loss": 2.7528, + "step": 17050 + }, + { + "epoch": 0.10592300343668193, + "grad_norm": 0.09852533042430878, + "learning_rate": 0.0015, + "loss": 2.7277, + "step": 17075 + }, + { + "epoch": 0.10607808836118658, + "grad_norm": 0.11046476662158966, + "learning_rate": 0.0015, + "loss": 2.722, + "step": 17100 + }, + { + "epoch": 0.10623317328569125, + "grad_norm": 0.11632421612739563, + "learning_rate": 0.0015, + "loss": 2.726, + "step": 17125 + }, + { + "epoch": 0.1063882582101959, + "grad_norm": 0.11760540306568146, + "learning_rate": 0.0015, + "loss": 2.7267, + "step": 17150 + }, + { + "epoch": 0.10654334313470057, + "grad_norm": 0.12264183163642883, + "learning_rate": 0.0015, + "loss": 2.8037, + "step": 17175 + }, + { + "epoch": 0.10669842805920522, + "grad_norm": 0.15346336364746094, + "learning_rate": 0.0015, + "loss": 2.7668, + "step": 17200 + }, + { + "epoch": 0.10669842805920522, + "eval_loss": 4.503612995147705, + "perplexity": 90.34294891357422, + "step": 17200 + }, + { + "epoch": 0.10685351298370988, + "grad_norm": 0.10642746090888977, + "learning_rate": 0.0015, + "loss": 2.7295, + "step": 17225 + }, + { + "epoch": 0.10700859790821454, + "grad_norm": 0.10965430736541748, + "learning_rate": 0.0015, + "loss": 2.7113, + "step": 17250 + }, + { + "epoch": 0.1071636828327192, + "grad_norm": 0.09912869334220886, + "learning_rate": 0.0015, + "loss": 2.7353, + "step": 17275 + }, + { + "epoch": 0.10731876775722386, + "grad_norm": 0.14111942052841187, + "learning_rate": 0.0015, + "loss": 2.7064, + "step": 17300 + }, + { + "epoch": 0.10747385268172852, + "grad_norm": 0.11583065241575241, + "learning_rate": 0.0015, + "loss": 2.722, + "step": 17325 + }, + { + "epoch": 0.10762893760623317, + "grad_norm": 0.09374859184026718, + "learning_rate": 0.0015, + "loss": 2.6964, + "step": 17350 + }, + { + "epoch": 0.10778402253073784, + "grad_norm": 0.11704573035240173, + "learning_rate": 0.0015, + "loss": 2.7518, + "step": 17375 + }, + { + "epoch": 0.10793910745524249, + "grad_norm": 0.13960668444633484, + "learning_rate": 0.0015, + "loss": 2.7373, + "step": 17400 + }, + { + "epoch": 0.10793910745524249, + "eval_loss": 4.514464378356934, + "perplexity": 91.3286361694336, + "step": 17400 + }, + { + "epoch": 0.10809419237974716, + "grad_norm": 0.1006089448928833, + "learning_rate": 0.0015, + "loss": 2.7199, + "step": 17425 + }, + { + "epoch": 0.10824927730425181, + "grad_norm": 0.14851173758506775, + "learning_rate": 0.0015, + "loss": 2.7202, + "step": 17450 + }, + { + "epoch": 0.10840436222875646, + "grad_norm": 0.11992091685533524, + "learning_rate": 0.0015, + "loss": 2.6932, + "step": 17475 + }, + { + "epoch": 0.10855944715326113, + "grad_norm": 0.12420158833265305, + "learning_rate": 0.0015, + "loss": 2.7395, + "step": 17500 + }, + { + "epoch": 0.10871453207776578, + "grad_norm": 0.09945713728666306, + "learning_rate": 0.0015, + "loss": 2.7323, + "step": 17525 + }, + { + "epoch": 0.10886961700227045, + "grad_norm": 0.13007710874080658, + "learning_rate": 0.0015, + "loss": 2.7438, + "step": 17550 + }, + { + "epoch": 0.1090247019267751, + "grad_norm": 0.10875315964221954, + "learning_rate": 0.0015, + "loss": 2.7656, + "step": 17575 + }, + { + "epoch": 0.10917978685127976, + "grad_norm": 0.1075393334031105, + "learning_rate": 0.0015, + "loss": 2.7174, + "step": 17600 + }, + { + "epoch": 0.10917978685127976, + "eval_loss": 4.4858293533325195, + "perplexity": 88.75052642822266, + "step": 17600 + }, + { + "epoch": 0.10933487177578442, + "grad_norm": 0.16400013864040375, + "learning_rate": 0.0015, + "loss": 2.7389, + "step": 17625 + }, + { + "epoch": 0.10948995670028908, + "grad_norm": 0.1368722766637802, + "learning_rate": 0.0015, + "loss": 2.7198, + "step": 17650 + }, + { + "epoch": 0.10964504162479374, + "grad_norm": 0.23104597628116608, + "learning_rate": 0.0015, + "loss": 2.7346, + "step": 17675 + }, + { + "epoch": 0.1098001265492984, + "grad_norm": 0.12463794648647308, + "learning_rate": 0.0015, + "loss": 2.691, + "step": 17700 + }, + { + "epoch": 0.10995521147380305, + "grad_norm": 0.19538962841033936, + "learning_rate": 0.0015, + "loss": 2.6917, + "step": 17725 + }, + { + "epoch": 0.11011029639830772, + "grad_norm": 0.12000603973865509, + "learning_rate": 0.0015, + "loss": 2.7431, + "step": 17750 + }, + { + "epoch": 0.11026538132281237, + "grad_norm": 0.15090298652648926, + "learning_rate": 0.0015, + "loss": 2.7493, + "step": 17775 + }, + { + "epoch": 0.11042046624731704, + "grad_norm": 0.13190440833568573, + "learning_rate": 0.0015, + "loss": 2.7582, + "step": 17800 + }, + { + "epoch": 0.11042046624731704, + "eval_loss": 4.493134021759033, + "perplexity": 89.40119171142578, + "step": 17800 + }, + { + "epoch": 0.11057555117182169, + "grad_norm": 0.12455850094556808, + "learning_rate": 0.0015, + "loss": 2.7574, + "step": 17825 + }, + { + "epoch": 0.11073063609632634, + "grad_norm": 0.14911110699176788, + "learning_rate": 0.0015, + "loss": 2.7285, + "step": 17850 + }, + { + "epoch": 0.11088572102083101, + "grad_norm": 0.16008728742599487, + "learning_rate": 0.0015, + "loss": 2.733, + "step": 17875 + }, + { + "epoch": 0.11104080594533566, + "grad_norm": 0.1668420433998108, + "learning_rate": 0.0015, + "loss": 2.7259, + "step": 17900 + }, + { + "epoch": 0.11119589086984033, + "grad_norm": 0.11736566573381424, + "learning_rate": 0.0015, + "loss": 2.7682, + "step": 17925 + }, + { + "epoch": 0.11135097579434498, + "grad_norm": 0.11538700759410858, + "learning_rate": 0.0015, + "loss": 2.7656, + "step": 17950 + }, + { + "epoch": 0.11150606071884964, + "grad_norm": 0.09440570324659348, + "learning_rate": 0.0015, + "loss": 2.7517, + "step": 17975 + }, + { + "epoch": 0.1116611456433543, + "grad_norm": 0.20621652901172638, + "learning_rate": 0.0015, + "loss": 2.7292, + "step": 18000 + }, + { + "epoch": 0.1116611456433543, + "eval_loss": 4.493429183959961, + "perplexity": 89.42758178710938, + "step": 18000 + }, + { + "epoch": 0.11181623056785896, + "grad_norm": 0.12027841061353683, + "learning_rate": 0.0015, + "loss": 2.7049, + "step": 18025 + }, + { + "epoch": 0.11197131549236362, + "grad_norm": 0.08760379254817963, + "learning_rate": 0.0015, + "loss": 2.7291, + "step": 18050 + }, + { + "epoch": 0.11212640041686828, + "grad_norm": 0.1251729428768158, + "learning_rate": 0.0015, + "loss": 2.7149, + "step": 18075 + }, + { + "epoch": 0.11228148534137293, + "grad_norm": 0.10340214520692825, + "learning_rate": 0.0015, + "loss": 2.7437, + "step": 18100 + }, + { + "epoch": 0.1124365702658776, + "grad_norm": 0.10546920448541641, + "learning_rate": 0.0015, + "loss": 2.7656, + "step": 18125 + }, + { + "epoch": 0.11259165519038225, + "grad_norm": 0.12438227981328964, + "learning_rate": 0.0015, + "loss": 2.7171, + "step": 18150 + }, + { + "epoch": 0.11274674011488692, + "grad_norm": 0.14557534456253052, + "learning_rate": 0.0015, + "loss": 2.7395, + "step": 18175 + }, + { + "epoch": 0.11290182503939157, + "grad_norm": 0.13714823126792908, + "learning_rate": 0.0015, + "loss": 2.7066, + "step": 18200 + }, + { + "epoch": 0.11290182503939157, + "eval_loss": 4.4876604080200195, + "perplexity": 88.9131851196289, + "step": 18200 + }, + { + "epoch": 0.11305690996389622, + "grad_norm": 0.12662547826766968, + "learning_rate": 0.0015, + "loss": 2.6665, + "step": 18225 + }, + { + "epoch": 0.11321199488840089, + "grad_norm": 0.10047092288732529, + "learning_rate": 0.0015, + "loss": 2.7332, + "step": 18250 + }, + { + "epoch": 0.11336707981290554, + "grad_norm": 0.11126455664634705, + "learning_rate": 0.0015, + "loss": 2.7154, + "step": 18275 + }, + { + "epoch": 0.11352216473741021, + "grad_norm": 0.10023871064186096, + "learning_rate": 0.0015, + "loss": 2.7007, + "step": 18300 + }, + { + "epoch": 0.11367724966191486, + "grad_norm": 0.11821885406970978, + "learning_rate": 0.0015, + "loss": 2.7081, + "step": 18325 + }, + { + "epoch": 0.11383233458641952, + "grad_norm": 0.1216677874326706, + "learning_rate": 0.0015, + "loss": 2.74, + "step": 18350 + }, + { + "epoch": 0.11398741951092418, + "grad_norm": 0.1125161275267601, + "learning_rate": 0.0015, + "loss": 2.733, + "step": 18375 + }, + { + "epoch": 0.11414250443542884, + "grad_norm": 0.18253153562545776, + "learning_rate": 0.0015, + "loss": 2.7085, + "step": 18400 + }, + { + "epoch": 0.11414250443542884, + "eval_loss": 4.501376628875732, + "perplexity": 90.1411361694336, + "step": 18400 + }, + { + "epoch": 0.1142975893599335, + "grad_norm": 0.13288918137550354, + "learning_rate": 0.0015, + "loss": 2.7033, + "step": 18425 + }, + { + "epoch": 0.11445267428443816, + "grad_norm": 0.1069432720541954, + "learning_rate": 0.0015, + "loss": 2.7063, + "step": 18450 + }, + { + "epoch": 0.11460775920894281, + "grad_norm": 0.1035354733467102, + "learning_rate": 0.0015, + "loss": 2.7174, + "step": 18475 + }, + { + "epoch": 0.11476284413344748, + "grad_norm": 0.1121230348944664, + "learning_rate": 0.0015, + "loss": 2.7, + "step": 18500 + }, + { + "epoch": 0.11491792905795213, + "grad_norm": 0.13324719667434692, + "learning_rate": 0.0015, + "loss": 2.7423, + "step": 18525 + }, + { + "epoch": 0.1150730139824568, + "grad_norm": 0.0891190841794014, + "learning_rate": 0.0015, + "loss": 2.7418, + "step": 18550 + }, + { + "epoch": 0.11522809890696145, + "grad_norm": 0.10579492896795273, + "learning_rate": 0.0015, + "loss": 2.7321, + "step": 18575 + }, + { + "epoch": 0.1153831838314661, + "grad_norm": 0.1010003387928009, + "learning_rate": 0.0015, + "loss": 2.7071, + "step": 18600 + }, + { + "epoch": 0.1153831838314661, + "eval_loss": 4.508904933929443, + "perplexity": 90.82231140136719, + "step": 18600 + }, + { + "epoch": 0.11553826875597077, + "grad_norm": 0.1599242389202118, + "learning_rate": 0.0015, + "loss": 2.7222, + "step": 18625 + }, + { + "epoch": 0.11569335368047542, + "grad_norm": 0.09344537556171417, + "learning_rate": 0.0015, + "loss": 2.7424, + "step": 18650 + }, + { + "epoch": 0.11584843860498009, + "grad_norm": 0.13959461450576782, + "learning_rate": 0.0015, + "loss": 2.7584, + "step": 18675 + }, + { + "epoch": 0.11600352352948474, + "grad_norm": 0.11661764234304428, + "learning_rate": 0.0015, + "loss": 2.7363, + "step": 18700 + }, + { + "epoch": 0.1161586084539894, + "grad_norm": 0.11968798190355301, + "learning_rate": 0.0015, + "loss": 2.7314, + "step": 18725 + }, + { + "epoch": 0.11631369337849407, + "grad_norm": 0.22232107818126678, + "learning_rate": 0.0015, + "loss": 2.6992, + "step": 18750 + }, + { + "epoch": 0.11646877830299872, + "grad_norm": 0.1387198567390442, + "learning_rate": 0.0015, + "loss": 2.7001, + "step": 18775 + }, + { + "epoch": 0.11662386322750339, + "grad_norm": 0.17059509456157684, + "learning_rate": 0.0015, + "loss": 2.7002, + "step": 18800 + }, + { + "epoch": 0.11662386322750339, + "eval_loss": 4.516000270843506, + "perplexity": 91.4690170288086, + "step": 18800 + }, + { + "epoch": 0.11677894815200804, + "grad_norm": 0.10877668112516403, + "learning_rate": 0.0015, + "loss": 2.7171, + "step": 18825 + }, + { + "epoch": 0.11693403307651269, + "grad_norm": 0.11746638268232346, + "learning_rate": 0.0015, + "loss": 2.7006, + "step": 18850 + }, + { + "epoch": 0.11708911800101736, + "grad_norm": 0.17617632448673248, + "learning_rate": 0.0015, + "loss": 2.7427, + "step": 18875 + }, + { + "epoch": 0.11724420292552201, + "grad_norm": 0.09788820147514343, + "learning_rate": 0.0015, + "loss": 2.7507, + "step": 18900 + }, + { + "epoch": 0.11739928785002668, + "grad_norm": 0.1285056471824646, + "learning_rate": 0.0015, + "loss": 2.7386, + "step": 18925 + }, + { + "epoch": 0.11755437277453133, + "grad_norm": 0.11705992370843887, + "learning_rate": 0.0015, + "loss": 2.7234, + "step": 18950 + }, + { + "epoch": 0.11770945769903599, + "grad_norm": 0.09166467934846878, + "learning_rate": 0.0015, + "loss": 2.7825, + "step": 18975 + }, + { + "epoch": 0.11786454262354065, + "grad_norm": 0.11318054795265198, + "learning_rate": 0.0015, + "loss": 2.778, + "step": 19000 + }, + { + "epoch": 0.11786454262354065, + "eval_loss": 4.499363422393799, + "perplexity": 89.95984649658203, + "step": 19000 + } + ], + "logging_steps": 25, + "max_steps": 161202, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": true, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 60, + "trial_name": null, + "trial_params": null +}