diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,5923 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.11786454262354065,
+  "eval_steps": 200,
+  "global_step": 19000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.00015508492450465875,
+      "grad_norm": 0.12764382362365723,
+      "learning_rate": 0.0015,
+      "loss": 3.062,
+      "step": 25
+    },
+    {
+      "epoch": 0.0003101698490093175,
+      "grad_norm": 0.08861421793699265,
+      "learning_rate": 0.0015,
+      "loss": 3.0523,
+      "step": 50
+    },
+    {
+      "epoch": 0.00046525477351397625,
+      "grad_norm": 0.10059793293476105,
+      "learning_rate": 0.0015,
+      "loss": 3.0271,
+      "step": 75
+    },
+    {
+      "epoch": 0.000620339698018635,
+      "grad_norm": 0.09730365872383118,
+      "learning_rate": 0.0015,
+      "loss": 3.0421,
+      "step": 100
+    },
+    {
+      "epoch": 0.0007754246225232938,
+      "grad_norm": 0.15407200157642365,
+      "learning_rate": 0.0015,
+      "loss": 2.9894,
+      "step": 125
+    },
+    {
+      "epoch": 0.0009305095470279525,
+      "grad_norm": 0.12250959873199463,
+      "learning_rate": 0.0015,
+      "loss": 3.0055,
+      "step": 150
+    },
+    {
+      "epoch": 0.0010855944715326112,
+      "grad_norm": 0.08540652692317963,
+      "learning_rate": 0.0015,
+      "loss": 3.0025,
+      "step": 175
+    },
+    {
+      "epoch": 0.00124067939603727,
+      "grad_norm": 0.1479829102754593,
+      "learning_rate": 0.0015,
+      "loss": 2.9881,
+      "step": 200
+    },
+    {
+      "epoch": 0.00124067939603727,
+      "eval_loss": 4.852784156799316,
+      "perplexity": 128.09652709960938,
+      "step": 200
+    },
+    {
+      "epoch": 0.0013957643205419288,
+      "grad_norm": 0.1036139577627182,
+      "learning_rate": 0.0015,
+      "loss": 2.9609,
+      "step": 225
+    },
+    {
+      "epoch": 0.0015508492450465876,
+      "grad_norm": 0.10382606089115143,
+      "learning_rate": 0.0015,
+      "loss": 2.9771,
+      "step": 250
+    },
+    {
+      "epoch": 0.0017059341695512462,
+      "grad_norm": 0.08648105710744858,
+      "learning_rate": 0.0015,
+      "loss": 2.9522,
+      "step": 275
+    },
+    {
+      "epoch": 0.001861019094055905,
+      "grad_norm": 0.08675844967365265,
+      "learning_rate": 0.0015,
+      "loss": 2.9833,
+      "step": 300
+    },
+    {
+      "epoch": 0.0020161040185605636,
+      "grad_norm": 0.1417882740497589,
+      "learning_rate": 0.0015,
+      "loss": 2.9626,
+      "step": 325
+    },
+    {
+      "epoch": 0.0021711889430652224,
+      "grad_norm": 0.09860406816005707,
+      "learning_rate": 0.0015,
+      "loss": 2.9515,
+      "step": 350
+    },
+    {
+      "epoch": 0.002326273867569881,
+      "grad_norm": 0.11757214367389679,
+      "learning_rate": 0.0015,
+      "loss": 2.9523,
+      "step": 375
+    },
+    {
+      "epoch": 0.00248135879207454,
+      "grad_norm": 0.11415340006351471,
+      "learning_rate": 0.0015,
+      "loss": 2.9579,
+      "step": 400
+    },
+    {
+      "epoch": 0.00248135879207454,
+      "eval_loss": 4.8426313400268555,
+      "perplexity": 126.80257415771484,
+      "step": 400
+    },
+    {
+      "epoch": 0.002636443716579199,
+      "grad_norm": 0.10692940652370453,
+      "learning_rate": 0.0015,
+      "loss": 2.9273,
+      "step": 425
+    },
+    {
+      "epoch": 0.0027915286410838576,
+      "grad_norm": 0.12780559062957764,
+      "learning_rate": 0.0015,
+      "loss": 2.9577,
+      "step": 450
+    },
+    {
+      "epoch": 0.0029466135655885164,
+      "grad_norm": 0.21147418022155762,
+      "learning_rate": 0.0015,
+      "loss": 2.9118,
+      "step": 475
+    },
+    {
+      "epoch": 0.003101698490093175,
+      "grad_norm": 0.13209331035614014,
+      "learning_rate": 0.0015,
+      "loss": 2.9584,
+      "step": 500
+    },
+    {
+      "epoch": 0.0032567834145978336,
+      "grad_norm": 0.13230836391448975,
+      "learning_rate": 0.0015,
+      "loss": 2.9621,
+      "step": 525
+    },
+    {
+      "epoch": 0.0034118683391024924,
+      "grad_norm": 0.11265246570110321,
+      "learning_rate": 0.0015,
+      "loss": 2.941,
+      "step": 550
+    },
+    {
+      "epoch": 0.003566953263607151,
+      "grad_norm": 0.10484226047992706,
+      "learning_rate": 0.0015,
+      "loss": 2.9311,
+      "step": 575
+    },
+    {
+      "epoch": 0.00372203818811181,
+      "grad_norm": 0.13941314816474915,
+      "learning_rate": 0.0015,
+      "loss": 2.9741,
+      "step": 600
+    },
+    {
+      "epoch": 0.00372203818811181,
+      "eval_loss": 4.831629276275635,
+      "perplexity": 125.41513061523438,
+      "step": 600
+    },
+    {
+      "epoch": 0.0038771231126164688,
+      "grad_norm": 0.0885343998670578,
+      "learning_rate": 0.0015,
+      "loss": 2.944,
+      "step": 625
+    },
+    {
+      "epoch": 0.004032208037121127,
+      "grad_norm": 0.093564473092556,
+      "learning_rate": 0.0015,
+      "loss": 2.9673,
+      "step": 650
+    },
+    {
+      "epoch": 0.004187292961625786,
+      "grad_norm": 0.15350665152072906,
+      "learning_rate": 0.0015,
+      "loss": 2.9314,
+      "step": 675
+    },
+    {
+      "epoch": 0.004342377886130445,
+      "grad_norm": 0.11337901651859283,
+      "learning_rate": 0.0015,
+      "loss": 2.97,
+      "step": 700
+    },
+    {
+      "epoch": 0.004497462810635104,
+      "grad_norm": 0.13508272171020508,
+      "learning_rate": 0.0015,
+      "loss": 2.9121,
+      "step": 725
+    },
+    {
+      "epoch": 0.004652547735139762,
+      "grad_norm": 0.10049441456794739,
+      "learning_rate": 0.0015,
+      "loss": 2.9572,
+      "step": 750
+    },
+    {
+      "epoch": 0.004807632659644422,
+      "grad_norm": 0.1017594188451767,
+      "learning_rate": 0.0015,
+      "loss": 2.9207,
+      "step": 775
+    },
+    {
+      "epoch": 0.00496271758414908,
+      "grad_norm": 0.09874167293310165,
+      "learning_rate": 0.0015,
+      "loss": 2.9258,
+      "step": 800
+    },
+    {
+      "epoch": 0.00496271758414908,
+      "eval_loss": 4.783432960510254,
+      "perplexity": 119.51393127441406,
+      "step": 800
+    },
+    {
+      "epoch": 0.005117802508653739,
+      "grad_norm": 0.09769408404827118,
+      "learning_rate": 0.0015,
+      "loss": 2.9606,
+      "step": 825
+    },
+    {
+      "epoch": 0.005272887433158398,
+      "grad_norm": 0.11946038156747818,
+      "learning_rate": 0.0015,
+      "loss": 2.889,
+      "step": 850
+    },
+    {
+      "epoch": 0.005427972357663056,
+      "grad_norm": 0.12191672623157501,
+      "learning_rate": 0.0015,
+      "loss": 2.9094,
+      "step": 875
+    },
+    {
+      "epoch": 0.005583057282167715,
+      "grad_norm": 0.09349209070205688,
+      "learning_rate": 0.0015,
+      "loss": 2.9242,
+      "step": 900
+    },
+    {
+      "epoch": 0.0057381422066723736,
+      "grad_norm": 0.07793531566858292,
+      "learning_rate": 0.0015,
+      "loss": 2.9692,
+      "step": 925
+    },
+    {
+      "epoch": 0.005893227131177033,
+      "grad_norm": 0.1276599019765854,
+      "learning_rate": 0.0015,
+      "loss": 2.9339,
+      "step": 950
+    },
+    {
+      "epoch": 0.006048312055681691,
+      "grad_norm": 0.11083021759986877,
+      "learning_rate": 0.0015,
+      "loss": 2.9251,
+      "step": 975
+    },
+    {
+      "epoch": 0.00620339698018635,
+      "grad_norm": 0.13207702338695526,
+      "learning_rate": 0.0015,
+      "loss": 2.8567,
+      "step": 1000
+    },
+    {
+      "epoch": 0.00620339698018635,
+      "eval_loss": 4.790068626403809,
+      "perplexity": 120.30962371826172,
+      "step": 1000
+    },
+    {
+      "epoch": 0.006358481904691009,
+      "grad_norm": 0.20453479886054993,
+      "learning_rate": 0.0015,
+      "loss": 2.9127,
+      "step": 1025
+    },
+    {
+      "epoch": 0.006513566829195667,
+      "grad_norm": 0.12530989944934845,
+      "learning_rate": 0.0015,
+      "loss": 2.9147,
+      "step": 1050
+    },
+    {
+      "epoch": 0.006668651753700326,
+      "grad_norm": 0.11520997434854507,
+      "learning_rate": 0.0015,
+      "loss": 2.936,
+      "step": 1075
+    },
+    {
+      "epoch": 0.006823736678204985,
+      "grad_norm": 0.09191219508647919,
+      "learning_rate": 0.0015,
+      "loss": 2.9115,
+      "step": 1100
+    },
+    {
+      "epoch": 0.006978821602709644,
+      "grad_norm": 0.07251202315092087,
+      "learning_rate": 0.0015,
+      "loss": 2.9154,
+      "step": 1125
+    },
+    {
+      "epoch": 0.007133906527214302,
+      "grad_norm": 0.10054546594619751,
+      "learning_rate": 0.0015,
+      "loss": 2.8924,
+      "step": 1150
+    },
+    {
+      "epoch": 0.007288991451718962,
+      "grad_norm": 0.1192697063088417,
+      "learning_rate": 0.0015,
+      "loss": 2.957,
+      "step": 1175
+    },
+    {
+      "epoch": 0.00744407637622362,
+      "grad_norm": 0.14840476214885712,
+      "learning_rate": 0.0015,
+      "loss": 2.895,
+      "step": 1200
+    },
+    {
+      "epoch": 0.00744407637622362,
+      "eval_loss": 4.770949363708496,
+      "perplexity": 118.03124237060547,
+      "step": 1200
+    },
+    {
+      "epoch": 0.007599161300728279,
+      "grad_norm": 0.11221906542778015,
+      "learning_rate": 0.0015,
+      "loss": 2.9131,
+      "step": 1225
+    },
+    {
+      "epoch": 0.0077542462252329376,
+      "grad_norm": 0.11528974026441574,
+      "learning_rate": 0.0015,
+      "loss": 2.8783,
+      "step": 1250
+    },
+    {
+      "epoch": 0.007909331149737596,
+      "grad_norm": 0.0807015597820282,
+      "learning_rate": 0.0015,
+      "loss": 2.91,
+      "step": 1275
+    },
+    {
+      "epoch": 0.008064416074242254,
+      "grad_norm": 0.1435490846633911,
+      "learning_rate": 0.0015,
+      "loss": 2.9198,
+      "step": 1300
+    },
+    {
+      "epoch": 0.008219500998746914,
+      "grad_norm": 0.11956608295440674,
+      "learning_rate": 0.0015,
+      "loss": 2.8771,
+      "step": 1325
+    },
+    {
+      "epoch": 0.008374585923251573,
+      "grad_norm": 0.10362117737531662,
+      "learning_rate": 0.0015,
+      "loss": 2.8913,
+      "step": 1350
+    },
+    {
+      "epoch": 0.008529670847756231,
+      "grad_norm": 0.07132004201412201,
+      "learning_rate": 0.0015,
+      "loss": 2.946,
+      "step": 1375
+    },
+    {
+      "epoch": 0.00868475577226089,
+      "grad_norm": 0.08756817877292633,
+      "learning_rate": 0.0015,
+      "loss": 2.9015,
+      "step": 1400
+    },
+    {
+      "epoch": 0.00868475577226089,
+      "eval_loss": 4.769084453582764,
+      "perplexity": 117.81133270263672,
+      "step": 1400
+    },
+    {
+      "epoch": 0.00883984069676555,
+      "grad_norm": 0.18067917227745056,
+      "learning_rate": 0.0015,
+      "loss": 2.8887,
+      "step": 1425
+    },
+    {
+      "epoch": 0.008994925621270208,
+      "grad_norm": 0.09742950648069382,
+      "learning_rate": 0.0015,
+      "loss": 2.8834,
+      "step": 1450
+    },
+    {
+      "epoch": 0.009150010545774866,
+      "grad_norm": 0.09857803583145142,
+      "learning_rate": 0.0015,
+      "loss": 2.8856,
+      "step": 1475
+    },
+    {
+      "epoch": 0.009305095470279525,
+      "grad_norm": 0.17605328559875488,
+      "learning_rate": 0.0015,
+      "loss": 2.9238,
+      "step": 1500
+    },
+    {
+      "epoch": 0.009460180394784183,
+      "grad_norm": 0.08441105484962463,
+      "learning_rate": 0.0015,
+      "loss": 2.8605,
+      "step": 1525
+    },
+    {
+      "epoch": 0.009615265319288843,
+      "grad_norm": 0.15339621901512146,
+      "learning_rate": 0.0015,
+      "loss": 2.9421,
+      "step": 1550
+    },
+    {
+      "epoch": 0.009770350243793502,
+      "grad_norm": 0.21426236629486084,
+      "learning_rate": 0.0015,
+      "loss": 2.8899,
+      "step": 1575
+    },
+    {
+      "epoch": 0.00992543516829816,
+      "grad_norm": 0.16503557562828064,
+      "learning_rate": 0.0015,
+      "loss": 2.878,
+      "step": 1600
+    },
+    {
+      "epoch": 0.00992543516829816,
+      "eval_loss": 4.774999618530273,
+      "perplexity": 118.51026916503906,
+      "step": 1600
+    },
+    {
+      "epoch": 0.010080520092802818,
+      "grad_norm": 0.11398541182279587,
+      "learning_rate": 0.0015,
+      "loss": 2.866,
+      "step": 1625
+    },
+    {
+      "epoch": 0.010235605017307478,
+      "grad_norm": 0.16510234773159027,
+      "learning_rate": 0.0015,
+      "loss": 2.8936,
+      "step": 1650
+    },
+    {
+      "epoch": 0.010390689941812137,
+      "grad_norm": 0.08827799558639526,
+      "learning_rate": 0.0015,
+      "loss": 2.8789,
+      "step": 1675
+    },
+    {
+      "epoch": 0.010545774866316795,
+      "grad_norm": 0.12703286111354828,
+      "learning_rate": 0.0015,
+      "loss": 2.9104,
+      "step": 1700
+    },
+    {
+      "epoch": 0.010700859790821454,
+      "grad_norm": 0.10185768455266953,
+      "learning_rate": 0.0015,
+      "loss": 2.8389,
+      "step": 1725
+    },
+    {
+      "epoch": 0.010855944715326112,
+      "grad_norm": 0.13076236844062805,
+      "learning_rate": 0.0015,
+      "loss": 2.8603,
+      "step": 1750
+    },
+    {
+      "epoch": 0.011011029639830772,
+      "grad_norm": 0.08955707401037216,
+      "learning_rate": 0.0015,
+      "loss": 2.8283,
+      "step": 1775
+    },
+    {
+      "epoch": 0.01116611456433543,
+      "grad_norm": 0.07163148373365402,
+      "learning_rate": 0.0015,
+      "loss": 2.8852,
+      "step": 1800
+    },
+    {
+      "epoch": 0.01116611456433543,
+      "eval_loss": 4.75281286239624,
+      "perplexity": 115.90986633300781,
+      "step": 1800
+    },
+    {
+      "epoch": 0.011321199488840089,
+      "grad_norm": 0.09710580855607986,
+      "learning_rate": 0.0015,
+      "loss": 2.8573,
+      "step": 1825
+    },
+    {
+      "epoch": 0.011476284413344747,
+      "grad_norm": 0.11669810861349106,
+      "learning_rate": 0.0015,
+      "loss": 2.8674,
+      "step": 1850
+    },
+    {
+      "epoch": 0.011631369337849405,
+      "grad_norm": 0.11174403876066208,
+      "learning_rate": 0.0015,
+      "loss": 2.9121,
+      "step": 1875
+    },
+    {
+      "epoch": 0.011786454262354066,
+      "grad_norm": 0.09547118842601776,
+      "learning_rate": 0.0015,
+      "loss": 2.9033,
+      "step": 1900
+    },
+    {
+      "epoch": 0.011941539186858724,
+      "grad_norm": 0.09878171980381012,
+      "learning_rate": 0.0015,
+      "loss": 2.8738,
+      "step": 1925
+    },
+    {
+      "epoch": 0.012096624111363382,
+      "grad_norm": 0.09479096531867981,
+      "learning_rate": 0.0015,
+      "loss": 2.8775,
+      "step": 1950
+    },
+    {
+      "epoch": 0.01225170903586804,
+      "grad_norm": 0.12434259057044983,
+      "learning_rate": 0.0015,
+      "loss": 2.8452,
+      "step": 1975
+    },
+    {
+      "epoch": 0.0124067939603727,
+      "grad_norm": 0.09166444838047028,
+      "learning_rate": 0.0015,
+      "loss": 2.8546,
+      "step": 2000
+    },
+    {
+      "epoch": 0.0124067939603727,
+      "eval_loss": 4.748600482940674,
+      "perplexity": 115.42263793945312,
+      "step": 2000
+    },
+    {
+      "epoch": 0.01256187888487736,
+      "grad_norm": 0.07793508470058441,
+      "learning_rate": 0.0015,
+      "loss": 2.8306,
+      "step": 2025
+    },
+    {
+      "epoch": 0.012716963809382018,
+      "grad_norm": 0.1670406609773636,
+      "learning_rate": 0.0015,
+      "loss": 2.863,
+      "step": 2050
+    },
+    {
+      "epoch": 0.012872048733886676,
+      "grad_norm": 0.20754718780517578,
+      "learning_rate": 0.0015,
+      "loss": 2.8871,
+      "step": 2075
+    },
+    {
+      "epoch": 0.013027133658391334,
+      "grad_norm": 0.14225496351718903,
+      "learning_rate": 0.0015,
+      "loss": 2.8498,
+      "step": 2100
+    },
+    {
+      "epoch": 0.013182218582895994,
+      "grad_norm": 0.11809197813272476,
+      "learning_rate": 0.0015,
+      "loss": 2.8206,
+      "step": 2125
+    },
+    {
+      "epoch": 0.013337303507400653,
+      "grad_norm": 0.09541622549295425,
+      "learning_rate": 0.0015,
+      "loss": 2.8585,
+      "step": 2150
+    },
+    {
+      "epoch": 0.013492388431905311,
+      "grad_norm": 0.1115843802690506,
+      "learning_rate": 0.0015,
+      "loss": 2.8533,
+      "step": 2175
+    },
+    {
+      "epoch": 0.01364747335640997,
+      "grad_norm": 0.08517899364233017,
+      "learning_rate": 0.0015,
+      "loss": 2.8477,
+      "step": 2200
+    },
+    {
+      "epoch": 0.01364747335640997,
+      "eval_loss": 4.753279685974121,
+      "perplexity": 115.9639892578125,
+      "step": 2200
+    },
+    {
+      "epoch": 0.01380255828091463,
+      "grad_norm": 0.13083544373512268,
+      "learning_rate": 0.0015,
+      "loss": 2.8518,
+      "step": 2225
+    },
+    {
+      "epoch": 0.013957643205419288,
+      "grad_norm": 0.07403870671987534,
+      "learning_rate": 0.0015,
+      "loss": 2.8685,
+      "step": 2250
+    },
+    {
+      "epoch": 0.014112728129923946,
+      "grad_norm": 0.16436311602592468,
+      "learning_rate": 0.0015,
+      "loss": 2.8601,
+      "step": 2275
+    },
+    {
+      "epoch": 0.014267813054428605,
+      "grad_norm": 0.12990187108516693,
+      "learning_rate": 0.0015,
+      "loss": 2.8332,
+      "step": 2300
+    },
+    {
+      "epoch": 0.014422897978933263,
+      "grad_norm": 0.0897112786769867,
+      "learning_rate": 0.0015,
+      "loss": 2.8578,
+      "step": 2325
+    },
+    {
+      "epoch": 0.014577982903437923,
+      "grad_norm": 0.10096879303455353,
+      "learning_rate": 0.0015,
+      "loss": 2.802,
+      "step": 2350
+    },
+    {
+      "epoch": 0.014733067827942582,
+      "grad_norm": 0.0850217416882515,
+      "learning_rate": 0.0015,
+      "loss": 2.8529,
+      "step": 2375
+    },
+    {
+      "epoch": 0.01488815275244724,
+      "grad_norm": 0.11395123600959778,
+      "learning_rate": 0.0015,
+      "loss": 2.8655,
+      "step": 2400
+    },
+    {
+      "epoch": 0.01488815275244724,
+      "eval_loss": 4.743602275848389,
+      "perplexity": 114.84716796875,
+      "step": 2400
+    },
+    {
+      "epoch": 0.015043237676951898,
+      "grad_norm": 0.1590801179409027,
+      "learning_rate": 0.0015,
+      "loss": 2.8227,
+      "step": 2425
+    },
+    {
+      "epoch": 0.015198322601456558,
+      "grad_norm": 0.16819922626018524,
+      "learning_rate": 0.0015,
+      "loss": 2.8551,
+      "step": 2450
+    },
+    {
+      "epoch": 0.015353407525961217,
+      "grad_norm": 0.15390118956565857,
+      "learning_rate": 0.0015,
+      "loss": 2.8691,
+      "step": 2475
+    },
+    {
+      "epoch": 0.015508492450465875,
+      "grad_norm": 0.10976951569318771,
+      "learning_rate": 0.0015,
+      "loss": 2.8615,
+      "step": 2500
+    },
+    {
+      "epoch": 0.015663577374970535,
+      "grad_norm": 0.09539350867271423,
+      "learning_rate": 0.0015,
+      "loss": 2.7755,
+      "step": 2525
+    },
+    {
+      "epoch": 0.015818662299475192,
+      "grad_norm": 0.09798863530158997,
+      "learning_rate": 0.0015,
+      "loss": 2.7675,
+      "step": 2550
+    },
+    {
+      "epoch": 0.015973747223979852,
+      "grad_norm": 0.10233014822006226,
+      "learning_rate": 0.0015,
+      "loss": 2.7905,
+      "step": 2575
+    },
+    {
+      "epoch": 0.01612883214848451,
+      "grad_norm": 0.09607812017202377,
+      "learning_rate": 0.0015,
+      "loss": 2.779,
+      "step": 2600
+    },
+    {
+      "epoch": 0.01612883214848451,
+      "eval_loss": 4.757762432098389,
+      "perplexity": 116.48499298095703,
+      "step": 2600
+    },
+    {
+      "epoch": 0.01628391707298917,
+      "grad_norm": 0.09782920032739639,
+      "learning_rate": 0.0015,
+      "loss": 2.8455,
+      "step": 2625
+    },
+    {
+      "epoch": 0.01643900199749383,
+      "grad_norm": 0.08443335443735123,
+      "learning_rate": 0.0015,
+      "loss": 2.8537,
+      "step": 2650
+    },
+    {
+      "epoch": 0.016594086921998485,
+      "grad_norm": 0.1567981094121933,
+      "learning_rate": 0.0015,
+      "loss": 2.8334,
+      "step": 2675
+    },
+    {
+      "epoch": 0.016749171846503146,
+      "grad_norm": 0.1279255449771881,
+      "learning_rate": 0.0015,
+      "loss": 2.8733,
+      "step": 2700
+    },
+    {
+      "epoch": 0.016904256771007802,
+      "grad_norm": 0.09086953848600388,
+      "learning_rate": 0.0015,
+      "loss": 2.7992,
+      "step": 2725
+    },
+    {
+      "epoch": 0.017059341695512462,
+      "grad_norm": 0.15084481239318848,
+      "learning_rate": 0.0015,
+      "loss": 2.7891,
+      "step": 2750
+    },
+    {
+      "epoch": 0.017214426620017122,
+      "grad_norm": 0.1059018149971962,
+      "learning_rate": 0.0015,
+      "loss": 2.8088,
+      "step": 2775
+    },
+    {
+      "epoch": 0.01736951154452178,
+      "grad_norm": 0.08803548663854599,
+      "learning_rate": 0.0015,
+      "loss": 2.817,
+      "step": 2800
+    },
+    {
+      "epoch": 0.01736951154452178,
+      "eval_loss": 4.730724334716797,
+      "perplexity": 113.37765502929688,
+      "step": 2800
+    },
+    {
+      "epoch": 0.01752459646902644,
+      "grad_norm": 0.0954984724521637,
+      "learning_rate": 0.0015,
+      "loss": 2.8528,
+      "step": 2825
+    },
+    {
+      "epoch": 0.0176796813935311,
+      "grad_norm": 0.14015914499759674,
+      "learning_rate": 0.0015,
+      "loss": 2.8131,
+      "step": 2850
+    },
+    {
+      "epoch": 0.017834766318035756,
+      "grad_norm": 0.07908599078655243,
+      "learning_rate": 0.0015,
+      "loss": 2.8371,
+      "step": 2875
+    },
+    {
+      "epoch": 0.017989851242540416,
+      "grad_norm": 0.14578266441822052,
+      "learning_rate": 0.0015,
+      "loss": 2.8033,
+      "step": 2900
+    },
+    {
+      "epoch": 0.018144936167045073,
+      "grad_norm": 0.10059946030378342,
+      "learning_rate": 0.0015,
+      "loss": 2.8165,
+      "step": 2925
+    },
+    {
+      "epoch": 0.018300021091549733,
+      "grad_norm": 0.10238490998744965,
+      "learning_rate": 0.0015,
+      "loss": 2.7739,
+      "step": 2950
+    },
+    {
+      "epoch": 0.018455106016054393,
+      "grad_norm": 0.12706336379051208,
+      "learning_rate": 0.0015,
+      "loss": 2.8018,
+      "step": 2975
+    },
+    {
+      "epoch": 0.01861019094055905,
+      "grad_norm": 0.1252700239419937,
+      "learning_rate": 0.0015,
+      "loss": 2.8155,
+      "step": 3000
+    },
+    {
+      "epoch": 0.01861019094055905,
+      "eval_loss": 4.707705020904541,
+      "perplexity": 110.79759216308594,
+      "step": 3000
+    },
+    {
+      "epoch": 0.01876527586506371,
+      "grad_norm": 0.13322588801383972,
+      "learning_rate": 0.0015,
+      "loss": 2.8201,
+      "step": 3025
+    },
+    {
+      "epoch": 0.018920360789568366,
+      "grad_norm": 0.14152252674102783,
+      "learning_rate": 0.0015,
+      "loss": 2.7942,
+      "step": 3050
+    },
+    {
+      "epoch": 0.019075445714073026,
+      "grad_norm": 0.1276037096977234,
+      "learning_rate": 0.0015,
+      "loss": 2.8065,
+      "step": 3075
+    },
+    {
+      "epoch": 0.019230530638577686,
+      "grad_norm": 0.11600831896066666,
+      "learning_rate": 0.0015,
+      "loss": 2.8335,
+      "step": 3100
+    },
+    {
+      "epoch": 0.019385615563082343,
+      "grad_norm": 0.11985427141189575,
+      "learning_rate": 0.0015,
+      "loss": 2.7993,
+      "step": 3125
+    },
+    {
+      "epoch": 0.019540700487587003,
+      "grad_norm": 0.11630894988775253,
+      "learning_rate": 0.0015,
+      "loss": 2.7838,
+      "step": 3150
+    },
+    {
+      "epoch": 0.01969578541209166,
+      "grad_norm": 0.08493560552597046,
+      "learning_rate": 0.0015,
+      "loss": 2.7884,
+      "step": 3175
+    },
+    {
+      "epoch": 0.01985087033659632,
+      "grad_norm": 0.12671016156673431,
+      "learning_rate": 0.0015,
+      "loss": 2.7763,
+      "step": 3200
+    },
+    {
+      "epoch": 0.01985087033659632,
+      "eval_loss": 4.7127766609191895,
+      "perplexity": 111.3609390258789,
+      "step": 3200
+    },
+    {
+      "epoch": 0.02000595526110098,
+      "grad_norm": 0.10381816327571869,
+      "learning_rate": 0.0015,
+      "loss": 2.7849,
+      "step": 3225
+    },
+    {
+      "epoch": 0.020161040185605637,
+      "grad_norm": 0.12319795787334442,
+      "learning_rate": 0.0015,
+      "loss": 2.8325,
+      "step": 3250
+    },
+    {
+      "epoch": 0.020316125110110297,
+      "grad_norm": 0.11378122121095657,
+      "learning_rate": 0.0015,
+      "loss": 2.7609,
+      "step": 3275
+    },
+    {
+      "epoch": 0.020471210034614957,
+      "grad_norm": 0.08910433948040009,
+      "learning_rate": 0.0015,
+      "loss": 2.7886,
+      "step": 3300
+    },
+    {
+      "epoch": 0.020626294959119613,
+      "grad_norm": 0.11803348362445831,
+      "learning_rate": 0.0015,
+      "loss": 2.7716,
+      "step": 3325
+    },
+    {
+      "epoch": 0.020781379883624274,
+      "grad_norm": 0.10203807801008224,
+      "learning_rate": 0.0015,
+      "loss": 2.778,
+      "step": 3350
+    },
+    {
+      "epoch": 0.02093646480812893,
+      "grad_norm": 0.07175683230161667,
+      "learning_rate": 0.0015,
+      "loss": 2.7844,
+      "step": 3375
+    },
+    {
+      "epoch": 0.02109154973263359,
+      "grad_norm": 0.1556989699602127,
+      "learning_rate": 0.0015,
+      "loss": 2.748,
+      "step": 3400
+    },
+    {
+      "epoch": 0.02109154973263359,
+      "eval_loss": 4.711516857147217,
+      "perplexity": 111.22074127197266,
+      "step": 3400
+    },
+    {
+      "epoch": 0.02124663465713825,
+      "grad_norm": 0.11983326822519302,
+      "learning_rate": 0.0015,
+      "loss": 2.7747,
+      "step": 3425
+    },
+    {
+      "epoch": 0.021401719581642907,
+      "grad_norm": 0.09098344296216965,
+      "learning_rate": 0.0015,
+      "loss": 2.7609,
+      "step": 3450
+    },
+    {
+      "epoch": 0.021556804506147567,
+      "grad_norm": 0.1238594651222229,
+      "learning_rate": 0.0015,
+      "loss": 2.7849,
+      "step": 3475
+    },
+    {
+      "epoch": 0.021711889430652224,
+      "grad_norm": 0.10654041916131973,
+      "learning_rate": 0.0015,
+      "loss": 2.7742,
+      "step": 3500
+    },
+    {
+      "epoch": 0.021866974355156884,
+      "grad_norm": 0.12955708801746368,
+      "learning_rate": 0.0015,
+      "loss": 2.7302,
+      "step": 3525
+    },
+    {
+      "epoch": 0.022022059279661544,
+      "grad_norm": 0.0945751890540123,
+      "learning_rate": 0.0015,
+      "loss": 2.7366,
+      "step": 3550
+    },
+    {
+      "epoch": 0.0221771442041662,
+      "grad_norm": 0.11322261393070221,
+      "learning_rate": 0.0015,
+      "loss": 2.7307,
+      "step": 3575
+    },
+    {
+      "epoch": 0.02233222912867086,
+      "grad_norm": 0.14438313245773315,
+      "learning_rate": 0.0015,
+      "loss": 2.741,
+      "step": 3600
+    },
+    {
+      "epoch": 0.02233222912867086,
+      "eval_loss": 4.7056427001953125,
+      "perplexity": 110.56932830810547,
+      "step": 3600
+    },
+    {
+      "epoch": 0.022487314053175517,
+      "grad_norm": 0.12101957201957703,
+      "learning_rate": 0.0015,
+      "loss": 2.7699,
+      "step": 3625
+    },
+    {
+      "epoch": 0.022642398977680177,
+      "grad_norm": 0.13060438632965088,
+      "learning_rate": 0.0015,
+      "loss": 2.7534,
+      "step": 3650
+    },
+    {
+      "epoch": 0.022797483902184838,
+      "grad_norm": 0.18028861284255981,
+      "learning_rate": 0.0015,
+      "loss": 2.7716,
+      "step": 3675
+    },
+    {
+      "epoch": 0.022952568826689494,
+      "grad_norm": 0.2551407217979431,
+      "learning_rate": 0.0015,
+      "loss": 2.7505,
+      "step": 3700
+    },
+    {
+      "epoch": 0.023107653751194154,
+      "grad_norm": 0.14461354911327362,
+      "learning_rate": 0.0015,
+      "loss": 2.762,
+      "step": 3725
+    },
+    {
+      "epoch": 0.02326273867569881,
+      "grad_norm": 0.08960037678480148,
+      "learning_rate": 0.0015,
+      "loss": 2.7752,
+      "step": 3750
+    },
+    {
+      "epoch": 0.02341782360020347,
+      "grad_norm": 0.12423495948314667,
+      "learning_rate": 0.0015,
+      "loss": 2.7649,
+      "step": 3775
+    },
+    {
+      "epoch": 0.02357290852470813,
+      "grad_norm": 0.11889061331748962,
+      "learning_rate": 0.0015,
+      "loss": 2.7465,
+      "step": 3800
+    },
+    {
+      "epoch": 0.02357290852470813,
+      "eval_loss": 4.709405422210693,
+      "perplexity": 110.98615264892578,
+      "step": 3800
+    },
+    {
+      "epoch": 0.023727993449212788,
+      "grad_norm": 0.1310662031173706,
+      "learning_rate": 0.0015,
+      "loss": 2.7739,
+      "step": 3825
+    },
+    {
+      "epoch": 0.023883078373717448,
+      "grad_norm": 0.10841766744852066,
+      "learning_rate": 0.0015,
+      "loss": 2.7558,
+      "step": 3850
+    },
+    {
+      "epoch": 0.024038163298222108,
+      "grad_norm": 0.11951743066310883,
+      "learning_rate": 0.0015,
+      "loss": 2.7574,
+      "step": 3875
+    },
+    {
+      "epoch": 0.024193248222726765,
+      "grad_norm": 0.10914873331785202,
+      "learning_rate": 0.0015,
+      "loss": 2.7593,
+      "step": 3900
+    },
+    {
+      "epoch": 0.024348333147231425,
+      "grad_norm": 0.12661431729793549,
+      "learning_rate": 0.0015,
+      "loss": 2.7405,
+      "step": 3925
+    },
+    {
+      "epoch": 0.02450341807173608,
+      "grad_norm": 0.09351510554552078,
+      "learning_rate": 0.0015,
+      "loss": 2.7614,
+      "step": 3950
+    },
+    {
+      "epoch": 0.02465850299624074,
+      "grad_norm": 0.10916408896446228,
+      "learning_rate": 0.0015,
+      "loss": 2.7348,
+      "step": 3975
+    },
+    {
+      "epoch": 0.0248135879207454,
+      "grad_norm": 0.1506185084581375,
+      "learning_rate": 0.0015,
+      "loss": 2.7465,
+      "step": 4000
+    },
+    {
+      "epoch": 0.0248135879207454,
+      "eval_loss": 4.691644191741943,
+      "perplexity": 109.03230285644531,
+      "step": 4000
+    },
+    {
+      "epoch": 0.024968672845250058,
+      "grad_norm": 0.16664201021194458,
+      "learning_rate": 0.0015,
+      "loss": 2.7099,
+      "step": 4025
+    },
+    {
+      "epoch": 0.02512375776975472,
+      "grad_norm": 0.08793428540229797,
+      "learning_rate": 0.0015,
+      "loss": 2.7062,
+      "step": 4050
+    },
+    {
+      "epoch": 0.025278842694259375,
+      "grad_norm": 0.10746140778064728,
+      "learning_rate": 0.0015,
+      "loss": 2.7013,
+      "step": 4075
+    },
+    {
+      "epoch": 0.025433927618764035,
+      "grad_norm": 0.14466698467731476,
+      "learning_rate": 0.0015,
+      "loss": 2.7366,
+      "step": 4100
+    },
+    {
+      "epoch": 0.025589012543268695,
+      "grad_norm": 0.12191653996706009,
+      "learning_rate": 0.0015,
+      "loss": 2.7042,
+      "step": 4125
+    },
+    {
+      "epoch": 0.025744097467773352,
+      "grad_norm": 0.10167489945888519,
+      "learning_rate": 0.0015,
+      "loss": 2.7215,
+      "step": 4150
+    },
+    {
+      "epoch": 0.025899182392278012,
+      "grad_norm": 0.11334148049354553,
+      "learning_rate": 0.0015,
+      "loss": 2.7365,
+      "step": 4175
+    },
+    {
+      "epoch": 0.02605426731678267,
+      "grad_norm": 0.09303794056177139,
+      "learning_rate": 0.0015,
+      "loss": 2.7471,
+      "step": 4200
+    },
+    {
+      "epoch": 0.02605426731678267,
+      "eval_loss": 4.692121505737305,
+      "perplexity": 109.08435821533203,
+      "step": 4200
+    },
+    {
+      "epoch": 0.02620935224128733,
+      "grad_norm": 0.09444712847471237,
+      "learning_rate": 0.0015,
+      "loss": 2.6965,
+      "step": 4225
+    },
+    {
+      "epoch": 0.02636443716579199,
+      "grad_norm": 0.09560113400220871,
+      "learning_rate": 0.0015,
+      "loss": 2.7186,
+      "step": 4250
+    },
+    {
+      "epoch": 0.026519522090296645,
+      "grad_norm": 0.10814715176820755,
+      "learning_rate": 0.0015,
+      "loss": 2.7,
+      "step": 4275
+    },
+    {
+      "epoch": 0.026674607014801305,
+      "grad_norm": 0.12008251994848251,
+      "learning_rate": 0.0015,
+      "loss": 2.6827,
+      "step": 4300
+    },
+    {
+      "epoch": 0.026829691939305966,
+      "grad_norm": 0.13892072439193726,
+      "learning_rate": 0.0015,
+      "loss": 2.7481,
+      "step": 4325
+    },
+    {
+      "epoch": 0.026984776863810622,
+      "grad_norm": 0.10116352885961533,
+      "learning_rate": 0.0015,
+      "loss": 2.6839,
+      "step": 4350
+    },
+    {
+      "epoch": 0.027139861788315282,
+      "grad_norm": 0.2541595697402954,
+      "learning_rate": 0.0015,
+      "loss": 2.6987,
+      "step": 4375
+    },
+    {
+      "epoch": 0.02729494671281994,
+      "grad_norm": 0.11070574074983597,
+      "learning_rate": 0.0015,
+      "loss": 2.7102,
+      "step": 4400
+    },
+    {
+      "epoch": 0.02729494671281994,
+      "eval_loss": 4.702114105224609,
+      "perplexity": 110.17985534667969,
+      "step": 4400
+    },
+    {
+      "epoch": 0.0274500316373246,
+      "grad_norm": 0.09290622174739838,
+      "learning_rate": 0.0015,
+      "loss": 2.744,
+      "step": 4425
+    },
+    {
+      "epoch": 0.02760511656182926,
+      "grad_norm": 0.09867129474878311,
+      "learning_rate": 0.0015,
+      "loss": 2.6979,
+      "step": 4450
+    },
+    {
+      "epoch": 0.027760201486333916,
+      "grad_norm": 0.08975850045681,
+      "learning_rate": 0.0015,
+      "loss": 2.7346,
+      "step": 4475
+    },
+    {
+      "epoch": 0.027915286410838576,
+      "grad_norm": 0.1251811683177948,
+      "learning_rate": 0.0015,
+      "loss": 2.6901,
+      "step": 4500
+    },
+    {
+      "epoch": 0.028070371335343233,
+      "grad_norm": 0.10718528181314468,
+      "learning_rate": 0.0015,
+      "loss": 2.6584,
+      "step": 4525
+    },
+    {
+      "epoch": 0.028225456259847893,
+      "grad_norm": 0.1920158714056015,
+      "learning_rate": 0.0015,
+      "loss": 2.6776,
+      "step": 4550
+    },
+    {
+      "epoch": 0.028380541184352553,
+      "grad_norm": 0.11409153789281845,
+      "learning_rate": 0.0015,
+      "loss": 2.7052,
+      "step": 4575
+    },
+    {
+      "epoch": 0.02853562610885721,
+      "grad_norm": 0.12506772577762604,
+      "learning_rate": 0.0015,
+      "loss": 2.6954,
+      "step": 4600
+    },
+    {
+      "epoch": 0.02853562610885721,
+      "eval_loss": 4.685390949249268,
+      "perplexity": 108.35262298583984,
+      "step": 4600
+    },
+    {
+      "epoch": 0.02869071103336187,
+      "grad_norm": 0.1093166172504425,
+      "learning_rate": 0.0015,
+      "loss": 2.7257,
+      "step": 4625
+    },
+    {
+      "epoch": 0.028845795957866526,
+      "grad_norm": 0.16628532111644745,
+      "learning_rate": 0.0015,
+      "loss": 2.6782,
+      "step": 4650
+    },
+    {
+      "epoch": 0.029000880882371186,
+      "grad_norm": 0.1638079136610031,
+      "learning_rate": 0.0015,
+      "loss": 2.6884,
+      "step": 4675
+    },
+    {
+      "epoch": 0.029155965806875846,
+      "grad_norm": 0.11411619931459427,
+      "learning_rate": 0.0015,
+      "loss": 2.7054,
+      "step": 4700
+    },
+    {
+      "epoch": 0.029311050731380503,
+      "grad_norm": 0.09292814135551453,
+      "learning_rate": 0.0015,
+      "loss": 2.6826,
+      "step": 4725
+    },
+    {
+      "epoch": 0.029466135655885163,
+      "grad_norm": 0.09136354923248291,
+      "learning_rate": 0.0015,
+      "loss": 2.6936,
+      "step": 4750
+    },
+    {
+      "epoch": 0.029621220580389823,
+      "grad_norm": 0.1188502386212349,
+      "learning_rate": 0.0015,
+      "loss": 2.6466,
+      "step": 4775
+    },
+    {
+      "epoch": 0.02977630550489448,
+      "grad_norm": 0.09645655751228333,
+      "learning_rate": 0.0015,
+      "loss": 2.6092,
+      "step": 4800
+    },
+    {
+      "epoch": 0.02977630550489448,
+      "eval_loss": 4.683995723724365,
+      "perplexity": 108.20155334472656,
+      "step": 4800
+    },
+    {
+      "epoch": 0.02993139042939914,
+      "grad_norm": 0.17193672060966492,
+      "learning_rate": 0.0015,
+      "loss": 2.6916,
+      "step": 4825
+    },
+    {
+      "epoch": 0.030086475353903797,
+      "grad_norm": 0.14866988360881805,
+      "learning_rate": 0.0015,
+      "loss": 2.6776,
+      "step": 4850
+    },
+    {
+      "epoch": 0.030241560278408457,
+      "grad_norm": 0.10588869452476501,
+      "learning_rate": 0.0015,
+      "loss": 2.6773,
+      "step": 4875
+    },
+    {
+      "epoch": 0.030396645202913117,
+      "grad_norm": 0.12059559673070908,
+      "learning_rate": 0.0015,
+      "loss": 2.639,
+      "step": 4900
+    },
+    {
+      "epoch": 0.030551730127417773,
+      "grad_norm": 0.13296598196029663,
+      "learning_rate": 0.0015,
+      "loss": 2.6359,
+      "step": 4925
+    },
+    {
+      "epoch": 0.030706815051922434,
+      "grad_norm": 0.12300167232751846,
+      "learning_rate": 0.0015,
+      "loss": 2.668,
+      "step": 4950
+    },
+    {
+      "epoch": 0.03086189997642709,
+      "grad_norm": 0.15900522470474243,
+      "learning_rate": 0.0015,
+      "loss": 2.6252,
+      "step": 4975
+    },
+    {
+      "epoch": 0.03101698490093175,
+      "grad_norm": 0.138090580701828,
+      "learning_rate": 0.0015,
+      "loss": 2.659,
+      "step": 5000
+    },
+    {
+      "epoch": 0.03101698490093175,
+      "eval_loss": 4.688181400299072,
+      "perplexity": 108.65540313720703,
+      "step": 5000
+    },
+    {
+      "epoch": 0.03117206982543641,
+      "grad_norm": 0.13720737397670746,
+      "learning_rate": 0.0015,
+      "loss": 2.6096,
+      "step": 5025
+    },
+    {
+      "epoch": 0.03132715474994107,
+      "grad_norm": 0.13671600818634033,
+      "learning_rate": 0.0015,
+      "loss": 2.647,
+      "step": 5050
+    },
+    {
+      "epoch": 0.031482239674445724,
+      "grad_norm": 0.12611277401447296,
+      "learning_rate": 0.0015,
+      "loss": 2.639,
+      "step": 5075
+    },
+    {
+      "epoch": 0.031637324598950384,
+      "grad_norm": 0.12045291066169739,
+      "learning_rate": 0.0015,
+      "loss": 2.663,
+      "step": 5100
+    },
+    {
+      "epoch": 0.031792409523455044,
+      "grad_norm": 0.10857657343149185,
+      "learning_rate": 0.0015,
+      "loss": 2.6677,
+      "step": 5125
+    },
+    {
+      "epoch": 0.031947494447959704,
+      "grad_norm": 0.12052007764577866,
+      "learning_rate": 0.0015,
+      "loss": 2.6508,
+      "step": 5150
+    },
+    {
+      "epoch": 0.032102579372464364,
+      "grad_norm": 0.10999467223882675,
+      "learning_rate": 0.0015,
+      "loss": 2.661,
+      "step": 5175
+    },
+    {
+      "epoch": 0.03225766429696902,
+      "grad_norm": 0.11075185984373093,
+      "learning_rate": 0.0015,
+      "loss": 2.6645,
+      "step": 5200
+    },
+    {
+      "epoch": 0.03225766429696902,
+      "eval_loss": 4.706582546234131,
+      "perplexity": 110.67329406738281,
+      "step": 5200
+    },
+    {
+      "epoch": 0.03241274922147368,
+      "grad_norm": 0.09703061729669571,
+      "learning_rate": 0.0015,
+      "loss": 2.6109,
+      "step": 5225
+    },
+    {
+      "epoch": 0.03256783414597834,
+      "grad_norm": 0.13556119799613953,
+      "learning_rate": 0.0015,
+      "loss": 2.6621,
+      "step": 5250
+    },
+    {
+      "epoch": 0.032722919070483,
+      "grad_norm": 0.09178316593170166,
+      "learning_rate": 0.0015,
+      "loss": 2.6263,
+      "step": 5275
+    },
+    {
+      "epoch": 0.03287800399498766,
+      "grad_norm": 0.10839138180017471,
+      "learning_rate": 0.0015,
+      "loss": 2.5999,
+      "step": 5300
+    },
+    {
+      "epoch": 0.03303308891949231,
+      "grad_norm": 0.12049377709627151,
+      "learning_rate": 0.0015,
+      "loss": 2.6085,
+      "step": 5325
+    },
+    {
+      "epoch": 0.03318817384399697,
+      "grad_norm": 0.15260230004787445,
+      "learning_rate": 0.0015,
+      "loss": 2.664,
+      "step": 5350
+    },
+    {
+      "epoch": 0.03334325876850163,
+      "grad_norm": 0.12393297255039215,
+      "learning_rate": 0.0015,
+      "loss": 2.6234,
+      "step": 5375
+    },
+    {
+      "epoch": 0.03349834369300629,
+      "grad_norm": 0.1284521073102951,
+      "learning_rate": 0.0015,
+      "loss": 2.5624,
+      "step": 5400
+    },
+    {
+      "epoch": 0.03349834369300629,
+      "eval_loss": 4.696901321411133,
+      "perplexity": 109.60700988769531,
+      "step": 5400
+    },
+    {
+      "epoch": 0.03365342861751095,
+      "grad_norm": 0.18052247166633606,
+      "learning_rate": 0.0015,
+      "loss": 2.5779,
+      "step": 5425
+    },
+    {
+      "epoch": 0.033808513542015604,
+      "grad_norm": 0.11775010824203491,
+      "learning_rate": 0.0015,
+      "loss": 2.6167,
+      "step": 5450
+    },
+    {
+      "epoch": 0.033963598466520264,
+      "grad_norm": 0.13769109547138214,
+      "learning_rate": 0.0015,
+      "loss": 2.6117,
+      "step": 5475
+    },
+    {
+      "epoch": 0.034118683391024925,
+      "grad_norm": 0.09634970873594284,
+      "learning_rate": 0.0015,
+      "loss": 2.613,
+      "step": 5500
+    },
+    {
+      "epoch": 0.034273768315529585,
+      "grad_norm": 0.14692488312721252,
+      "learning_rate": 0.0015,
+      "loss": 2.6176,
+      "step": 5525
+    },
+    {
+      "epoch": 0.034428853240034245,
+      "grad_norm": 0.21920783817768097,
+      "learning_rate": 0.0015,
+      "loss": 2.6196,
+      "step": 5550
+    },
+    {
+      "epoch": 0.034583938164538905,
+      "grad_norm": 0.1033003106713295,
+      "learning_rate": 0.0015,
+      "loss": 2.5872,
+      "step": 5575
+    },
+    {
+      "epoch": 0.03473902308904356,
+      "grad_norm": 0.09867612272500992,
+      "learning_rate": 0.0015,
+      "loss": 2.5782,
+      "step": 5600
+    },
+    {
+      "epoch": 0.03473902308904356,
+      "eval_loss": 4.704063892364502,
+      "perplexity": 110.3948974609375,
+      "step": 5600
+    },
+    {
+      "epoch": 0.03489410801354822,
+      "grad_norm": 0.1032184287905693,
+      "learning_rate": 0.0015,
+      "loss": 2.6187,
+      "step": 5625
+    },
+    {
+      "epoch": 0.03504919293805288,
+      "grad_norm": 0.12661318480968475,
+      "learning_rate": 0.0015,
+      "loss": 2.5805,
+      "step": 5650
+    },
+    {
+      "epoch": 0.03520427786255754,
+      "grad_norm": 0.28772449493408203,
+      "learning_rate": 0.0015,
+      "loss": 2.7518,
+      "step": 5675
+    },
+    {
+      "epoch": 0.0353593627870622,
+      "grad_norm": 0.10005131363868713,
+      "learning_rate": 0.0015,
+      "loss": 2.8556,
+      "step": 5700
+    },
+    {
+      "epoch": 0.03551444771156685,
+      "grad_norm": 0.10379570722579956,
+      "learning_rate": 0.0015,
+      "loss": 2.8648,
+      "step": 5725
+    },
+    {
+      "epoch": 0.03566953263607151,
+      "grad_norm": 0.08921229094266891,
+      "learning_rate": 0.0015,
+      "loss": 2.8421,
+      "step": 5750
+    },
+    {
+      "epoch": 0.03582461756057617,
+      "grad_norm": 0.15366144478321075,
+      "learning_rate": 0.0015,
+      "loss": 2.8162,
+      "step": 5775
+    },
+    {
+      "epoch": 0.03597970248508083,
+      "grad_norm": 0.12743431329727173,
+      "learning_rate": 0.0015,
+      "loss": 2.8635,
+      "step": 5800
+    },
+    {
+      "epoch": 0.03597970248508083,
+      "eval_loss": 4.674878120422363,
+      "perplexity": 107.21949768066406,
+      "step": 5800
+    },
+    {
+      "epoch": 0.03613478740958549,
+      "grad_norm": 0.08773666620254517,
+      "learning_rate": 0.0015,
+      "loss": 2.8787,
+      "step": 5825
+    },
+    {
+      "epoch": 0.036289872334090145,
+      "grad_norm": 0.11721781641244888,
+      "learning_rate": 0.0015,
+      "loss": 2.853,
+      "step": 5850
+    },
+    {
+      "epoch": 0.036444957258594805,
+      "grad_norm": 0.09957700222730637,
+      "learning_rate": 0.0015,
+      "loss": 2.8163,
+      "step": 5875
+    },
+    {
+      "epoch": 0.036600042183099465,
+      "grad_norm": 0.09999966621398926,
+      "learning_rate": 0.0015,
+      "loss": 2.8206,
+      "step": 5900
+    },
+    {
+      "epoch": 0.036755127107604126,
+      "grad_norm": 0.09899301081895828,
+      "learning_rate": 0.0015,
+      "loss": 2.8378,
+      "step": 5925
+    },
+    {
+      "epoch": 0.036910212032108786,
+      "grad_norm": 0.09676779061555862,
+      "learning_rate": 0.0015,
+      "loss": 2.8385,
+      "step": 5950
+    },
+    {
+      "epoch": 0.03706529695661344,
+      "grad_norm": 0.14397811889648438,
+      "learning_rate": 0.0015,
+      "loss": 2.8639,
+      "step": 5975
+    },
+    {
+      "epoch": 0.0372203818811181,
+      "grad_norm": 0.08991026133298874,
+      "learning_rate": 0.0015,
+      "loss": 2.862,
+      "step": 6000
+    },
+    {
+      "epoch": 0.0372203818811181,
+      "eval_loss": 4.649503707885742,
+      "perplexity": 104.53309631347656,
+      "step": 6000
+    },
+    {
+      "epoch": 0.03737546680562276,
+      "grad_norm": 0.11916879564523697,
+      "learning_rate": 0.0015,
+      "loss": 2.8336,
+      "step": 6025
+    },
+    {
+      "epoch": 0.03753055173012742,
+      "grad_norm": 0.1533547192811966,
+      "learning_rate": 0.0015,
+      "loss": 2.8154,
+      "step": 6050
+    },
+    {
+      "epoch": 0.03768563665463208,
+      "grad_norm": 0.10416785627603531,
+      "learning_rate": 0.0015,
+      "loss": 2.8073,
+      "step": 6075
+    },
+    {
+      "epoch": 0.03784072157913673,
+      "grad_norm": 0.1307593733072281,
+      "learning_rate": 0.0015,
+      "loss": 2.8227,
+      "step": 6100
+    },
+    {
+      "epoch": 0.03799580650364139,
+      "grad_norm": 0.11226139962673187,
+      "learning_rate": 0.0015,
+      "loss": 2.8316,
+      "step": 6125
+    },
+    {
+      "epoch": 0.03815089142814605,
+      "grad_norm": 0.12050950527191162,
+      "learning_rate": 0.0015,
+      "loss": 2.8636,
+      "step": 6150
+    },
+    {
+      "epoch": 0.03830597635265071,
+      "grad_norm": 0.14836955070495605,
+      "learning_rate": 0.0015,
+      "loss": 2.8433,
+      "step": 6175
+    },
+    {
+      "epoch": 0.03846106127715537,
+      "grad_norm": 0.1240909993648529,
+      "learning_rate": 0.0015,
+      "loss": 2.885,
+      "step": 6200
+    },
+    {
+      "epoch": 0.03846106127715537,
+      "eval_loss": 4.652696132659912,
+      "perplexity": 104.86734008789062,
+      "step": 6200
+    },
+    {
+      "epoch": 0.038616146201660026,
+      "grad_norm": 0.09549515694379807,
+      "learning_rate": 0.0015,
+      "loss": 2.822,
+      "step": 6225
+    },
+    {
+      "epoch": 0.038771231126164686,
+      "grad_norm": 0.1386450082063675,
+      "learning_rate": 0.0015,
+      "loss": 2.8455,
+      "step": 6250
+    },
+    {
+      "epoch": 0.038926316050669346,
+      "grad_norm": 0.10233025252819061,
+      "learning_rate": 0.0015,
+      "loss": 2.834,
+      "step": 6275
+    },
+    {
+      "epoch": 0.039081400975174006,
+      "grad_norm": 0.09776704013347626,
+      "learning_rate": 0.0015,
+      "loss": 2.8114,
+      "step": 6300
+    },
+    {
+      "epoch": 0.039236485899678666,
+      "grad_norm": 0.09631351381540298,
+      "learning_rate": 0.0015,
+      "loss": 2.8107,
+      "step": 6325
+    },
+    {
+      "epoch": 0.03939157082418332,
+      "grad_norm": 0.08424117416143417,
+      "learning_rate": 0.0015,
+      "loss": 2.8373,
+      "step": 6350
+    },
+    {
+      "epoch": 0.03954665574868798,
+      "grad_norm": 0.14171521365642548,
+      "learning_rate": 0.0015,
+      "loss": 2.8394,
+      "step": 6375
+    },
+    {
+      "epoch": 0.03970174067319264,
+      "grad_norm": 0.11349046230316162,
+      "learning_rate": 0.0015,
+      "loss": 2.8131,
+      "step": 6400
+    },
+    {
+      "epoch": 0.03970174067319264,
+      "eval_loss": 4.652514934539795,
+      "perplexity": 104.84834289550781,
+      "step": 6400
+    },
+    {
+      "epoch": 0.0398568255976973,
+      "grad_norm": 0.09066054224967957,
+      "learning_rate": 0.0015,
+      "loss": 2.8758,
+      "step": 6425
+    },
+    {
+      "epoch": 0.04001191052220196,
+      "grad_norm": 0.09391192346811295,
+      "learning_rate": 0.0015,
+      "loss": 2.826,
+      "step": 6450
+    },
+    {
+      "epoch": 0.04016699544670661,
+      "grad_norm": 0.17412593960762024,
+      "learning_rate": 0.0015,
+      "loss": 2.8487,
+      "step": 6475
+    },
+    {
+      "epoch": 0.04032208037121127,
+      "grad_norm": 0.17672564089298248,
+      "learning_rate": 0.0015,
+      "loss": 2.8441,
+      "step": 6500
+    },
+    {
+      "epoch": 0.04047716529571593,
+      "grad_norm": 0.11427825689315796,
+      "learning_rate": 0.0015,
+      "loss": 2.8843,
+      "step": 6525
+    },
+    {
+      "epoch": 0.04063225022022059,
+      "grad_norm": 0.13745597004890442,
+      "learning_rate": 0.0015,
+      "loss": 2.8458,
+      "step": 6550
+    },
+    {
+      "epoch": 0.040787335144725254,
+      "grad_norm": 0.12339327484369278,
+      "learning_rate": 0.0015,
+      "loss": 2.8299,
+      "step": 6575
+    },
+    {
+      "epoch": 0.040942420069229914,
+      "grad_norm": 0.11045660078525543,
+      "learning_rate": 0.0015,
+      "loss": 2.8504,
+      "step": 6600
+    },
+    {
+      "epoch": 0.040942420069229914,
+      "eval_loss": 4.645139217376709,
+      "perplexity": 104.0778579711914,
+      "step": 6600
+    },
+    {
+      "epoch": 0.04109750499373457,
+      "grad_norm": 0.14822149276733398,
+      "learning_rate": 0.0015,
+      "loss": 2.8438,
+      "step": 6625
+    },
+    {
+      "epoch": 0.04125258991823923,
+      "grad_norm": 0.09271769225597382,
+      "learning_rate": 0.0015,
+      "loss": 2.8195,
+      "step": 6650
+    },
+    {
+      "epoch": 0.04140767484274389,
+      "grad_norm": 0.12357133626937866,
+      "learning_rate": 0.0015,
+      "loss": 2.8434,
+      "step": 6675
+    },
+    {
+      "epoch": 0.04156275976724855,
+      "grad_norm": 0.12669824063777924,
+      "learning_rate": 0.0015,
+      "loss": 2.8262,
+      "step": 6700
+    },
+    {
+      "epoch": 0.04171784469175321,
+      "grad_norm": 0.10409893840551376,
+      "learning_rate": 0.0015,
+      "loss": 2.8164,
+      "step": 6725
+    },
+    {
+      "epoch": 0.04187292961625786,
+      "grad_norm": 0.10687699913978577,
+      "learning_rate": 0.0015,
+      "loss": 2.83,
+      "step": 6750
+    },
+    {
+      "epoch": 0.04202801454076252,
+      "grad_norm": 0.09924216568470001,
+      "learning_rate": 0.0015,
+      "loss": 2.8415,
+      "step": 6775
+    },
+    {
+      "epoch": 0.04218309946526718,
+      "grad_norm": 0.11719833314418793,
+      "learning_rate": 0.0015,
+      "loss": 2.8368,
+      "step": 6800
+    },
+    {
+      "epoch": 0.04218309946526718,
+      "eval_loss": 4.673882484436035,
+      "perplexity": 107.11280059814453,
+      "step": 6800
+    },
+    {
+      "epoch": 0.04233818438977184,
+      "grad_norm": 0.10162920504808426,
+      "learning_rate": 0.0015,
+      "loss": 2.8285,
+      "step": 6825
+    },
+    {
+      "epoch": 0.0424932693142765,
+      "grad_norm": 0.10563603043556213,
+      "learning_rate": 0.0015,
+      "loss": 2.809,
+      "step": 6850
+    },
+    {
+      "epoch": 0.042648354238781154,
+      "grad_norm": 0.079631008207798,
+      "learning_rate": 0.0015,
+      "loss": 2.8362,
+      "step": 6875
+    },
+    {
+      "epoch": 0.042803439163285814,
+      "grad_norm": 0.11915802210569382,
+      "learning_rate": 0.0015,
+      "loss": 2.8211,
+      "step": 6900
+    },
+    {
+      "epoch": 0.042958524087790474,
+      "grad_norm": 0.13783864676952362,
+      "learning_rate": 0.0015,
+      "loss": 2.8403,
+      "step": 6925
+    },
+    {
+      "epoch": 0.043113609012295134,
+      "grad_norm": 0.17333541810512543,
+      "learning_rate": 0.0015,
+      "loss": 2.8699,
+      "step": 6950
+    },
+    {
+      "epoch": 0.043268693936799794,
+      "grad_norm": 0.10923554003238678,
+      "learning_rate": 0.0015,
+      "loss": 2.8016,
+      "step": 6975
+    },
+    {
+      "epoch": 0.04342377886130445,
+      "grad_norm": 0.10525023192167282,
+      "learning_rate": 0.0015,
+      "loss": 2.8302,
+      "step": 7000
+    },
+    {
+      "epoch": 0.04342377886130445,
+      "eval_loss": 4.660215854644775,
+      "perplexity": 105.65888977050781,
+      "step": 7000
+    },
+    {
+      "epoch": 0.04357886378580911,
+      "grad_norm": 0.10499420017004013,
+      "learning_rate": 0.0015,
+      "loss": 2.8215,
+      "step": 7025
+    },
+    {
+      "epoch": 0.04373394871031377,
+      "grad_norm": 0.09560755640268326,
+      "learning_rate": 0.0015,
+      "loss": 2.8279,
+      "step": 7050
+    },
+    {
+      "epoch": 0.04388903363481843,
+      "grad_norm": 0.10454019159078598,
+      "learning_rate": 0.0015,
+      "loss": 2.8161,
+      "step": 7075
+    },
+    {
+      "epoch": 0.04404411855932309,
+      "grad_norm": 0.0982690081000328,
+      "learning_rate": 0.0015,
+      "loss": 2.7895,
+      "step": 7100
+    },
+    {
+      "epoch": 0.04419920348382774,
+      "grad_norm": 0.10405784100294113,
+      "learning_rate": 0.0015,
+      "loss": 2.7945,
+      "step": 7125
+    },
+    {
+      "epoch": 0.0443542884083324,
+      "grad_norm": 0.09310988336801529,
+      "learning_rate": 0.0015,
+      "loss": 2.8535,
+      "step": 7150
+    },
+    {
+      "epoch": 0.04450937333283706,
+      "grad_norm": 0.1031995639204979,
+      "learning_rate": 0.0015,
+      "loss": 2.8298,
+      "step": 7175
+    },
+    {
+      "epoch": 0.04466445825734172,
+      "grad_norm": 0.09206147491931915,
+      "learning_rate": 0.0015,
+      "loss": 2.794,
+      "step": 7200
+    },
+    {
+      "epoch": 0.04466445825734172,
+      "eval_loss": 4.642621994018555,
+      "perplexity": 103.81619262695312,
+      "step": 7200
+    },
+    {
+      "epoch": 0.04481954318184638,
+      "grad_norm": 0.1051359549164772,
+      "learning_rate": 0.0015,
+      "loss": 2.7996,
+      "step": 7225
+    },
+    {
+      "epoch": 0.044974628106351035,
+      "grad_norm": 0.12941063940525055,
+      "learning_rate": 0.0015,
+      "loss": 2.792,
+      "step": 7250
+    },
+    {
+      "epoch": 0.045129713030855695,
+      "grad_norm": 0.09297281503677368,
+      "learning_rate": 0.0015,
+      "loss": 2.7847,
+      "step": 7275
+    },
+    {
+      "epoch": 0.045284797955360355,
+      "grad_norm": 0.11114951968193054,
+      "learning_rate": 0.0015,
+      "loss": 2.8164,
+      "step": 7300
+    },
+    {
+      "epoch": 0.045439882879865015,
+      "grad_norm": 0.08519440144300461,
+      "learning_rate": 0.0015,
+      "loss": 2.8053,
+      "step": 7325
+    },
+    {
+      "epoch": 0.045594967804369675,
+      "grad_norm": 0.11148552596569061,
+      "learning_rate": 0.0015,
+      "loss": 2.7871,
+      "step": 7350
+    },
+    {
+      "epoch": 0.04575005272887433,
+      "grad_norm": 0.136012002825737,
+      "learning_rate": 0.0015,
+      "loss": 2.8457,
+      "step": 7375
+    },
+    {
+      "epoch": 0.04590513765337899,
+      "grad_norm": 0.1037759929895401,
+      "learning_rate": 0.0015,
+      "loss": 2.748,
+      "step": 7400
+    },
+    {
+      "epoch": 0.04590513765337899,
+      "eval_loss": 4.631537437438965,
+      "perplexity": 102.67179107666016,
+      "step": 7400
+    },
+    {
+      "epoch": 0.04606022257788365,
+      "grad_norm": 0.11162275820970535,
+      "learning_rate": 0.0015,
+      "loss": 2.8044,
+      "step": 7425
+    },
+    {
+      "epoch": 0.04621530750238831,
+      "grad_norm": 0.11309058219194412,
+      "learning_rate": 0.0015,
+      "loss": 2.8198,
+      "step": 7450
+    },
+    {
+      "epoch": 0.04637039242689297,
+      "grad_norm": 0.09359199553728104,
+      "learning_rate": 0.0015,
+      "loss": 2.8302,
+      "step": 7475
+    },
+    {
+      "epoch": 0.04652547735139762,
+      "grad_norm": 0.09513767808675766,
+      "learning_rate": 0.0015,
+      "loss": 2.8325,
+      "step": 7500
+    },
+    {
+      "epoch": 0.04668056227590228,
+      "grad_norm": 0.08243551850318909,
+      "learning_rate": 0.0015,
+      "loss": 2.7925,
+      "step": 7525
+    },
+    {
+      "epoch": 0.04683564720040694,
+      "grad_norm": 0.08001349121332169,
+      "learning_rate": 0.0015,
+      "loss": 2.8406,
+      "step": 7550
+    },
+    {
+      "epoch": 0.0469907321249116,
+      "grad_norm": 0.11749595403671265,
+      "learning_rate": 0.0015,
+      "loss": 2.7762,
+      "step": 7575
+    },
+    {
+      "epoch": 0.04714581704941626,
+      "grad_norm": 0.15697765350341797,
+      "learning_rate": 0.0015,
+      "loss": 2.8137,
+      "step": 7600
+    },
+    {
+      "epoch": 0.04714581704941626,
+      "eval_loss": 4.643322467803955,
+      "perplexity": 103.8889389038086,
+      "step": 7600
+    },
+    {
+      "epoch": 0.04730090197392092,
+      "grad_norm": 0.1004658117890358,
+      "learning_rate": 0.0015,
+      "loss": 2.7787,
+      "step": 7625
+    },
+    {
+      "epoch": 0.047455986898425576,
+      "grad_norm": 0.11577022075653076,
+      "learning_rate": 0.0015,
+      "loss": 2.806,
+      "step": 7650
+    },
+    {
+      "epoch": 0.047611071822930236,
+      "grad_norm": 0.10791046917438507,
+      "learning_rate": 0.0015,
+      "loss": 2.7637,
+      "step": 7675
+    },
+    {
+      "epoch": 0.047766156747434896,
+      "grad_norm": 0.09490654617547989,
+      "learning_rate": 0.0015,
+      "loss": 2.8187,
+      "step": 7700
+    },
+    {
+      "epoch": 0.047921241671939556,
+      "grad_norm": 0.10448817163705826,
+      "learning_rate": 0.0015,
+      "loss": 2.8335,
+      "step": 7725
+    },
+    {
+      "epoch": 0.048076326596444216,
+      "grad_norm": 0.10800398141145706,
+      "learning_rate": 0.0015,
+      "loss": 2.8138,
+      "step": 7750
+    },
+    {
+      "epoch": 0.04823141152094887,
+      "grad_norm": 0.10268035531044006,
+      "learning_rate": 0.0015,
+      "loss": 2.8074,
+      "step": 7775
+    },
+    {
+      "epoch": 0.04838649644545353,
+      "grad_norm": 0.145925372838974,
+      "learning_rate": 0.0015,
+      "loss": 2.8161,
+      "step": 7800
+    },
+    {
+      "epoch": 0.04838649644545353,
+      "eval_loss": 4.628528118133545,
+      "perplexity": 102.36328887939453,
+      "step": 7800
+    },
+    {
+      "epoch": 0.04854158136995819,
+      "grad_norm": 0.1422831267118454,
+      "learning_rate": 0.0015,
+      "loss": 2.8179,
+      "step": 7825
+    },
+    {
+      "epoch": 0.04869666629446285,
+      "grad_norm": 0.10019826889038086,
+      "learning_rate": 0.0015,
+      "loss": 2.8228,
+      "step": 7850
+    },
+    {
+      "epoch": 0.04885175121896751,
+      "grad_norm": 0.12028387933969498,
+      "learning_rate": 0.0015,
+      "loss": 2.8359,
+      "step": 7875
+    },
+    {
+      "epoch": 0.04900683614347216,
+      "grad_norm": 0.08171118795871735,
+      "learning_rate": 0.0015,
+      "loss": 2.7829,
+      "step": 7900
+    },
+    {
+      "epoch": 0.04916192106797682,
+      "grad_norm": 0.138522207736969,
+      "learning_rate": 0.0015,
+      "loss": 2.7992,
+      "step": 7925
+    },
+    {
+      "epoch": 0.04931700599248148,
+      "grad_norm": 0.10419227927923203,
+      "learning_rate": 0.0015,
+      "loss": 2.8097,
+      "step": 7950
+    },
+    {
+      "epoch": 0.04947209091698614,
+      "grad_norm": 0.1020691841840744,
+      "learning_rate": 0.0015,
+      "loss": 2.8152,
+      "step": 7975
+    },
+    {
+      "epoch": 0.0496271758414908,
+      "grad_norm": 0.12423787266016006,
+      "learning_rate": 0.0015,
+      "loss": 2.7966,
+      "step": 8000
+    },
+    {
+      "epoch": 0.0496271758414908,
+      "eval_loss": 4.6273722648620605,
+      "perplexity": 102.24504089355469,
+      "step": 8000
+    },
+    {
+      "epoch": 0.049782260765995456,
+      "grad_norm": 0.15230977535247803,
+      "learning_rate": 0.0015,
+      "loss": 2.7575,
+      "step": 8025
+    },
+    {
+      "epoch": 0.049937345690500116,
+      "grad_norm": 0.12649676203727722,
+      "learning_rate": 0.0015,
+      "loss": 2.7897,
+      "step": 8050
+    },
+    {
+      "epoch": 0.05009243061500478,
+      "grad_norm": 0.11257271468639374,
+      "learning_rate": 0.0015,
+      "loss": 2.8115,
+      "step": 8075
+    },
+    {
+      "epoch": 0.05024751553950944,
+      "grad_norm": 0.09349871426820755,
+      "learning_rate": 0.0015,
+      "loss": 2.8041,
+      "step": 8100
+    },
+    {
+      "epoch": 0.0504026004640141,
+      "grad_norm": 0.14108401536941528,
+      "learning_rate": 0.0015,
+      "loss": 2.7772,
+      "step": 8125
+    },
+    {
+      "epoch": 0.05055768538851875,
+      "grad_norm": 0.17286863923072815,
+      "learning_rate": 0.0015,
+      "loss": 2.8197,
+      "step": 8150
+    },
+    {
+      "epoch": 0.05071277031302341,
+      "grad_norm": 0.10759209096431732,
+      "learning_rate": 0.0015,
+      "loss": 2.8396,
+      "step": 8175
+    },
+    {
+      "epoch": 0.05086785523752807,
+      "grad_norm": 0.10236554592847824,
+      "learning_rate": 0.0015,
+      "loss": 2.8175,
+      "step": 8200
+    },
+    {
+      "epoch": 0.05086785523752807,
+      "eval_loss": 4.610519886016846,
+      "perplexity": 100.5363998413086,
+      "step": 8200
+    },
+    {
+      "epoch": 0.05102294016203273,
+      "grad_norm": 0.12348885089159012,
+      "learning_rate": 0.0015,
+      "loss": 2.8139,
+      "step": 8225
+    },
+    {
+      "epoch": 0.05117802508653739,
+      "grad_norm": 0.10251584649085999,
+      "learning_rate": 0.0015,
+      "loss": 2.8436,
+      "step": 8250
+    },
+    {
+      "epoch": 0.051333110011042044,
+      "grad_norm": 0.10069389641284943,
+      "learning_rate": 0.0015,
+      "loss": 2.8409,
+      "step": 8275
+    },
+    {
+      "epoch": 0.051488194935546704,
+      "grad_norm": 0.1546829789876938,
+      "learning_rate": 0.0015,
+      "loss": 2.8199,
+      "step": 8300
+    },
+    {
+      "epoch": 0.051643279860051364,
+      "grad_norm": 0.10704527795314789,
+      "learning_rate": 0.0015,
+      "loss": 2.7721,
+      "step": 8325
+    },
+    {
+      "epoch": 0.051798364784556024,
+      "grad_norm": 0.12251198291778564,
+      "learning_rate": 0.0015,
+      "loss": 2.8175,
+      "step": 8350
+    },
+    {
+      "epoch": 0.051953449709060684,
+      "grad_norm": 0.11113474518060684,
+      "learning_rate": 0.0015,
+      "loss": 2.8085,
+      "step": 8375
+    },
+    {
+      "epoch": 0.05210853463356534,
+      "grad_norm": 0.1341187059879303,
+      "learning_rate": 0.0015,
+      "loss": 2.8169,
+      "step": 8400
+    },
+    {
+      "epoch": 0.05210853463356534,
+      "eval_loss": 4.610434532165527,
+      "perplexity": 100.52782440185547,
+      "step": 8400
+    },
+    {
+      "epoch": 0.05226361955807,
+      "grad_norm": 0.16195224225521088,
+      "learning_rate": 0.0015,
+      "loss": 2.8266,
+      "step": 8425
+    },
+    {
+      "epoch": 0.05241870448257466,
+      "grad_norm": 0.1637653261423111,
+      "learning_rate": 0.0015,
+      "loss": 2.8106,
+      "step": 8450
+    },
+    {
+      "epoch": 0.05257378940707932,
+      "grad_norm": 0.10014921426773071,
+      "learning_rate": 0.0015,
+      "loss": 2.8103,
+      "step": 8475
+    },
+    {
+      "epoch": 0.05272887433158398,
+      "grad_norm": 0.11419603228569031,
+      "learning_rate": 0.0015,
+      "loss": 2.7965,
+      "step": 8500
+    },
+    {
+      "epoch": 0.05288395925608863,
+      "grad_norm": 0.08137035369873047,
+      "learning_rate": 0.0015,
+      "loss": 2.7802,
+      "step": 8525
+    },
+    {
+      "epoch": 0.05303904418059329,
+      "grad_norm": 0.08078640699386597,
+      "learning_rate": 0.0015,
+      "loss": 2.7819,
+      "step": 8550
+    },
+    {
+      "epoch": 0.05319412910509795,
+      "grad_norm": 0.13133442401885986,
+      "learning_rate": 0.0015,
+      "loss": 2.83,
+      "step": 8575
+    },
+    {
+      "epoch": 0.05334921402960261,
+      "grad_norm": 0.08819993585348129,
+      "learning_rate": 0.0015,
+      "loss": 2.833,
+      "step": 8600
+    },
+    {
+      "epoch": 0.05334921402960261,
+      "eval_loss": 4.603670120239258,
+      "perplexity": 99.85010528564453,
+      "step": 8600
+    },
+    {
+      "epoch": 0.05350429895410727,
+      "grad_norm": 0.14662431180477142,
+      "learning_rate": 0.0015,
+      "loss": 2.8201,
+      "step": 8625
+    },
+    {
+      "epoch": 0.05365938387861193,
+      "grad_norm": 0.10400764644145966,
+      "learning_rate": 0.0015,
+      "loss": 2.7944,
+      "step": 8650
+    },
+    {
+      "epoch": 0.053814468803116584,
+      "grad_norm": 0.2790142297744751,
+      "learning_rate": 0.0015,
+      "loss": 2.8307,
+      "step": 8675
+    },
+    {
+      "epoch": 0.053969553727621244,
+      "grad_norm": 0.13645683228969574,
+      "learning_rate": 0.0015,
+      "loss": 2.7904,
+      "step": 8700
+    },
+    {
+      "epoch": 0.054124638652125905,
+      "grad_norm": 0.09604925662279129,
+      "learning_rate": 0.0015,
+      "loss": 2.76,
+      "step": 8725
+    },
+    {
+      "epoch": 0.054279723576630565,
+      "grad_norm": 0.07631650567054749,
+      "learning_rate": 0.0015,
+      "loss": 2.7955,
+      "step": 8750
+    },
+    {
+      "epoch": 0.054434808501135225,
+      "grad_norm": 0.13132531940937042,
+      "learning_rate": 0.0015,
+      "loss": 2.8308,
+      "step": 8775
+    },
+    {
+      "epoch": 0.05458989342563988,
+      "grad_norm": 0.08334681391716003,
+      "learning_rate": 0.0015,
+      "loss": 2.755,
+      "step": 8800
+    },
+    {
+      "epoch": 0.05458989342563988,
+      "eval_loss": 4.597860336303711,
+      "perplexity": 99.27168273925781,
+      "step": 8800
+    },
+    {
+      "epoch": 0.05474497835014454,
+      "grad_norm": 0.10585317760705948,
+      "learning_rate": 0.0015,
+      "loss": 2.7708,
+      "step": 8825
+    },
+    {
+      "epoch": 0.0549000632746492,
+      "grad_norm": 0.08953095227479935,
+      "learning_rate": 0.0015,
+      "loss": 2.7622,
+      "step": 8850
+    },
+    {
+      "epoch": 0.05505514819915386,
+      "grad_norm": 0.10430523008108139,
+      "learning_rate": 0.0015,
+      "loss": 2.8255,
+      "step": 8875
+    },
+    {
+      "epoch": 0.05521023312365852,
+      "grad_norm": 0.08961856365203857,
+      "learning_rate": 0.0015,
+      "loss": 2.7835,
+      "step": 8900
+    },
+    {
+      "epoch": 0.05536531804816317,
+      "grad_norm": 0.13602201640605927,
+      "learning_rate": 0.0015,
+      "loss": 2.813,
+      "step": 8925
+    },
+    {
+      "epoch": 0.05552040297266783,
+      "grad_norm": 0.1858643889427185,
+      "learning_rate": 0.0015,
+      "loss": 2.8296,
+      "step": 8950
+    },
+    {
+      "epoch": 0.05567548789717249,
+      "grad_norm": 0.12873806059360504,
+      "learning_rate": 0.0015,
+      "loss": 2.7669,
+      "step": 8975
+    },
+    {
+      "epoch": 0.05583057282167715,
+      "grad_norm": 0.09891733527183533,
+      "learning_rate": 0.0015,
+      "loss": 2.7829,
+      "step": 9000
+    },
+    {
+      "epoch": 0.05583057282167715,
+      "eval_loss": 4.606179714202881,
+      "perplexity": 100.10100555419922,
+      "step": 9000
+    },
+    {
+      "epoch": 0.05598565774618181,
+      "grad_norm": 0.1619413048028946,
+      "learning_rate": 0.0015,
+      "loss": 2.7885,
+      "step": 9025
+    },
+    {
+      "epoch": 0.056140742670686465,
+      "grad_norm": 0.1223379522562027,
+      "learning_rate": 0.0015,
+      "loss": 2.7829,
+      "step": 9050
+    },
+    {
+      "epoch": 0.056295827595191125,
+      "grad_norm": 0.10872245579957962,
+      "learning_rate": 0.0015,
+      "loss": 2.7962,
+      "step": 9075
+    },
+    {
+      "epoch": 0.056450912519695785,
+      "grad_norm": 0.11461862176656723,
+      "learning_rate": 0.0015,
+      "loss": 2.7476,
+      "step": 9100
+    },
+    {
+      "epoch": 0.056605997444200445,
+      "grad_norm": 0.08933119475841522,
+      "learning_rate": 0.0015,
+      "loss": 2.7745,
+      "step": 9125
+    },
+    {
+      "epoch": 0.056761082368705106,
+      "grad_norm": 0.12911683320999146,
+      "learning_rate": 0.0015,
+      "loss": 2.8029,
+      "step": 9150
+    },
+    {
+      "epoch": 0.05691616729320976,
+      "grad_norm": 0.13963252305984497,
+      "learning_rate": 0.0015,
+      "loss": 2.7931,
+      "step": 9175
+    },
+    {
+      "epoch": 0.05707125221771442,
+      "grad_norm": 0.13462606072425842,
+      "learning_rate": 0.0015,
+      "loss": 2.7771,
+      "step": 9200
+    },
+    {
+      "epoch": 0.05707125221771442,
+      "eval_loss": 4.619841575622559,
+      "perplexity": 101.47795104980469,
+      "step": 9200
+    },
+    {
+      "epoch": 0.05722633714221908,
+      "grad_norm": 0.12551379203796387,
+      "learning_rate": 0.0015,
+      "loss": 2.7934,
+      "step": 9225
+    },
+    {
+      "epoch": 0.05738142206672374,
+      "grad_norm": 0.12379872798919678,
+      "learning_rate": 0.0015,
+      "loss": 2.7882,
+      "step": 9250
+    },
+    {
+      "epoch": 0.0575365069912284,
+      "grad_norm": 0.0940781831741333,
+      "learning_rate": 0.0015,
+      "loss": 2.7658,
+      "step": 9275
+    },
+    {
+      "epoch": 0.05769159191573305,
+      "grad_norm": 0.14165829122066498,
+      "learning_rate": 0.0015,
+      "loss": 2.7973,
+      "step": 9300
+    },
+    {
+      "epoch": 0.05784667684023771,
+      "grad_norm": 0.10727201402187347,
+      "learning_rate": 0.0015,
+      "loss": 2.815,
+      "step": 9325
+    },
+    {
+      "epoch": 0.05800176176474237,
+      "grad_norm": 0.1628653109073639,
+      "learning_rate": 0.0015,
+      "loss": 2.7854,
+      "step": 9350
+    },
+    {
+      "epoch": 0.05815684668924703,
+      "grad_norm": 0.09925588220357895,
+      "learning_rate": 0.0015,
+      "loss": 2.7578,
+      "step": 9375
+    },
+    {
+      "epoch": 0.05831193161375169,
+      "grad_norm": 0.1587476134300232,
+      "learning_rate": 0.0015,
+      "loss": 2.7296,
+      "step": 9400
+    },
+    {
+      "epoch": 0.05831193161375169,
+      "eval_loss": 4.604221343994141,
+      "perplexity": 99.90515899658203,
+      "step": 9400
+    },
+    {
+      "epoch": 0.058467016538256346,
+      "grad_norm": 0.10519708693027496,
+      "learning_rate": 0.0015,
+      "loss": 2.7712,
+      "step": 9425
+    },
+    {
+      "epoch": 0.058622101462761006,
+      "grad_norm": 0.10321429371833801,
+      "learning_rate": 0.0015,
+      "loss": 2.7281,
+      "step": 9450
+    },
+    {
+      "epoch": 0.058777186387265666,
+      "grad_norm": 0.20060209929943085,
+      "learning_rate": 0.0015,
+      "loss": 2.807,
+      "step": 9475
+    },
+    {
+      "epoch": 0.058932271311770326,
+      "grad_norm": 0.10847010463476181,
+      "learning_rate": 0.0015,
+      "loss": 2.8078,
+      "step": 9500
+    },
+    {
+      "epoch": 0.059087356236274986,
+      "grad_norm": 0.11248752474784851,
+      "learning_rate": 0.0015,
+      "loss": 2.796,
+      "step": 9525
+    },
+    {
+      "epoch": 0.059242441160779646,
+      "grad_norm": 0.13171915709972382,
+      "learning_rate": 0.0015,
+      "loss": 2.7658,
+      "step": 9550
+    },
+    {
+      "epoch": 0.0593975260852843,
+      "grad_norm": 0.12041529268026352,
+      "learning_rate": 0.0015,
+      "loss": 2.7507,
+      "step": 9575
+    },
+    {
+      "epoch": 0.05955261100978896,
+      "grad_norm": 0.11275593191385269,
+      "learning_rate": 0.0015,
+      "loss": 2.8022,
+      "step": 9600
+    },
+    {
+      "epoch": 0.05955261100978896,
+      "eval_loss": 4.5886077880859375,
+      "perplexity": 98.3573989868164,
+      "step": 9600
+    },
+    {
+      "epoch": 0.05970769593429362,
+      "grad_norm": 0.1715971678495407,
+      "learning_rate": 0.0015,
+      "loss": 2.8003,
+      "step": 9625
+    },
+    {
+      "epoch": 0.05986278085879828,
+      "grad_norm": 0.1223614364862442,
+      "learning_rate": 0.0015,
+      "loss": 2.8012,
+      "step": 9650
+    },
+    {
+      "epoch": 0.06001786578330294,
+      "grad_norm": 0.114704430103302,
+      "learning_rate": 0.0015,
+      "loss": 2.7963,
+      "step": 9675
+    },
+    {
+      "epoch": 0.06017295070780759,
+      "grad_norm": 0.10282139480113983,
+      "learning_rate": 0.0015,
+      "loss": 2.7965,
+      "step": 9700
+    },
+    {
+      "epoch": 0.06032803563231225,
+      "grad_norm": 0.10494767129421234,
+      "learning_rate": 0.0015,
+      "loss": 2.7698,
+      "step": 9725
+    },
+    {
+      "epoch": 0.06048312055681691,
+      "grad_norm": 0.0908605083823204,
+      "learning_rate": 0.0015,
+      "loss": 2.749,
+      "step": 9750
+    },
+    {
+      "epoch": 0.06063820548132157,
+      "grad_norm": 0.0847998857498169,
+      "learning_rate": 0.0015,
+      "loss": 2.838,
+      "step": 9775
+    },
+    {
+      "epoch": 0.060793290405826234,
+      "grad_norm": 0.24615754187107086,
+      "learning_rate": 0.0015,
+      "loss": 2.8117,
+      "step": 9800
+    },
+    {
+      "epoch": 0.060793290405826234,
+      "eval_loss": 4.593789100646973,
+      "perplexity": 98.86833953857422,
+      "step": 9800
+    },
+    {
+      "epoch": 0.06094837533033089,
+      "grad_norm": 0.0959208682179451,
+      "learning_rate": 0.0015,
+      "loss": 2.7845,
+      "step": 9825
+    },
+    {
+      "epoch": 0.06110346025483555,
+      "grad_norm": 0.09963307529687881,
+      "learning_rate": 0.0015,
+      "loss": 2.8296,
+      "step": 9850
+    },
+    {
+      "epoch": 0.06125854517934021,
+      "grad_norm": 0.1115136444568634,
+      "learning_rate": 0.0015,
+      "loss": 2.7586,
+      "step": 9875
+    },
+    {
+      "epoch": 0.06141363010384487,
+      "grad_norm": 0.13883067667484283,
+      "learning_rate": 0.0015,
+      "loss": 2.7978,
+      "step": 9900
+    },
+    {
+      "epoch": 0.06156871502834953,
+      "grad_norm": 0.2048570066690445,
+      "learning_rate": 0.0015,
+      "loss": 2.8397,
+      "step": 9925
+    },
+    {
+      "epoch": 0.06172379995285418,
+      "grad_norm": 0.1306881606578827,
+      "learning_rate": 0.0015,
+      "loss": 2.8084,
+      "step": 9950
+    },
+    {
+      "epoch": 0.06187888487735884,
+      "grad_norm": 0.18285603821277618,
+      "learning_rate": 0.0015,
+      "loss": 2.7989,
+      "step": 9975
+    },
+    {
+      "epoch": 0.0620339698018635,
+      "grad_norm": 0.1109723299741745,
+      "learning_rate": 0.0015,
+      "loss": 2.8064,
+      "step": 10000
+    },
+    {
+      "epoch": 0.0620339698018635,
+      "eval_loss": 4.5877556800842285,
+      "perplexity": 98.27362823486328,
+      "step": 10000
+    },
+    {
+      "epoch": 0.06218905472636816,
+      "grad_norm": 0.12350066751241684,
+      "learning_rate": 0.0015,
+      "loss": 2.7684,
+      "step": 10025
+    },
+    {
+      "epoch": 0.06234413965087282,
+      "grad_norm": 0.11565285176038742,
+      "learning_rate": 0.0015,
+      "loss": 2.7748,
+      "step": 10050
+    },
+    {
+      "epoch": 0.062499224575377474,
+      "grad_norm": 0.1117839589715004,
+      "learning_rate": 0.0015,
+      "loss": 2.8044,
+      "step": 10075
+    },
+    {
+      "epoch": 0.06265430949988214,
+      "grad_norm": 0.1102209985256195,
+      "learning_rate": 0.0015,
+      "loss": 2.7844,
+      "step": 10100
+    },
+    {
+      "epoch": 0.0628093944243868,
+      "grad_norm": 0.10270575433969498,
+      "learning_rate": 0.0015,
+      "loss": 2.7685,
+      "step": 10125
+    },
+    {
+      "epoch": 0.06296447934889145,
+      "grad_norm": 0.09842963516712189,
+      "learning_rate": 0.0015,
+      "loss": 2.8048,
+      "step": 10150
+    },
+    {
+      "epoch": 0.06311956427339611,
+      "grad_norm": 0.10446088761091232,
+      "learning_rate": 0.0015,
+      "loss": 2.8051,
+      "step": 10175
+    },
+    {
+      "epoch": 0.06327464919790077,
+      "grad_norm": 0.14759957790374756,
+      "learning_rate": 0.0015,
+      "loss": 2.8089,
+      "step": 10200
+    },
+    {
+      "epoch": 0.06327464919790077,
+      "eval_loss": 4.588883399963379,
+      "perplexity": 98.38451385498047,
+      "step": 10200
+    },
+    {
+      "epoch": 0.06342973412240543,
+      "grad_norm": 0.12910906970500946,
+      "learning_rate": 0.0015,
+      "loss": 2.8193,
+      "step": 10225
+    },
+    {
+      "epoch": 0.06358481904691009,
+      "grad_norm": 0.13095402717590332,
+      "learning_rate": 0.0015,
+      "loss": 2.7509,
+      "step": 10250
+    },
+    {
+      "epoch": 0.06373990397141474,
+      "grad_norm": 0.16069594025611877,
+      "learning_rate": 0.0015,
+      "loss": 2.7911,
+      "step": 10275
+    },
+    {
+      "epoch": 0.06389498889591941,
+      "grad_norm": 0.08322907984256744,
+      "learning_rate": 0.0015,
+      "loss": 2.8025,
+      "step": 10300
+    },
+    {
+      "epoch": 0.06405007382042406,
+      "grad_norm": 0.2328927367925644,
+      "learning_rate": 0.0015,
+      "loss": 2.7863,
+      "step": 10325
+    },
+    {
+      "epoch": 0.06420515874492873,
+      "grad_norm": 0.09172859787940979,
+      "learning_rate": 0.0015,
+      "loss": 2.8101,
+      "step": 10350
+    },
+    {
+      "epoch": 0.06436024366943338,
+      "grad_norm": 0.13464473187923431,
+      "learning_rate": 0.0015,
+      "loss": 2.7718,
+      "step": 10375
+    },
+    {
+      "epoch": 0.06451532859393803,
+      "grad_norm": 0.1284090131521225,
+      "learning_rate": 0.0015,
+      "loss": 2.7667,
+      "step": 10400
+    },
+    {
+      "epoch": 0.06451532859393803,
+      "eval_loss": 4.59510612487793,
+      "perplexity": 98.99864196777344,
+      "step": 10400
+    },
+    {
+      "epoch": 0.0646704135184427,
+      "grad_norm": 0.13565704226493835,
+      "learning_rate": 0.0015,
+      "loss": 2.7552,
+      "step": 10425
+    },
+    {
+      "epoch": 0.06482549844294735,
+      "grad_norm": 0.1089024469256401,
+      "learning_rate": 0.0015,
+      "loss": 2.7838,
+      "step": 10450
+    },
+    {
+      "epoch": 0.06498058336745202,
+      "grad_norm": 0.11035135388374329,
+      "learning_rate": 0.0015,
+      "loss": 2.7986,
+      "step": 10475
+    },
+    {
+      "epoch": 0.06513566829195667,
+      "grad_norm": 0.08107917010784149,
+      "learning_rate": 0.0015,
+      "loss": 2.7791,
+      "step": 10500
+    },
+    {
+      "epoch": 0.06529075321646133,
+      "grad_norm": 0.10200012475252151,
+      "learning_rate": 0.0015,
+      "loss": 2.7636,
+      "step": 10525
+    },
+    {
+      "epoch": 0.065445838140966,
+      "grad_norm": 0.08427785336971283,
+      "learning_rate": 0.0015,
+      "loss": 2.794,
+      "step": 10550
+    },
+    {
+      "epoch": 0.06560092306547065,
+      "grad_norm": 0.10828018933534622,
+      "learning_rate": 0.0015,
+      "loss": 2.7778,
+      "step": 10575
+    },
+    {
+      "epoch": 0.06575600798997532,
+      "grad_norm": 0.12101134657859802,
+      "learning_rate": 0.0015,
+      "loss": 2.7469,
+      "step": 10600
+    },
+    {
+      "epoch": 0.06575600798997532,
+      "eval_loss": 4.597805500030518,
+      "perplexity": 99.2662353515625,
+      "step": 10600
+    },
+    {
+      "epoch": 0.06591109291447997,
+      "grad_norm": 0.11220554262399673,
+      "learning_rate": 0.0015,
+      "loss": 2.7294,
+      "step": 10625
+    },
+    {
+      "epoch": 0.06606617783898462,
+      "grad_norm": 0.13899332284927368,
+      "learning_rate": 0.0015,
+      "loss": 2.763,
+      "step": 10650
+    },
+    {
+      "epoch": 0.06622126276348929,
+      "grad_norm": 0.11773937195539474,
+      "learning_rate": 0.0015,
+      "loss": 2.7866,
+      "step": 10675
+    },
+    {
+      "epoch": 0.06637634768799394,
+      "grad_norm": 0.11059702187776566,
+      "learning_rate": 0.0015,
+      "loss": 2.8076,
+      "step": 10700
+    },
+    {
+      "epoch": 0.06653143261249861,
+      "grad_norm": 0.1251254379749298,
+      "learning_rate": 0.0015,
+      "loss": 2.7674,
+      "step": 10725
+    },
+    {
+      "epoch": 0.06668651753700326,
+      "grad_norm": 0.12195979803800583,
+      "learning_rate": 0.0015,
+      "loss": 2.768,
+      "step": 10750
+    },
+    {
+      "epoch": 0.06684160246150792,
+      "grad_norm": 0.1487302929162979,
+      "learning_rate": 0.0015,
+      "loss": 2.762,
+      "step": 10775
+    },
+    {
+      "epoch": 0.06699668738601258,
+      "grad_norm": 0.1315547525882721,
+      "learning_rate": 0.0015,
+      "loss": 2.7348,
+      "step": 10800
+    },
+    {
+      "epoch": 0.06699668738601258,
+      "eval_loss": 4.566490650177002,
+      "perplexity": 96.20589447021484,
+      "step": 10800
+    },
+    {
+      "epoch": 0.06715177231051724,
+      "grad_norm": 0.13864025473594666,
+      "learning_rate": 0.0015,
+      "loss": 2.7517,
+      "step": 10825
+    },
+    {
+      "epoch": 0.0673068572350219,
+      "grad_norm": 0.08808566629886627,
+      "learning_rate": 0.0015,
+      "loss": 2.7718,
+      "step": 10850
+    },
+    {
+      "epoch": 0.06746194215952656,
+      "grad_norm": 0.115321584045887,
+      "learning_rate": 0.0015,
+      "loss": 2.7007,
+      "step": 10875
+    },
+    {
+      "epoch": 0.06761702708403121,
+      "grad_norm": 0.10276370495557785,
+      "learning_rate": 0.0015,
+      "loss": 2.7692,
+      "step": 10900
+    },
+    {
+      "epoch": 0.06777211200853588,
+      "grad_norm": 0.09534792602062225,
+      "learning_rate": 0.0015,
+      "loss": 2.8186,
+      "step": 10925
+    },
+    {
+      "epoch": 0.06792719693304053,
+      "grad_norm": 0.14239507913589478,
+      "learning_rate": 0.0015,
+      "loss": 2.7801,
+      "step": 10950
+    },
+    {
+      "epoch": 0.0680822818575452,
+      "grad_norm": 0.11848737299442291,
+      "learning_rate": 0.0015,
+      "loss": 2.7394,
+      "step": 10975
+    },
+    {
+      "epoch": 0.06823736678204985,
+      "grad_norm": 0.09367898106575012,
+      "learning_rate": 0.0015,
+      "loss": 2.8043,
+      "step": 11000
+    },
+    {
+      "epoch": 0.06823736678204985,
+      "eval_loss": 4.5800089836120605,
+      "perplexity": 97.51527404785156,
+      "step": 11000
+    },
+    {
+      "epoch": 0.0683924517065545,
+      "grad_norm": 0.1494915634393692,
+      "learning_rate": 0.0015,
+      "loss": 2.7841,
+      "step": 11025
+    },
+    {
+      "epoch": 0.06854753663105917,
+      "grad_norm": 0.09982737898826599,
+      "learning_rate": 0.0015,
+      "loss": 2.7933,
+      "step": 11050
+    },
+    {
+      "epoch": 0.06870262155556382,
+      "grad_norm": 0.12379477173089981,
+      "learning_rate": 0.0015,
+      "loss": 2.7419,
+      "step": 11075
+    },
+    {
+      "epoch": 0.06885770648006849,
+      "grad_norm": 0.11405149102210999,
+      "learning_rate": 0.0015,
+      "loss": 2.763,
+      "step": 11100
+    },
+    {
+      "epoch": 0.06901279140457314,
+      "grad_norm": 0.09574620425701141,
+      "learning_rate": 0.0015,
+      "loss": 2.7961,
+      "step": 11125
+    },
+    {
+      "epoch": 0.06916787632907781,
+      "grad_norm": 0.2947874963283539,
+      "learning_rate": 0.0015,
+      "loss": 2.789,
+      "step": 11150
+    },
+    {
+      "epoch": 0.06932296125358246,
+      "grad_norm": 0.09219149500131607,
+      "learning_rate": 0.0015,
+      "loss": 2.7951,
+      "step": 11175
+    },
+    {
+      "epoch": 0.06947804617808712,
+      "grad_norm": 0.11840498447418213,
+      "learning_rate": 0.0015,
+      "loss": 2.7717,
+      "step": 11200
+    },
+    {
+      "epoch": 0.06947804617808712,
+      "eval_loss": 4.564184188842773,
+      "perplexity": 95.98426055908203,
+      "step": 11200
+    },
+    {
+      "epoch": 0.06963313110259178,
+      "grad_norm": 0.09422053396701813,
+      "learning_rate": 0.0015,
+      "loss": 2.7976,
+      "step": 11225
+    },
+    {
+      "epoch": 0.06978821602709644,
+      "grad_norm": 0.11220031976699829,
+      "learning_rate": 0.0015,
+      "loss": 2.7634,
+      "step": 11250
+    },
+    {
+      "epoch": 0.0699433009516011,
+      "grad_norm": 0.10228817909955978,
+      "learning_rate": 0.0015,
+      "loss": 2.7256,
+      "step": 11275
+    },
+    {
+      "epoch": 0.07009838587610576,
+      "grad_norm": 0.0929483100771904,
+      "learning_rate": 0.0015,
+      "loss": 2.8005,
+      "step": 11300
+    },
+    {
+      "epoch": 0.07025347080061041,
+      "grad_norm": 0.11491668224334717,
+      "learning_rate": 0.0015,
+      "loss": 2.7504,
+      "step": 11325
+    },
+    {
+      "epoch": 0.07040855572511508,
+      "grad_norm": 0.15256111323833466,
+      "learning_rate": 0.0015,
+      "loss": 2.7609,
+      "step": 11350
+    },
+    {
+      "epoch": 0.07056364064961973,
+      "grad_norm": 0.11576159298419952,
+      "learning_rate": 0.0015,
+      "loss": 2.7742,
+      "step": 11375
+    },
+    {
+      "epoch": 0.0707187255741244,
+      "grad_norm": 0.08809765428304672,
+      "learning_rate": 0.0015,
+      "loss": 2.7891,
+      "step": 11400
+    },
+    {
+      "epoch": 0.0707187255741244,
+      "eval_loss": 4.568883895874023,
+      "perplexity": 96.43641662597656,
+      "step": 11400
+    },
+    {
+      "epoch": 0.07087381049862905,
+      "grad_norm": 0.08563827723264694,
+      "learning_rate": 0.0015,
+      "loss": 2.8066,
+      "step": 11425
+    },
+    {
+      "epoch": 0.0710288954231337,
+      "grad_norm": 0.18896931409835815,
+      "learning_rate": 0.0015,
+      "loss": 2.8055,
+      "step": 11450
+    },
+    {
+      "epoch": 0.07118398034763837,
+      "grad_norm": 0.13940319418907166,
+      "learning_rate": 0.0015,
+      "loss": 2.7766,
+      "step": 11475
+    },
+    {
+      "epoch": 0.07133906527214302,
+      "grad_norm": 0.09737322479486465,
+      "learning_rate": 0.0015,
+      "loss": 2.7945,
+      "step": 11500
+    },
+    {
+      "epoch": 0.07149415019664769,
+      "grad_norm": 0.11357785761356354,
+      "learning_rate": 0.0015,
+      "loss": 2.7799,
+      "step": 11525
+    },
+    {
+      "epoch": 0.07164923512115234,
+      "grad_norm": 0.10513681918382645,
+      "learning_rate": 0.0015,
+      "loss": 2.7627,
+      "step": 11550
+    },
+    {
+      "epoch": 0.071804320045657,
+      "grad_norm": 0.1434682458639145,
+      "learning_rate": 0.0015,
+      "loss": 2.8055,
+      "step": 11575
+    },
+    {
+      "epoch": 0.07195940497016166,
+      "grad_norm": 0.10169105976819992,
+      "learning_rate": 0.0015,
+      "loss": 2.7832,
+      "step": 11600
+    },
+    {
+      "epoch": 0.07195940497016166,
+      "eval_loss": 4.560365676879883,
+      "perplexity": 95.61843872070312,
+      "step": 11600
+    },
+    {
+      "epoch": 0.07211448989466632,
+      "grad_norm": 0.1385478526353836,
+      "learning_rate": 0.0015,
+      "loss": 2.7548,
+      "step": 11625
+    },
+    {
+      "epoch": 0.07226957481917098,
+      "grad_norm": 0.1300746351480484,
+      "learning_rate": 0.0015,
+      "loss": 2.7553,
+      "step": 11650
+    },
+    {
+      "epoch": 0.07242465974367564,
+      "grad_norm": 0.11596991866827011,
+      "learning_rate": 0.0015,
+      "loss": 2.8095,
+      "step": 11675
+    },
+    {
+      "epoch": 0.07257974466818029,
+      "grad_norm": 0.11611347645521164,
+      "learning_rate": 0.0015,
+      "loss": 2.76,
+      "step": 11700
+    },
+    {
+      "epoch": 0.07273482959268496,
+      "grad_norm": 0.11249697953462601,
+      "learning_rate": 0.0015,
+      "loss": 2.7827,
+      "step": 11725
+    },
+    {
+      "epoch": 0.07288991451718961,
+      "grad_norm": 0.1243973895907402,
+      "learning_rate": 0.0015,
+      "loss": 2.7754,
+      "step": 11750
+    },
+    {
+      "epoch": 0.07304499944169428,
+      "grad_norm": 0.08843350410461426,
+      "learning_rate": 0.0015,
+      "loss": 2.8079,
+      "step": 11775
+    },
+    {
+      "epoch": 0.07320008436619893,
+      "grad_norm": 0.09881053864955902,
+      "learning_rate": 0.0015,
+      "loss": 2.7961,
+      "step": 11800
+    },
+    {
+      "epoch": 0.07320008436619893,
+      "eval_loss": 4.567913055419922,
+      "perplexity": 96.34283447265625,
+      "step": 11800
+    },
+    {
+      "epoch": 0.07335516929070358,
+      "grad_norm": 0.08978071063756943,
+      "learning_rate": 0.0015,
+      "loss": 2.7786,
+      "step": 11825
+    },
+    {
+      "epoch": 0.07351025421520825,
+      "grad_norm": 0.1376107782125473,
+      "learning_rate": 0.0015,
+      "loss": 2.7931,
+      "step": 11850
+    },
+    {
+      "epoch": 0.0736653391397129,
+      "grad_norm": 0.09934777021408081,
+      "learning_rate": 0.0015,
+      "loss": 2.7787,
+      "step": 11875
+    },
+    {
+      "epoch": 0.07382042406421757,
+      "grad_norm": 0.17031100392341614,
+      "learning_rate": 0.0015,
+      "loss": 2.7997,
+      "step": 11900
+    },
+    {
+      "epoch": 0.07397550898872222,
+      "grad_norm": 0.13974526524543762,
+      "learning_rate": 0.0015,
+      "loss": 2.7975,
+      "step": 11925
+    },
+    {
+      "epoch": 0.07413059391322688,
+      "grad_norm": 0.12611718475818634,
+      "learning_rate": 0.0015,
+      "loss": 2.792,
+      "step": 11950
+    },
+    {
+      "epoch": 0.07428567883773154,
+      "grad_norm": 0.15177124738693237,
+      "learning_rate": 0.0015,
+      "loss": 2.7904,
+      "step": 11975
+    },
+    {
+      "epoch": 0.0744407637622362,
+      "grad_norm": 0.1411113739013672,
+      "learning_rate": 0.0015,
+      "loss": 2.7677,
+      "step": 12000
+    },
+    {
+      "epoch": 0.0744407637622362,
+      "eval_loss": 4.5571770668029785,
+      "perplexity": 95.31403350830078,
+      "step": 12000
+    },
+    {
+      "epoch": 0.07459584868674086,
+      "grad_norm": 0.08981940150260925,
+      "learning_rate": 0.0015,
+      "loss": 2.7765,
+      "step": 12025
+    },
+    {
+      "epoch": 0.07475093361124552,
+      "grad_norm": 0.09796686470508575,
+      "learning_rate": 0.0015,
+      "loss": 2.7503,
+      "step": 12050
+    },
+    {
+      "epoch": 0.07490601853575017,
+      "grad_norm": 0.1125386580824852,
+      "learning_rate": 0.0015,
+      "loss": 2.7263,
+      "step": 12075
+    },
+    {
+      "epoch": 0.07506110346025484,
+      "grad_norm": 0.11394508183002472,
+      "learning_rate": 0.0015,
+      "loss": 2.7855,
+      "step": 12100
+    },
+    {
+      "epoch": 0.07521618838475949,
+      "grad_norm": 0.11744117736816406,
+      "learning_rate": 0.0015,
+      "loss": 2.7698,
+      "step": 12125
+    },
+    {
+      "epoch": 0.07537127330926416,
+      "grad_norm": 0.17264704406261444,
+      "learning_rate": 0.0015,
+      "loss": 2.7592,
+      "step": 12150
+    },
+    {
+      "epoch": 0.07552635823376881,
+      "grad_norm": 0.10691671818494797,
+      "learning_rate": 0.0015,
+      "loss": 2.7519,
+      "step": 12175
+    },
+    {
+      "epoch": 0.07568144315827346,
+      "grad_norm": 0.1205432191491127,
+      "learning_rate": 0.0015,
+      "loss": 2.7676,
+      "step": 12200
+    },
+    {
+      "epoch": 0.07568144315827346,
+      "eval_loss": 4.544521808624268,
+      "perplexity": 94.11540985107422,
+      "step": 12200
+    },
+    {
+      "epoch": 0.07583652808277813,
+      "grad_norm": 0.1253867894411087,
+      "learning_rate": 0.0015,
+      "loss": 2.7698,
+      "step": 12225
+    },
+    {
+      "epoch": 0.07599161300728279,
+      "grad_norm": 0.1450471729040146,
+      "learning_rate": 0.0015,
+      "loss": 2.77,
+      "step": 12250
+    },
+    {
+      "epoch": 0.07614669793178745,
+      "grad_norm": 0.17055222392082214,
+      "learning_rate": 0.0015,
+      "loss": 2.7352,
+      "step": 12275
+    },
+    {
+      "epoch": 0.0763017828562921,
+      "grad_norm": 0.10687011480331421,
+      "learning_rate": 0.0015,
+      "loss": 2.7988,
+      "step": 12300
+    },
+    {
+      "epoch": 0.07645686778079676,
+      "grad_norm": 0.15520496666431427,
+      "learning_rate": 0.0015,
+      "loss": 2.7828,
+      "step": 12325
+    },
+    {
+      "epoch": 0.07661195270530143,
+      "grad_norm": 0.09279755502939224,
+      "learning_rate": 0.0015,
+      "loss": 2.7222,
+      "step": 12350
+    },
+    {
+      "epoch": 0.07676703762980608,
+      "grad_norm": 0.18024928867816925,
+      "learning_rate": 0.0015,
+      "loss": 2.7555,
+      "step": 12375
+    },
+    {
+      "epoch": 0.07692212255431075,
+      "grad_norm": 0.13292630016803741,
+      "learning_rate": 0.0015,
+      "loss": 2.733,
+      "step": 12400
+    },
+    {
+      "epoch": 0.07692212255431075,
+      "eval_loss": 4.538700103759766,
+      "perplexity": 93.569091796875,
+      "step": 12400
+    },
+    {
+      "epoch": 0.0770772074788154,
+      "grad_norm": 0.09353446960449219,
+      "learning_rate": 0.0015,
+      "loss": 2.7768,
+      "step": 12425
+    },
+    {
+      "epoch": 0.07723229240332005,
+      "grad_norm": 0.0946316123008728,
+      "learning_rate": 0.0015,
+      "loss": 2.7321,
+      "step": 12450
+    },
+    {
+      "epoch": 0.07738737732782472,
+      "grad_norm": 0.11109050363302231,
+      "learning_rate": 0.0015,
+      "loss": 2.7607,
+      "step": 12475
+    },
+    {
+      "epoch": 0.07754246225232937,
+      "grad_norm": 0.10057735443115234,
+      "learning_rate": 0.0015,
+      "loss": 2.7707,
+      "step": 12500
+    },
+    {
+      "epoch": 0.07769754717683404,
+      "grad_norm": 0.1466909795999527,
+      "learning_rate": 0.0015,
+      "loss": 2.7434,
+      "step": 12525
+    },
+    {
+      "epoch": 0.07785263210133869,
+      "grad_norm": 0.09831534326076508,
+      "learning_rate": 0.0015,
+      "loss": 2.7858,
+      "step": 12550
+    },
+    {
+      "epoch": 0.07800771702584335,
+      "grad_norm": 0.13202817738056183,
+      "learning_rate": 0.0015,
+      "loss": 2.7884,
+      "step": 12575
+    },
+    {
+      "epoch": 0.07816280195034801,
+      "grad_norm": 0.10797799378633499,
+      "learning_rate": 0.0015,
+      "loss": 2.7788,
+      "step": 12600
+    },
+    {
+      "epoch": 0.07816280195034801,
+      "eval_loss": 4.5452494621276855,
+      "perplexity": 94.18392181396484,
+      "step": 12600
+    },
+    {
+      "epoch": 0.07831788687485267,
+      "grad_norm": 0.10239394754171371,
+      "learning_rate": 0.0015,
+      "loss": 2.7803,
+      "step": 12625
+    },
+    {
+      "epoch": 0.07847297179935733,
+      "grad_norm": 0.10468672215938568,
+      "learning_rate": 0.0015,
+      "loss": 2.7449,
+      "step": 12650
+    },
+    {
+      "epoch": 0.07862805672386199,
+      "grad_norm": 0.13691146671772003,
+      "learning_rate": 0.0015,
+      "loss": 2.7837,
+      "step": 12675
+    },
+    {
+      "epoch": 0.07878314164836664,
+      "grad_norm": 0.16976097226142883,
+      "learning_rate": 0.0015,
+      "loss": 2.7557,
+      "step": 12700
+    },
+    {
+      "epoch": 0.0789382265728713,
+      "grad_norm": 0.09623986482620239,
+      "learning_rate": 0.0015,
+      "loss": 2.7576,
+      "step": 12725
+    },
+    {
+      "epoch": 0.07909331149737596,
+      "grad_norm": 0.11203131079673767,
+      "learning_rate": 0.0015,
+      "loss": 2.7846,
+      "step": 12750
+    },
+    {
+      "epoch": 0.07924839642188063,
+      "grad_norm": 0.12257611751556396,
+      "learning_rate": 0.0015,
+      "loss": 2.8015,
+      "step": 12775
+    },
+    {
+      "epoch": 0.07940348134638528,
+      "grad_norm": 0.08369628340005875,
+      "learning_rate": 0.0015,
+      "loss": 2.7616,
+      "step": 12800
+    },
+    {
+      "epoch": 0.07940348134638528,
+      "eval_loss": 4.548933506011963,
+      "perplexity": 94.53153991699219,
+      "step": 12800
+    },
+    {
+      "epoch": 0.07955856627088993,
+      "grad_norm": 0.12149519473314285,
+      "learning_rate": 0.0015,
+      "loss": 2.7651,
+      "step": 12825
+    },
+    {
+      "epoch": 0.0797136511953946,
+      "grad_norm": 0.09911686927080154,
+      "learning_rate": 0.0015,
+      "loss": 2.7964,
+      "step": 12850
+    },
+    {
+      "epoch": 0.07986873611989925,
+      "grad_norm": 0.09883631020784378,
+      "learning_rate": 0.0015,
+      "loss": 2.7461,
+      "step": 12875
+    },
+    {
+      "epoch": 0.08002382104440392,
+      "grad_norm": 0.08828576654195786,
+      "learning_rate": 0.0015,
+      "loss": 2.7735,
+      "step": 12900
+    },
+    {
+      "epoch": 0.08017890596890857,
+      "grad_norm": 0.18119321763515472,
+      "learning_rate": 0.0015,
+      "loss": 2.7863,
+      "step": 12925
+    },
+    {
+      "epoch": 0.08033399089341323,
+      "grad_norm": 0.09123501181602478,
+      "learning_rate": 0.0015,
+      "loss": 2.7559,
+      "step": 12950
+    },
+    {
+      "epoch": 0.0804890758179179,
+      "grad_norm": 0.18334759771823883,
+      "learning_rate": 0.0015,
+      "loss": 2.7357,
+      "step": 12975
+    },
+    {
+      "epoch": 0.08064416074242255,
+      "grad_norm": 0.08934136480093002,
+      "learning_rate": 0.0015,
+      "loss": 2.8003,
+      "step": 13000
+    },
+    {
+      "epoch": 0.08064416074242255,
+      "eval_loss": 4.537932395935059,
+      "perplexity": 93.49728393554688,
+      "step": 13000
+    },
+    {
+      "epoch": 0.08079924566692721,
+      "grad_norm": 0.117793008685112,
+      "learning_rate": 0.0015,
+      "loss": 2.738,
+      "step": 13025
+    },
+    {
+      "epoch": 0.08095433059143187,
+      "grad_norm": 0.1012151837348938,
+      "learning_rate": 0.0015,
+      "loss": 2.767,
+      "step": 13050
+    },
+    {
+      "epoch": 0.08110941551593653,
+      "grad_norm": 0.1099851131439209,
+      "learning_rate": 0.0015,
+      "loss": 2.7899,
+      "step": 13075
+    },
+    {
+      "epoch": 0.08126450044044119,
+      "grad_norm": 0.105575330555439,
+      "learning_rate": 0.0015,
+      "loss": 2.7857,
+      "step": 13100
+    },
+    {
+      "epoch": 0.08141958536494584,
+      "grad_norm": 0.11926279962062836,
+      "learning_rate": 0.0015,
+      "loss": 2.7821,
+      "step": 13125
+    },
+    {
+      "epoch": 0.08157467028945051,
+      "grad_norm": 0.1669924259185791,
+      "learning_rate": 0.0015,
+      "loss": 2.7673,
+      "step": 13150
+    },
+    {
+      "epoch": 0.08172975521395516,
+      "grad_norm": 0.11445988714694977,
+      "learning_rate": 0.0015,
+      "loss": 2.8081,
+      "step": 13175
+    },
+    {
+      "epoch": 0.08188484013845983,
+      "grad_norm": 0.09700124710798264,
+      "learning_rate": 0.0015,
+      "loss": 2.7841,
+      "step": 13200
+    },
+    {
+      "epoch": 0.08188484013845983,
+      "eval_loss": 4.540359973907471,
+      "perplexity": 93.72453308105469,
+      "step": 13200
+    },
+    {
+      "epoch": 0.08203992506296448,
+      "grad_norm": 0.11112058907747269,
+      "learning_rate": 0.0015,
+      "loss": 2.7471,
+      "step": 13225
+    },
+    {
+      "epoch": 0.08219500998746913,
+      "grad_norm": 0.17890195548534393,
+      "learning_rate": 0.0015,
+      "loss": 2.7898,
+      "step": 13250
+    },
+    {
+      "epoch": 0.0823500949119738,
+      "grad_norm": 0.12197751551866531,
+      "learning_rate": 0.0015,
+      "loss": 2.7328,
+      "step": 13275
+    },
+    {
+      "epoch": 0.08250517983647845,
+      "grad_norm": 0.11677111685276031,
+      "learning_rate": 0.0015,
+      "loss": 2.7849,
+      "step": 13300
+    },
+    {
+      "epoch": 0.08266026476098312,
+      "grad_norm": 0.15514017641544342,
+      "learning_rate": 0.0015,
+      "loss": 2.7561,
+      "step": 13325
+    },
+    {
+      "epoch": 0.08281534968548777,
+      "grad_norm": 0.10389192402362823,
+      "learning_rate": 0.0015,
+      "loss": 2.7611,
+      "step": 13350
+    },
+    {
+      "epoch": 0.08297043460999243,
+      "grad_norm": 0.10176412016153336,
+      "learning_rate": 0.0015,
+      "loss": 2.7793,
+      "step": 13375
+    },
+    {
+      "epoch": 0.0831255195344971,
+      "grad_norm": 0.1043052077293396,
+      "learning_rate": 0.0015,
+      "loss": 2.7375,
+      "step": 13400
+    },
+    {
+      "epoch": 0.0831255195344971,
+      "eval_loss": 4.5388336181640625,
+      "perplexity": 93.58158111572266,
+      "step": 13400
+    },
+    {
+      "epoch": 0.08328060445900175,
+      "grad_norm": 0.08918718248605728,
+      "learning_rate": 0.0015,
+      "loss": 2.7465,
+      "step": 13425
+    },
+    {
+      "epoch": 0.08343568938350641,
+      "grad_norm": 0.10008233785629272,
+      "learning_rate": 0.0015,
+      "loss": 2.7776,
+      "step": 13450
+    },
+    {
+      "epoch": 0.08359077430801107,
+      "grad_norm": 0.10228800773620605,
+      "learning_rate": 0.0015,
+      "loss": 2.756,
+      "step": 13475
+    },
+    {
+      "epoch": 0.08374585923251572,
+      "grad_norm": 0.0868915542960167,
+      "learning_rate": 0.0015,
+      "loss": 2.7556,
+      "step": 13500
+    },
+    {
+      "epoch": 0.08390094415702039,
+      "grad_norm": 0.11076166480779648,
+      "learning_rate": 0.0015,
+      "loss": 2.6975,
+      "step": 13525
+    },
+    {
+      "epoch": 0.08405602908152504,
+      "grad_norm": 0.13617128133773804,
+      "learning_rate": 0.0015,
+      "loss": 2.7643,
+      "step": 13550
+    },
+    {
+      "epoch": 0.08421111400602971,
+      "grad_norm": 0.15346932411193848,
+      "learning_rate": 0.0015,
+      "loss": 2.7966,
+      "step": 13575
+    },
+    {
+      "epoch": 0.08436619893053436,
+      "grad_norm": 0.17080894112586975,
+      "learning_rate": 0.0015,
+      "loss": 2.7636,
+      "step": 13600
+    },
+    {
+      "epoch": 0.08436619893053436,
+      "eval_loss": 4.513378620147705,
+      "perplexity": 91.22953033447266,
+      "step": 13600
+    },
+    {
+      "epoch": 0.08452128385503901,
+      "grad_norm": 0.11548548936843872,
+      "learning_rate": 0.0015,
+      "loss": 2.7729,
+      "step": 13625
+    },
+    {
+      "epoch": 0.08467636877954368,
+      "grad_norm": 0.14650912582874298,
+      "learning_rate": 0.0015,
+      "loss": 2.7063,
+      "step": 13650
+    },
+    {
+      "epoch": 0.08483145370404833,
+      "grad_norm": 0.09750749915838242,
+      "learning_rate": 0.0015,
+      "loss": 2.7648,
+      "step": 13675
+    },
+    {
+      "epoch": 0.084986538628553,
+      "grad_norm": 0.18051239848136902,
+      "learning_rate": 0.0015,
+      "loss": 2.754,
+      "step": 13700
+    },
+    {
+      "epoch": 0.08514162355305765,
+      "grad_norm": 0.21637938916683197,
+      "learning_rate": 0.0015,
+      "loss": 2.7529,
+      "step": 13725
+    },
+    {
+      "epoch": 0.08529670847756231,
+      "grad_norm": 0.10037226974964142,
+      "learning_rate": 0.0015,
+      "loss": 2.7638,
+      "step": 13750
+    },
+    {
+      "epoch": 0.08545179340206698,
+      "grad_norm": 0.1033267229795456,
+      "learning_rate": 0.0015,
+      "loss": 2.7713,
+      "step": 13775
+    },
+    {
+      "epoch": 0.08560687832657163,
+      "grad_norm": 0.09179462492465973,
+      "learning_rate": 0.0015,
+      "loss": 2.8278,
+      "step": 13800
+    },
+    {
+      "epoch": 0.08560687832657163,
+      "eval_loss": 4.508410453796387,
+      "perplexity": 90.77741241455078,
+      "step": 13800
+    },
+    {
+      "epoch": 0.0857619632510763,
+      "grad_norm": 0.09874552488327026,
+      "learning_rate": 0.0015,
+      "loss": 2.7544,
+      "step": 13825
+    },
+    {
+      "epoch": 0.08591704817558095,
+      "grad_norm": 0.17807777225971222,
+      "learning_rate": 0.0015,
+      "loss": 2.7401,
+      "step": 13850
+    },
+    {
+      "epoch": 0.0860721331000856,
+      "grad_norm": 0.14388497173786163,
+      "learning_rate": 0.0015,
+      "loss": 2.7879,
+      "step": 13875
+    },
+    {
+      "epoch": 0.08622721802459027,
+      "grad_norm": 0.13081450760364532,
+      "learning_rate": 0.0015,
+      "loss": 2.7162,
+      "step": 13900
+    },
+    {
+      "epoch": 0.08638230294909492,
+      "grad_norm": 0.15077342092990875,
+      "learning_rate": 0.0015,
+      "loss": 2.757,
+      "step": 13925
+    },
+    {
+      "epoch": 0.08653738787359959,
+      "grad_norm": 0.11368410289287567,
+      "learning_rate": 0.0015,
+      "loss": 2.7546,
+      "step": 13950
+    },
+    {
+      "epoch": 0.08669247279810424,
+      "grad_norm": 0.16447153687477112,
+      "learning_rate": 0.0015,
+      "loss": 2.7371,
+      "step": 13975
+    },
+    {
+      "epoch": 0.0868475577226089,
+      "grad_norm": 0.20563559234142303,
+      "learning_rate": 0.0015,
+      "loss": 2.7474,
+      "step": 14000
+    },
+    {
+      "epoch": 0.0868475577226089,
+      "eval_loss": 4.525671005249023,
+      "perplexity": 92.35787963867188,
+      "step": 14000
+    },
+    {
+      "epoch": 0.08700264264711356,
+      "grad_norm": 0.10695035755634308,
+      "learning_rate": 0.0015,
+      "loss": 2.7565,
+      "step": 14025
+    },
+    {
+      "epoch": 0.08715772757161822,
+      "grad_norm": 0.12368099391460419,
+      "learning_rate": 0.0015,
+      "loss": 2.784,
+      "step": 14050
+    },
+    {
+      "epoch": 0.08731281249612288,
+      "grad_norm": 0.11491699516773224,
+      "learning_rate": 0.0015,
+      "loss": 2.7477,
+      "step": 14075
+    },
+    {
+      "epoch": 0.08746789742062754,
+      "grad_norm": 0.10570378601551056,
+      "learning_rate": 0.0015,
+      "loss": 2.7575,
+      "step": 14100
+    },
+    {
+      "epoch": 0.08762298234513219,
+      "grad_norm": 0.09137633442878723,
+      "learning_rate": 0.0015,
+      "loss": 2.7517,
+      "step": 14125
+    },
+    {
+      "epoch": 0.08777806726963686,
+      "grad_norm": 0.09999803453683853,
+      "learning_rate": 0.0015,
+      "loss": 2.7446,
+      "step": 14150
+    },
+    {
+      "epoch": 0.08793315219414151,
+      "grad_norm": 0.15709616243839264,
+      "learning_rate": 0.0015,
+      "loss": 2.7606,
+      "step": 14175
+    },
+    {
+      "epoch": 0.08808823711864618,
+      "grad_norm": 0.10327859222888947,
+      "learning_rate": 0.0015,
+      "loss": 2.7441,
+      "step": 14200
+    },
+    {
+      "epoch": 0.08808823711864618,
+      "eval_loss": 4.521189212799072,
+      "perplexity": 91.94487762451172,
+      "step": 14200
+    },
+    {
+      "epoch": 0.08824332204315083,
+      "grad_norm": 0.1964125633239746,
+      "learning_rate": 0.0015,
+      "loss": 2.7109,
+      "step": 14225
+    },
+    {
+      "epoch": 0.08839840696765548,
+      "grad_norm": 0.12792247533798218,
+      "learning_rate": 0.0015,
+      "loss": 2.7401,
+      "step": 14250
+    },
+    {
+      "epoch": 0.08855349189216015,
+      "grad_norm": 0.17532923817634583,
+      "learning_rate": 0.0015,
+      "loss": 2.7609,
+      "step": 14275
+    },
+    {
+      "epoch": 0.0887085768166648,
+      "grad_norm": 0.096143439412117,
+      "learning_rate": 0.0015,
+      "loss": 2.7749,
+      "step": 14300
+    },
+    {
+      "epoch": 0.08886366174116947,
+      "grad_norm": 0.12778601050376892,
+      "learning_rate": 0.0015,
+      "loss": 2.6981,
+      "step": 14325
+    },
+    {
+      "epoch": 0.08901874666567412,
+      "grad_norm": 0.1130848377943039,
+      "learning_rate": 0.0015,
+      "loss": 2.7255,
+      "step": 14350
+    },
+    {
+      "epoch": 0.08917383159017878,
+      "grad_norm": 0.0818464607000351,
+      "learning_rate": 0.0015,
+      "loss": 2.7223,
+      "step": 14375
+    },
+    {
+      "epoch": 0.08932891651468344,
+      "grad_norm": 0.10516222566366196,
+      "learning_rate": 0.0015,
+      "loss": 2.7672,
+      "step": 14400
+    },
+    {
+      "epoch": 0.08932891651468344,
+      "eval_loss": 4.524067401885986,
+      "perplexity": 92.20989227294922,
+      "step": 14400
+    },
+    {
+      "epoch": 0.0894840014391881,
+      "grad_norm": 0.08912840485572815,
+      "learning_rate": 0.0015,
+      "loss": 2.7349,
+      "step": 14425
+    },
+    {
+      "epoch": 0.08963908636369276,
+      "grad_norm": 0.11931388080120087,
+      "learning_rate": 0.0015,
+      "loss": 2.7326,
+      "step": 14450
+    },
+    {
+      "epoch": 0.08979417128819742,
+      "grad_norm": 0.12271756678819656,
+      "learning_rate": 0.0015,
+      "loss": 2.7327,
+      "step": 14475
+    },
+    {
+      "epoch": 0.08994925621270207,
+      "grad_norm": 0.1567191183567047,
+      "learning_rate": 0.0015,
+      "loss": 2.7573,
+      "step": 14500
+    },
+    {
+      "epoch": 0.09010434113720674,
+      "grad_norm": 0.1841791719198227,
+      "learning_rate": 0.0015,
+      "loss": 2.7582,
+      "step": 14525
+    },
+    {
+      "epoch": 0.09025942606171139,
+      "grad_norm": 0.12743189930915833,
+      "learning_rate": 0.0015,
+      "loss": 2.8061,
+      "step": 14550
+    },
+    {
+      "epoch": 0.09041451098621606,
+      "grad_norm": 0.11932828277349472,
+      "learning_rate": 0.0015,
+      "loss": 2.7447,
+      "step": 14575
+    },
+    {
+      "epoch": 0.09056959591072071,
+      "grad_norm": 0.18284690380096436,
+      "learning_rate": 0.0015,
+      "loss": 2.7436,
+      "step": 14600
+    },
+    {
+      "epoch": 0.09056959591072071,
+      "eval_loss": 4.515897750854492,
+      "perplexity": 91.45964050292969,
+      "step": 14600
+    },
+    {
+      "epoch": 0.09072468083522536,
+      "grad_norm": 0.17987670004367828,
+      "learning_rate": 0.0015,
+      "loss": 2.7831,
+      "step": 14625
+    },
+    {
+      "epoch": 0.09087976575973003,
+      "grad_norm": 0.10992395132780075,
+      "learning_rate": 0.0015,
+      "loss": 2.7516,
+      "step": 14650
+    },
+    {
+      "epoch": 0.09103485068423468,
+      "grad_norm": 0.09343726187944412,
+      "learning_rate": 0.0015,
+      "loss": 2.7475,
+      "step": 14675
+    },
+    {
+      "epoch": 0.09118993560873935,
+      "grad_norm": 0.10370751470327377,
+      "learning_rate": 0.0015,
+      "loss": 2.7518,
+      "step": 14700
+    },
+    {
+      "epoch": 0.091345020533244,
+      "grad_norm": 0.11190348863601685,
+      "learning_rate": 0.0015,
+      "loss": 2.7482,
+      "step": 14725
+    },
+    {
+      "epoch": 0.09150010545774866,
+      "grad_norm": 0.12450053542852402,
+      "learning_rate": 0.0015,
+      "loss": 2.7726,
+      "step": 14750
+    },
+    {
+      "epoch": 0.09165519038225332,
+      "grad_norm": 0.11882703006267548,
+      "learning_rate": 0.0015,
+      "loss": 2.7318,
+      "step": 14775
+    },
+    {
+      "epoch": 0.09181027530675798,
+      "grad_norm": 0.1315181404352188,
+      "learning_rate": 0.0015,
+      "loss": 2.757,
+      "step": 14800
+    },
+    {
+      "epoch": 0.09181027530675798,
+      "eval_loss": 4.521557807922363,
+      "perplexity": 91.97877502441406,
+      "step": 14800
+    },
+    {
+      "epoch": 0.09196536023126264,
+      "grad_norm": 0.18574784696102142,
+      "learning_rate": 0.0015,
+      "loss": 2.7353,
+      "step": 14825
+    },
+    {
+      "epoch": 0.0921204451557673,
+      "grad_norm": 0.17665444314479828,
+      "learning_rate": 0.0015,
+      "loss": 2.7687,
+      "step": 14850
+    },
+    {
+      "epoch": 0.09227553008027195,
+      "grad_norm": 0.12507860362529755,
+      "learning_rate": 0.0015,
+      "loss": 2.7386,
+      "step": 14875
+    },
+    {
+      "epoch": 0.09243061500477662,
+      "grad_norm": 0.10472691059112549,
+      "learning_rate": 0.0015,
+      "loss": 2.7716,
+      "step": 14900
+    },
+    {
+      "epoch": 0.09258569992928127,
+      "grad_norm": 0.10282575339078903,
+      "learning_rate": 0.0015,
+      "loss": 2.7312,
+      "step": 14925
+    },
+    {
+      "epoch": 0.09274078485378594,
+      "grad_norm": 0.12706094980239868,
+      "learning_rate": 0.0015,
+      "loss": 2.7995,
+      "step": 14950
+    },
+    {
+      "epoch": 0.09289586977829059,
+      "grad_norm": 0.15283973515033722,
+      "learning_rate": 0.0015,
+      "loss": 2.7313,
+      "step": 14975
+    },
+    {
+      "epoch": 0.09305095470279524,
+      "grad_norm": 0.12476324290037155,
+      "learning_rate": 0.0015,
+      "loss": 2.7727,
+      "step": 15000
+    },
+    {
+      "epoch": 0.09305095470279524,
+      "eval_loss": 4.547565937042236,
+      "perplexity": 94.40234375,
+      "step": 15000
+    },
+    {
+      "epoch": 0.09320603962729991,
+      "grad_norm": 0.12369734048843384,
+      "learning_rate": 0.0015,
+      "loss": 2.7565,
+      "step": 15025
+    },
+    {
+      "epoch": 0.09336112455180456,
+      "grad_norm": 0.1322038471698761,
+      "learning_rate": 0.0015,
+      "loss": 2.7588,
+      "step": 15050
+    },
+    {
+      "epoch": 0.09351620947630923,
+      "grad_norm": 0.0926559790968895,
+      "learning_rate": 0.0015,
+      "loss": 2.7393,
+      "step": 15075
+    },
+    {
+      "epoch": 0.09367129440081388,
+      "grad_norm": 0.17404210567474365,
+      "learning_rate": 0.0015,
+      "loss": 2.723,
+      "step": 15100
+    },
+    {
+      "epoch": 0.09382637932531855,
+      "grad_norm": 0.10326647758483887,
+      "learning_rate": 0.0015,
+      "loss": 2.7853,
+      "step": 15125
+    },
+    {
+      "epoch": 0.0939814642498232,
+      "grad_norm": 0.13869203627109528,
+      "learning_rate": 0.0015,
+      "loss": 2.7535,
+      "step": 15150
+    },
+    {
+      "epoch": 0.09413654917432786,
+      "grad_norm": 0.14325955510139465,
+      "learning_rate": 0.0015,
+      "loss": 2.7597,
+      "step": 15175
+    },
+    {
+      "epoch": 0.09429163409883252,
+      "grad_norm": 0.11783768236637115,
+      "learning_rate": 0.0015,
+      "loss": 2.7524,
+      "step": 15200
+    },
+    {
+      "epoch": 0.09429163409883252,
+      "eval_loss": 4.5251593589782715,
+      "perplexity": 92.31063842773438,
+      "step": 15200
+    },
+    {
+      "epoch": 0.09444671902333718,
+      "grad_norm": 0.12261676043272018,
+      "learning_rate": 0.0015,
+      "loss": 2.7279,
+      "step": 15225
+    },
+    {
+      "epoch": 0.09460180394784184,
+      "grad_norm": 0.09966279566287994,
+      "learning_rate": 0.0015,
+      "loss": 2.8119,
+      "step": 15250
+    },
+    {
+      "epoch": 0.0947568888723465,
+      "grad_norm": 0.1052974984049797,
+      "learning_rate": 0.0015,
+      "loss": 2.7392,
+      "step": 15275
+    },
+    {
+      "epoch": 0.09491197379685115,
+      "grad_norm": 0.11074663698673248,
+      "learning_rate": 0.0015,
+      "loss": 2.7319,
+      "step": 15300
+    },
+    {
+      "epoch": 0.09506705872135582,
+      "grad_norm": 0.09762706607580185,
+      "learning_rate": 0.0015,
+      "loss": 2.7806,
+      "step": 15325
+    },
+    {
+      "epoch": 0.09522214364586047,
+      "grad_norm": 0.08552476018667221,
+      "learning_rate": 0.0015,
+      "loss": 2.7351,
+      "step": 15350
+    },
+    {
+      "epoch": 0.09537722857036514,
+      "grad_norm": 0.13211695849895477,
+      "learning_rate": 0.0015,
+      "loss": 2.7667,
+      "step": 15375
+    },
+    {
+      "epoch": 0.09553231349486979,
+      "grad_norm": 0.12074939906597137,
+      "learning_rate": 0.0015,
+      "loss": 2.7614,
+      "step": 15400
+    },
+    {
+      "epoch": 0.09553231349486979,
+      "eval_loss": 4.53213357925415,
+      "perplexity": 92.95668029785156,
+      "step": 15400
+    },
+    {
+      "epoch": 0.09568739841937444,
+      "grad_norm": 0.11755666136741638,
+      "learning_rate": 0.0015,
+      "loss": 2.7101,
+      "step": 15425
+    },
+    {
+      "epoch": 0.09584248334387911,
+      "grad_norm": 0.10476246476173401,
+      "learning_rate": 0.0015,
+      "loss": 2.7391,
+      "step": 15450
+    },
+    {
+      "epoch": 0.09599756826838376,
+      "grad_norm": 0.10921350121498108,
+      "learning_rate": 0.0015,
+      "loss": 2.7423,
+      "step": 15475
+    },
+    {
+      "epoch": 0.09615265319288843,
+      "grad_norm": 0.11517275124788284,
+      "learning_rate": 0.0015,
+      "loss": 2.7374,
+      "step": 15500
+    },
+    {
+      "epoch": 0.09630773811739309,
+      "grad_norm": 0.10500945895910263,
+      "learning_rate": 0.0015,
+      "loss": 2.73,
+      "step": 15525
+    },
+    {
+      "epoch": 0.09646282304189774,
+      "grad_norm": 0.0962584912776947,
+      "learning_rate": 0.0015,
+      "loss": 2.7597,
+      "step": 15550
+    },
+    {
+      "epoch": 0.0966179079664024,
+      "grad_norm": 0.1273050308227539,
+      "learning_rate": 0.0015,
+      "loss": 2.7306,
+      "step": 15575
+    },
+    {
+      "epoch": 0.09677299289090706,
+      "grad_norm": 0.11249135434627533,
+      "learning_rate": 0.0015,
+      "loss": 2.7859,
+      "step": 15600
+    },
+    {
+      "epoch": 0.09677299289090706,
+      "eval_loss": 4.537318706512451,
+      "perplexity": 93.43992614746094,
+      "step": 15600
+    },
+    {
+      "epoch": 0.09692807781541173,
+      "grad_norm": 0.19111056625843048,
+      "learning_rate": 0.0015,
+      "loss": 2.7386,
+      "step": 15625
+    },
+    {
+      "epoch": 0.09708316273991638,
+      "grad_norm": 0.10486472398042679,
+      "learning_rate": 0.0015,
+      "loss": 2.7462,
+      "step": 15650
+    },
+    {
+      "epoch": 0.09723824766442103,
+      "grad_norm": 0.1453208327293396,
+      "learning_rate": 0.0015,
+      "loss": 2.762,
+      "step": 15675
+    },
+    {
+      "epoch": 0.0973933325889257,
+      "grad_norm": 0.08459452539682388,
+      "learning_rate": 0.0015,
+      "loss": 2.7353,
+      "step": 15700
+    },
+    {
+      "epoch": 0.09754841751343035,
+      "grad_norm": 0.11150529980659485,
+      "learning_rate": 0.0015,
+      "loss": 2.7617,
+      "step": 15725
+    },
+    {
+      "epoch": 0.09770350243793502,
+      "grad_norm": 0.11301703006029129,
+      "learning_rate": 0.0015,
+      "loss": 2.7623,
+      "step": 15750
+    },
+    {
+      "epoch": 0.09785858736243967,
+      "grad_norm": 0.16564789414405823,
+      "learning_rate": 0.0015,
+      "loss": 2.7315,
+      "step": 15775
+    },
+    {
+      "epoch": 0.09801367228694433,
+      "grad_norm": 0.08968822658061981,
+      "learning_rate": 0.0015,
+      "loss": 2.7842,
+      "step": 15800
+    },
+    {
+      "epoch": 0.09801367228694433,
+      "eval_loss": 4.528219223022461,
+      "perplexity": 92.5935287475586,
+      "step": 15800
+    },
+    {
+      "epoch": 0.09816875721144899,
+      "grad_norm": 0.1233256533741951,
+      "learning_rate": 0.0015,
+      "loss": 2.7584,
+      "step": 15825
+    },
+    {
+      "epoch": 0.09832384213595365,
+      "grad_norm": 0.18926863372325897,
+      "learning_rate": 0.0015,
+      "loss": 2.7651,
+      "step": 15850
+    },
+    {
+      "epoch": 0.09847892706045831,
+      "grad_norm": 0.0912550836801529,
+      "learning_rate": 0.0015,
+      "loss": 2.7551,
+      "step": 15875
+    },
+    {
+      "epoch": 0.09863401198496297,
+      "grad_norm": 0.1443813592195511,
+      "learning_rate": 0.0015,
+      "loss": 2.7378,
+      "step": 15900
+    },
+    {
+      "epoch": 0.09878909690946762,
+      "grad_norm": 0.11620072275400162,
+      "learning_rate": 0.0015,
+      "loss": 2.7706,
+      "step": 15925
+    },
+    {
+      "epoch": 0.09894418183397229,
+      "grad_norm": 0.10275860130786896,
+      "learning_rate": 0.0015,
+      "loss": 2.7502,
+      "step": 15950
+    },
+    {
+      "epoch": 0.09909926675847694,
+      "grad_norm": 0.1417694240808487,
+      "learning_rate": 0.0015,
+      "loss": 2.706,
+      "step": 15975
+    },
+    {
+      "epoch": 0.0992543516829816,
+      "grad_norm": 0.1121877133846283,
+      "learning_rate": 0.0015,
+      "loss": 2.7537,
+      "step": 16000
+    },
+    {
+      "epoch": 0.0992543516829816,
+      "eval_loss": 4.520648956298828,
+      "perplexity": 91.89521789550781,
+      "step": 16000
+    },
+    {
+      "epoch": 0.09940943660748626,
+      "grad_norm": 0.10022582858800888,
+      "learning_rate": 0.0015,
+      "loss": 2.7213,
+      "step": 16025
+    },
+    {
+      "epoch": 0.09956452153199091,
+      "grad_norm": 0.09722616523504257,
+      "learning_rate": 0.0015,
+      "loss": 2.7437,
+      "step": 16050
+    },
+    {
+      "epoch": 0.09971960645649558,
+      "grad_norm": 0.11053729802370071,
+      "learning_rate": 0.0015,
+      "loss": 2.7495,
+      "step": 16075
+    },
+    {
+      "epoch": 0.09987469138100023,
+      "grad_norm": 0.10231011360883713,
+      "learning_rate": 0.0015,
+      "loss": 2.7505,
+      "step": 16100
+    },
+    {
+      "epoch": 0.1000297763055049,
+      "grad_norm": 0.135975643992424,
+      "learning_rate": 0.0015,
+      "loss": 2.7487,
+      "step": 16125
+    },
+    {
+      "epoch": 0.10018486123000955,
+      "grad_norm": 0.11350739002227783,
+      "learning_rate": 0.0015,
+      "loss": 2.7484,
+      "step": 16150
+    },
+    {
+      "epoch": 0.1003399461545142,
+      "grad_norm": 0.10639143735170364,
+      "learning_rate": 0.0015,
+      "loss": 2.7429,
+      "step": 16175
+    },
+    {
+      "epoch": 0.10049503107901887,
+      "grad_norm": 0.09016221761703491,
+      "learning_rate": 0.0015,
+      "loss": 2.7891,
+      "step": 16200
+    },
+    {
+      "epoch": 0.10049503107901887,
+      "eval_loss": 4.5112504959106445,
+      "perplexity": 91.03558349609375,
+      "step": 16200
+    },
+    {
+      "epoch": 0.10065011600352353,
+      "grad_norm": 0.11324500292539597,
+      "learning_rate": 0.0015,
+      "loss": 2.7678,
+      "step": 16225
+    },
+    {
+      "epoch": 0.1008052009280282,
+      "grad_norm": 0.13268886506557465,
+      "learning_rate": 0.0015,
+      "loss": 2.723,
+      "step": 16250
+    },
+    {
+      "epoch": 0.10096028585253285,
+      "grad_norm": 0.11448831856250763,
+      "learning_rate": 0.0015,
+      "loss": 2.7328,
+      "step": 16275
+    },
+    {
+      "epoch": 0.1011153707770375,
+      "grad_norm": 0.10799309611320496,
+      "learning_rate": 0.0015,
+      "loss": 2.7478,
+      "step": 16300
+    },
+    {
+      "epoch": 0.10127045570154217,
+      "grad_norm": 0.19559204578399658,
+      "learning_rate": 0.0015,
+      "loss": 2.7606,
+      "step": 16325
+    },
+    {
+      "epoch": 0.10142554062604682,
+      "grad_norm": 0.14151975512504578,
+      "learning_rate": 0.0015,
+      "loss": 2.7279,
+      "step": 16350
+    },
+    {
+      "epoch": 0.10158062555055149,
+      "grad_norm": 0.10044725239276886,
+      "learning_rate": 0.0015,
+      "loss": 2.7609,
+      "step": 16375
+    },
+    {
+      "epoch": 0.10173571047505614,
+      "grad_norm": 0.10686340183019638,
+      "learning_rate": 0.0015,
+      "loss": 2.7295,
+      "step": 16400
+    },
+    {
+      "epoch": 0.10173571047505614,
+      "eval_loss": 4.521287441253662,
+      "perplexity": 91.95391082763672,
+      "step": 16400
+    },
+    {
+      "epoch": 0.1018907953995608,
+      "grad_norm": 0.1561044305562973,
+      "learning_rate": 0.0015,
+      "loss": 2.7769,
+      "step": 16425
+    },
+    {
+      "epoch": 0.10204588032406546,
+      "grad_norm": 0.12182148545980453,
+      "learning_rate": 0.0015,
+      "loss": 2.757,
+      "step": 16450
+    },
+    {
+      "epoch": 0.10220096524857011,
+      "grad_norm": 0.20665724575519562,
+      "learning_rate": 0.0015,
+      "loss": 2.7349,
+      "step": 16475
+    },
+    {
+      "epoch": 0.10235605017307478,
+      "grad_norm": 0.09160878509283066,
+      "learning_rate": 0.0015,
+      "loss": 2.7393,
+      "step": 16500
+    },
+    {
+      "epoch": 0.10251113509757943,
+      "grad_norm": 0.16651533544063568,
+      "learning_rate": 0.0015,
+      "loss": 2.7441,
+      "step": 16525
+    },
+    {
+      "epoch": 0.10266622002208409,
+      "grad_norm": 0.09358719736337662,
+      "learning_rate": 0.0015,
+      "loss": 2.7297,
+      "step": 16550
+    },
+    {
+      "epoch": 0.10282130494658875,
+      "grad_norm": 0.20277003943920135,
+      "learning_rate": 0.0015,
+      "loss": 2.7506,
+      "step": 16575
+    },
+    {
+      "epoch": 0.10297638987109341,
+      "grad_norm": 0.13382607698440552,
+      "learning_rate": 0.0015,
+      "loss": 2.7924,
+      "step": 16600
+    },
+    {
+      "epoch": 0.10297638987109341,
+      "eval_loss": 4.525242328643799,
+      "perplexity": 92.31829833984375,
+      "step": 16600
+    },
+    {
+      "epoch": 0.10313147479559807,
+      "grad_norm": 0.09686290472745895,
+      "learning_rate": 0.0015,
+      "loss": 2.7417,
+      "step": 16625
+    },
+    {
+      "epoch": 0.10328655972010273,
+      "grad_norm": 0.11446567624807358,
+      "learning_rate": 0.0015,
+      "loss": 2.7582,
+      "step": 16650
+    },
+    {
+      "epoch": 0.10344164464460738,
+      "grad_norm": 0.15948985517024994,
+      "learning_rate": 0.0015,
+      "loss": 2.7254,
+      "step": 16675
+    },
+    {
+      "epoch": 0.10359672956911205,
+      "grad_norm": 0.1254827231168747,
+      "learning_rate": 0.0015,
+      "loss": 2.7515,
+      "step": 16700
+    },
+    {
+      "epoch": 0.1037518144936167,
+      "grad_norm": 0.11295375972986221,
+      "learning_rate": 0.0015,
+      "loss": 2.7058,
+      "step": 16725
+    },
+    {
+      "epoch": 0.10390689941812137,
+      "grad_norm": 0.10659389197826385,
+      "learning_rate": 0.0015,
+      "loss": 2.7281,
+      "step": 16750
+    },
+    {
+      "epoch": 0.10406198434262602,
+      "grad_norm": 0.1045156791806221,
+      "learning_rate": 0.0015,
+      "loss": 2.7131,
+      "step": 16775
+    },
+    {
+      "epoch": 0.10421706926713067,
+      "grad_norm": 0.13835974037647247,
+      "learning_rate": 0.0015,
+      "loss": 2.744,
+      "step": 16800
+    },
+    {
+      "epoch": 0.10421706926713067,
+      "eval_loss": 4.507747650146484,
+      "perplexity": 90.7172622680664,
+      "step": 16800
+    },
+    {
+      "epoch": 0.10437215419163534,
+      "grad_norm": 0.19872727990150452,
+      "learning_rate": 0.0015,
+      "loss": 2.7642,
+      "step": 16825
+    },
+    {
+      "epoch": 0.10452723911614,
+      "grad_norm": 0.13754956424236298,
+      "learning_rate": 0.0015,
+      "loss": 2.7652,
+      "step": 16850
+    },
+    {
+      "epoch": 0.10468232404064466,
+      "grad_norm": 0.1451335996389389,
+      "learning_rate": 0.0015,
+      "loss": 2.7561,
+      "step": 16875
+    },
+    {
+      "epoch": 0.10483740896514931,
+      "grad_norm": 0.16750144958496094,
+      "learning_rate": 0.0015,
+      "loss": 2.7206,
+      "step": 16900
+    },
+    {
+      "epoch": 0.10499249388965397,
+      "grad_norm": 0.12020619958639145,
+      "learning_rate": 0.0015,
+      "loss": 2.699,
+      "step": 16925
+    },
+    {
+      "epoch": 0.10514757881415863,
+      "grad_norm": 0.16792155802249908,
+      "learning_rate": 0.0015,
+      "loss": 2.8062,
+      "step": 16950
+    },
+    {
+      "epoch": 0.10530266373866329,
+      "grad_norm": 0.11066465824842453,
+      "learning_rate": 0.0015,
+      "loss": 2.6968,
+      "step": 16975
+    },
+    {
+      "epoch": 0.10545774866316796,
+      "grad_norm": 0.11885298788547516,
+      "learning_rate": 0.0015,
+      "loss": 2.7699,
+      "step": 17000
+    },
+    {
+      "epoch": 0.10545774866316796,
+      "eval_loss": 4.524214744567871,
+      "perplexity": 92.22348022460938,
+      "step": 17000
+    },
+    {
+      "epoch": 0.10561283358767261,
+      "grad_norm": 0.1298653483390808,
+      "learning_rate": 0.0015,
+      "loss": 2.7199,
+      "step": 17025
+    },
+    {
+      "epoch": 0.10576791851217726,
+      "grad_norm": 0.11387672275304794,
+      "learning_rate": 0.0015,
+      "loss": 2.7528,
+      "step": 17050
+    },
+    {
+      "epoch": 0.10592300343668193,
+      "grad_norm": 0.09852533042430878,
+      "learning_rate": 0.0015,
+      "loss": 2.7277,
+      "step": 17075
+    },
+    {
+      "epoch": 0.10607808836118658,
+      "grad_norm": 0.11046476662158966,
+      "learning_rate": 0.0015,
+      "loss": 2.722,
+      "step": 17100
+    },
+    {
+      "epoch": 0.10623317328569125,
+      "grad_norm": 0.11632421612739563,
+      "learning_rate": 0.0015,
+      "loss": 2.726,
+      "step": 17125
+    },
+    {
+      "epoch": 0.1063882582101959,
+      "grad_norm": 0.11760540306568146,
+      "learning_rate": 0.0015,
+      "loss": 2.7267,
+      "step": 17150
+    },
+    {
+      "epoch": 0.10654334313470057,
+      "grad_norm": 0.12264183163642883,
+      "learning_rate": 0.0015,
+      "loss": 2.8037,
+      "step": 17175
+    },
+    {
+      "epoch": 0.10669842805920522,
+      "grad_norm": 0.15346336364746094,
+      "learning_rate": 0.0015,
+      "loss": 2.7668,
+      "step": 17200
+    },
+    {
+      "epoch": 0.10669842805920522,
+      "eval_loss": 4.503612995147705,
+      "perplexity": 90.34294891357422,
+      "step": 17200
+    },
+    {
+      "epoch": 0.10685351298370988,
+      "grad_norm": 0.10642746090888977,
+      "learning_rate": 0.0015,
+      "loss": 2.7295,
+      "step": 17225
+    },
+    {
+      "epoch": 0.10700859790821454,
+      "grad_norm": 0.10965430736541748,
+      "learning_rate": 0.0015,
+      "loss": 2.7113,
+      "step": 17250
+    },
+    {
+      "epoch": 0.1071636828327192,
+      "grad_norm": 0.09912869334220886,
+      "learning_rate": 0.0015,
+      "loss": 2.7353,
+      "step": 17275
+    },
+    {
+      "epoch": 0.10731876775722386,
+      "grad_norm": 0.14111942052841187,
+      "learning_rate": 0.0015,
+      "loss": 2.7064,
+      "step": 17300
+    },
+    {
+      "epoch": 0.10747385268172852,
+      "grad_norm": 0.11583065241575241,
+      "learning_rate": 0.0015,
+      "loss": 2.722,
+      "step": 17325
+    },
+    {
+      "epoch": 0.10762893760623317,
+      "grad_norm": 0.09374859184026718,
+      "learning_rate": 0.0015,
+      "loss": 2.6964,
+      "step": 17350
+    },
+    {
+      "epoch": 0.10778402253073784,
+      "grad_norm": 0.11704573035240173,
+      "learning_rate": 0.0015,
+      "loss": 2.7518,
+      "step": 17375
+    },
+    {
+      "epoch": 0.10793910745524249,
+      "grad_norm": 0.13960668444633484,
+      "learning_rate": 0.0015,
+      "loss": 2.7373,
+      "step": 17400
+    },
+    {
+      "epoch": 0.10793910745524249,
+      "eval_loss": 4.514464378356934,
+      "perplexity": 91.3286361694336,
+      "step": 17400
+    },
+    {
+      "epoch": 0.10809419237974716,
+      "grad_norm": 0.1006089448928833,
+      "learning_rate": 0.0015,
+      "loss": 2.7199,
+      "step": 17425
+    },
+    {
+      "epoch": 0.10824927730425181,
+      "grad_norm": 0.14851173758506775,
+      "learning_rate": 0.0015,
+      "loss": 2.7202,
+      "step": 17450
+    },
+    {
+      "epoch": 0.10840436222875646,
+      "grad_norm": 0.11992091685533524,
+      "learning_rate": 0.0015,
+      "loss": 2.6932,
+      "step": 17475
+    },
+    {
+      "epoch": 0.10855944715326113,
+      "grad_norm": 0.12420158833265305,
+      "learning_rate": 0.0015,
+      "loss": 2.7395,
+      "step": 17500
+    },
+    {
+      "epoch": 0.10871453207776578,
+      "grad_norm": 0.09945713728666306,
+      "learning_rate": 0.0015,
+      "loss": 2.7323,
+      "step": 17525
+    },
+    {
+      "epoch": 0.10886961700227045,
+      "grad_norm": 0.13007710874080658,
+      "learning_rate": 0.0015,
+      "loss": 2.7438,
+      "step": 17550
+    },
+    {
+      "epoch": 0.1090247019267751,
+      "grad_norm": 0.10875315964221954,
+      "learning_rate": 0.0015,
+      "loss": 2.7656,
+      "step": 17575
+    },
+    {
+      "epoch": 0.10917978685127976,
+      "grad_norm": 0.1075393334031105,
+      "learning_rate": 0.0015,
+      "loss": 2.7174,
+      "step": 17600
+    },
+    {
+      "epoch": 0.10917978685127976,
+      "eval_loss": 4.4858293533325195,
+      "perplexity": 88.75052642822266,
+      "step": 17600
+    },
+    {
+      "epoch": 0.10933487177578442,
+      "grad_norm": 0.16400013864040375,
+      "learning_rate": 0.0015,
+      "loss": 2.7389,
+      "step": 17625
+    },
+    {
+      "epoch": 0.10948995670028908,
+      "grad_norm": 0.1368722766637802,
+      "learning_rate": 0.0015,
+      "loss": 2.7198,
+      "step": 17650
+    },
+    {
+      "epoch": 0.10964504162479374,
+      "grad_norm": 0.23104597628116608,
+      "learning_rate": 0.0015,
+      "loss": 2.7346,
+      "step": 17675
+    },
+    {
+      "epoch": 0.1098001265492984,
+      "grad_norm": 0.12463794648647308,
+      "learning_rate": 0.0015,
+      "loss": 2.691,
+      "step": 17700
+    },
+    {
+      "epoch": 0.10995521147380305,
+      "grad_norm": 0.19538962841033936,
+      "learning_rate": 0.0015,
+      "loss": 2.6917,
+      "step": 17725
+    },
+    {
+      "epoch": 0.11011029639830772,
+      "grad_norm": 0.12000603973865509,
+      "learning_rate": 0.0015,
+      "loss": 2.7431,
+      "step": 17750
+    },
+    {
+      "epoch": 0.11026538132281237,
+      "grad_norm": 0.15090298652648926,
+      "learning_rate": 0.0015,
+      "loss": 2.7493,
+      "step": 17775
+    },
+    {
+      "epoch": 0.11042046624731704,
+      "grad_norm": 0.13190440833568573,
+      "learning_rate": 0.0015,
+      "loss": 2.7582,
+      "step": 17800
+    },
+    {
+      "epoch": 0.11042046624731704,
+      "eval_loss": 4.493134021759033,
+      "perplexity": 89.40119171142578,
+      "step": 17800
+    },
+    {
+      "epoch": 0.11057555117182169,
+      "grad_norm": 0.12455850094556808,
+      "learning_rate": 0.0015,
+      "loss": 2.7574,
+      "step": 17825
+    },
+    {
+      "epoch": 0.11073063609632634,
+      "grad_norm": 0.14911110699176788,
+      "learning_rate": 0.0015,
+      "loss": 2.7285,
+      "step": 17850
+    },
+    {
+      "epoch": 0.11088572102083101,
+      "grad_norm": 0.16008728742599487,
+      "learning_rate": 0.0015,
+      "loss": 2.733,
+      "step": 17875
+    },
+    {
+      "epoch": 0.11104080594533566,
+      "grad_norm": 0.1668420433998108,
+      "learning_rate": 0.0015,
+      "loss": 2.7259,
+      "step": 17900
+    },
+    {
+      "epoch": 0.11119589086984033,
+      "grad_norm": 0.11736566573381424,
+      "learning_rate": 0.0015,
+      "loss": 2.7682,
+      "step": 17925
+    },
+    {
+      "epoch": 0.11135097579434498,
+      "grad_norm": 0.11538700759410858,
+      "learning_rate": 0.0015,
+      "loss": 2.7656,
+      "step": 17950
+    },
+    {
+      "epoch": 0.11150606071884964,
+      "grad_norm": 0.09440570324659348,
+      "learning_rate": 0.0015,
+      "loss": 2.7517,
+      "step": 17975
+    },
+    {
+      "epoch": 0.1116611456433543,
+      "grad_norm": 0.20621652901172638,
+      "learning_rate": 0.0015,
+      "loss": 2.7292,
+      "step": 18000
+    },
+    {
+      "epoch": 0.1116611456433543,
+      "eval_loss": 4.493429183959961,
+      "perplexity": 89.42758178710938,
+      "step": 18000
+    },
+    {
+      "epoch": 0.11181623056785896,
+      "grad_norm": 0.12027841061353683,
+      "learning_rate": 0.0015,
+      "loss": 2.7049,
+      "step": 18025
+    },
+    {
+      "epoch": 0.11197131549236362,
+      "grad_norm": 0.08760379254817963,
+      "learning_rate": 0.0015,
+      "loss": 2.7291,
+      "step": 18050
+    },
+    {
+      "epoch": 0.11212640041686828,
+      "grad_norm": 0.1251729428768158,
+      "learning_rate": 0.0015,
+      "loss": 2.7149,
+      "step": 18075
+    },
+    {
+      "epoch": 0.11228148534137293,
+      "grad_norm": 0.10340214520692825,
+      "learning_rate": 0.0015,
+      "loss": 2.7437,
+      "step": 18100
+    },
+    {
+      "epoch": 0.1124365702658776,
+      "grad_norm": 0.10546920448541641,
+      "learning_rate": 0.0015,
+      "loss": 2.7656,
+      "step": 18125
+    },
+    {
+      "epoch": 0.11259165519038225,
+      "grad_norm": 0.12438227981328964,
+      "learning_rate": 0.0015,
+      "loss": 2.7171,
+      "step": 18150
+    },
+    {
+      "epoch": 0.11274674011488692,
+      "grad_norm": 0.14557534456253052,
+      "learning_rate": 0.0015,
+      "loss": 2.7395,
+      "step": 18175
+    },
+    {
+      "epoch": 0.11290182503939157,
+      "grad_norm": 0.13714823126792908,
+      "learning_rate": 0.0015,
+      "loss": 2.7066,
+      "step": 18200
+    },
+    {
+      "epoch": 0.11290182503939157,
+      "eval_loss": 4.4876604080200195,
+      "perplexity": 88.9131851196289,
+      "step": 18200
+    },
+    {
+      "epoch": 0.11305690996389622,
+      "grad_norm": 0.12662547826766968,
+      "learning_rate": 0.0015,
+      "loss": 2.6665,
+      "step": 18225
+    },
+    {
+      "epoch": 0.11321199488840089,
+      "grad_norm": 0.10047092288732529,
+      "learning_rate": 0.0015,
+      "loss": 2.7332,
+      "step": 18250
+    },
+    {
+      "epoch": 0.11336707981290554,
+      "grad_norm": 0.11126455664634705,
+      "learning_rate": 0.0015,
+      "loss": 2.7154,
+      "step": 18275
+    },
+    {
+      "epoch": 0.11352216473741021,
+      "grad_norm": 0.10023871064186096,
+      "learning_rate": 0.0015,
+      "loss": 2.7007,
+      "step": 18300
+    },
+    {
+      "epoch": 0.11367724966191486,
+      "grad_norm": 0.11821885406970978,
+      "learning_rate": 0.0015,
+      "loss": 2.7081,
+      "step": 18325
+    },
+    {
+      "epoch": 0.11383233458641952,
+      "grad_norm": 0.1216677874326706,
+      "learning_rate": 0.0015,
+      "loss": 2.74,
+      "step": 18350
+    },
+    {
+      "epoch": 0.11398741951092418,
+      "grad_norm": 0.1125161275267601,
+      "learning_rate": 0.0015,
+      "loss": 2.733,
+      "step": 18375
+    },
+    {
+      "epoch": 0.11414250443542884,
+      "grad_norm": 0.18253153562545776,
+      "learning_rate": 0.0015,
+      "loss": 2.7085,
+      "step": 18400
+    },
+    {
+      "epoch": 0.11414250443542884,
+      "eval_loss": 4.501376628875732,
+      "perplexity": 90.1411361694336,
+      "step": 18400
+    },
+    {
+      "epoch": 0.1142975893599335,
+      "grad_norm": 0.13288918137550354,
+      "learning_rate": 0.0015,
+      "loss": 2.7033,
+      "step": 18425
+    },
+    {
+      "epoch": 0.11445267428443816,
+      "grad_norm": 0.1069432720541954,
+      "learning_rate": 0.0015,
+      "loss": 2.7063,
+      "step": 18450
+    },
+    {
+      "epoch": 0.11460775920894281,
+      "grad_norm": 0.1035354733467102,
+      "learning_rate": 0.0015,
+      "loss": 2.7174,
+      "step": 18475
+    },
+    {
+      "epoch": 0.11476284413344748,
+      "grad_norm": 0.1121230348944664,
+      "learning_rate": 0.0015,
+      "loss": 2.7,
+      "step": 18500
+    },
+    {
+      "epoch": 0.11491792905795213,
+      "grad_norm": 0.13324719667434692,
+      "learning_rate": 0.0015,
+      "loss": 2.7423,
+      "step": 18525
+    },
+    {
+      "epoch": 0.1150730139824568,
+      "grad_norm": 0.0891190841794014,
+      "learning_rate": 0.0015,
+      "loss": 2.7418,
+      "step": 18550
+    },
+    {
+      "epoch": 0.11522809890696145,
+      "grad_norm": 0.10579492896795273,
+      "learning_rate": 0.0015,
+      "loss": 2.7321,
+      "step": 18575
+    },
+    {
+      "epoch": 0.1153831838314661,
+      "grad_norm": 0.1010003387928009,
+      "learning_rate": 0.0015,
+      "loss": 2.7071,
+      "step": 18600
+    },
+    {
+      "epoch": 0.1153831838314661,
+      "eval_loss": 4.508904933929443,
+      "perplexity": 90.82231140136719,
+      "step": 18600
+    },
+    {
+      "epoch": 0.11553826875597077,
+      "grad_norm": 0.1599242389202118,
+      "learning_rate": 0.0015,
+      "loss": 2.7222,
+      "step": 18625
+    },
+    {
+      "epoch": 0.11569335368047542,
+      "grad_norm": 0.09344537556171417,
+      "learning_rate": 0.0015,
+      "loss": 2.7424,
+      "step": 18650
+    },
+    {
+      "epoch": 0.11584843860498009,
+      "grad_norm": 0.13959461450576782,
+      "learning_rate": 0.0015,
+      "loss": 2.7584,
+      "step": 18675
+    },
+    {
+      "epoch": 0.11600352352948474,
+      "grad_norm": 0.11661764234304428,
+      "learning_rate": 0.0015,
+      "loss": 2.7363,
+      "step": 18700
+    },
+    {
+      "epoch": 0.1161586084539894,
+      "grad_norm": 0.11968798190355301,
+      "learning_rate": 0.0015,
+      "loss": 2.7314,
+      "step": 18725
+    },
+    {
+      "epoch": 0.11631369337849407,
+      "grad_norm": 0.22232107818126678,
+      "learning_rate": 0.0015,
+      "loss": 2.6992,
+      "step": 18750
+    },
+    {
+      "epoch": 0.11646877830299872,
+      "grad_norm": 0.1387198567390442,
+      "learning_rate": 0.0015,
+      "loss": 2.7001,
+      "step": 18775
+    },
+    {
+      "epoch": 0.11662386322750339,
+      "grad_norm": 0.17059509456157684,
+      "learning_rate": 0.0015,
+      "loss": 2.7002,
+      "step": 18800
+    },
+    {
+      "epoch": 0.11662386322750339,
+      "eval_loss": 4.516000270843506,
+      "perplexity": 91.4690170288086,
+      "step": 18800
+    },
+    {
+      "epoch": 0.11677894815200804,
+      "grad_norm": 0.10877668112516403,
+      "learning_rate": 0.0015,
+      "loss": 2.7171,
+      "step": 18825
+    },
+    {
+      "epoch": 0.11693403307651269,
+      "grad_norm": 0.11746638268232346,
+      "learning_rate": 0.0015,
+      "loss": 2.7006,
+      "step": 18850
+    },
+    {
+      "epoch": 0.11708911800101736,
+      "grad_norm": 0.17617632448673248,
+      "learning_rate": 0.0015,
+      "loss": 2.7427,
+      "step": 18875
+    },
+    {
+      "epoch": 0.11724420292552201,
+      "grad_norm": 0.09788820147514343,
+      "learning_rate": 0.0015,
+      "loss": 2.7507,
+      "step": 18900
+    },
+    {
+      "epoch": 0.11739928785002668,
+      "grad_norm": 0.1285056471824646,
+      "learning_rate": 0.0015,
+      "loss": 2.7386,
+      "step": 18925
+    },
+    {
+      "epoch": 0.11755437277453133,
+      "grad_norm": 0.11705992370843887,
+      "learning_rate": 0.0015,
+      "loss": 2.7234,
+      "step": 18950
+    },
+    {
+      "epoch": 0.11770945769903599,
+      "grad_norm": 0.09166467934846878,
+      "learning_rate": 0.0015,
+      "loss": 2.7825,
+      "step": 18975
+    },
+    {
+      "epoch": 0.11786454262354065,
+      "grad_norm": 0.11318054795265198,
+      "learning_rate": 0.0015,
+      "loss": 2.778,
+      "step": 19000
+    },
+    {
+      "epoch": 0.11786454262354065,
+      "eval_loss": 4.499363422393799,
+      "perplexity": 89.95984649658203,
+      "step": 19000
+    }
+  ],
+  "logging_steps": 25,
+  "max_steps": 161202,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": true,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 60,
+  "trial_name": null,
+  "trial_params": null
+}