{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500.0, "global_step": 13267, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003768749528906309, "grad_norm": 0.19456887245178223, "learning_rate": 9.999649547444612e-05, "loss": 0.4609, "step": 50 }, { "epoch": 0.007537499057812618, "grad_norm": 0.18202780187129974, "learning_rate": 9.998598238905239e-05, "loss": 0.4407, "step": 100 }, { "epoch": 0.011306248586718927, "grad_norm": 0.17280590534210205, "learning_rate": 9.996846221755392e-05, "loss": 0.4401, "step": 150 }, { "epoch": 0.015074998115625236, "grad_norm": 0.18477164208889008, "learning_rate": 9.994393741594623e-05, "loss": 0.4424, "step": 200 }, { "epoch": 0.018843747644531544, "grad_norm": 0.18784017860889435, "learning_rate": 9.99124114221411e-05, "loss": 0.4424, "step": 250 }, { "epoch": 0.022612497173437853, "grad_norm": 0.16860812902450562, "learning_rate": 9.987388865548454e-05, "loss": 0.4418, "step": 300 }, { "epoch": 0.026381246702344163, "grad_norm": 0.1637168675661087, "learning_rate": 9.982837451613738e-05, "loss": 0.4426, "step": 350 }, { "epoch": 0.030149996231250472, "grad_norm": 0.15581847727298737, "learning_rate": 9.977587538431816e-05, "loss": 0.4415, "step": 400 }, { "epoch": 0.03391874576015678, "grad_norm": 0.16367992758750916, "learning_rate": 9.971639861940889e-05, "loss": 0.4401, "step": 450 }, { "epoch": 0.03768749528906309, "grad_norm": 0.1706039160490036, "learning_rate": 9.964995255892323e-05, "loss": 0.4442, "step": 500 }, { "epoch": 0.0414562448179694, "grad_norm": 0.16303901374340057, "learning_rate": 9.957654651733788e-05, "loss": 0.4427, "step": 550 }, { "epoch": 0.045224994346875706, "grad_norm": 0.15467339754104614, "learning_rate": 9.949619078478677e-05, "loss": 0.4397, "step": 600 }, { "epoch": 0.048993743875782016, "grad_norm": 0.1724122315645218, "learning_rate": 9.940889662561864e-05, "loss": 0.4421, "step": 650 }, { "epoch": 0.052762493404688325, "grad_norm": 0.15842895209789276, "learning_rate": 9.931467627681792e-05, "loss": 0.439, "step": 700 }, { "epoch": 0.056531242933594635, "grad_norm": 0.1511755883693695, "learning_rate": 9.921354294628944e-05, "loss": 0.4391, "step": 750 }, { "epoch": 0.060299992462500944, "grad_norm": 0.14909741282463074, "learning_rate": 9.910551081100684e-05, "loss": 0.4397, "step": 800 }, { "epoch": 0.06406874199140725, "grad_norm": 0.15353120863437653, "learning_rate": 9.899059501502526e-05, "loss": 0.4389, "step": 850 }, { "epoch": 0.06783749152031356, "grad_norm": 0.1551615446805954, "learning_rate": 9.886881166735846e-05, "loss": 0.4371, "step": 900 }, { "epoch": 0.07160624104921987, "grad_norm": 0.1446385234594345, "learning_rate": 9.874017783972058e-05, "loss": 0.4392, "step": 950 }, { "epoch": 0.07537499057812617, "grad_norm": 0.14661286771297455, "learning_rate": 9.860471156413309e-05, "loss": 0.4372, "step": 1000 }, { "epoch": 0.07914374010703248, "grad_norm": 0.15160425007343292, "learning_rate": 9.846243183039694e-05, "loss": 0.4376, "step": 1050 }, { "epoch": 0.0829124896359388, "grad_norm": 0.15164785087108612, "learning_rate": 9.831335858343064e-05, "loss": 0.4363, "step": 1100 }, { "epoch": 0.0866812391648451, "grad_norm": 0.15714821219444275, "learning_rate": 9.815751272047434e-05, "loss": 0.4369, "step": 1150 }, { "epoch": 0.09044998869375141, "grad_norm": 0.14024978876113892, "learning_rate": 9.79949160881604e-05, "loss": 0.4349, "step": 1200 }, { "epoch": 0.09421873822265772, "grad_norm": 0.15335111320018768, "learning_rate": 9.782559147945094e-05, "loss": 0.4361, "step": 1250 }, { "epoch": 0.09798748775156403, "grad_norm": 0.14330938458442688, "learning_rate": 9.76495626304427e-05, "loss": 0.4347, "step": 1300 }, { "epoch": 0.10175623728047034, "grad_norm": 0.14001183211803436, "learning_rate": 9.746685421703961e-05, "loss": 0.4331, "step": 1350 }, { "epoch": 0.10552498680937665, "grad_norm": 0.16508500277996063, "learning_rate": 9.727749185149388e-05, "loss": 0.4339, "step": 1400 }, { "epoch": 0.10929373633828296, "grad_norm": 0.14644689857959747, "learning_rate": 9.708150207881543e-05, "loss": 0.4337, "step": 1450 }, { "epoch": 0.11306248586718927, "grad_norm": 0.14279577136039734, "learning_rate": 9.687891237305096e-05, "loss": 0.4339, "step": 1500 }, { "epoch": 0.11683123539609558, "grad_norm": 0.14988760650157928, "learning_rate": 9.666975113343246e-05, "loss": 0.4321, "step": 1550 }, { "epoch": 0.12059998492500189, "grad_norm": 0.1408597081899643, "learning_rate": 9.645404768039633e-05, "loss": 0.4311, "step": 1600 }, { "epoch": 0.1243687344539082, "grad_norm": 0.1554020494222641, "learning_rate": 9.623183225147308e-05, "loss": 0.4311, "step": 1650 }, { "epoch": 0.1281374839828145, "grad_norm": 0.14362740516662598, "learning_rate": 9.600313599704869e-05, "loss": 0.429, "step": 1700 }, { "epoch": 0.13190623351172082, "grad_norm": 0.1427031308412552, "learning_rate": 9.576799097599786e-05, "loss": 0.4298, "step": 1750 }, { "epoch": 0.1356749830406271, "grad_norm": 0.145565927028656, "learning_rate": 9.552643015118998e-05, "loss": 0.427, "step": 1800 }, { "epoch": 0.13944373256953344, "grad_norm": 0.14019770920276642, "learning_rate": 9.527848738486842e-05, "loss": 0.4265, "step": 1850 }, { "epoch": 0.14321248209843973, "grad_norm": 0.14158335328102112, "learning_rate": 9.502419743390357e-05, "loss": 0.4283, "step": 1900 }, { "epoch": 0.14698123162734605, "grad_norm": 0.14075861871242523, "learning_rate": 9.476359594492068e-05, "loss": 0.4277, "step": 1950 }, { "epoch": 0.15074998115625235, "grad_norm": 0.1477021872997284, "learning_rate": 9.449671944930288e-05, "loss": 0.4277, "step": 2000 }, { "epoch": 0.15451873068515867, "grad_norm": 0.13484624028205872, "learning_rate": 9.422360535807009e-05, "loss": 0.4285, "step": 2050 }, { "epoch": 0.15828748021406497, "grad_norm": 0.13757042586803436, "learning_rate": 9.394429195663478e-05, "loss": 0.4268, "step": 2100 }, { "epoch": 0.1620562297429713, "grad_norm": 0.1443628966808319, "learning_rate": 9.365881839943508e-05, "loss": 0.4249, "step": 2150 }, { "epoch": 0.1658249792718776, "grad_norm": 0.138884037733078, "learning_rate": 9.336722470444604e-05, "loss": 0.4265, "step": 2200 }, { "epoch": 0.1695937288007839, "grad_norm": 0.14065410196781158, "learning_rate": 9.306955174756985e-05, "loss": 0.4258, "step": 2250 }, { "epoch": 0.1733624783296902, "grad_norm": 0.1357513815164566, "learning_rate": 9.27658412569059e-05, "loss": 0.4246, "step": 2300 }, { "epoch": 0.17713122785859653, "grad_norm": 0.13815779983997345, "learning_rate": 9.24561358069012e-05, "loss": 0.4239, "step": 2350 }, { "epoch": 0.18089997738750282, "grad_norm": 0.135328009724617, "learning_rate": 9.214047881238233e-05, "loss": 0.4233, "step": 2400 }, { "epoch": 0.18466872691640915, "grad_norm": 0.13310644030570984, "learning_rate": 9.181891452246937e-05, "loss": 0.4262, "step": 2450 }, { "epoch": 0.18843747644531544, "grad_norm": 0.15285935997962952, "learning_rate": 9.149148801437321e-05, "loss": 0.4233, "step": 2500 }, { "epoch": 0.19220622597422174, "grad_norm": 0.1368732899427414, "learning_rate": 9.115824518707644e-05, "loss": 0.4225, "step": 2550 }, { "epoch": 0.19597497550312806, "grad_norm": 0.1352555751800537, "learning_rate": 9.08192327548992e-05, "loss": 0.4219, "step": 2600 }, { "epoch": 0.19974372503203436, "grad_norm": 0.1387627273797989, "learning_rate": 9.047449824095075e-05, "loss": 0.421, "step": 2650 }, { "epoch": 0.20351247456094068, "grad_norm": 0.13286112248897552, "learning_rate": 9.012408997046766e-05, "loss": 0.4217, "step": 2700 }, { "epoch": 0.20728122408984698, "grad_norm": 0.1302906721830368, "learning_rate": 8.976805706403942e-05, "loss": 0.4207, "step": 2750 }, { "epoch": 0.2110499736187533, "grad_norm": 0.13901737332344055, "learning_rate": 8.94064494307228e-05, "loss": 0.4193, "step": 2800 }, { "epoch": 0.2148187231476596, "grad_norm": 0.12851282954216003, "learning_rate": 8.903931776104545e-05, "loss": 0.4182, "step": 2850 }, { "epoch": 0.21858747267656592, "grad_norm": 0.13808579742908478, "learning_rate": 8.866671351990007e-05, "loss": 0.4181, "step": 2900 }, { "epoch": 0.22235622220547221, "grad_norm": 0.14110170304775238, "learning_rate": 8.82886889393301e-05, "loss": 0.4193, "step": 2950 }, { "epoch": 0.22612497173437854, "grad_norm": 0.13282662630081177, "learning_rate": 8.790529701120759e-05, "loss": 0.4166, "step": 3000 }, { "epoch": 0.22989372126328483, "grad_norm": 0.12828828394412994, "learning_rate": 8.751659147980493e-05, "loss": 0.4185, "step": 3050 }, { "epoch": 0.23366247079219116, "grad_norm": 0.13336990773677826, "learning_rate": 8.712262683426082e-05, "loss": 0.4149, "step": 3100 }, { "epoch": 0.23743122032109745, "grad_norm": 0.13983511924743652, "learning_rate": 8.672345830094199e-05, "loss": 0.4174, "step": 3150 }, { "epoch": 0.24119996985000378, "grad_norm": 0.13716702163219452, "learning_rate": 8.631914183570143e-05, "loss": 0.4167, "step": 3200 }, { "epoch": 0.24496871937891007, "grad_norm": 0.12907662987709045, "learning_rate": 8.590973411603452e-05, "loss": 0.4159, "step": 3250 }, { "epoch": 0.2487374689078164, "grad_norm": 0.12837421894073486, "learning_rate": 8.549529253313386e-05, "loss": 0.4165, "step": 3300 }, { "epoch": 0.2525062184367227, "grad_norm": 0.13946649432182312, "learning_rate": 8.507587518384421e-05, "loss": 0.414, "step": 3350 }, { "epoch": 0.256274967965629, "grad_norm": 0.128893181681633, "learning_rate": 8.465154086251828e-05, "loss": 0.4133, "step": 3400 }, { "epoch": 0.2600437174945353, "grad_norm": 0.13107603788375854, "learning_rate": 8.422234905277495e-05, "loss": 0.414, "step": 3450 }, { "epoch": 0.26381246702344163, "grad_norm": 0.1320282369852066, "learning_rate": 8.378835991916083e-05, "loss": 0.4139, "step": 3500 }, { "epoch": 0.26758121655234796, "grad_norm": 0.135846346616745, "learning_rate": 8.334963429871627e-05, "loss": 0.4154, "step": 3550 }, { "epoch": 0.2713499660812542, "grad_norm": 0.1381276398897171, "learning_rate": 8.290623369244721e-05, "loss": 0.413, "step": 3600 }, { "epoch": 0.27511871561016055, "grad_norm": 0.1333329826593399, "learning_rate": 8.245822025670384e-05, "loss": 0.4146, "step": 3650 }, { "epoch": 0.27888746513906687, "grad_norm": 0.12488547712564468, "learning_rate": 8.200565679446753e-05, "loss": 0.4102, "step": 3700 }, { "epoch": 0.2826562146679732, "grad_norm": 0.13603095710277557, "learning_rate": 8.154860674654698e-05, "loss": 0.4114, "step": 3750 }, { "epoch": 0.28642496419687946, "grad_norm": 0.13111622631549835, "learning_rate": 8.108713418268514e-05, "loss": 0.4112, "step": 3800 }, { "epoch": 0.2901937137257858, "grad_norm": 0.1286476105451584, "learning_rate": 8.062130379257764e-05, "loss": 0.4112, "step": 3850 }, { "epoch": 0.2939624632546921, "grad_norm": 0.13606774806976318, "learning_rate": 8.015118087680477e-05, "loss": 0.4117, "step": 3900 }, { "epoch": 0.2977312127835984, "grad_norm": 0.12688776850700378, "learning_rate": 7.96768313376774e-05, "loss": 0.4077, "step": 3950 }, { "epoch": 0.3014999623125047, "grad_norm": 0.13127557933330536, "learning_rate": 7.919832166999874e-05, "loss": 0.4091, "step": 4000 }, { "epoch": 0.305268711841411, "grad_norm": 0.12965704500675201, "learning_rate": 7.871571895174316e-05, "loss": 0.4076, "step": 4050 }, { "epoch": 0.30903746137031735, "grad_norm": 0.1248898133635521, "learning_rate": 7.822909083465298e-05, "loss": 0.4099, "step": 4100 }, { "epoch": 0.3128062108992236, "grad_norm": 0.13600467145442963, "learning_rate": 7.773850553475508e-05, "loss": 0.4071, "step": 4150 }, { "epoch": 0.31657496042812994, "grad_norm": 0.13761986792087555, "learning_rate": 7.724403182279823e-05, "loss": 0.4097, "step": 4200 }, { "epoch": 0.32034370995703626, "grad_norm": 0.13098478317260742, "learning_rate": 7.674573901461282e-05, "loss": 0.4046, "step": 4250 }, { "epoch": 0.3241124594859426, "grad_norm": 0.1297547072172165, "learning_rate": 7.624369696139402e-05, "loss": 0.406, "step": 4300 }, { "epoch": 0.32788120901484885, "grad_norm": 0.12281239032745361, "learning_rate": 7.573797603991004e-05, "loss": 0.4072, "step": 4350 }, { "epoch": 0.3316499585437552, "grad_norm": 0.1250068098306656, "learning_rate": 7.522864714263655e-05, "loss": 0.4068, "step": 4400 }, { "epoch": 0.3354187080726615, "grad_norm": 0.12702082097530365, "learning_rate": 7.471578166781899e-05, "loss": 0.4069, "step": 4450 }, { "epoch": 0.3391874576015678, "grad_norm": 0.1302025467157364, "learning_rate": 7.419945150946386e-05, "loss": 0.4045, "step": 4500 }, { "epoch": 0.3429562071304741, "grad_norm": 0.13086843490600586, "learning_rate": 7.367972904726055e-05, "loss": 0.4065, "step": 4550 }, { "epoch": 0.3467249566593804, "grad_norm": 0.12231607735157013, "learning_rate": 7.3156687136435e-05, "loss": 0.4041, "step": 4600 }, { "epoch": 0.35049370618828674, "grad_norm": 0.12740549445152283, "learning_rate": 7.26303990975369e-05, "loss": 0.4045, "step": 4650 }, { "epoch": 0.35426245571719306, "grad_norm": 0.1309334933757782, "learning_rate": 7.210093870616155e-05, "loss": 0.4043, "step": 4700 }, { "epoch": 0.3580312052460993, "grad_norm": 0.12529818713665009, "learning_rate": 7.156838018260776e-05, "loss": 0.4034, "step": 4750 }, { "epoch": 0.36179995477500565, "grad_norm": 0.12599386274814606, "learning_rate": 7.103279818147371e-05, "loss": 0.4014, "step": 4800 }, { "epoch": 0.365568704303912, "grad_norm": 0.13122481107711792, "learning_rate": 7.049426778119179e-05, "loss": 0.402, "step": 4850 }, { "epoch": 0.3693374538328183, "grad_norm": 0.1257227212190628, "learning_rate": 6.995286447350397e-05, "loss": 0.402, "step": 4900 }, { "epoch": 0.37310620336172456, "grad_norm": 0.13369211554527283, "learning_rate": 6.940866415287931e-05, "loss": 0.4021, "step": 4950 }, { "epoch": 0.3768749528906309, "grad_norm": 0.12336255609989166, "learning_rate": 6.886174310587501e-05, "loss": 0.3999, "step": 5000 }, { "epoch": 0.3806437024195372, "grad_norm": 0.12576647102832794, "learning_rate": 6.831217800044252e-05, "loss": 0.4027, "step": 5050 }, { "epoch": 0.3844124519484435, "grad_norm": 0.12260672450065613, "learning_rate": 6.776004587518001e-05, "loss": 0.4018, "step": 5100 }, { "epoch": 0.3881812014773498, "grad_norm": 0.12775105237960815, "learning_rate": 6.720542412853319e-05, "loss": 0.3993, "step": 5150 }, { "epoch": 0.3919499510062561, "grad_norm": 0.126275435090065, "learning_rate": 6.66483905079454e-05, "loss": 0.3991, "step": 5200 }, { "epoch": 0.39571870053516245, "grad_norm": 0.12239239364862442, "learning_rate": 6.608902309895895e-05, "loss": 0.3974, "step": 5250 }, { "epoch": 0.3994874500640687, "grad_norm": 0.12439530342817307, "learning_rate": 6.552740031426902e-05, "loss": 0.3949, "step": 5300 }, { "epoch": 0.40325619959297504, "grad_norm": 0.12401723861694336, "learning_rate": 6.496360088273161e-05, "loss": 0.3974, "step": 5350 }, { "epoch": 0.40702494912188136, "grad_norm": 0.12568959593772888, "learning_rate": 6.439770383832732e-05, "loss": 0.3977, "step": 5400 }, { "epoch": 0.4107936986507877, "grad_norm": 0.12248090654611588, "learning_rate": 6.382978850908226e-05, "loss": 0.3969, "step": 5450 }, { "epoch": 0.41456244817969395, "grad_norm": 0.12089215964078903, "learning_rate": 6.325993450594782e-05, "loss": 0.3973, "step": 5500 }, { "epoch": 0.4183311977086003, "grad_norm": 0.13026227056980133, "learning_rate": 6.26882217116406e-05, "loss": 0.3959, "step": 5550 }, { "epoch": 0.4220999472375066, "grad_norm": 0.12836764752864838, "learning_rate": 6.211473026944452e-05, "loss": 0.3945, "step": 5600 }, { "epoch": 0.4258686967664129, "grad_norm": 0.12013324350118637, "learning_rate": 6.153954057197612e-05, "loss": 0.3955, "step": 5650 }, { "epoch": 0.4296374462953192, "grad_norm": 0.125695139169693, "learning_rate": 6.0962733249915135e-05, "loss": 0.3942, "step": 5700 }, { "epoch": 0.4334061958242255, "grad_norm": 0.12257810682058334, "learning_rate": 6.038438916070155e-05, "loss": 0.3953, "step": 5750 }, { "epoch": 0.43717494535313184, "grad_norm": 0.12903869152069092, "learning_rate": 5.9804589377200946e-05, "loss": 0.3946, "step": 5800 }, { "epoch": 0.44094369488203816, "grad_norm": 0.12327694892883301, "learning_rate": 5.922341517633965e-05, "loss": 0.3953, "step": 5850 }, { "epoch": 0.44471244441094443, "grad_norm": 0.11745862662792206, "learning_rate": 5.864094802771115e-05, "loss": 0.3925, "step": 5900 }, { "epoch": 0.44848119393985075, "grad_norm": 0.1281508505344391, "learning_rate": 5.8057269582155735e-05, "loss": 0.395, "step": 5950 }, { "epoch": 0.4522499434687571, "grad_norm": 0.12625160813331604, "learning_rate": 5.7472461660314504e-05, "loss": 0.3928, "step": 6000 }, { "epoch": 0.4560186929976634, "grad_norm": 0.1253785938024521, "learning_rate": 5.6886606241159714e-05, "loss": 0.3915, "step": 6050 }, { "epoch": 0.45978744252656967, "grad_norm": 0.1176779568195343, "learning_rate": 5.6299785450502853e-05, "loss": 0.3912, "step": 6100 }, { "epoch": 0.463556192055476, "grad_norm": 0.1299523264169693, "learning_rate": 5.571208154948218e-05, "loss": 0.3916, "step": 6150 }, { "epoch": 0.4673249415843823, "grad_norm": 0.12234111875295639, "learning_rate": 5.5123576923031253e-05, "loss": 0.3907, "step": 6200 }, { "epoch": 0.47109369111328864, "grad_norm": 0.12556277215480804, "learning_rate": 5.453435406833017e-05, "loss": 0.3877, "step": 6250 }, { "epoch": 0.4748624406421949, "grad_norm": 0.12356381118297577, "learning_rate": 5.3944495583240987e-05, "loss": 0.3922, "step": 6300 }, { "epoch": 0.47863119017110123, "grad_norm": 0.12636062502861023, "learning_rate": 5.3354084154729034e-05, "loss": 0.3889, "step": 6350 }, { "epoch": 0.48239993970000755, "grad_norm": 0.12002749741077423, "learning_rate": 5.276320254727187e-05, "loss": 0.3894, "step": 6400 }, { "epoch": 0.4861686892289138, "grad_norm": 0.11866312474012375, "learning_rate": 5.217193359125724e-05, "loss": 0.3904, "step": 6450 }, { "epoch": 0.48993743875782014, "grad_norm": 0.12735402584075928, "learning_rate": 5.15803601713717e-05, "loss": 0.3898, "step": 6500 }, { "epoch": 0.49370618828672647, "grad_norm": 0.12688295543193817, "learning_rate": 5.0988565214981976e-05, "loss": 0.3875, "step": 6550 }, { "epoch": 0.4974749378156328, "grad_norm": 0.12292376905679703, "learning_rate": 5.0396631680509945e-05, "loss": 0.3878, "step": 6600 }, { "epoch": 0.5012436873445391, "grad_norm": 0.12782977521419525, "learning_rate": 4.9804642545803524e-05, "loss": 0.3882, "step": 6650 }, { "epoch": 0.5050124368734454, "grad_norm": 0.12405069172382355, "learning_rate": 4.9212680796504704e-05, "loss": 0.387, "step": 6700 }, { "epoch": 0.5087811864023517, "grad_norm": 0.12035728245973587, "learning_rate": 4.8620829414416615e-05, "loss": 0.3875, "step": 6750 }, { "epoch": 0.512549935931258, "grad_norm": 0.12531401216983795, "learning_rate": 4.8029171365870926e-05, "loss": 0.3864, "step": 6800 }, { "epoch": 0.5163186854601644, "grad_norm": 0.12338712066411972, "learning_rate": 4.743778959009766e-05, "loss": 0.3848, "step": 6850 }, { "epoch": 0.5200874349890706, "grad_norm": 0.11947856098413467, "learning_rate": 4.684676698759864e-05, "loss": 0.385, "step": 6900 }, { "epoch": 0.5238561845179769, "grad_norm": 0.12007743120193481, "learning_rate": 4.62561864085264e-05, "loss": 0.3843, "step": 6950 }, { "epoch": 0.5276249340468833, "grad_norm": 0.1278475821018219, "learning_rate": 4.566613064107015e-05, "loss": 0.3853, "step": 7000 }, { "epoch": 0.5313936835757895, "grad_norm": 0.1218569353222847, "learning_rate": 4.507668239985055e-05, "loss": 0.3845, "step": 7050 }, { "epoch": 0.5351624331046959, "grad_norm": 0.11835400015115738, "learning_rate": 4.448792431432451e-05, "loss": 0.3859, "step": 7100 }, { "epoch": 0.5389311826336022, "grad_norm": 0.12840606272220612, "learning_rate": 4.389993891720232e-05, "loss": 0.3845, "step": 7150 }, { "epoch": 0.5426999321625084, "grad_norm": 0.13272738456726074, "learning_rate": 4.3312808632877924e-05, "loss": 0.3811, "step": 7200 }, { "epoch": 0.5464686816914148, "grad_norm": 0.12915217876434326, "learning_rate": 4.27266157658747e-05, "loss": 0.382, "step": 7250 }, { "epoch": 0.5502374312203211, "grad_norm": 0.12107321619987488, "learning_rate": 4.214144248930797e-05, "loss": 0.3826, "step": 7300 }, { "epoch": 0.5540061807492274, "grad_norm": 0.11897026747465134, "learning_rate": 4.155737083336575e-05, "loss": 0.3823, "step": 7350 }, { "epoch": 0.5577749302781337, "grad_norm": 0.12942850589752197, "learning_rate": 4.097448267380979e-05, "loss": 0.3832, "step": 7400 }, { "epoch": 0.56154367980704, "grad_norm": 0.12150803208351135, "learning_rate": 4.03928597204981e-05, "loss": 0.3794, "step": 7450 }, { "epoch": 0.5653124293359464, "grad_norm": 0.12691834568977356, "learning_rate": 3.9812583505930786e-05, "loss": 0.3823, "step": 7500 }, { "epoch": 0.5690811788648527, "grad_norm": 0.12072654068470001, "learning_rate": 3.923373537382074e-05, "loss": 0.3805, "step": 7550 }, { "epoch": 0.5728499283937589, "grad_norm": 0.1254051923751831, "learning_rate": 3.86563964676908e-05, "loss": 0.3801, "step": 7600 }, { "epoch": 0.5766186779226653, "grad_norm": 0.12295497953891754, "learning_rate": 3.808064771949893e-05, "loss": 0.3798, "step": 7650 }, { "epoch": 0.5803874274515716, "grad_norm": 0.1204095408320427, "learning_rate": 3.75065698382932e-05, "loss": 0.3804, "step": 7700 }, { "epoch": 0.5841561769804778, "grad_norm": 0.1274949461221695, "learning_rate": 3.693424329889776e-05, "loss": 0.3797, "step": 7750 }, { "epoch": 0.5879249265093842, "grad_norm": 0.12017184495925903, "learning_rate": 3.636374833063191e-05, "loss": 0.3793, "step": 7800 }, { "epoch": 0.5916936760382905, "grad_norm": 0.11911458522081375, "learning_rate": 3.579516490606346e-05, "loss": 0.378, "step": 7850 }, { "epoch": 0.5954624255671968, "grad_norm": 0.12406555563211441, "learning_rate": 3.522857272979804e-05, "loss": 0.3772, "step": 7900 }, { "epoch": 0.5992311750961031, "grad_norm": 0.1240847259759903, "learning_rate": 3.4664051227306026e-05, "loss": 0.3767, "step": 7950 }, { "epoch": 0.6029999246250094, "grad_norm": 0.12354228645563126, "learning_rate": 3.4101679533788734e-05, "loss": 0.3756, "step": 8000 }, { "epoch": 0.6067686741539158, "grad_norm": 0.12415247410535812, "learning_rate": 3.354153648308492e-05, "loss": 0.3758, "step": 8050 }, { "epoch": 0.610537423682822, "grad_norm": 0.12234577536582947, "learning_rate": 3.298370059662004e-05, "loss": 0.376, "step": 8100 }, { "epoch": 0.6143061732117283, "grad_norm": 0.1265120804309845, "learning_rate": 3.2428250072398846e-05, "loss": 0.3753, "step": 8150 }, { "epoch": 0.6180749227406347, "grad_norm": 0.12359097599983215, "learning_rate": 3.187526277404355e-05, "loss": 0.3766, "step": 8200 }, { "epoch": 0.621843672269541, "grad_norm": 0.12145058065652847, "learning_rate": 3.1324816219878903e-05, "loss": 0.3752, "step": 8250 }, { "epoch": 0.6256124217984472, "grad_norm": 0.12867897748947144, "learning_rate": 3.077698757206552e-05, "loss": 0.3759, "step": 8300 }, { "epoch": 0.6293811713273536, "grad_norm": 0.11933314055204391, "learning_rate": 3.0231853625783163e-05, "loss": 0.3737, "step": 8350 }, { "epoch": 0.6331499208562599, "grad_norm": 0.12191013246774673, "learning_rate": 2.9689490798465698e-05, "loss": 0.3739, "step": 8400 }, { "epoch": 0.6369186703851663, "grad_norm": 0.12023235857486725, "learning_rate": 2.9149975119088596e-05, "loss": 0.3739, "step": 8450 }, { "epoch": 0.6406874199140725, "grad_norm": 0.1173822283744812, "learning_rate": 2.8613382217511265e-05, "loss": 0.3721, "step": 8500 }, { "epoch": 0.6444561694429788, "grad_norm": 0.11978928744792938, "learning_rate": 2.807978731387516e-05, "loss": 0.3744, "step": 8550 }, { "epoch": 0.6482249189718852, "grad_norm": 0.12471877038478851, "learning_rate": 2.754926520805925e-05, "loss": 0.3734, "step": 8600 }, { "epoch": 0.6519936685007914, "grad_norm": 0.12483327090740204, "learning_rate": 2.702189026919465e-05, "loss": 0.3739, "step": 8650 }, { "epoch": 0.6557624180296977, "grad_norm": 0.12658704817295074, "learning_rate": 2.6497736425239315e-05, "loss": 0.3721, "step": 8700 }, { "epoch": 0.6595311675586041, "grad_norm": 0.12170502543449402, "learning_rate": 2.597687715261484e-05, "loss": 0.3725, "step": 8750 }, { "epoch": 0.6632999170875103, "grad_norm": 0.12154766917228699, "learning_rate": 2.5459385465906517e-05, "loss": 0.3727, "step": 8800 }, { "epoch": 0.6670686666164167, "grad_norm": 0.12545736134052277, "learning_rate": 2.4945333907627892e-05, "loss": 0.3733, "step": 8850 }, { "epoch": 0.670837416145323, "grad_norm": 0.12335515767335892, "learning_rate": 2.443479453805189e-05, "loss": 0.3694, "step": 8900 }, { "epoch": 0.6746061656742293, "grad_norm": 0.12594442069530487, "learning_rate": 2.392783892510917e-05, "loss": 0.3707, "step": 8950 }, { "epoch": 0.6783749152031356, "grad_norm": 0.12440726161003113, "learning_rate": 2.3424538134355715e-05, "loss": 0.3718, "step": 9000 }, { "epoch": 0.6821436647320419, "grad_norm": 0.1183919832110405, "learning_rate": 2.2924962719010874e-05, "loss": 0.371, "step": 9050 }, { "epoch": 0.6859124142609482, "grad_norm": 0.12334096431732178, "learning_rate": 2.242918271006698e-05, "loss": 0.371, "step": 9100 }, { "epoch": 0.6896811637898546, "grad_norm": 0.12674276530742645, "learning_rate": 2.193726760647245e-05, "loss": 0.3688, "step": 9150 }, { "epoch": 0.6934499133187608, "grad_norm": 0.1269780695438385, "learning_rate": 2.1449286365389342e-05, "loss": 0.3696, "step": 9200 }, { "epoch": 0.6972186628476671, "grad_norm": 0.1259194165468216, "learning_rate": 2.0965307392526818e-05, "loss": 0.369, "step": 9250 }, { "epoch": 0.7009874123765735, "grad_norm": 0.12153580784797668, "learning_rate": 2.048539853255197e-05, "loss": 0.3703, "step": 9300 }, { "epoch": 0.7047561619054797, "grad_norm": 0.12010648101568222, "learning_rate": 2.0009627059579372e-05, "loss": 0.3705, "step": 9350 }, { "epoch": 0.7085249114343861, "grad_norm": 0.1195657029747963, "learning_rate": 1.953805966774037e-05, "loss": 0.3709, "step": 9400 }, { "epoch": 0.7122936609632924, "grad_norm": 0.1207772046327591, "learning_rate": 1.9070762461834018e-05, "loss": 0.3685, "step": 9450 }, { "epoch": 0.7160624104921987, "grad_norm": 0.1287909597158432, "learning_rate": 1.8607800948060266e-05, "loss": 0.3677, "step": 9500 }, { "epoch": 0.719831160021105, "grad_norm": 0.12034378945827484, "learning_rate": 1.8149240024837315e-05, "loss": 0.3672, "step": 9550 }, { "epoch": 0.7235999095500113, "grad_norm": 0.12253709137439728, "learning_rate": 1.7695143973704143e-05, "loss": 0.3667, "step": 9600 }, { "epoch": 0.7273686590789176, "grad_norm": 0.12438009679317474, "learning_rate": 1.7245576450309316e-05, "loss": 0.368, "step": 9650 }, { "epoch": 0.731137408607824, "grad_norm": 0.12267523258924484, "learning_rate": 1.6800600475487826e-05, "loss": 0.3686, "step": 9700 }, { "epoch": 0.7349061581367302, "grad_norm": 0.12052857130765915, "learning_rate": 1.6360278426426624e-05, "loss": 0.3671, "step": 9750 }, { "epoch": 0.7386749076656366, "grad_norm": 0.12073458731174469, "learning_rate": 1.5924672027920663e-05, "loss": 0.3659, "step": 9800 }, { "epoch": 0.7424436571945429, "grad_norm": 0.12209140509366989, "learning_rate": 1.5493842343720104e-05, "loss": 0.3651, "step": 9850 }, { "epoch": 0.7462124067234491, "grad_norm": 0.12427693605422974, "learning_rate": 1.5067849767970488e-05, "loss": 0.365, "step": 9900 }, { "epoch": 0.7499811562523555, "grad_norm": 0.1223672404885292, "learning_rate": 1.4646754016746483e-05, "loss": 0.3672, "step": 9950 }, { "epoch": 0.7537499057812618, "grad_norm": 0.11954417079687119, "learning_rate": 1.4230614119680957e-05, "loss": 0.3673, "step": 10000 }, { "epoch": 0.757518655310168, "grad_norm": 0.12223341315984726, "learning_rate": 1.3819488411690018e-05, "loss": 0.366, "step": 10050 }, { "epoch": 0.7612874048390744, "grad_norm": 0.12244118750095367, "learning_rate": 1.3413434524795631e-05, "loss": 0.3655, "step": 10100 }, { "epoch": 0.7650561543679807, "grad_norm": 0.1303243190050125, "learning_rate": 1.3012509380046745e-05, "loss": 0.3635, "step": 10150 }, { "epoch": 0.768824903896887, "grad_norm": 0.12242712080478668, "learning_rate": 1.2616769179539944e-05, "loss": 0.3653, "step": 10200 }, { "epoch": 0.7725936534257933, "grad_norm": 0.12988024950027466, "learning_rate": 1.222626939854103e-05, "loss": 0.3634, "step": 10250 }, { "epoch": 0.7763624029546996, "grad_norm": 0.12370813637971878, "learning_rate": 1.1841064777708483e-05, "loss": 0.3648, "step": 10300 }, { "epoch": 0.780131152483606, "grad_norm": 0.12255989760160446, "learning_rate": 1.1461209315419758e-05, "loss": 0.3619, "step": 10350 }, { "epoch": 0.7838999020125123, "grad_norm": 0.1202356219291687, "learning_rate": 1.1086756260201859e-05, "loss": 0.3635, "step": 10400 }, { "epoch": 0.7876686515414185, "grad_norm": 0.12302669882774353, "learning_rate": 1.0717758103266805e-05, "loss": 0.3637, "step": 10450 }, { "epoch": 0.7914374010703249, "grad_norm": 0.12413894385099411, "learning_rate": 1.0354266571153399e-05, "loss": 0.3628, "step": 10500 }, { "epoch": 0.7952061505992312, "grad_norm": 0.13576340675354004, "learning_rate": 9.996332618476172e-06, "loss": 0.3642, "step": 10550 }, { "epoch": 0.7989749001281374, "grad_norm": 0.11875317990779877, "learning_rate": 9.644006420782476e-06, "loss": 0.3616, "step": 10600 }, { "epoch": 0.8027436496570438, "grad_norm": 0.11939128488302231, "learning_rate": 9.29733736751881e-06, "loss": 0.3645, "step": 10650 }, { "epoch": 0.8065123991859501, "grad_norm": 0.12436193227767944, "learning_rate": 8.956374055107442e-06, "loss": 0.3628, "step": 10700 }, { "epoch": 0.8102811487148565, "grad_norm": 0.1196485161781311, "learning_rate": 8.621164280134004e-06, "loss": 0.3627, "step": 10750 }, { "epoch": 0.8140498982437627, "grad_norm": 0.12489930540323257, "learning_rate": 8.291755032647402e-06, "loss": 0.3623, "step": 10800 }, { "epoch": 0.817818647772669, "grad_norm": 0.123841792345047, "learning_rate": 7.96819248957265e-06, "loss": 0.3625, "step": 10850 }, { "epoch": 0.8215873973015754, "grad_norm": 0.11904341727495193, "learning_rate": 7.650522008237754e-06, "loss": 0.3621, "step": 10900 }, { "epoch": 0.8253561468304816, "grad_norm": 0.12214882671833038, "learning_rate": 7.338788120015522e-06, "loss": 0.3615, "step": 10950 }, { "epoch": 0.8291248963593879, "grad_norm": 0.11963903903961182, "learning_rate": 7.033034524081023e-06, "loss": 0.3616, "step": 11000 }, { "epoch": 0.8328936458882943, "grad_norm": 0.1206718236207962, "learning_rate": 6.733304081285874e-06, "loss": 0.3615, "step": 11050 }, { "epoch": 0.8366623954172006, "grad_norm": 0.12292595952749252, "learning_rate": 6.439638808149923e-06, "loss": 0.3626, "step": 11100 }, { "epoch": 0.8404311449461069, "grad_norm": 0.11864078789949417, "learning_rate": 6.152079870971311e-06, "loss": 0.3604, "step": 11150 }, { "epoch": 0.8441998944750132, "grad_norm": 0.11779944598674774, "learning_rate": 5.870667580055805e-06, "loss": 0.362, "step": 11200 }, { "epoch": 0.8479686440039195, "grad_norm": 0.1210256963968277, "learning_rate": 5.595441384065986e-06, "loss": 0.3625, "step": 11250 }, { "epoch": 0.8517373935328258, "grad_norm": 0.11869262158870697, "learning_rate": 5.3264398644913114e-06, "loss": 0.3606, "step": 11300 }, { "epoch": 0.8555061430617321, "grad_norm": 0.11657856404781342, "learning_rate": 5.063700730239784e-06, "loss": 0.3603, "step": 11350 }, { "epoch": 0.8592748925906384, "grad_norm": 0.12736374139785767, "learning_rate": 4.807260812351793e-06, "loss": 0.3597, "step": 11400 }, { "epoch": 0.8630436421195448, "grad_norm": 0.12483314424753189, "learning_rate": 4.557156058837137e-06, "loss": 0.3595, "step": 11450 }, { "epoch": 0.866812391648451, "grad_norm": 0.12260041385889053, "learning_rate": 4.31342152963583e-06, "loss": 0.3605, "step": 11500 }, { "epoch": 0.8705811411773573, "grad_norm": 0.12077975273132324, "learning_rate": 4.076091391703302e-06, "loss": 0.361, "step": 11550 }, { "epoch": 0.8743498907062637, "grad_norm": 0.12066470086574554, "learning_rate": 3.845198914220871e-06, "loss": 0.3595, "step": 11600 }, { "epoch": 0.87811864023517, "grad_norm": 0.11941241472959518, "learning_rate": 3.6207764639320462e-06, "loss": 0.3598, "step": 11650 }, { "epoch": 0.8818873897640763, "grad_norm": 0.11988034099340439, "learning_rate": 3.4028555006052953e-06, "loss": 0.3623, "step": 11700 }, { "epoch": 0.8856561392929826, "grad_norm": 0.11922150105237961, "learning_rate": 3.191466572624019e-06, "loss": 0.359, "step": 11750 }, { "epoch": 0.8894248888218889, "grad_norm": 0.12553803622722626, "learning_rate": 2.986639312704209e-06, "loss": 0.3591, "step": 11800 }, { "epoch": 0.8931936383507952, "grad_norm": 0.12020330131053925, "learning_rate": 2.788402433740517e-06, "loss": 0.3594, "step": 11850 }, { "epoch": 0.8969623878797015, "grad_norm": 0.12430017441511154, "learning_rate": 2.596783724781282e-06, "loss": 0.3611, "step": 11900 }, { "epoch": 0.9007311374086078, "grad_norm": 0.11724447458982468, "learning_rate": 2.4118100471329787e-06, "loss": 0.3586, "step": 11950 }, { "epoch": 0.9044998869375142, "grad_norm": 0.12089824676513672, "learning_rate": 2.2335073305948086e-06, "loss": 0.3599, "step": 12000 }, { "epoch": 0.9082686364664204, "grad_norm": 0.12007749080657959, "learning_rate": 2.0619005698238437e-06, "loss": 0.3596, "step": 12050 }, { "epoch": 0.9120373859953268, "grad_norm": 0.12171461433172226, "learning_rate": 1.8970138208311949e-06, "loss": 0.3583, "step": 12100 }, { "epoch": 0.9158061355242331, "grad_norm": 0.12043970823287964, "learning_rate": 1.7388701976099041e-06, "loss": 0.3602, "step": 12150 }, { "epoch": 0.9195748850531393, "grad_norm": 0.11760008335113525, "learning_rate": 1.5874918688946972e-06, "loss": 0.3595, "step": 12200 }, { "epoch": 0.9233436345820457, "grad_norm": 0.1200682744383812, "learning_rate": 1.4429000550544414e-06, "loss": 0.36, "step": 12250 }, { "epoch": 0.927112384110952, "grad_norm": 0.11784795671701431, "learning_rate": 1.305115025117387e-06, "loss": 0.361, "step": 12300 }, { "epoch": 0.9308811336398582, "grad_norm": 0.12539437413215637, "learning_rate": 1.1741560939298791e-06, "loss": 0.3604, "step": 12350 }, { "epoch": 0.9346498831687646, "grad_norm": 0.11890075355768204, "learning_rate": 1.0500416194487384e-06, "loss": 0.3603, "step": 12400 }, { "epoch": 0.9384186326976709, "grad_norm": 0.11838522553443909, "learning_rate": 9.327890001678719e-07, "loss": 0.3583, "step": 12450 }, { "epoch": 0.9421873822265773, "grad_norm": 0.11805940419435501, "learning_rate": 8.224146726792947e-07, "loss": 0.3584, "step": 12500 }, { "epoch": 0.9459561317554835, "grad_norm": 0.12155980616807938, "learning_rate": 7.189341093690627e-07, "loss": 0.3573, "step": 12550 }, { "epoch": 0.9497248812843898, "grad_norm": 0.12234574556350708, "learning_rate": 6.223618162483014e-07, "loss": 0.3602, "step": 12600 }, { "epoch": 0.9534936308132962, "grad_norm": 0.12211139500141144, "learning_rate": 5.327113309197828e-07, "loss": 0.3607, "step": 12650 }, { "epoch": 0.9572623803422025, "grad_norm": 0.12177923321723938, "learning_rate": 4.4999522068017164e-07, "loss": 0.3593, "step": 12700 }, { "epoch": 0.9610311298711087, "grad_norm": 0.12205006927251816, "learning_rate": 3.7422508075835583e-07, "loss": 0.3581, "step": 12750 }, { "epoch": 0.9647998794000151, "grad_norm": 0.12369778752326965, "learning_rate": 3.05411532689992e-07, "loss": 0.3574, "step": 12800 }, { "epoch": 0.9685686289289214, "grad_norm": 0.11952103674411774, "learning_rate": 2.435642228285906e-07, "loss": 0.3591, "step": 12850 }, { "epoch": 0.9723373784578276, "grad_norm": 0.12717215716838837, "learning_rate": 1.886918209932642e-07, "loss": 0.3591, "step": 12900 }, { "epoch": 0.976106127986734, "grad_norm": 0.11970841139554977, "learning_rate": 1.4080201925338322e-07, "loss": 0.359, "step": 12950 }, { "epoch": 0.9798748775156403, "grad_norm": 0.11662206053733826, "learning_rate": 9.99015308503215e-08, "loss": 0.361, "step": 13000 }, { "epoch": 0.9836436270445467, "grad_norm": 0.11980729550123215, "learning_rate": 6.599608925633715e-08, "loss": 0.359, "step": 13050 }, { "epoch": 0.9874123765734529, "grad_norm": 0.11847749352455139, "learning_rate": 3.909044737089307e-08, "loss": 0.3599, "step": 13100 }, { "epoch": 0.9911811261023592, "grad_norm": 0.11814071238040924, "learning_rate": 1.9188376854373246e-08, "loss": 0.3593, "step": 13150 }, { "epoch": 0.9949498756312656, "grad_norm": 0.12487523257732391, "learning_rate": 6.292667599366864e-09, "loss": 0.3591, "step": 13200 }, { "epoch": 0.9987186251601718, "grad_norm": 0.1255567967891693, "learning_rate": 4.0512733956998837e-10, "loss": 0.3566, "step": 13250 } ], "logging_steps": 50, "max_steps": 13267, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.473625900334952e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }