{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 174828, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028599537831468643, "grad_norm": 8.294376373291016, "learning_rate": 4.985700231084266e-05, "loss": 6.665, "step": 500 }, { "epoch": 0.005719907566293729, "grad_norm": 8.308354377746582, "learning_rate": 4.971400462168532e-05, "loss": 5.1044, "step": 1000 }, { "epoch": 0.008579861349440594, "grad_norm": 7.017335891723633, "learning_rate": 4.9571006932527974e-05, "loss": 4.6249, "step": 1500 }, { "epoch": 0.011439815132587457, "grad_norm": 7.528384685516357, "learning_rate": 4.942800924337063e-05, "loss": 4.3456, "step": 2000 }, { "epoch": 0.014299768915734323, "grad_norm": 7.852795600891113, "learning_rate": 4.928501155421328e-05, "loss": 4.0772, "step": 2500 }, { "epoch": 0.017159722698881188, "grad_norm": 7.606760025024414, "learning_rate": 4.914201386505594e-05, "loss": 3.974, "step": 3000 }, { "epoch": 0.02001967648202805, "grad_norm": 7.45611572265625, "learning_rate": 4.8999016175898596e-05, "loss": 3.8115, "step": 3500 }, { "epoch": 0.022879630265174915, "grad_norm": 7.1126861572265625, "learning_rate": 4.885601848674126e-05, "loss": 3.7529, "step": 4000 }, { "epoch": 0.02573958404832178, "grad_norm": 7.052072525024414, "learning_rate": 4.871302079758392e-05, "loss": 3.654, "step": 4500 }, { "epoch": 0.028599537831468645, "grad_norm": 7.367290019989014, "learning_rate": 4.8570023108426574e-05, "loss": 3.5671, "step": 5000 }, { "epoch": 0.03145949161461551, "grad_norm": 8.534124374389648, "learning_rate": 4.842702541926923e-05, "loss": 3.5032, "step": 5500 }, { "epoch": 0.034319445397762376, "grad_norm": 7.498807907104492, "learning_rate": 4.828402773011189e-05, "loss": 3.5244, "step": 6000 }, { "epoch": 0.037179399180909234, "grad_norm": 6.6923298835754395, "learning_rate": 4.814103004095454e-05, "loss": 3.4814, "step": 6500 }, { "epoch": 0.0400393529640561, "grad_norm": 6.853496551513672, "learning_rate": 4.7998032351797196e-05, "loss": 3.3781, "step": 7000 }, { "epoch": 0.042899306747202964, "grad_norm": 7.179901599884033, "learning_rate": 4.785503466263985e-05, "loss": 3.3461, "step": 7500 }, { "epoch": 0.04575926053034983, "grad_norm": 7.566349506378174, "learning_rate": 4.771203697348251e-05, "loss": 3.3024, "step": 8000 }, { "epoch": 0.048619214313496695, "grad_norm": 7.144839763641357, "learning_rate": 4.756903928432517e-05, "loss": 3.2582, "step": 8500 }, { "epoch": 0.05147916809664356, "grad_norm": 6.929514408111572, "learning_rate": 4.7426041595167824e-05, "loss": 3.2371, "step": 9000 }, { "epoch": 0.054339121879790425, "grad_norm": 6.699374198913574, "learning_rate": 4.728304390601048e-05, "loss": 3.201, "step": 9500 }, { "epoch": 0.05719907566293729, "grad_norm": 5.757383346557617, "learning_rate": 4.714004621685314e-05, "loss": 3.1877, "step": 10000 }, { "epoch": 0.06005902944608415, "grad_norm": 7.66983699798584, "learning_rate": 4.6997048527695796e-05, "loss": 3.1514, "step": 10500 }, { "epoch": 0.06291898322923102, "grad_norm": 7.166614532470703, "learning_rate": 4.6854050838538446e-05, "loss": 3.1203, "step": 11000 }, { "epoch": 0.06577893701237789, "grad_norm": 8.80114459991455, "learning_rate": 4.671105314938111e-05, "loss": 3.1312, "step": 11500 }, { "epoch": 0.06863889079552475, "grad_norm": 29.41587257385254, "learning_rate": 4.656805546022377e-05, "loss": 3.1019, "step": 12000 }, { "epoch": 0.0714988445786716, "grad_norm": 6.1705145835876465, "learning_rate": 4.6425057771066424e-05, "loss": 3.0878, "step": 12500 }, { "epoch": 0.07435879836181847, "grad_norm": 7.218475341796875, "learning_rate": 4.628206008190908e-05, "loss": 3.0758, "step": 13000 }, { "epoch": 0.07721875214496533, "grad_norm": 6.435647964477539, "learning_rate": 4.613906239275174e-05, "loss": 3.0396, "step": 13500 }, { "epoch": 0.0800787059281122, "grad_norm": 7.471750736236572, "learning_rate": 4.5996064703594396e-05, "loss": 3.0454, "step": 14000 }, { "epoch": 0.08293865971125906, "grad_norm": 6.561801910400391, "learning_rate": 4.585306701443705e-05, "loss": 3.0095, "step": 14500 }, { "epoch": 0.08579861349440593, "grad_norm": 7.1273369789123535, "learning_rate": 4.57100693252797e-05, "loss": 3.0249, "step": 15000 }, { "epoch": 0.0886585672775528, "grad_norm": 6.540430545806885, "learning_rate": 4.556707163612236e-05, "loss": 3.0156, "step": 15500 }, { "epoch": 0.09151852106069966, "grad_norm": 6.394286632537842, "learning_rate": 4.542407394696502e-05, "loss": 2.9634, "step": 16000 }, { "epoch": 0.09437847484384652, "grad_norm": 7.856606960296631, "learning_rate": 4.5281076257807674e-05, "loss": 2.9866, "step": 16500 }, { "epoch": 0.09723842862699339, "grad_norm": 7.8352861404418945, "learning_rate": 4.513807856865033e-05, "loss": 2.9864, "step": 17000 }, { "epoch": 0.10009838241014025, "grad_norm": 6.253101348876953, "learning_rate": 4.499508087949299e-05, "loss": 2.9592, "step": 17500 }, { "epoch": 0.10295833619328712, "grad_norm": 6.485994815826416, "learning_rate": 4.4852083190335646e-05, "loss": 2.9405, "step": 18000 }, { "epoch": 0.10581828997643399, "grad_norm": 6.409724712371826, "learning_rate": 4.47090855011783e-05, "loss": 2.9567, "step": 18500 }, { "epoch": 0.10867824375958085, "grad_norm": 7.388598918914795, "learning_rate": 4.456608781202096e-05, "loss": 2.9123, "step": 19000 }, { "epoch": 0.11153819754272772, "grad_norm": 7.503371715545654, "learning_rate": 4.442309012286362e-05, "loss": 2.922, "step": 19500 }, { "epoch": 0.11439815132587458, "grad_norm": 6.702953338623047, "learning_rate": 4.4280092433706274e-05, "loss": 2.9321, "step": 20000 }, { "epoch": 0.11725810510902143, "grad_norm": 7.328106880187988, "learning_rate": 4.413709474454893e-05, "loss": 2.8965, "step": 20500 }, { "epoch": 0.1201180588921683, "grad_norm": 6.787193775177002, "learning_rate": 4.399409705539159e-05, "loss": 2.9204, "step": 21000 }, { "epoch": 0.12297801267531516, "grad_norm": 5.832542896270752, "learning_rate": 4.3851099366234246e-05, "loss": 2.8724, "step": 21500 }, { "epoch": 0.12583796645846204, "grad_norm": 6.784033298492432, "learning_rate": 4.37081016770769e-05, "loss": 2.889, "step": 22000 }, { "epoch": 0.1286979202416089, "grad_norm": 7.457705497741699, "learning_rate": 4.356510398791956e-05, "loss": 2.8845, "step": 22500 }, { "epoch": 0.13155787402475577, "grad_norm": 7.377457141876221, "learning_rate": 4.342210629876222e-05, "loss": 2.876, "step": 23000 }, { "epoch": 0.13441782780790262, "grad_norm": 6.810230731964111, "learning_rate": 4.327910860960487e-05, "loss": 2.8881, "step": 23500 }, { "epoch": 0.1372777815910495, "grad_norm": 6.137091636657715, "learning_rate": 4.3136110920447525e-05, "loss": 2.8599, "step": 24000 }, { "epoch": 0.14013773537419635, "grad_norm": 27.535808563232422, "learning_rate": 4.299311323129018e-05, "loss": 2.8432, "step": 24500 }, { "epoch": 0.1429976891573432, "grad_norm": 6.044827461242676, "learning_rate": 4.285011554213284e-05, "loss": 2.8644, "step": 25000 }, { "epoch": 0.14585764294049008, "grad_norm": 6.300295829772949, "learning_rate": 4.2707117852975496e-05, "loss": 2.8269, "step": 25500 }, { "epoch": 0.14871759672363694, "grad_norm": 5.811293125152588, "learning_rate": 4.256412016381815e-05, "loss": 2.8308, "step": 26000 }, { "epoch": 0.15157755050678381, "grad_norm": 6.52765417098999, "learning_rate": 4.242112247466081e-05, "loss": 2.8309, "step": 26500 }, { "epoch": 0.15443750428993067, "grad_norm": 6.731512546539307, "learning_rate": 4.227812478550347e-05, "loss": 2.8066, "step": 27000 }, { "epoch": 0.15729745807307755, "grad_norm": 6.837157249450684, "learning_rate": 4.2135127096346125e-05, "loss": 2.8282, "step": 27500 }, { "epoch": 0.1601574118562244, "grad_norm": 5.657121181488037, "learning_rate": 4.199212940718878e-05, "loss": 2.7904, "step": 28000 }, { "epoch": 0.16301736563937128, "grad_norm": 8.501928329467773, "learning_rate": 4.184913171803144e-05, "loss": 2.8224, "step": 28500 }, { "epoch": 0.16587731942251813, "grad_norm": 6.447242736816406, "learning_rate": 4.1706134028874096e-05, "loss": 2.7731, "step": 29000 }, { "epoch": 0.168737273205665, "grad_norm": 6.049993991851807, "learning_rate": 4.156313633971675e-05, "loss": 2.7726, "step": 29500 }, { "epoch": 0.17159722698881186, "grad_norm": 5.747082710266113, "learning_rate": 4.142013865055941e-05, "loss": 2.799, "step": 30000 }, { "epoch": 0.17445718077195874, "grad_norm": 6.7925615310668945, "learning_rate": 4.127714096140207e-05, "loss": 2.7704, "step": 30500 }, { "epoch": 0.1773171345551056, "grad_norm": 7.164943218231201, "learning_rate": 4.1134143272244725e-05, "loss": 2.7611, "step": 31000 }, { "epoch": 0.18017708833825247, "grad_norm": 6.813632011413574, "learning_rate": 4.099114558308738e-05, "loss": 2.7637, "step": 31500 }, { "epoch": 0.18303704212139932, "grad_norm": 5.981168270111084, "learning_rate": 4.084814789393003e-05, "loss": 2.7495, "step": 32000 }, { "epoch": 0.18589699590454617, "grad_norm": 6.125492095947266, "learning_rate": 4.070515020477269e-05, "loss": 2.7592, "step": 32500 }, { "epoch": 0.18875694968769305, "grad_norm": 44.21103286743164, "learning_rate": 4.0562152515615347e-05, "loss": 2.7711, "step": 33000 }, { "epoch": 0.1916169034708399, "grad_norm": 5.714451789855957, "learning_rate": 4.0419154826458004e-05, "loss": 2.7456, "step": 33500 }, { "epoch": 0.19447685725398678, "grad_norm": 5.732424736022949, "learning_rate": 4.027615713730066e-05, "loss": 2.7412, "step": 34000 }, { "epoch": 0.19733681103713363, "grad_norm": 7.989277362823486, "learning_rate": 4.013315944814332e-05, "loss": 2.7429, "step": 34500 }, { "epoch": 0.2001967648202805, "grad_norm": 6.200708389282227, "learning_rate": 3.9990161758985975e-05, "loss": 2.7497, "step": 35000 }, { "epoch": 0.20305671860342736, "grad_norm": 6.867748260498047, "learning_rate": 3.984716406982863e-05, "loss": 2.7387, "step": 35500 }, { "epoch": 0.20591667238657424, "grad_norm": 5.795921325683594, "learning_rate": 3.970416638067129e-05, "loss": 2.7262, "step": 36000 }, { "epoch": 0.2087766261697211, "grad_norm": 6.110116958618164, "learning_rate": 3.9561168691513947e-05, "loss": 2.7416, "step": 36500 }, { "epoch": 0.21163657995286797, "grad_norm": 6.253924369812012, "learning_rate": 3.9418171002356604e-05, "loss": 2.7215, "step": 37000 }, { "epoch": 0.21449653373601482, "grad_norm": 6.117007732391357, "learning_rate": 3.927517331319926e-05, "loss": 2.7231, "step": 37500 }, { "epoch": 0.2173564875191617, "grad_norm": 8.227131843566895, "learning_rate": 3.913217562404192e-05, "loss": 2.71, "step": 38000 }, { "epoch": 0.22021644130230855, "grad_norm": 6.146326541900635, "learning_rate": 3.8989177934884575e-05, "loss": 2.7076, "step": 38500 }, { "epoch": 0.22307639508545543, "grad_norm": 6.7277398109436035, "learning_rate": 3.884618024572723e-05, "loss": 2.7226, "step": 39000 }, { "epoch": 0.22593634886860228, "grad_norm": 6.300662994384766, "learning_rate": 3.870318255656989e-05, "loss": 2.7025, "step": 39500 }, { "epoch": 0.22879630265174916, "grad_norm": 5.755123138427734, "learning_rate": 3.8560184867412547e-05, "loss": 2.7025, "step": 40000 }, { "epoch": 0.231656256434896, "grad_norm": 6.393768310546875, "learning_rate": 3.8417187178255204e-05, "loss": 2.7113, "step": 40500 }, { "epoch": 0.23451621021804286, "grad_norm": 5.855433464050293, "learning_rate": 3.8274189489097854e-05, "loss": 2.721, "step": 41000 }, { "epoch": 0.23737616400118974, "grad_norm": 4.719547271728516, "learning_rate": 3.813119179994051e-05, "loss": 2.6774, "step": 41500 }, { "epoch": 0.2402361177843366, "grad_norm": 5.75437068939209, "learning_rate": 3.798819411078317e-05, "loss": 2.6922, "step": 42000 }, { "epoch": 0.24309607156748347, "grad_norm": 6.258277416229248, "learning_rate": 3.7845196421625825e-05, "loss": 2.701, "step": 42500 }, { "epoch": 0.24595602535063033, "grad_norm": 5.8440165519714355, "learning_rate": 3.770219873246848e-05, "loss": 2.697, "step": 43000 }, { "epoch": 0.2488159791337772, "grad_norm": 5.4940009117126465, "learning_rate": 3.755920104331114e-05, "loss": 2.6826, "step": 43500 }, { "epoch": 0.2516759329169241, "grad_norm": 8.00302791595459, "learning_rate": 3.7416203354153804e-05, "loss": 2.6782, "step": 44000 }, { "epoch": 0.25453588670007093, "grad_norm": 6.31597375869751, "learning_rate": 3.727320566499646e-05, "loss": 2.7113, "step": 44500 }, { "epoch": 0.2573958404832178, "grad_norm": 6.734432697296143, "learning_rate": 3.713020797583911e-05, "loss": 2.6883, "step": 45000 }, { "epoch": 0.26025579426636464, "grad_norm": 8.607872009277344, "learning_rate": 3.698721028668177e-05, "loss": 2.675, "step": 45500 }, { "epoch": 0.26311574804951154, "grad_norm": 6.785426139831543, "learning_rate": 3.6844212597524425e-05, "loss": 2.6662, "step": 46000 }, { "epoch": 0.2659757018326584, "grad_norm": 5.7255072593688965, "learning_rate": 3.670121490836708e-05, "loss": 2.6566, "step": 46500 }, { "epoch": 0.26883565561580525, "grad_norm": 5.778408527374268, "learning_rate": 3.655821721920974e-05, "loss": 2.6869, "step": 47000 }, { "epoch": 0.2716956093989521, "grad_norm": 7.3644490242004395, "learning_rate": 3.64152195300524e-05, "loss": 2.6548, "step": 47500 }, { "epoch": 0.274555563182099, "grad_norm": 9.922218322753906, "learning_rate": 3.6272221840895054e-05, "loss": 2.6548, "step": 48000 }, { "epoch": 0.27741551696524586, "grad_norm": 6.6563944816589355, "learning_rate": 3.612922415173771e-05, "loss": 2.6466, "step": 48500 }, { "epoch": 0.2802754707483927, "grad_norm": 5.308610439300537, "learning_rate": 3.598622646258037e-05, "loss": 2.6744, "step": 49000 }, { "epoch": 0.28313542453153956, "grad_norm": 6.213603973388672, "learning_rate": 3.584322877342302e-05, "loss": 2.6484, "step": 49500 }, { "epoch": 0.2859953783146864, "grad_norm": 5.715392589569092, "learning_rate": 3.5700231084265676e-05, "loss": 2.6573, "step": 50000 }, { "epoch": 0.2888553320978333, "grad_norm": 6.067576885223389, "learning_rate": 3.555723339510833e-05, "loss": 2.6487, "step": 50500 }, { "epoch": 0.29171528588098017, "grad_norm": 6.300750255584717, "learning_rate": 3.541423570595099e-05, "loss": 2.6445, "step": 51000 }, { "epoch": 0.294575239664127, "grad_norm": 6.036895275115967, "learning_rate": 3.5271238016793654e-05, "loss": 2.6756, "step": 51500 }, { "epoch": 0.29743519344727387, "grad_norm": 5.856159687042236, "learning_rate": 3.512824032763631e-05, "loss": 2.6415, "step": 52000 }, { "epoch": 0.3002951472304208, "grad_norm": 12.173583984375, "learning_rate": 3.498524263847897e-05, "loss": 2.6386, "step": 52500 }, { "epoch": 0.30315510101356763, "grad_norm": 5.493927478790283, "learning_rate": 3.4842244949321625e-05, "loss": 2.6515, "step": 53000 }, { "epoch": 0.3060150547967145, "grad_norm": 5.786694526672363, "learning_rate": 3.4699247260164276e-05, "loss": 2.6449, "step": 53500 }, { "epoch": 0.30887500857986133, "grad_norm": 5.755667686462402, "learning_rate": 3.455624957100693e-05, "loss": 2.6357, "step": 54000 }, { "epoch": 0.31173496236300824, "grad_norm": 5.9297027587890625, "learning_rate": 3.441325188184959e-05, "loss": 2.6493, "step": 54500 }, { "epoch": 0.3145949161461551, "grad_norm": 6.182466983795166, "learning_rate": 3.427025419269225e-05, "loss": 2.6298, "step": 55000 }, { "epoch": 0.31745486992930194, "grad_norm": 6.565801620483398, "learning_rate": 3.4127256503534904e-05, "loss": 2.6611, "step": 55500 }, { "epoch": 0.3203148237124488, "grad_norm": 5.94129753112793, "learning_rate": 3.398425881437756e-05, "loss": 2.6201, "step": 56000 }, { "epoch": 0.32317477749559564, "grad_norm": 6.72519063949585, "learning_rate": 3.384126112522022e-05, "loss": 2.5961, "step": 56500 }, { "epoch": 0.32603473127874255, "grad_norm": 6.440931797027588, "learning_rate": 3.3698263436062876e-05, "loss": 2.6322, "step": 57000 }, { "epoch": 0.3288946850618894, "grad_norm": 6.059328079223633, "learning_rate": 3.355526574690553e-05, "loss": 2.615, "step": 57500 }, { "epoch": 0.33175463884503625, "grad_norm": 6.007944107055664, "learning_rate": 3.341226805774818e-05, "loss": 2.5992, "step": 58000 }, { "epoch": 0.3346145926281831, "grad_norm": 6.9386982917785645, "learning_rate": 3.326927036859084e-05, "loss": 2.6317, "step": 58500 }, { "epoch": 0.33747454641133, "grad_norm": 5.493308067321777, "learning_rate": 3.3126272679433504e-05, "loss": 2.5975, "step": 59000 }, { "epoch": 0.34033450019447686, "grad_norm": 7.026157855987549, "learning_rate": 3.298327499027616e-05, "loss": 2.6139, "step": 59500 }, { "epoch": 0.3431944539776237, "grad_norm": 5.790646553039551, "learning_rate": 3.284027730111882e-05, "loss": 2.5916, "step": 60000 }, { "epoch": 0.34605440776077057, "grad_norm": 5.980741024017334, "learning_rate": 3.2697279611961476e-05, "loss": 2.5811, "step": 60500 }, { "epoch": 0.3489143615439175, "grad_norm": 6.555883407592773, "learning_rate": 3.255428192280413e-05, "loss": 2.5909, "step": 61000 }, { "epoch": 0.3517743153270643, "grad_norm": 5.8480706214904785, "learning_rate": 3.241128423364679e-05, "loss": 2.6127, "step": 61500 }, { "epoch": 0.3546342691102112, "grad_norm": 6.341095924377441, "learning_rate": 3.226828654448944e-05, "loss": 2.5959, "step": 62000 }, { "epoch": 0.357494222893358, "grad_norm": 5.832342147827148, "learning_rate": 3.21252888553321e-05, "loss": 2.5946, "step": 62500 }, { "epoch": 0.36035417667650493, "grad_norm": 6.495291709899902, "learning_rate": 3.1982291166174755e-05, "loss": 2.6122, "step": 63000 }, { "epoch": 0.3632141304596518, "grad_norm": 6.527446746826172, "learning_rate": 3.183929347701741e-05, "loss": 2.573, "step": 63500 }, { "epoch": 0.36607408424279864, "grad_norm": 6.4324951171875, "learning_rate": 3.169629578786007e-05, "loss": 2.6119, "step": 64000 }, { "epoch": 0.3689340380259455, "grad_norm": 7.166018009185791, "learning_rate": 3.1553298098702726e-05, "loss": 2.6124, "step": 64500 }, { "epoch": 0.37179399180909234, "grad_norm": 6.462119102478027, "learning_rate": 3.141030040954538e-05, "loss": 2.5552, "step": 65000 }, { "epoch": 0.37465394559223925, "grad_norm": 6.0564703941345215, "learning_rate": 3.126730272038804e-05, "loss": 2.5672, "step": 65500 }, { "epoch": 0.3775138993753861, "grad_norm": 5.307662487030029, "learning_rate": 3.11243050312307e-05, "loss": 2.5611, "step": 66000 }, { "epoch": 0.38037385315853295, "grad_norm": 5.18694543838501, "learning_rate": 3.0981307342073355e-05, "loss": 2.5691, "step": 66500 }, { "epoch": 0.3832338069416798, "grad_norm": 5.568657398223877, "learning_rate": 3.083830965291601e-05, "loss": 2.575, "step": 67000 }, { "epoch": 0.3860937607248267, "grad_norm": 10.616528511047363, "learning_rate": 3.069531196375867e-05, "loss": 2.5886, "step": 67500 }, { "epoch": 0.38895371450797356, "grad_norm": 6.7568206787109375, "learning_rate": 3.0552314274601326e-05, "loss": 2.5822, "step": 68000 }, { "epoch": 0.3918136682911204, "grad_norm": 6.087740421295166, "learning_rate": 3.040931658544398e-05, "loss": 2.5472, "step": 68500 }, { "epoch": 0.39467362207426726, "grad_norm": 6.702504634857178, "learning_rate": 3.0266318896286637e-05, "loss": 2.5897, "step": 69000 }, { "epoch": 0.39753357585741417, "grad_norm": 6.2178053855896, "learning_rate": 3.0123321207129297e-05, "loss": 2.5698, "step": 69500 }, { "epoch": 0.400393529640561, "grad_norm": 6.559543609619141, "learning_rate": 2.9980323517971955e-05, "loss": 2.5725, "step": 70000 }, { "epoch": 0.40325348342370787, "grad_norm": 5.918066501617432, "learning_rate": 2.9837325828814605e-05, "loss": 2.5847, "step": 70500 }, { "epoch": 0.4061134372068547, "grad_norm": 5.602575778961182, "learning_rate": 2.9694328139657262e-05, "loss": 2.583, "step": 71000 }, { "epoch": 0.40897339099000163, "grad_norm": 5.304308891296387, "learning_rate": 2.955133045049992e-05, "loss": 2.5632, "step": 71500 }, { "epoch": 0.4118333447731485, "grad_norm": 5.540666103363037, "learning_rate": 2.9408332761342576e-05, "loss": 2.5756, "step": 72000 }, { "epoch": 0.41469329855629533, "grad_norm": 6.2000861167907715, "learning_rate": 2.9265335072185234e-05, "loss": 2.5357, "step": 72500 }, { "epoch": 0.4175532523394422, "grad_norm": 5.1564459800720215, "learning_rate": 2.912233738302789e-05, "loss": 2.5516, "step": 73000 }, { "epoch": 0.42041320612258903, "grad_norm": 6.008329391479492, "learning_rate": 2.897933969387055e-05, "loss": 2.5738, "step": 73500 }, { "epoch": 0.42327315990573594, "grad_norm": 6.52450704574585, "learning_rate": 2.883634200471321e-05, "loss": 2.578, "step": 74000 }, { "epoch": 0.4261331136888828, "grad_norm": 5.788220405578613, "learning_rate": 2.8693344315555866e-05, "loss": 2.5578, "step": 74500 }, { "epoch": 0.42899306747202964, "grad_norm": 5.5810112953186035, "learning_rate": 2.8550346626398516e-05, "loss": 2.5643, "step": 75000 }, { "epoch": 0.4318530212551765, "grad_norm": 5.334226608276367, "learning_rate": 2.8407348937241173e-05, "loss": 2.5376, "step": 75500 }, { "epoch": 0.4347129750383234, "grad_norm": 5.804100513458252, "learning_rate": 2.826435124808383e-05, "loss": 2.541, "step": 76000 }, { "epoch": 0.43757292882147025, "grad_norm": 5.555410385131836, "learning_rate": 2.8121353558926487e-05, "loss": 2.5364, "step": 76500 }, { "epoch": 0.4404328826046171, "grad_norm": 5.454427719116211, "learning_rate": 2.7978355869769148e-05, "loss": 2.5602, "step": 77000 }, { "epoch": 0.44329283638776396, "grad_norm": 16.772747039794922, "learning_rate": 2.7835358180611805e-05, "loss": 2.5674, "step": 77500 }, { "epoch": 0.44615279017091086, "grad_norm": 8.047761917114258, "learning_rate": 2.7692360491454462e-05, "loss": 2.5334, "step": 78000 }, { "epoch": 0.4490127439540577, "grad_norm": 6.612277507781982, "learning_rate": 2.754936280229712e-05, "loss": 2.5525, "step": 78500 }, { "epoch": 0.45187269773720457, "grad_norm": 6.439370632171631, "learning_rate": 2.740636511313977e-05, "loss": 2.5349, "step": 79000 }, { "epoch": 0.4547326515203514, "grad_norm": 6.890873908996582, "learning_rate": 2.7263367423982427e-05, "loss": 2.5145, "step": 79500 }, { "epoch": 0.4575926053034983, "grad_norm": 5.4768500328063965, "learning_rate": 2.7120369734825084e-05, "loss": 2.5277, "step": 80000 }, { "epoch": 0.4604525590866452, "grad_norm": 5.825018405914307, "learning_rate": 2.697737204566774e-05, "loss": 2.5505, "step": 80500 }, { "epoch": 0.463312512869792, "grad_norm": 6.583479881286621, "learning_rate": 2.68343743565104e-05, "loss": 2.5562, "step": 81000 }, { "epoch": 0.4661724666529389, "grad_norm": 6.420114040374756, "learning_rate": 2.669137666735306e-05, "loss": 2.5094, "step": 81500 }, { "epoch": 0.46903242043608573, "grad_norm": 6.8168110847473145, "learning_rate": 2.6548378978195716e-05, "loss": 2.5347, "step": 82000 }, { "epoch": 0.47189237421923264, "grad_norm": 6.224096298217773, "learning_rate": 2.6405381289038373e-05, "loss": 2.5154, "step": 82500 }, { "epoch": 0.4747523280023795, "grad_norm": 6.240240097045898, "learning_rate": 2.626238359988103e-05, "loss": 2.535, "step": 83000 }, { "epoch": 0.47761228178552634, "grad_norm": 6.053983211517334, "learning_rate": 2.611938591072368e-05, "loss": 2.5275, "step": 83500 }, { "epoch": 0.4804722355686732, "grad_norm": 5.546879768371582, "learning_rate": 2.5976388221566338e-05, "loss": 2.5329, "step": 84000 }, { "epoch": 0.4833321893518201, "grad_norm": 6.190423011779785, "learning_rate": 2.5833390532408995e-05, "loss": 2.5174, "step": 84500 }, { "epoch": 0.48619214313496695, "grad_norm": 5.437402248382568, "learning_rate": 2.5690392843251655e-05, "loss": 2.49, "step": 85000 }, { "epoch": 0.4890520969181138, "grad_norm": 6.8163557052612305, "learning_rate": 2.5547395154094312e-05, "loss": 2.524, "step": 85500 }, { "epoch": 0.49191205070126065, "grad_norm": 6.754604816436768, "learning_rate": 2.540439746493697e-05, "loss": 2.5041, "step": 86000 }, { "epoch": 0.49477200448440756, "grad_norm": 5.496472358703613, "learning_rate": 2.5261399775779627e-05, "loss": 2.5277, "step": 86500 }, { "epoch": 0.4976319582675544, "grad_norm": 5.616280555725098, "learning_rate": 2.5118402086622284e-05, "loss": 2.5061, "step": 87000 }, { "epoch": 0.5004919120507013, "grad_norm": 6.141283988952637, "learning_rate": 2.4975404397464938e-05, "loss": 2.5214, "step": 87500 }, { "epoch": 0.5033518658338482, "grad_norm": 6.124631404876709, "learning_rate": 2.4832406708307595e-05, "loss": 2.4854, "step": 88000 }, { "epoch": 0.506211819616995, "grad_norm": 6.740499496459961, "learning_rate": 2.4689409019150252e-05, "loss": 2.5054, "step": 88500 }, { "epoch": 0.5090717734001419, "grad_norm": 6.040327548980713, "learning_rate": 2.454641132999291e-05, "loss": 2.5042, "step": 89000 }, { "epoch": 0.5119317271832887, "grad_norm": 5.564330577850342, "learning_rate": 2.4403413640835566e-05, "loss": 2.5021, "step": 89500 }, { "epoch": 0.5147916809664356, "grad_norm": 6.915059566497803, "learning_rate": 2.4260415951678223e-05, "loss": 2.5227, "step": 90000 }, { "epoch": 0.5176516347495824, "grad_norm": 6.181910991668701, "learning_rate": 2.411741826252088e-05, "loss": 2.5098, "step": 90500 }, { "epoch": 0.5205115885327293, "grad_norm": 5.829164505004883, "learning_rate": 2.3974420573363534e-05, "loss": 2.5133, "step": 91000 }, { "epoch": 0.5233715423158761, "grad_norm": 14.621573448181152, "learning_rate": 2.383142288420619e-05, "loss": 2.503, "step": 91500 }, { "epoch": 0.5262314960990231, "grad_norm": 6.3930511474609375, "learning_rate": 2.368842519504885e-05, "loss": 2.5124, "step": 92000 }, { "epoch": 0.5290914498821699, "grad_norm": 5.840575695037842, "learning_rate": 2.3545427505891506e-05, "loss": 2.5177, "step": 92500 }, { "epoch": 0.5319514036653168, "grad_norm": 6.612518787384033, "learning_rate": 2.3402429816734163e-05, "loss": 2.4881, "step": 93000 }, { "epoch": 0.5348113574484636, "grad_norm": 6.505732536315918, "learning_rate": 2.325943212757682e-05, "loss": 2.4872, "step": 93500 }, { "epoch": 0.5376713112316105, "grad_norm": 7.19988489151001, "learning_rate": 2.3116434438419477e-05, "loss": 2.4958, "step": 94000 }, { "epoch": 0.5405312650147573, "grad_norm": 5.988187789916992, "learning_rate": 2.2973436749262134e-05, "loss": 2.5094, "step": 94500 }, { "epoch": 0.5433912187979042, "grad_norm": 5.709506511688232, "learning_rate": 2.2830439060104788e-05, "loss": 2.4882, "step": 95000 }, { "epoch": 0.546251172581051, "grad_norm": 5.567132949829102, "learning_rate": 2.2687441370947445e-05, "loss": 2.4909, "step": 95500 }, { "epoch": 0.549111126364198, "grad_norm": 11.825920104980469, "learning_rate": 2.2544443681790102e-05, "loss": 2.4944, "step": 96000 }, { "epoch": 0.5519710801473449, "grad_norm": 5.969587802886963, "learning_rate": 2.240144599263276e-05, "loss": 2.4912, "step": 96500 }, { "epoch": 0.5548310339304917, "grad_norm": 6.31153678894043, "learning_rate": 2.225844830347542e-05, "loss": 2.4901, "step": 97000 }, { "epoch": 0.5576909877136386, "grad_norm": 7.130558013916016, "learning_rate": 2.2115450614318074e-05, "loss": 2.4768, "step": 97500 }, { "epoch": 0.5605509414967854, "grad_norm": 5.947187900543213, "learning_rate": 2.197245292516073e-05, "loss": 2.4971, "step": 98000 }, { "epoch": 0.5634108952799323, "grad_norm": 6.830575466156006, "learning_rate": 2.1829455236003388e-05, "loss": 2.4901, "step": 98500 }, { "epoch": 0.5662708490630791, "grad_norm": 5.682921409606934, "learning_rate": 2.1686457546846045e-05, "loss": 2.4946, "step": 99000 }, { "epoch": 0.569130802846226, "grad_norm": 5.174154758453369, "learning_rate": 2.15434598576887e-05, "loss": 2.4813, "step": 99500 }, { "epoch": 0.5719907566293728, "grad_norm": 5.400365352630615, "learning_rate": 2.1400462168531356e-05, "loss": 2.4498, "step": 100000 }, { "epoch": 0.5748507104125198, "grad_norm": 5.433869361877441, "learning_rate": 2.1257464479374013e-05, "loss": 2.523, "step": 100500 }, { "epoch": 0.5777106641956666, "grad_norm": 6.321377754211426, "learning_rate": 2.1114466790216674e-05, "loss": 2.4731, "step": 101000 }, { "epoch": 0.5805706179788135, "grad_norm": 6.643988609313965, "learning_rate": 2.0971469101059327e-05, "loss": 2.4837, "step": 101500 }, { "epoch": 0.5834305717619603, "grad_norm": 6.258885383605957, "learning_rate": 2.0828471411901985e-05, "loss": 2.4735, "step": 102000 }, { "epoch": 0.5862905255451072, "grad_norm": 5.747689723968506, "learning_rate": 2.068547372274464e-05, "loss": 2.4742, "step": 102500 }, { "epoch": 0.589150479328254, "grad_norm": 6.016144275665283, "learning_rate": 2.05424760335873e-05, "loss": 2.4633, "step": 103000 }, { "epoch": 0.5920104331114009, "grad_norm": 5.250337600708008, "learning_rate": 2.0399478344429953e-05, "loss": 2.467, "step": 103500 }, { "epoch": 0.5948703868945477, "grad_norm": 5.667397975921631, "learning_rate": 2.025648065527261e-05, "loss": 2.4709, "step": 104000 }, { "epoch": 0.5977303406776946, "grad_norm": 6.414941310882568, "learning_rate": 2.0113482966115267e-05, "loss": 2.4805, "step": 104500 }, { "epoch": 0.6005902944608416, "grad_norm": 6.118762493133545, "learning_rate": 1.9970485276957927e-05, "loss": 2.46, "step": 105000 }, { "epoch": 0.6034502482439884, "grad_norm": 7.456865310668945, "learning_rate": 1.9827487587800584e-05, "loss": 2.4863, "step": 105500 }, { "epoch": 0.6063102020271353, "grad_norm": 7.2666096687316895, "learning_rate": 1.9684489898643238e-05, "loss": 2.431, "step": 106000 }, { "epoch": 0.6091701558102821, "grad_norm": 6.135725975036621, "learning_rate": 1.9541492209485895e-05, "loss": 2.4833, "step": 106500 }, { "epoch": 0.612030109593429, "grad_norm": 6.930655002593994, "learning_rate": 1.9398494520328553e-05, "loss": 2.4791, "step": 107000 }, { "epoch": 0.6148900633765758, "grad_norm": 5.848691940307617, "learning_rate": 1.925549683117121e-05, "loss": 2.4744, "step": 107500 }, { "epoch": 0.6177500171597227, "grad_norm": 6.593609809875488, "learning_rate": 1.9112499142013863e-05, "loss": 2.4818, "step": 108000 }, { "epoch": 0.6206099709428695, "grad_norm": 5.148362636566162, "learning_rate": 1.8969501452856524e-05, "loss": 2.4863, "step": 108500 }, { "epoch": 0.6234699247260165, "grad_norm": 6.264626979827881, "learning_rate": 1.882650376369918e-05, "loss": 2.4896, "step": 109000 }, { "epoch": 0.6263298785091633, "grad_norm": 7.046905040740967, "learning_rate": 1.8683506074541838e-05, "loss": 2.4746, "step": 109500 }, { "epoch": 0.6291898322923102, "grad_norm": 6.274538993835449, "learning_rate": 1.8540508385384492e-05, "loss": 2.4395, "step": 110000 }, { "epoch": 0.632049786075457, "grad_norm": 5.889391899108887, "learning_rate": 1.839751069622715e-05, "loss": 2.4307, "step": 110500 }, { "epoch": 0.6349097398586039, "grad_norm": 5.6989030838012695, "learning_rate": 1.8254513007069806e-05, "loss": 2.4297, "step": 111000 }, { "epoch": 0.6377696936417507, "grad_norm": 6.275044918060303, "learning_rate": 1.8111515317912463e-05, "loss": 2.4504, "step": 111500 }, { "epoch": 0.6406296474248976, "grad_norm": 6.444321155548096, "learning_rate": 1.7968517628755117e-05, "loss": 2.4286, "step": 112000 }, { "epoch": 0.6434896012080444, "grad_norm": 6.624863147735596, "learning_rate": 1.7825519939597778e-05, "loss": 2.463, "step": 112500 }, { "epoch": 0.6463495549911913, "grad_norm": 7.994183540344238, "learning_rate": 1.7682522250440435e-05, "loss": 2.4362, "step": 113000 }, { "epoch": 0.6492095087743383, "grad_norm": 5.6794257164001465, "learning_rate": 1.7539524561283092e-05, "loss": 2.4355, "step": 113500 }, { "epoch": 0.6520694625574851, "grad_norm": 5.606757164001465, "learning_rate": 1.739652687212575e-05, "loss": 2.4525, "step": 114000 }, { "epoch": 0.654929416340632, "grad_norm": 6.253554344177246, "learning_rate": 1.7253529182968403e-05, "loss": 2.4511, "step": 114500 }, { "epoch": 0.6577893701237788, "grad_norm": 6.014497756958008, "learning_rate": 1.711053149381106e-05, "loss": 2.4571, "step": 115000 }, { "epoch": 0.6606493239069257, "grad_norm": 6.601302146911621, "learning_rate": 1.6967533804653717e-05, "loss": 2.4505, "step": 115500 }, { "epoch": 0.6635092776900725, "grad_norm": 7.215948104858398, "learning_rate": 1.6824536115496374e-05, "loss": 2.4351, "step": 116000 }, { "epoch": 0.6663692314732194, "grad_norm": 5.974714279174805, "learning_rate": 1.668153842633903e-05, "loss": 2.4435, "step": 116500 }, { "epoch": 0.6692291852563662, "grad_norm": 6.903178691864014, "learning_rate": 1.653854073718169e-05, "loss": 2.4388, "step": 117000 }, { "epoch": 0.6720891390395132, "grad_norm": 6.214517116546631, "learning_rate": 1.6395543048024346e-05, "loss": 2.4405, "step": 117500 }, { "epoch": 0.67494909282266, "grad_norm": 6.263461589813232, "learning_rate": 1.6252545358867003e-05, "loss": 2.4496, "step": 118000 }, { "epoch": 0.6778090466058069, "grad_norm": 8.066364288330078, "learning_rate": 1.6109547669709657e-05, "loss": 2.4368, "step": 118500 }, { "epoch": 0.6806690003889537, "grad_norm": 5.834959506988525, "learning_rate": 1.5966549980552314e-05, "loss": 2.4481, "step": 119000 }, { "epoch": 0.6835289541721006, "grad_norm": 6.710206031799316, "learning_rate": 1.582355229139497e-05, "loss": 2.4325, "step": 119500 }, { "epoch": 0.6863889079552474, "grad_norm": 5.984834671020508, "learning_rate": 1.5680554602237628e-05, "loss": 2.4454, "step": 120000 }, { "epoch": 0.6892488617383943, "grad_norm": 5.370354652404785, "learning_rate": 1.5537556913080285e-05, "loss": 2.4279, "step": 120500 }, { "epoch": 0.6921088155215411, "grad_norm": 6.09434175491333, "learning_rate": 1.5394559223922942e-05, "loss": 2.4314, "step": 121000 }, { "epoch": 0.694968769304688, "grad_norm": 6.878710746765137, "learning_rate": 1.52515615347656e-05, "loss": 2.4191, "step": 121500 }, { "epoch": 0.697828723087835, "grad_norm": 5.660272121429443, "learning_rate": 1.5108563845608257e-05, "loss": 2.433, "step": 122000 }, { "epoch": 0.7006886768709818, "grad_norm": 6.489835739135742, "learning_rate": 1.4965566156450914e-05, "loss": 2.4491, "step": 122500 }, { "epoch": 0.7035486306541286, "grad_norm": 5.600217819213867, "learning_rate": 1.4822568467293567e-05, "loss": 2.4235, "step": 123000 }, { "epoch": 0.7064085844372755, "grad_norm": 5.281232833862305, "learning_rate": 1.4679570778136226e-05, "loss": 2.4219, "step": 123500 }, { "epoch": 0.7092685382204224, "grad_norm": 5.651204586029053, "learning_rate": 1.4536573088978883e-05, "loss": 2.448, "step": 124000 }, { "epoch": 0.7121284920035692, "grad_norm": 5.520606994628906, "learning_rate": 1.439357539982154e-05, "loss": 2.4118, "step": 124500 }, { "epoch": 0.714988445786716, "grad_norm": 6.359561920166016, "learning_rate": 1.4250577710664196e-05, "loss": 2.4502, "step": 125000 }, { "epoch": 0.7178483995698629, "grad_norm": 6.264361381530762, "learning_rate": 1.4107580021506853e-05, "loss": 2.4214, "step": 125500 }, { "epoch": 0.7207083533530099, "grad_norm": 15.211498260498047, "learning_rate": 1.396458233234951e-05, "loss": 2.4476, "step": 126000 }, { "epoch": 0.7235683071361567, "grad_norm": 6.165014266967773, "learning_rate": 1.3821584643192167e-05, "loss": 2.4255, "step": 126500 }, { "epoch": 0.7264282609193036, "grad_norm": 5.279512882232666, "learning_rate": 1.3678586954034823e-05, "loss": 2.4458, "step": 127000 }, { "epoch": 0.7292882147024504, "grad_norm": 6.13384485244751, "learning_rate": 1.353558926487748e-05, "loss": 2.4022, "step": 127500 }, { "epoch": 0.7321481684855973, "grad_norm": 5.577615261077881, "learning_rate": 1.3392591575720137e-05, "loss": 2.4174, "step": 128000 }, { "epoch": 0.7350081222687441, "grad_norm": 5.860058784484863, "learning_rate": 1.3249593886562794e-05, "loss": 2.4043, "step": 128500 }, { "epoch": 0.737868076051891, "grad_norm": 6.8798065185546875, "learning_rate": 1.3106596197405451e-05, "loss": 2.3858, "step": 129000 }, { "epoch": 0.7407280298350378, "grad_norm": 7.996329307556152, "learning_rate": 1.2963598508248107e-05, "loss": 2.3993, "step": 129500 }, { "epoch": 0.7435879836181847, "grad_norm": 6.488850116729736, "learning_rate": 1.2820600819090764e-05, "loss": 2.4204, "step": 130000 }, { "epoch": 0.7464479374013316, "grad_norm": 5.177313804626465, "learning_rate": 1.2677603129933421e-05, "loss": 2.433, "step": 130500 }, { "epoch": 0.7493078911844785, "grad_norm": 6.9536895751953125, "learning_rate": 1.2534605440776078e-05, "loss": 2.4145, "step": 131000 }, { "epoch": 0.7521678449676253, "grad_norm": 5.639203071594238, "learning_rate": 1.2391607751618735e-05, "loss": 2.3906, "step": 131500 }, { "epoch": 0.7550277987507722, "grad_norm": 5.76200532913208, "learning_rate": 1.2248610062461391e-05, "loss": 2.4065, "step": 132000 }, { "epoch": 0.757887752533919, "grad_norm": 7.033239364624023, "learning_rate": 1.2105612373304048e-05, "loss": 2.4045, "step": 132500 }, { "epoch": 0.7607477063170659, "grad_norm": 6.319807529449463, "learning_rate": 1.1962614684146704e-05, "loss": 2.3646, "step": 133000 }, { "epoch": 0.7636076601002127, "grad_norm": 6.506091117858887, "learning_rate": 1.1819616994989362e-05, "loss": 2.4247, "step": 133500 }, { "epoch": 0.7664676138833596, "grad_norm": 6.245853424072266, "learning_rate": 1.1676619305832018e-05, "loss": 2.3998, "step": 134000 }, { "epoch": 0.7693275676665066, "grad_norm": 6.403684616088867, "learning_rate": 1.1533621616674675e-05, "loss": 2.4072, "step": 134500 }, { "epoch": 0.7721875214496534, "grad_norm": 6.385560035705566, "learning_rate": 1.1390623927517332e-05, "loss": 2.4078, "step": 135000 }, { "epoch": 0.7750474752328003, "grad_norm": 6.857175350189209, "learning_rate": 1.124762623835999e-05, "loss": 2.4167, "step": 135500 }, { "epoch": 0.7779074290159471, "grad_norm": 5.734222888946533, "learning_rate": 1.1104628549202645e-05, "loss": 2.411, "step": 136000 }, { "epoch": 0.780767382799094, "grad_norm": 6.311659812927246, "learning_rate": 1.0961630860045302e-05, "loss": 2.4232, "step": 136500 }, { "epoch": 0.7836273365822408, "grad_norm": 6.344162940979004, "learning_rate": 1.0818633170887959e-05, "loss": 2.3997, "step": 137000 }, { "epoch": 0.7864872903653877, "grad_norm": 5.971358776092529, "learning_rate": 1.0675635481730616e-05, "loss": 2.4181, "step": 137500 }, { "epoch": 0.7893472441485345, "grad_norm": 5.663905620574951, "learning_rate": 1.0532637792573273e-05, "loss": 2.3939, "step": 138000 }, { "epoch": 0.7922071979316814, "grad_norm": 5.739428520202637, "learning_rate": 1.0389640103415929e-05, "loss": 2.3803, "step": 138500 }, { "epoch": 0.7950671517148283, "grad_norm": 6.558109760284424, "learning_rate": 1.0246642414258586e-05, "loss": 2.3794, "step": 139000 }, { "epoch": 0.7979271054979752, "grad_norm": 7.577678203582764, "learning_rate": 1.0103644725101243e-05, "loss": 2.4035, "step": 139500 }, { "epoch": 0.800787059281122, "grad_norm": 6.890414237976074, "learning_rate": 9.9606470359439e-06, "loss": 2.3791, "step": 140000 }, { "epoch": 0.8036470130642689, "grad_norm": 6.212318420410156, "learning_rate": 9.817649346786556e-06, "loss": 2.363, "step": 140500 }, { "epoch": 0.8065069668474157, "grad_norm": 6.501023292541504, "learning_rate": 9.674651657629213e-06, "loss": 2.3794, "step": 141000 }, { "epoch": 0.8093669206305626, "grad_norm": 6.136830806732178, "learning_rate": 9.53165396847187e-06, "loss": 2.3835, "step": 141500 }, { "epoch": 0.8122268744137094, "grad_norm": 6.386491298675537, "learning_rate": 9.388656279314527e-06, "loss": 2.3836, "step": 142000 }, { "epoch": 0.8150868281968563, "grad_norm": 6.060532093048096, "learning_rate": 9.245658590157182e-06, "loss": 2.3714, "step": 142500 }, { "epoch": 0.8179467819800033, "grad_norm": 6.481443405151367, "learning_rate": 9.10266090099984e-06, "loss": 2.3842, "step": 143000 }, { "epoch": 0.8208067357631501, "grad_norm": 6.378634929656982, "learning_rate": 8.959663211842497e-06, "loss": 2.4011, "step": 143500 }, { "epoch": 0.823666689546297, "grad_norm": 7.321898937225342, "learning_rate": 8.816665522685154e-06, "loss": 2.3874, "step": 144000 }, { "epoch": 0.8265266433294438, "grad_norm": 5.878232479095459, "learning_rate": 8.673667833527811e-06, "loss": 2.3747, "step": 144500 }, { "epoch": 0.8293865971125907, "grad_norm": 6.182088375091553, "learning_rate": 8.530670144370468e-06, "loss": 2.3928, "step": 145000 }, { "epoch": 0.8322465508957375, "grad_norm": 6.2058258056640625, "learning_rate": 8.387672455213125e-06, "loss": 2.3784, "step": 145500 }, { "epoch": 0.8351065046788844, "grad_norm": 6.231584072113037, "learning_rate": 8.24467476605578e-06, "loss": 2.3715, "step": 146000 }, { "epoch": 0.8379664584620312, "grad_norm": 6.14652156829834, "learning_rate": 8.101677076898438e-06, "loss": 2.3789, "step": 146500 }, { "epoch": 0.8408264122451781, "grad_norm": 6.431158065795898, "learning_rate": 7.958679387741095e-06, "loss": 2.3792, "step": 147000 }, { "epoch": 0.843686366028325, "grad_norm": 5.822235584259033, "learning_rate": 7.815681698583752e-06, "loss": 2.4062, "step": 147500 }, { "epoch": 0.8465463198114719, "grad_norm": 5.64607048034668, "learning_rate": 7.672684009426408e-06, "loss": 2.368, "step": 148000 }, { "epoch": 0.8494062735946187, "grad_norm": 6.182931900024414, "learning_rate": 7.5296863202690655e-06, "loss": 2.3877, "step": 148500 }, { "epoch": 0.8522662273777656, "grad_norm": 6.151760578155518, "learning_rate": 7.386688631111721e-06, "loss": 2.3915, "step": 149000 }, { "epoch": 0.8551261811609124, "grad_norm": 6.303664684295654, "learning_rate": 7.243690941954379e-06, "loss": 2.3565, "step": 149500 }, { "epoch": 0.8579861349440593, "grad_norm": 6.381216526031494, "learning_rate": 7.100693252797034e-06, "loss": 2.3697, "step": 150000 }, { "epoch": 0.8608460887272061, "grad_norm": 5.706302165985107, "learning_rate": 6.957695563639692e-06, "loss": 2.4026, "step": 150500 }, { "epoch": 0.863706042510353, "grad_norm": 7.22359561920166, "learning_rate": 6.814697874482348e-06, "loss": 2.3759, "step": 151000 }, { "epoch": 0.8665659962935, "grad_norm": 5.458381652832031, "learning_rate": 6.671700185325006e-06, "loss": 2.3836, "step": 151500 }, { "epoch": 0.8694259500766468, "grad_norm": 5.785479545593262, "learning_rate": 6.528702496167661e-06, "loss": 2.3655, "step": 152000 }, { "epoch": 0.8722859038597937, "grad_norm": 5.856048583984375, "learning_rate": 6.385704807010319e-06, "loss": 2.3669, "step": 152500 }, { "epoch": 0.8751458576429405, "grad_norm": 5.491500377655029, "learning_rate": 6.2427071178529756e-06, "loss": 2.4154, "step": 153000 }, { "epoch": 0.8780058114260874, "grad_norm": 5.936758518218994, "learning_rate": 6.099709428695633e-06, "loss": 2.3702, "step": 153500 }, { "epoch": 0.8808657652092342, "grad_norm": 7.138918399810791, "learning_rate": 5.956711739538289e-06, "loss": 2.3582, "step": 154000 }, { "epoch": 0.8837257189923811, "grad_norm": 6.457569122314453, "learning_rate": 5.813714050380946e-06, "loss": 2.381, "step": 154500 }, { "epoch": 0.8865856727755279, "grad_norm": 6.026115894317627, "learning_rate": 5.6707163612236024e-06, "loss": 2.385, "step": 155000 }, { "epoch": 0.8894456265586748, "grad_norm": 6.851065158843994, "learning_rate": 5.52771867206626e-06, "loss": 2.3664, "step": 155500 }, { "epoch": 0.8923055803418217, "grad_norm": 6.16819953918457, "learning_rate": 5.384720982908916e-06, "loss": 2.3814, "step": 156000 }, { "epoch": 0.8951655341249686, "grad_norm": 5.917440891265869, "learning_rate": 5.241723293751574e-06, "loss": 2.3701, "step": 156500 }, { "epoch": 0.8980254879081154, "grad_norm": 10.217552185058594, "learning_rate": 5.09872560459423e-06, "loss": 2.3516, "step": 157000 }, { "epoch": 0.9008854416912623, "grad_norm": 7.088205814361572, "learning_rate": 4.955727915436887e-06, "loss": 2.3936, "step": 157500 }, { "epoch": 0.9037453954744091, "grad_norm": 6.357458591461182, "learning_rate": 4.812730226279544e-06, "loss": 2.3672, "step": 158000 }, { "epoch": 0.906605349257556, "grad_norm": 6.871440887451172, "learning_rate": 4.669732537122201e-06, "loss": 2.3691, "step": 158500 }, { "epoch": 0.9094653030407028, "grad_norm": 6.192137718200684, "learning_rate": 4.526734847964857e-06, "loss": 2.3608, "step": 159000 }, { "epoch": 0.9123252568238497, "grad_norm": 6.265544414520264, "learning_rate": 4.383737158807514e-06, "loss": 2.3682, "step": 159500 }, { "epoch": 0.9151852106069966, "grad_norm": 5.907118320465088, "learning_rate": 4.2407394696501705e-06, "loss": 2.3423, "step": 160000 }, { "epoch": 0.9180451643901435, "grad_norm": 6.204267501831055, "learning_rate": 4.097741780492828e-06, "loss": 2.3605, "step": 160500 }, { "epoch": 0.9209051181732903, "grad_norm": 6.978556156158447, "learning_rate": 3.954744091335484e-06, "loss": 2.3594, "step": 161000 }, { "epoch": 0.9237650719564372, "grad_norm": 6.3842082023620605, "learning_rate": 3.811746402178141e-06, "loss": 2.3677, "step": 161500 }, { "epoch": 0.926625025739584, "grad_norm": 6.20996618270874, "learning_rate": 3.6687487130207977e-06, "loss": 2.3538, "step": 162000 }, { "epoch": 0.9294849795227309, "grad_norm": 6.184482574462891, "learning_rate": 3.5257510238634545e-06, "loss": 2.3787, "step": 162500 }, { "epoch": 0.9323449333058778, "grad_norm": 6.219623565673828, "learning_rate": 3.382753334706111e-06, "loss": 2.3774, "step": 163000 }, { "epoch": 0.9352048870890246, "grad_norm": 6.634711742401123, "learning_rate": 3.239755645548768e-06, "loss": 2.3671, "step": 163500 }, { "epoch": 0.9380648408721715, "grad_norm": 7.119485855102539, "learning_rate": 3.096757956391425e-06, "loss": 2.356, "step": 164000 }, { "epoch": 0.9409247946553184, "grad_norm": 6.833123207092285, "learning_rate": 2.9537602672340818e-06, "loss": 2.3451, "step": 164500 }, { "epoch": 0.9437847484384653, "grad_norm": 6.631540298461914, "learning_rate": 2.8107625780767385e-06, "loss": 2.3324, "step": 165000 }, { "epoch": 0.9466447022216121, "grad_norm": 6.187737941741943, "learning_rate": 2.667764888919395e-06, "loss": 2.3573, "step": 165500 }, { "epoch": 0.949504656004759, "grad_norm": 5.523457050323486, "learning_rate": 2.524767199762052e-06, "loss": 2.3468, "step": 166000 }, { "epoch": 0.9523646097879058, "grad_norm": 6.898806095123291, "learning_rate": 2.381769510604709e-06, "loss": 2.3534, "step": 166500 }, { "epoch": 0.9552245635710527, "grad_norm": 6.348108291625977, "learning_rate": 2.2387718214473658e-06, "loss": 2.3588, "step": 167000 }, { "epoch": 0.9580845173541995, "grad_norm": 6.188412189483643, "learning_rate": 2.0957741322900225e-06, "loss": 2.3607, "step": 167500 }, { "epoch": 0.9609444711373464, "grad_norm": 6.769163608551025, "learning_rate": 1.952776443132679e-06, "loss": 2.3721, "step": 168000 }, { "epoch": 0.9638044249204932, "grad_norm": 6.389153957366943, "learning_rate": 1.8097787539753357e-06, "loss": 2.381, "step": 168500 }, { "epoch": 0.9666643787036402, "grad_norm": 5.625518798828125, "learning_rate": 1.6667810648179926e-06, "loss": 2.3656, "step": 169000 }, { "epoch": 0.969524332486787, "grad_norm": 6.03477144241333, "learning_rate": 1.5237833756606493e-06, "loss": 2.3796, "step": 169500 }, { "epoch": 0.9723842862699339, "grad_norm": 6.034476280212402, "learning_rate": 1.3807856865033063e-06, "loss": 2.3407, "step": 170000 }, { "epoch": 0.9752442400530807, "grad_norm": 6.318973541259766, "learning_rate": 1.237787997345963e-06, "loss": 2.3537, "step": 170500 }, { "epoch": 0.9781041938362276, "grad_norm": 6.3570237159729, "learning_rate": 1.0947903081886197e-06, "loss": 2.3744, "step": 171000 }, { "epoch": 0.9809641476193744, "grad_norm": 5.440378189086914, "learning_rate": 9.517926190312765e-07, "loss": 2.3775, "step": 171500 }, { "epoch": 0.9838241014025213, "grad_norm": 7.5823655128479, "learning_rate": 8.087949298739332e-07, "loss": 2.3301, "step": 172000 }, { "epoch": 0.9866840551856682, "grad_norm": 6.07295560836792, "learning_rate": 6.6579724071659e-07, "loss": 2.3347, "step": 172500 }, { "epoch": 0.9895440089688151, "grad_norm": 7.158942222595215, "learning_rate": 5.227995515592468e-07, "loss": 2.3567, "step": 173000 }, { "epoch": 0.992403962751962, "grad_norm": 6.406834125518799, "learning_rate": 3.798018624019036e-07, "loss": 2.3204, "step": 173500 }, { "epoch": 0.9952639165351088, "grad_norm": 5.863027572631836, "learning_rate": 2.3680417324456038e-07, "loss": 2.3569, "step": 174000 }, { "epoch": 0.9981238703182557, "grad_norm": 6.552116394042969, "learning_rate": 9.380648408721716e-08, "loss": 2.3332, "step": 174500 }, { "epoch": 1.0, "step": 174828, "total_flos": 1.8427441878551347e+17, "train_loss": 1.5726176189089465, "train_runtime": 27622.4465, "train_samples_per_second": 25.317, "train_steps_per_second": 6.329 } ], "logging_steps": 500, "max_steps": 174828, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8427441878551347e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }