{ "best_metric": 1.5174657106399536, "best_model_checkpoint": "flan-t5-base-paragrapher/checkpoint-39000", "epoch": 10.0, "eval_steps": 500, "global_step": 44420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11256190904997748, "grad_norm": 2.3381476402282715, "learning_rate": 4.9437190454750117e-05, "loss": 2.0748, "num_input_tokens_seen": 562752, "step": 500 }, { "epoch": 0.11256190904997748, "eval_loss": 1.7587194442749023, "eval_runtime": 6.6407, "eval_samples_per_second": 150.586, "eval_steps_per_second": 18.823, "num_input_tokens_seen": 562752, "step": 500 }, { "epoch": 0.22512381809995496, "grad_norm": 2.58415150642395, "learning_rate": 4.887438090950023e-05, "loss": 1.9699, "num_input_tokens_seen": 1119424, "step": 1000 }, { "epoch": 0.22512381809995496, "eval_loss": 1.7031291723251343, "eval_runtime": 7.0071, "eval_samples_per_second": 142.713, "eval_steps_per_second": 17.839, "num_input_tokens_seen": 1119424, "step": 1000 }, { "epoch": 0.33768572714993245, "grad_norm": 2.1401476860046387, "learning_rate": 4.831157136425034e-05, "loss": 1.9177, "num_input_tokens_seen": 1676620, "step": 1500 }, { "epoch": 0.33768572714993245, "eval_loss": 1.6701327562332153, "eval_runtime": 7.0023, "eval_samples_per_second": 142.809, "eval_steps_per_second": 17.851, "num_input_tokens_seen": 1676620, "step": 1500 }, { "epoch": 0.45024763619990993, "grad_norm": 2.9264276027679443, "learning_rate": 4.774876181900045e-05, "loss": 1.9179, "num_input_tokens_seen": 2244928, "step": 2000 }, { "epoch": 0.45024763619990993, "eval_loss": 1.6646634340286255, "eval_runtime": 6.7352, "eval_samples_per_second": 148.473, "eval_steps_per_second": 18.559, "num_input_tokens_seen": 2244928, "step": 2000 }, { "epoch": 0.5628095452498875, "grad_norm": 1.9992469549179077, "learning_rate": 4.7185952273750566e-05, "loss": 1.8908, "num_input_tokens_seen": 2806840, "step": 2500 }, { "epoch": 0.5628095452498875, "eval_loss": 1.650177240371704, "eval_runtime": 6.9902, "eval_samples_per_second": 143.057, "eval_steps_per_second": 17.882, "num_input_tokens_seen": 2806840, "step": 2500 }, { "epoch": 0.6753714542998649, "grad_norm": 2.2747724056243896, "learning_rate": 4.662314272850068e-05, "loss": 1.8666, "num_input_tokens_seen": 3364792, "step": 3000 }, { "epoch": 0.6753714542998649, "eval_loss": 1.6427327394485474, "eval_runtime": 6.7654, "eval_samples_per_second": 147.81, "eval_steps_per_second": 18.476, "num_input_tokens_seen": 3364792, "step": 3000 }, { "epoch": 0.7879333633498424, "grad_norm": 2.3144521713256836, "learning_rate": 4.6060333183250794e-05, "loss": 1.8456, "num_input_tokens_seen": 3925172, "step": 3500 }, { "epoch": 0.7879333633498424, "eval_loss": 1.6245118379592896, "eval_runtime": 7.045, "eval_samples_per_second": 141.944, "eval_steps_per_second": 17.743, "num_input_tokens_seen": 3925172, "step": 3500 }, { "epoch": 0.9004952723998199, "grad_norm": 2.394990921020508, "learning_rate": 4.54975236380009e-05, "loss": 1.8542, "num_input_tokens_seen": 4490100, "step": 4000 }, { "epoch": 0.9004952723998199, "eval_loss": 1.6217968463897705, "eval_runtime": 7.1659, "eval_samples_per_second": 139.549, "eval_steps_per_second": 17.444, "num_input_tokens_seen": 4490100, "step": 4000 }, { "epoch": 1.0130571814497973, "grad_norm": 2.0307397842407227, "learning_rate": 4.4934714092751016e-05, "loss": 1.8305, "num_input_tokens_seen": 5052066, "step": 4500 }, { "epoch": 1.0130571814497973, "eval_loss": 1.6211210489273071, "eval_runtime": 6.7485, "eval_samples_per_second": 148.18, "eval_steps_per_second": 18.523, "num_input_tokens_seen": 5052066, "step": 4500 }, { "epoch": 1.125619090499775, "grad_norm": 2.627432346343994, "learning_rate": 4.437190454750112e-05, "loss": 1.7588, "num_input_tokens_seen": 5607258, "step": 5000 }, { "epoch": 1.125619090499775, "eval_loss": 1.6039692163467407, "eval_runtime": 6.9517, "eval_samples_per_second": 143.85, "eval_steps_per_second": 17.981, "num_input_tokens_seen": 5607258, "step": 5000 }, { "epoch": 1.2381809995497524, "grad_norm": 2.0901269912719727, "learning_rate": 4.380909500225124e-05, "loss": 1.7606, "num_input_tokens_seen": 6165278, "step": 5500 }, { "epoch": 1.2381809995497524, "eval_loss": 1.6020277738571167, "eval_runtime": 6.7991, "eval_samples_per_second": 147.079, "eval_steps_per_second": 18.385, "num_input_tokens_seen": 6165278, "step": 5500 }, { "epoch": 1.3507429085997298, "grad_norm": 2.733879804611206, "learning_rate": 4.324628545700135e-05, "loss": 1.7426, "num_input_tokens_seen": 6727290, "step": 6000 }, { "epoch": 1.3507429085997298, "eval_loss": 1.5992920398712158, "eval_runtime": 6.9187, "eval_samples_per_second": 144.535, "eval_steps_per_second": 18.067, "num_input_tokens_seen": 6727290, "step": 6000 }, { "epoch": 1.4633048176497074, "grad_norm": 2.134584665298462, "learning_rate": 4.2683475911751466e-05, "loss": 1.7477, "num_input_tokens_seen": 7292338, "step": 6500 }, { "epoch": 1.4633048176497074, "eval_loss": 1.5869497060775757, "eval_runtime": 7.0346, "eval_samples_per_second": 142.154, "eval_steps_per_second": 17.769, "num_input_tokens_seen": 7292338, "step": 6500 }, { "epoch": 1.5758667266996849, "grad_norm": 2.1596298217773438, "learning_rate": 4.212066636650158e-05, "loss": 1.7413, "num_input_tokens_seen": 7849466, "step": 7000 }, { "epoch": 1.5758667266996849, "eval_loss": 1.5790966749191284, "eval_runtime": 7.0544, "eval_samples_per_second": 141.755, "eval_steps_per_second": 17.719, "num_input_tokens_seen": 7849466, "step": 7000 }, { "epoch": 1.6884286357496623, "grad_norm": 2.1379499435424805, "learning_rate": 4.155785682125169e-05, "loss": 1.7342, "num_input_tokens_seen": 8415302, "step": 7500 }, { "epoch": 1.6884286357496623, "eval_loss": 1.5791916847229004, "eval_runtime": 6.899, "eval_samples_per_second": 144.949, "eval_steps_per_second": 18.119, "num_input_tokens_seen": 8415302, "step": 7500 }, { "epoch": 1.8009905447996397, "grad_norm": 3.6986210346221924, "learning_rate": 4.09950472760018e-05, "loss": 1.7247, "num_input_tokens_seen": 8970490, "step": 8000 }, { "epoch": 1.8009905447996397, "eval_loss": 1.5758883953094482, "eval_runtime": 7.0586, "eval_samples_per_second": 141.671, "eval_steps_per_second": 17.709, "num_input_tokens_seen": 8970490, "step": 8000 }, { "epoch": 1.9135524538496171, "grad_norm": 2.2056398391723633, "learning_rate": 4.0432237730751915e-05, "loss": 1.7423, "num_input_tokens_seen": 9529290, "step": 8500 }, { "epoch": 1.9135524538496171, "eval_loss": 1.574432134628296, "eval_runtime": 7.087, "eval_samples_per_second": 141.104, "eval_steps_per_second": 17.638, "num_input_tokens_seen": 9529290, "step": 8500 }, { "epoch": 2.0261143628995946, "grad_norm": 2.6801469326019287, "learning_rate": 3.986942818550203e-05, "loss": 1.7138, "num_input_tokens_seen": 10091652, "step": 9000 }, { "epoch": 2.0261143628995946, "eval_loss": 1.5655481815338135, "eval_runtime": 6.8545, "eval_samples_per_second": 145.889, "eval_steps_per_second": 18.236, "num_input_tokens_seen": 10091652, "step": 9000 }, { "epoch": 2.1386762719495724, "grad_norm": 1.9888643026351929, "learning_rate": 3.9306618640252144e-05, "loss": 1.6719, "num_input_tokens_seen": 10650544, "step": 9500 }, { "epoch": 2.1386762719495724, "eval_loss": 1.562954306602478, "eval_runtime": 6.8029, "eval_samples_per_second": 146.995, "eval_steps_per_second": 18.374, "num_input_tokens_seen": 10650544, "step": 9500 }, { "epoch": 2.25123818099955, "grad_norm": 2.0444724559783936, "learning_rate": 3.874380909500225e-05, "loss": 1.6637, "num_input_tokens_seen": 11208648, "step": 10000 }, { "epoch": 2.25123818099955, "eval_loss": 1.5584429502487183, "eval_runtime": 6.8702, "eval_samples_per_second": 145.557, "eval_steps_per_second": 18.195, "num_input_tokens_seen": 11208648, "step": 10000 }, { "epoch": 2.3638000900495273, "grad_norm": 3.0980803966522217, "learning_rate": 3.8180999549752365e-05, "loss": 1.6415, "num_input_tokens_seen": 11776396, "step": 10500 }, { "epoch": 2.3638000900495273, "eval_loss": 1.5608967542648315, "eval_runtime": 6.9589, "eval_samples_per_second": 143.702, "eval_steps_per_second": 17.963, "num_input_tokens_seen": 11776396, "step": 10500 }, { "epoch": 2.4763619990995047, "grad_norm": 2.2694685459136963, "learning_rate": 3.761819000450248e-05, "loss": 1.6565, "num_input_tokens_seen": 12338500, "step": 11000 }, { "epoch": 2.4763619990995047, "eval_loss": 1.5557788610458374, "eval_runtime": 6.8357, "eval_samples_per_second": 146.29, "eval_steps_per_second": 18.286, "num_input_tokens_seen": 12338500, "step": 11000 }, { "epoch": 2.588923908149482, "grad_norm": 2.054405689239502, "learning_rate": 3.705538045925259e-05, "loss": 1.6597, "num_input_tokens_seen": 12897552, "step": 11500 }, { "epoch": 2.588923908149482, "eval_loss": 1.5530343055725098, "eval_runtime": 6.9444, "eval_samples_per_second": 144.001, "eval_steps_per_second": 18.0, "num_input_tokens_seen": 12897552, "step": 11500 }, { "epoch": 2.7014858171994596, "grad_norm": 2.121612548828125, "learning_rate": 3.649257091400271e-05, "loss": 1.6709, "num_input_tokens_seen": 13460052, "step": 12000 }, { "epoch": 2.7014858171994596, "eval_loss": 1.547659993171692, "eval_runtime": 6.9397, "eval_samples_per_second": 144.099, "eval_steps_per_second": 18.012, "num_input_tokens_seen": 13460052, "step": 12000 }, { "epoch": 2.814047726249437, "grad_norm": 2.2037246227264404, "learning_rate": 3.5929761368752815e-05, "loss": 1.648, "num_input_tokens_seen": 14021984, "step": 12500 }, { "epoch": 2.814047726249437, "eval_loss": 1.5424396991729736, "eval_runtime": 6.9369, "eval_samples_per_second": 144.156, "eval_steps_per_second": 18.019, "num_input_tokens_seen": 14021984, "step": 12500 }, { "epoch": 2.926609635299415, "grad_norm": 3.3693389892578125, "learning_rate": 3.536695182350293e-05, "loss": 1.642, "num_input_tokens_seen": 14586256, "step": 13000 }, { "epoch": 2.926609635299415, "eval_loss": 1.5432666540145874, "eval_runtime": 6.899, "eval_samples_per_second": 144.948, "eval_steps_per_second": 18.119, "num_input_tokens_seen": 14586256, "step": 13000 }, { "epoch": 3.0391715443493923, "grad_norm": 2.0641028881073, "learning_rate": 3.480414227825304e-05, "loss": 1.6258, "num_input_tokens_seen": 15140609, "step": 13500 }, { "epoch": 3.0391715443493923, "eval_loss": 1.541858196258545, "eval_runtime": 6.9082, "eval_samples_per_second": 144.756, "eval_steps_per_second": 18.095, "num_input_tokens_seen": 15140609, "step": 13500 }, { "epoch": 3.1517334533993697, "grad_norm": 2.103282928466797, "learning_rate": 3.424133273300315e-05, "loss": 1.6067, "num_input_tokens_seen": 15700397, "step": 14000 }, { "epoch": 3.1517334533993697, "eval_loss": 1.541473627090454, "eval_runtime": 6.8691, "eval_samples_per_second": 145.579, "eval_steps_per_second": 18.197, "num_input_tokens_seen": 15700397, "step": 14000 }, { "epoch": 3.264295362449347, "grad_norm": 1.831855297088623, "learning_rate": 3.3678523187753265e-05, "loss": 1.5946, "num_input_tokens_seen": 16265849, "step": 14500 }, { "epoch": 3.264295362449347, "eval_loss": 1.54502272605896, "eval_runtime": 6.9275, "eval_samples_per_second": 144.353, "eval_steps_per_second": 18.044, "num_input_tokens_seen": 16265849, "step": 14500 }, { "epoch": 3.3768572714993246, "grad_norm": 1.8841536045074463, "learning_rate": 3.311571364250338e-05, "loss": 1.5835, "num_input_tokens_seen": 16827557, "step": 15000 }, { "epoch": 3.3768572714993246, "eval_loss": 1.5415329933166504, "eval_runtime": 7.0688, "eval_samples_per_second": 141.466, "eval_steps_per_second": 17.683, "num_input_tokens_seen": 16827557, "step": 15000 }, { "epoch": 3.489419180549302, "grad_norm": 2.257237672805786, "learning_rate": 3.2552904097253486e-05, "loss": 1.5996, "num_input_tokens_seen": 17384857, "step": 15500 }, { "epoch": 3.489419180549302, "eval_loss": 1.5411442518234253, "eval_runtime": 6.997, "eval_samples_per_second": 142.919, "eval_steps_per_second": 17.865, "num_input_tokens_seen": 17384857, "step": 15500 }, { "epoch": 3.6019810895992794, "grad_norm": 3.1065425872802734, "learning_rate": 3.19900945520036e-05, "loss": 1.5834, "num_input_tokens_seen": 17945909, "step": 16000 }, { "epoch": 3.6019810895992794, "eval_loss": 1.5382109880447388, "eval_runtime": 7.0342, "eval_samples_per_second": 142.163, "eval_steps_per_second": 17.77, "num_input_tokens_seen": 17945909, "step": 16000 }, { "epoch": 3.7145429986492573, "grad_norm": 2.736241579055786, "learning_rate": 3.1427285006753714e-05, "loss": 1.5956, "num_input_tokens_seen": 18507721, "step": 16500 }, { "epoch": 3.7145429986492573, "eval_loss": 1.5350807905197144, "eval_runtime": 6.9983, "eval_samples_per_second": 142.893, "eval_steps_per_second": 17.862, "num_input_tokens_seen": 18507721, "step": 16500 }, { "epoch": 3.8271049076992347, "grad_norm": 2.040745973587036, "learning_rate": 3.086447546150383e-05, "loss": 1.5825, "num_input_tokens_seen": 19069425, "step": 17000 }, { "epoch": 3.8271049076992347, "eval_loss": 1.5356193780899048, "eval_runtime": 6.7845, "eval_samples_per_second": 147.395, "eval_steps_per_second": 18.424, "num_input_tokens_seen": 19069425, "step": 17000 }, { "epoch": 3.939666816749212, "grad_norm": 1.7128684520721436, "learning_rate": 3.030166591625394e-05, "loss": 1.6001, "num_input_tokens_seen": 19631905, "step": 17500 }, { "epoch": 3.939666816749212, "eval_loss": 1.5294198989868164, "eval_runtime": 6.8242, "eval_samples_per_second": 146.538, "eval_steps_per_second": 18.317, "num_input_tokens_seen": 19631905, "step": 17500 }, { "epoch": 4.052228725799189, "grad_norm": 2.3983848094940186, "learning_rate": 2.9738856371004053e-05, "loss": 1.5677, "num_input_tokens_seen": 20185192, "step": 18000 }, { "epoch": 4.052228725799189, "eval_loss": 1.5368764400482178, "eval_runtime": 6.9147, "eval_samples_per_second": 144.619, "eval_steps_per_second": 18.077, "num_input_tokens_seen": 20185192, "step": 18000 }, { "epoch": 4.164790634849167, "grad_norm": 2.142731189727783, "learning_rate": 2.9176046825754167e-05, "loss": 1.5415, "num_input_tokens_seen": 20739888, "step": 18500 }, { "epoch": 4.164790634849167, "eval_loss": 1.5318347215652466, "eval_runtime": 6.8209, "eval_samples_per_second": 146.608, "eval_steps_per_second": 18.326, "num_input_tokens_seen": 20739888, "step": 18500 }, { "epoch": 4.277352543899145, "grad_norm": 2.7573177814483643, "learning_rate": 2.8613237280504278e-05, "loss": 1.5362, "num_input_tokens_seen": 21304584, "step": 19000 }, { "epoch": 4.277352543899145, "eval_loss": 1.531069278717041, "eval_runtime": 6.7833, "eval_samples_per_second": 147.42, "eval_steps_per_second": 18.428, "num_input_tokens_seen": 21304584, "step": 19000 }, { "epoch": 4.389914452949122, "grad_norm": 3.4846112728118896, "learning_rate": 2.8050427735254392e-05, "loss": 1.5251, "num_input_tokens_seen": 21862856, "step": 19500 }, { "epoch": 4.389914452949122, "eval_loss": 1.5322602987289429, "eval_runtime": 6.8222, "eval_samples_per_second": 146.58, "eval_steps_per_second": 18.323, "num_input_tokens_seen": 21862856, "step": 19500 }, { "epoch": 4.5024763619991, "grad_norm": 1.7864114046096802, "learning_rate": 2.7487618190004506e-05, "loss": 1.5388, "num_input_tokens_seen": 22427236, "step": 20000 }, { "epoch": 4.5024763619991, "eval_loss": 1.5306612253189087, "eval_runtime": 6.7822, "eval_samples_per_second": 147.445, "eval_steps_per_second": 18.431, "num_input_tokens_seen": 22427236, "step": 20000 }, { "epoch": 4.615038271049077, "grad_norm": 2.4305617809295654, "learning_rate": 2.6924808644754617e-05, "loss": 1.5508, "num_input_tokens_seen": 22985184, "step": 20500 }, { "epoch": 4.615038271049077, "eval_loss": 1.528159260749817, "eval_runtime": 6.7935, "eval_samples_per_second": 147.2, "eval_steps_per_second": 18.4, "num_input_tokens_seen": 22985184, "step": 20500 }, { "epoch": 4.727600180099055, "grad_norm": 2.421140193939209, "learning_rate": 2.636199909950473e-05, "loss": 1.5692, "num_input_tokens_seen": 23548396, "step": 21000 }, { "epoch": 4.727600180099055, "eval_loss": 1.5264862775802612, "eval_runtime": 6.7777, "eval_samples_per_second": 147.543, "eval_steps_per_second": 18.443, "num_input_tokens_seen": 23548396, "step": 21000 }, { "epoch": 4.8401620891490325, "grad_norm": 2.5409975051879883, "learning_rate": 2.5799189554254842e-05, "loss": 1.5391, "num_input_tokens_seen": 24111452, "step": 21500 }, { "epoch": 4.8401620891490325, "eval_loss": 1.5276471376419067, "eval_runtime": 6.7577, "eval_samples_per_second": 147.979, "eval_steps_per_second": 18.497, "num_input_tokens_seen": 24111452, "step": 21500 }, { "epoch": 4.952723998199009, "grad_norm": 2.3315558433532715, "learning_rate": 2.5236380009004956e-05, "loss": 1.5431, "num_input_tokens_seen": 24673344, "step": 22000 }, { "epoch": 4.952723998199009, "eval_loss": 1.5270482301712036, "eval_runtime": 6.7599, "eval_samples_per_second": 147.932, "eval_steps_per_second": 18.491, "num_input_tokens_seen": 24673344, "step": 22000 }, { "epoch": 5.065285907248987, "grad_norm": 1.7638120651245117, "learning_rate": 2.4673570463755067e-05, "loss": 1.5147, "num_input_tokens_seen": 25236559, "step": 22500 }, { "epoch": 5.065285907248987, "eval_loss": 1.529248833656311, "eval_runtime": 6.7621, "eval_samples_per_second": 147.883, "eval_steps_per_second": 18.485, "num_input_tokens_seen": 25236559, "step": 22500 }, { "epoch": 5.177847816298964, "grad_norm": 2.1643288135528564, "learning_rate": 2.4110760918505178e-05, "loss": 1.4908, "num_input_tokens_seen": 25799675, "step": 23000 }, { "epoch": 5.177847816298964, "eval_loss": 1.5288372039794922, "eval_runtime": 6.7612, "eval_samples_per_second": 147.902, "eval_steps_per_second": 18.488, "num_input_tokens_seen": 25799675, "step": 23000 }, { "epoch": 5.290409725348942, "grad_norm": 2.620457172393799, "learning_rate": 2.3547951373255292e-05, "loss": 1.5153, "num_input_tokens_seen": 26352767, "step": 23500 }, { "epoch": 5.290409725348942, "eval_loss": 1.5288450717926025, "eval_runtime": 6.7495, "eval_samples_per_second": 148.159, "eval_steps_per_second": 18.52, "num_input_tokens_seen": 26352767, "step": 23500 }, { "epoch": 5.402971634398919, "grad_norm": 1.847611427307129, "learning_rate": 2.2985141828005406e-05, "loss": 1.5099, "num_input_tokens_seen": 26916707, "step": 24000 }, { "epoch": 5.402971634398919, "eval_loss": 1.5249587297439575, "eval_runtime": 6.5756, "eval_samples_per_second": 152.077, "eval_steps_per_second": 19.01, "num_input_tokens_seen": 26916707, "step": 24000 }, { "epoch": 5.515533543448897, "grad_norm": 2.3133625984191895, "learning_rate": 2.2422332282755517e-05, "loss": 1.5064, "num_input_tokens_seen": 27483639, "step": 24500 }, { "epoch": 5.515533543448897, "eval_loss": 1.5258936882019043, "eval_runtime": 6.6146, "eval_samples_per_second": 151.18, "eval_steps_per_second": 18.897, "num_input_tokens_seen": 27483639, "step": 24500 }, { "epoch": 5.628095452498874, "grad_norm": 2.402250289916992, "learning_rate": 2.185952273750563e-05, "loss": 1.5146, "num_input_tokens_seen": 28040307, "step": 25000 }, { "epoch": 5.628095452498874, "eval_loss": 1.5248527526855469, "eval_runtime": 6.7384, "eval_samples_per_second": 148.403, "eval_steps_per_second": 18.55, "num_input_tokens_seen": 28040307, "step": 25000 }, { "epoch": 5.740657361548852, "grad_norm": 2.146390438079834, "learning_rate": 2.129671319225574e-05, "loss": 1.4938, "num_input_tokens_seen": 28600639, "step": 25500 }, { "epoch": 5.740657361548852, "eval_loss": 1.5232993364334106, "eval_runtime": 6.8312, "eval_samples_per_second": 146.387, "eval_steps_per_second": 18.298, "num_input_tokens_seen": 28600639, "step": 25500 }, { "epoch": 5.85321927059883, "grad_norm": 2.0159800052642822, "learning_rate": 2.0733903647005852e-05, "loss": 1.5034, "num_input_tokens_seen": 29164539, "step": 26000 }, { "epoch": 5.85321927059883, "eval_loss": 1.52369225025177, "eval_runtime": 6.8837, "eval_samples_per_second": 145.271, "eval_steps_per_second": 18.159, "num_input_tokens_seen": 29164539, "step": 26000 }, { "epoch": 5.965781179648807, "grad_norm": 2.001739740371704, "learning_rate": 2.0171094101755966e-05, "loss": 1.5091, "num_input_tokens_seen": 29730199, "step": 26500 }, { "epoch": 5.965781179648807, "eval_loss": 1.5219199657440186, "eval_runtime": 6.8886, "eval_samples_per_second": 145.168, "eval_steps_per_second": 18.146, "num_input_tokens_seen": 29730199, "step": 26500 }, { "epoch": 6.078343088698785, "grad_norm": 1.9858044385910034, "learning_rate": 1.960828455650608e-05, "loss": 1.4853, "num_input_tokens_seen": 30286010, "step": 27000 }, { "epoch": 6.078343088698785, "eval_loss": 1.5240556001663208, "eval_runtime": 6.7755, "eval_samples_per_second": 147.591, "eval_steps_per_second": 18.449, "num_input_tokens_seen": 30286010, "step": 27000 }, { "epoch": 6.190904997748762, "grad_norm": 1.812340259552002, "learning_rate": 1.904547501125619e-05, "loss": 1.4797, "num_input_tokens_seen": 30840802, "step": 27500 }, { "epoch": 6.190904997748762, "eval_loss": 1.5201354026794434, "eval_runtime": 6.8727, "eval_samples_per_second": 145.503, "eval_steps_per_second": 18.188, "num_input_tokens_seen": 30840802, "step": 27500 }, { "epoch": 6.3034669067987394, "grad_norm": 2.370309829711914, "learning_rate": 1.8482665466006305e-05, "loss": 1.466, "num_input_tokens_seen": 31403710, "step": 28000 }, { "epoch": 6.3034669067987394, "eval_loss": 1.5237922668457031, "eval_runtime": 6.9361, "eval_samples_per_second": 144.174, "eval_steps_per_second": 18.022, "num_input_tokens_seen": 31403710, "step": 28000 }, { "epoch": 6.416028815848716, "grad_norm": 2.5866804122924805, "learning_rate": 1.7919855920756416e-05, "loss": 1.4666, "num_input_tokens_seen": 31962730, "step": 28500 }, { "epoch": 6.416028815848716, "eval_loss": 1.522592306137085, "eval_runtime": 6.8335, "eval_samples_per_second": 146.339, "eval_steps_per_second": 18.292, "num_input_tokens_seen": 31962730, "step": 28500 }, { "epoch": 6.528590724898694, "grad_norm": 2.151406764984131, "learning_rate": 1.735704637550653e-05, "loss": 1.4732, "num_input_tokens_seen": 32518854, "step": 29000 }, { "epoch": 6.528590724898694, "eval_loss": 1.519935965538025, "eval_runtime": 6.7994, "eval_samples_per_second": 147.072, "eval_steps_per_second": 18.384, "num_input_tokens_seen": 32518854, "step": 29000 }, { "epoch": 6.641152633948671, "grad_norm": 2.2219040393829346, "learning_rate": 1.6794236830256644e-05, "loss": 1.4756, "num_input_tokens_seen": 33083634, "step": 29500 }, { "epoch": 6.641152633948671, "eval_loss": 1.5219242572784424, "eval_runtime": 6.7754, "eval_samples_per_second": 147.592, "eval_steps_per_second": 18.449, "num_input_tokens_seen": 33083634, "step": 29500 }, { "epoch": 6.753714542998649, "grad_norm": 1.9344135522842407, "learning_rate": 1.6231427285006755e-05, "loss": 1.4778, "num_input_tokens_seen": 33644482, "step": 30000 }, { "epoch": 6.753714542998649, "eval_loss": 1.519468069076538, "eval_runtime": 6.8057, "eval_samples_per_second": 146.936, "eval_steps_per_second": 18.367, "num_input_tokens_seen": 33644482, "step": 30000 }, { "epoch": 6.866276452048627, "grad_norm": 1.8874679803848267, "learning_rate": 1.5668617739756866e-05, "loss": 1.4674, "num_input_tokens_seen": 34207738, "step": 30500 }, { "epoch": 6.866276452048627, "eval_loss": 1.5181845426559448, "eval_runtime": 6.9025, "eval_samples_per_second": 144.875, "eval_steps_per_second": 18.109, "num_input_tokens_seen": 34207738, "step": 30500 }, { "epoch": 6.978838361098604, "grad_norm": 1.885331392288208, "learning_rate": 1.510580819450698e-05, "loss": 1.4813, "num_input_tokens_seen": 34772050, "step": 31000 }, { "epoch": 6.978838361098604, "eval_loss": 1.5201555490493774, "eval_runtime": 6.832, "eval_samples_per_second": 146.37, "eval_steps_per_second": 18.296, "num_input_tokens_seen": 34772050, "step": 31000 }, { "epoch": 7.091400270148582, "grad_norm": 2.135857582092285, "learning_rate": 1.4542998649257092e-05, "loss": 1.4543, "num_input_tokens_seen": 35331657, "step": 31500 }, { "epoch": 7.091400270148582, "eval_loss": 1.521092414855957, "eval_runtime": 6.9064, "eval_samples_per_second": 144.792, "eval_steps_per_second": 18.099, "num_input_tokens_seen": 35331657, "step": 31500 }, { "epoch": 7.203962179198559, "grad_norm": 2.2138864994049072, "learning_rate": 1.3980189104007205e-05, "loss": 1.4389, "num_input_tokens_seen": 35888749, "step": 32000 }, { "epoch": 7.203962179198559, "eval_loss": 1.5221294164657593, "eval_runtime": 6.9475, "eval_samples_per_second": 143.937, "eval_steps_per_second": 17.992, "num_input_tokens_seen": 35888749, "step": 32000 }, { "epoch": 7.316524088248537, "grad_norm": 2.255690336227417, "learning_rate": 1.3417379558757317e-05, "loss": 1.4534, "num_input_tokens_seen": 36455101, "step": 32500 }, { "epoch": 7.316524088248537, "eval_loss": 1.5215495824813843, "eval_runtime": 6.9343, "eval_samples_per_second": 144.211, "eval_steps_per_second": 18.026, "num_input_tokens_seen": 36455101, "step": 32500 }, { "epoch": 7.429085997298515, "grad_norm": 1.9412790536880493, "learning_rate": 1.285457001350743e-05, "loss": 1.4401, "num_input_tokens_seen": 37016889, "step": 33000 }, { "epoch": 7.429085997298515, "eval_loss": 1.5207875967025757, "eval_runtime": 6.7928, "eval_samples_per_second": 147.215, "eval_steps_per_second": 18.402, "num_input_tokens_seen": 37016889, "step": 33000 }, { "epoch": 7.541647906348492, "grad_norm": 2.0584607124328613, "learning_rate": 1.2291760468257542e-05, "loss": 1.4435, "num_input_tokens_seen": 37570517, "step": 33500 }, { "epoch": 7.541647906348492, "eval_loss": 1.5211970806121826, "eval_runtime": 6.8598, "eval_samples_per_second": 145.776, "eval_steps_per_second": 18.222, "num_input_tokens_seen": 37570517, "step": 33500 }, { "epoch": 7.6542098153984695, "grad_norm": 2.090921401977539, "learning_rate": 1.1728950923007654e-05, "loss": 1.4443, "num_input_tokens_seen": 38134577, "step": 34000 }, { "epoch": 7.6542098153984695, "eval_loss": 1.5204721689224243, "eval_runtime": 6.9765, "eval_samples_per_second": 143.338, "eval_steps_per_second": 17.917, "num_input_tokens_seen": 38134577, "step": 34000 }, { "epoch": 7.766771724448446, "grad_norm": 2.349177360534668, "learning_rate": 1.1166141377757767e-05, "loss": 1.4533, "num_input_tokens_seen": 38700917, "step": 34500 }, { "epoch": 7.766771724448446, "eval_loss": 1.5209357738494873, "eval_runtime": 7.024, "eval_samples_per_second": 142.37, "eval_steps_per_second": 17.796, "num_input_tokens_seen": 38700917, "step": 34500 }, { "epoch": 7.879333633498424, "grad_norm": 2.0737385749816895, "learning_rate": 1.0603331832507881e-05, "loss": 1.4589, "num_input_tokens_seen": 39259257, "step": 35000 }, { "epoch": 7.879333633498424, "eval_loss": 1.5217865705490112, "eval_runtime": 6.8333, "eval_samples_per_second": 146.343, "eval_steps_per_second": 18.293, "num_input_tokens_seen": 39259257, "step": 35000 }, { "epoch": 7.991895542548401, "grad_norm": 2.072783946990967, "learning_rate": 1.0040522287257992e-05, "loss": 1.4548, "num_input_tokens_seen": 39819093, "step": 35500 }, { "epoch": 7.991895542548401, "eval_loss": 1.5185105800628662, "eval_runtime": 6.7618, "eval_samples_per_second": 147.889, "eval_steps_per_second": 18.486, "num_input_tokens_seen": 39819093, "step": 35500 }, { "epoch": 8.104457451598378, "grad_norm": 2.2963035106658936, "learning_rate": 9.477712742008104e-06, "loss": 1.4322, "num_input_tokens_seen": 40382907, "step": 36000 }, { "epoch": 8.104457451598378, "eval_loss": 1.520738959312439, "eval_runtime": 6.783, "eval_samples_per_second": 147.428, "eval_steps_per_second": 18.428, "num_input_tokens_seen": 40382907, "step": 36000 }, { "epoch": 8.217019360648356, "grad_norm": 2.450338840484619, "learning_rate": 8.914903196758218e-06, "loss": 1.4271, "num_input_tokens_seen": 40938983, "step": 36500 }, { "epoch": 8.217019360648356, "eval_loss": 1.5220232009887695, "eval_runtime": 6.9451, "eval_samples_per_second": 143.986, "eval_steps_per_second": 17.998, "num_input_tokens_seen": 40938983, "step": 36500 }, { "epoch": 8.329581269698334, "grad_norm": 2.414069890975952, "learning_rate": 8.35209365150833e-06, "loss": 1.4165, "num_input_tokens_seen": 41498811, "step": 37000 }, { "epoch": 8.329581269698334, "eval_loss": 1.520321011543274, "eval_runtime": 6.9316, "eval_samples_per_second": 144.266, "eval_steps_per_second": 18.033, "num_input_tokens_seen": 41498811, "step": 37000 }, { "epoch": 8.442143178748312, "grad_norm": 2.3508474826812744, "learning_rate": 7.789284106258443e-06, "loss": 1.4273, "num_input_tokens_seen": 42053427, "step": 37500 }, { "epoch": 8.442143178748312, "eval_loss": 1.5197160243988037, "eval_runtime": 6.8382, "eval_samples_per_second": 146.237, "eval_steps_per_second": 18.28, "num_input_tokens_seen": 42053427, "step": 37500 }, { "epoch": 8.55470508779829, "grad_norm": 1.876745581626892, "learning_rate": 7.226474561008555e-06, "loss": 1.4281, "num_input_tokens_seen": 42615135, "step": 38000 }, { "epoch": 8.55470508779829, "eval_loss": 1.519529104232788, "eval_runtime": 6.7991, "eval_samples_per_second": 147.078, "eval_steps_per_second": 18.385, "num_input_tokens_seen": 42615135, "step": 38000 }, { "epoch": 8.667266996848266, "grad_norm": 2.1636829376220703, "learning_rate": 6.663665015758667e-06, "loss": 1.4372, "num_input_tokens_seen": 43173055, "step": 38500 }, { "epoch": 8.667266996848266, "eval_loss": 1.5196864604949951, "eval_runtime": 6.9956, "eval_samples_per_second": 142.947, "eval_steps_per_second": 17.868, "num_input_tokens_seen": 43173055, "step": 38500 }, { "epoch": 8.779828905898244, "grad_norm": 2.752340078353882, "learning_rate": 6.1008554705087804e-06, "loss": 1.4374, "num_input_tokens_seen": 43737723, "step": 39000 }, { "epoch": 8.779828905898244, "eval_loss": 1.5174657106399536, "eval_runtime": 6.8823, "eval_samples_per_second": 145.301, "eval_steps_per_second": 18.163, "num_input_tokens_seen": 43737723, "step": 39000 }, { "epoch": 8.892390814948222, "grad_norm": 2.1042685508728027, "learning_rate": 5.538045925258893e-06, "loss": 1.4278, "num_input_tokens_seen": 44300547, "step": 39500 }, { "epoch": 8.892390814948222, "eval_loss": 1.5211328268051147, "eval_runtime": 6.8661, "eval_samples_per_second": 145.644, "eval_steps_per_second": 18.205, "num_input_tokens_seen": 44300547, "step": 39500 }, { "epoch": 9.0049527239982, "grad_norm": 2.175323486328125, "learning_rate": 4.975236380009005e-06, "loss": 1.442, "num_input_tokens_seen": 44864787, "step": 40000 }, { "epoch": 9.0049527239982, "eval_loss": 1.5188645124435425, "eval_runtime": 6.7745, "eval_samples_per_second": 147.612, "eval_steps_per_second": 18.452, "num_input_tokens_seen": 44864787, "step": 40000 }, { "epoch": 9.117514633048177, "grad_norm": 2.275874376296997, "learning_rate": 4.412426834759118e-06, "loss": 1.4235, "num_input_tokens_seen": 45418155, "step": 40500 }, { "epoch": 9.117514633048177, "eval_loss": 1.5225725173950195, "eval_runtime": 6.8797, "eval_samples_per_second": 145.354, "eval_steps_per_second": 18.169, "num_input_tokens_seen": 45418155, "step": 40500 }, { "epoch": 9.230076542098153, "grad_norm": 2.1484689712524414, "learning_rate": 3.84961728950923e-06, "loss": 1.413, "num_input_tokens_seen": 45985195, "step": 41000 }, { "epoch": 9.230076542098153, "eval_loss": 1.5219917297363281, "eval_runtime": 6.7725, "eval_samples_per_second": 147.655, "eval_steps_per_second": 18.457, "num_input_tokens_seen": 45985195, "step": 41000 }, { "epoch": 9.342638451148131, "grad_norm": 1.917220115661621, "learning_rate": 3.286807744259343e-06, "loss": 1.4193, "num_input_tokens_seen": 46538675, "step": 41500 }, { "epoch": 9.342638451148131, "eval_loss": 1.5200846195220947, "eval_runtime": 6.7567, "eval_samples_per_second": 148.001, "eval_steps_per_second": 18.5, "num_input_tokens_seen": 46538675, "step": 41500 }, { "epoch": 9.45520036019811, "grad_norm": 1.9557278156280518, "learning_rate": 2.7239981990094554e-06, "loss": 1.414, "num_input_tokens_seen": 47101815, "step": 42000 }, { "epoch": 9.45520036019811, "eval_loss": 1.5202205181121826, "eval_runtime": 6.7702, "eval_samples_per_second": 147.707, "eval_steps_per_second": 18.463, "num_input_tokens_seen": 47101815, "step": 42000 }, { "epoch": 9.567762269248087, "grad_norm": 2.2344467639923096, "learning_rate": 2.161188653759568e-06, "loss": 1.4084, "num_input_tokens_seen": 47655583, "step": 42500 }, { "epoch": 9.567762269248087, "eval_loss": 1.5190742015838623, "eval_runtime": 6.853, "eval_samples_per_second": 145.921, "eval_steps_per_second": 18.24, "num_input_tokens_seen": 47655583, "step": 42500 }, { "epoch": 9.680324178298063, "grad_norm": 2.065092086791992, "learning_rate": 1.5983791085096803e-06, "loss": 1.408, "num_input_tokens_seen": 48217371, "step": 43000 }, { "epoch": 9.680324178298063, "eval_loss": 1.5206738710403442, "eval_runtime": 6.9763, "eval_samples_per_second": 143.343, "eval_steps_per_second": 17.918, "num_input_tokens_seen": 48217371, "step": 43000 }, { "epoch": 9.792886087348041, "grad_norm": 2.3241419792175293, "learning_rate": 1.035569563259793e-06, "loss": 1.4207, "num_input_tokens_seen": 48781351, "step": 43500 }, { "epoch": 9.792886087348041, "eval_loss": 1.5199604034423828, "eval_runtime": 6.8579, "eval_samples_per_second": 145.818, "eval_steps_per_second": 18.227, "num_input_tokens_seen": 48781351, "step": 43500 }, { "epoch": 9.905447996398019, "grad_norm": 2.3469886779785156, "learning_rate": 4.727600180099055e-07, "loss": 1.4293, "num_input_tokens_seen": 49345155, "step": 44000 }, { "epoch": 9.905447996398019, "eval_loss": 1.519752860069275, "eval_runtime": 6.7637, "eval_samples_per_second": 147.848, "eval_steps_per_second": 18.481, "num_input_tokens_seen": 49345155, "step": 44000 }, { "epoch": 10.0, "num_input_tokens_seen": 49815380, "step": 44420, "total_flos": 6.662920679892173e+16, "train_loss": 1.5719221366734615, "train_runtime": 7656.0711, "train_samples_per_second": 23.204, "train_steps_per_second": 5.802, "train_tokens_per_second": 6514.629 } ], "logging_steps": 500, "max_steps": 44420, "num_input_tokens_seen": 49815380, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.662920679892173e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }