{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 250, "global_step": 1445, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011072664359861591, "grad_norm": 7.3027024269104, "learning_rate": 0.00036363636363636367, "loss": 7.4688, "step": 16 }, { "epoch": 0.022145328719723183, "grad_norm": 3.441852331161499, "learning_rate": 0.0006363636363636364, "loss": 6.1486, "step": 32 }, { "epoch": 0.03321799307958478, "grad_norm": 0.9797470569610596, "learning_rate": 0.001, "loss": 5.1898, "step": 48 }, { "epoch": 0.044290657439446365, "grad_norm": 0.4113083481788635, "learning_rate": 0.0009996782216198338, "loss": 5.0792, "step": 64 }, { "epoch": 0.05536332179930796, "grad_norm": 0.23998336493968964, "learning_rate": 0.0009987133006446386, "loss": 4.9765, "step": 80 }, { "epoch": 0.06643598615916955, "grad_norm": 0.34719282388687134, "learning_rate": 0.0009971064790372482, "loss": 5.0589, "step": 96 }, { "epoch": 0.07750865051903114, "grad_norm": 0.28703269362449646, "learning_rate": 0.0009948598249594788, "loss": 5.0273, "step": 112 }, { "epoch": 0.08858131487889273, "grad_norm": 0.3285124599933624, "learning_rate": 0.00099197623011017, "loss": 4.9965, "step": 128 }, { "epoch": 0.09965397923875433, "grad_norm": 0.21146929264068604, "learning_rate": 0.0009884594060032405, "loss": 5.0386, "step": 144 }, { "epoch": 0.11072664359861592, "grad_norm": 0.2574313282966614, "learning_rate": 0.0009843138791905482, "loss": 4.9744, "step": 160 }, { "epoch": 0.12179930795847752, "grad_norm": 0.2936093211174011, "learning_rate": 0.000979544985435704, "loss": 5.0448, "step": 176 }, { "epoch": 0.1328719723183391, "grad_norm": 0.2813468277454376, "learning_rate": 0.0009741588628463382, "loss": 5.0399, "step": 192 }, { "epoch": 0.1439446366782007, "grad_norm": 0.27917245030403137, "learning_rate": 0.0009681624439736599, "loss": 5.0001, "step": 208 }, { "epoch": 0.15501730103806227, "grad_norm": 0.23994863033294678, "learning_rate": 0.0009615634468894752, "loss": 4.9829, "step": 224 }, { "epoch": 0.16608996539792387, "grad_norm": 0.18816818296909332, "learning_rate": 0.0009543703652521542, "loss": 4.9348, "step": 240 }, { "epoch": 0.17301038062283736, "eval_bleu": 0.07082991922728418, "eval_cap_loss": 1.862730330052497, "eval_con_loss": 2.0789193881875687, "eval_loss": 3.941649715827423, "step": 250 }, { "epoch": 0.17301038062283736, "eval_bleu": 0.07082991922728418, "eval_cap_loss": 1.862730330052497, "eval_con_loss": 2.0789193881875687, "eval_loss": 3.941649715827423, "eval_runtime": 809.5614, "eval_samples_per_second": 19.036, "eval_steps_per_second": 2.38, "step": 250 }, { "epoch": 0.17716262975778546, "grad_norm": 0.24259746074676514, "learning_rate": 0.0009465924573743279, "loss": 4.9856, "step": 256 }, { "epoch": 0.18823529411764706, "grad_norm": 0.2658279836177826, "learning_rate": 0.0009382397343063877, "loss": 4.9553, "step": 272 }, { "epoch": 0.19930795847750865, "grad_norm": 0.18191872537136078, "learning_rate": 0.0009293229469511293, "loss": 4.9995, "step": 288 }, { "epoch": 0.21038062283737025, "grad_norm": 0.29174473881721497, "learning_rate": 0.000919853572226118, "loss": 5.0197, "step": 304 }, { "epoch": 0.22145328719723184, "grad_norm": 0.21258646249771118, "learning_rate": 0.0009098437982915953, "loss": 4.9636, "step": 320 }, { "epoch": 0.23252595155709344, "grad_norm": 0.18101181089878082, "learning_rate": 0.0008993065088629304, "loss": 4.9274, "step": 336 }, { "epoch": 0.24359861591695503, "grad_norm": 0.2041468620300293, "learning_rate": 0.0008882552666278186, "loss": 5.0077, "step": 352 }, { "epoch": 0.2546712802768166, "grad_norm": 0.3073960244655609, "learning_rate": 0.0008767042957895606, "loss": 4.9781, "step": 368 }, { "epoch": 0.2657439446366782, "grad_norm": 0.17274291813373566, "learning_rate": 0.0008646684637588991, "loss": 4.911, "step": 384 }, { "epoch": 0.2768166089965398, "grad_norm": 0.20907478034496307, "learning_rate": 0.0008521632620179735, "loss": 5.0089, "step": 400 }, { "epoch": 0.2878892733564014, "grad_norm": 0.2937226891517639, "learning_rate": 0.0008392047861810229, "loss": 4.9306, "step": 416 }, { "epoch": 0.29896193771626295, "grad_norm": 0.35890915989875793, "learning_rate": 0.0008258097152775044, "loss": 4.9363, "step": 432 }, { "epoch": 0.31003460207612454, "grad_norm": 0.19902297854423523, "learning_rate": 0.0008119952902842882, "loss": 4.9872, "step": 448 }, { "epoch": 0.32110726643598614, "grad_norm": 0.2420748621225357, "learning_rate": 0.0007977792919345632, "loss": 5.0178, "step": 464 }, { "epoch": 0.33217993079584773, "grad_norm": 0.21633224189281464, "learning_rate": 0.0007831800178320152, "loss": 4.9588, "step": 480 }, { "epoch": 0.34325259515570933, "grad_norm": 0.2581658363342285, "learning_rate": 0.0007682162588997332, "loss": 4.9214, "step": 496 }, { "epoch": 0.3460207612456747, "eval_bleu": 0.0743834461108624, "eval_cap_loss": 1.8318203130285544, "eval_con_loss": 2.0786952485553205, "eval_loss": 3.9105155644914187, "step": 500 }, { "epoch": 0.3460207612456747, "eval_bleu": 0.0743834461108624, "eval_cap_loss": 1.8318203130285544, "eval_con_loss": 2.0786952485553205, "eval_loss": 3.9105155644914187, "eval_runtime": 820.0211, "eval_samples_per_second": 18.793, "eval_steps_per_second": 2.35, "step": 500 }, { "epoch": 0.3543252595155709, "grad_norm": 0.24933111667633057, "learning_rate": 0.0007529072751941595, "loss": 4.9951, "step": 512 }, { "epoch": 0.3653979238754325, "grad_norm": 0.1792280375957489, "learning_rate": 0.0007372727711152087, "loss": 4.8813, "step": 528 }, { "epoch": 0.3764705882352941, "grad_norm": 0.13637550175189972, "learning_rate": 0.0007213328700444696, "loss": 4.9125, "step": 544 }, { "epoch": 0.3875432525951557, "grad_norm": 0.3409987688064575, "learning_rate": 0.0007051080884441287, "loss": 4.9073, "step": 560 }, { "epoch": 0.3986159169550173, "grad_norm": 0.2467418760061264, "learning_rate": 0.0006886193094499536, "loss": 4.9227, "step": 576 }, { "epoch": 0.4096885813148789, "grad_norm": 0.2751147747039795, "learning_rate": 0.000671887755992327, "loss": 4.9871, "step": 592 }, { "epoch": 0.4207612456747405, "grad_norm": 0.21447473764419556, "learning_rate": 0.000654934963479926, "loss": 4.9484, "step": 608 }, { "epoch": 0.4318339100346021, "grad_norm": 0.26142337918281555, "learning_rate": 0.0006377827520812061, "loss": 4.9204, "step": 624 }, { "epoch": 0.4429065743944637, "grad_norm": 0.3584180176258087, "learning_rate": 0.0006204531986393678, "loss": 5.0, "step": 640 }, { "epoch": 0.4539792387543253, "grad_norm": 0.2877948582172394, "learning_rate": 0.0006029686082569537, "loss": 5.0003, "step": 656 }, { "epoch": 0.46505190311418687, "grad_norm": 0.2049325406551361, "learning_rate": 0.000585351485586648, "loss": 4.9349, "step": 672 }, { "epoch": 0.47612456747404847, "grad_norm": 0.200492262840271, "learning_rate": 0.0005676245058652349, "loss": 4.9385, "step": 688 }, { "epoch": 0.48719723183391006, "grad_norm": 0.24712397158145905, "learning_rate": 0.000549810485727994, "loss": 4.9516, "step": 704 }, { "epoch": 0.4982698961937716, "grad_norm": 0.380741149187088, "learning_rate": 0.0005319323538411021, "loss": 4.9121, "step": 720 }, { "epoch": 0.5093425605536333, "grad_norm": 0.2339814305305481, "learning_rate": 0.0005140131213898345, "loss": 4.9058, "step": 736 }, { "epoch": 0.5190311418685121, "eval_bleu": 0.0755992902414665, "eval_cap_loss": 1.8195727369484656, "eval_con_loss": 2.0776209530585406, "eval_loss": 3.897193689945144, "step": 750 }, { "epoch": 0.5190311418685121, "eval_bleu": 0.0755992902414665, "eval_cap_loss": 1.8195727369484656, "eval_con_loss": 2.0776209530585406, "eval_loss": 3.897193689945144, "eval_runtime": 819.7019, "eval_samples_per_second": 18.801, "eval_steps_per_second": 2.351, "step": 750 }, { "epoch": 0.5204152249134948, "grad_norm": 0.23871256411075592, "learning_rate": 0.0004960758524605593, "loss": 4.9217, "step": 752 }, { "epoch": 0.5314878892733564, "grad_norm": 0.19021867215633392, "learning_rate": 0.0004781436343546391, "loss": 4.9383, "step": 768 }, { "epoch": 0.542560553633218, "grad_norm": 0.39012765884399414, "learning_rate": 0.0004602395478724539, "loss": 4.8852, "step": 784 }, { "epoch": 0.5536332179930796, "grad_norm": 0.2299482375383377, "learning_rate": 0.00044238663760578963, "loss": 4.9461, "step": 800 }, { "epoch": 0.5647058823529412, "grad_norm": 0.20332716405391693, "learning_rate": 0.0004246078822768339, "loss": 4.8672, "step": 816 }, { "epoch": 0.5757785467128028, "grad_norm": 0.2742558717727661, "learning_rate": 0.00040692616516195134, "loss": 5.0016, "step": 832 }, { "epoch": 0.5868512110726644, "grad_norm": 0.18639664351940155, "learning_rate": 0.0003893642446383089, "loss": 4.8718, "step": 848 }, { "epoch": 0.5979238754325259, "grad_norm": 0.14423900842666626, "learning_rate": 0.00037194472489126174, "loss": 4.9254, "step": 864 }, { "epoch": 0.6089965397923875, "grad_norm": 0.25753453373908997, "learning_rate": 0.00035469002682019933, "loss": 5.0226, "step": 880 }, { "epoch": 0.6200692041522491, "grad_norm": 0.29433926939964294, "learning_rate": 0.00033762235918030425, "loss": 4.9075, "step": 896 }, { "epoch": 0.6311418685121107, "grad_norm": 0.16973747313022614, "learning_rate": 0.0003207636899973617, "loss": 4.9367, "step": 912 }, { "epoch": 0.6422145328719723, "grad_norm": 0.23007385432720184, "learning_rate": 0.0003041357182924178, "loss": 4.8899, "step": 928 }, { "epoch": 0.6532871972318339, "grad_norm": 0.23643143475055695, "learning_rate": 0.000287759846152675, "loss": 4.8813, "step": 944 }, { "epoch": 0.6643598615916955, "grad_norm": 0.17182067036628723, "learning_rate": 0.00027165715118457735, "loss": 4.9201, "step": 960 }, { "epoch": 0.6754325259515571, "grad_norm": 0.29925912618637085, "learning_rate": 0.0002558483593845372, "loss": 4.898, "step": 976 }, { "epoch": 0.6865051903114187, "grad_norm": 0.17474640905857086, "learning_rate": 0.00024035381846222555, "loss": 4.8839, "step": 992 }, { "epoch": 0.6920415224913494, "eval_bleu": 0.07638727198557162, "eval_cap_loss": 1.797022891217832, "eval_con_loss": 2.0767276909416923, "eval_loss": 3.87375058110786, "step": 1000 }, { "epoch": 0.6920415224913494, "eval_bleu": 0.07638727198557162, "eval_cap_loss": 1.797022891217832, "eval_con_loss": 2.0767276909416923, "eval_loss": 3.87375058110786, "eval_runtime": 815.983, "eval_samples_per_second": 18.886, "eval_steps_per_second": 2.362, "step": 1000 }, { "epoch": 0.6975778546712803, "grad_norm": 0.2200097143650055, "learning_rate": 0.00022519347165076065, "loss": 4.9202, "step": 1008 }, { "epoch": 0.7086505190311418, "grad_norm": 0.2346457690000534, "learning_rate": 0.00021038683203750092, "loss": 4.8698, "step": 1024 }, { "epoch": 0.7197231833910035, "grad_norm": 0.24073997139930725, "learning_rate": 0.00019595295744848825, "loss": 5.01, "step": 1040 }, { "epoch": 0.730795847750865, "grad_norm": 0.2730049192905426, "learning_rate": 0.00018191042591886197, "loss": 4.8847, "step": 1056 }, { "epoch": 0.7418685121107267, "grad_norm": 0.21411365270614624, "learning_rate": 0.00016827731178081822, "loss": 4.9802, "step": 1072 }, { "epoch": 0.7529411764705882, "grad_norm": 0.2260352373123169, "learning_rate": 0.0001550711623998926, "loss": 4.9593, "step": 1088 }, { "epoch": 0.7640138408304499, "grad_norm": 0.2655484974384308, "learning_rate": 0.0001423089755895095, "loss": 4.8097, "step": 1104 }, { "epoch": 0.7750865051903114, "grad_norm": 0.2067459672689438, "learning_rate": 0.0001300071777328658, "loss": 4.947, "step": 1120 }, { "epoch": 0.7861591695501731, "grad_norm": 0.21391142904758453, "learning_rate": 0.00011818160264031097, "loss": 4.8673, "step": 1136 }, { "epoch": 0.7972318339100346, "grad_norm": 0.3288532793521881, "learning_rate": 0.00010684747116943683, "loss": 4.9652, "step": 1152 }, { "epoch": 0.8083044982698961, "grad_norm": 0.1693842113018036, "learning_rate": 9.60193716341039e-05, "loss": 4.9003, "step": 1168 }, { "epoch": 0.8193771626297578, "grad_norm": 0.3056645691394806, "learning_rate": 8.571124102762767e-05, "loss": 4.9436, "step": 1184 }, { "epoch": 0.8304498269896193, "grad_norm": 0.3365015387535095, "learning_rate": 7.593634708428437e-05, "loss": 5.0349, "step": 1200 }, { "epoch": 0.841522491349481, "grad_norm": 0.2918214201927185, "learning_rate": 6.670727120223142e-05, "loss": 4.8812, "step": 1216 }, { "epoch": 0.8525951557093425, "grad_norm": 0.2237936407327652, "learning_rate": 5.8035892249820085e-05, "loss": 4.8842, "step": 1232 }, { "epoch": 0.8636678200692042, "grad_norm": 0.23957201838493347, "learning_rate": 4.993337127614273e-05, "loss": 4.9281, "step": 1248 }, { "epoch": 0.8650519031141869, "eval_bleu": 0.07725825664704841, "eval_cap_loss": 1.781457468055355, "eval_con_loss": 2.0761843878924693, "eval_loss": 3.8576418577727716, "step": 1250 }, { "epoch": 0.8650519031141869, "eval_bleu": 0.07725825664704841, "eval_cap_loss": 1.781457468055355, "eval_con_loss": 2.0761843878924693, "eval_loss": 3.8576418577727716, "eval_runtime": 819.4117, "eval_samples_per_second": 18.807, "eval_steps_per_second": 2.352, "step": 1250 }, { "epoch": 0.8747404844290657, "grad_norm": 0.09429222345352173, "learning_rate": 4.2410137145495964e-05, "loss": 4.8982, "step": 1264 }, { "epoch": 0.8858131487889274, "grad_norm": 0.23660314083099365, "learning_rate": 3.54758731142486e-05, "loss": 4.9127, "step": 1280 }, { "epoch": 0.8968858131487889, "grad_norm": 0.2092629075050354, "learning_rate": 2.9139504367391158e-05, "loss": 5.0183, "step": 1296 }, { "epoch": 0.9079584775086506, "grad_norm": 0.2870592772960663, "learning_rate": 2.3409186530809423e-05, "loss": 4.9319, "step": 1312 }, { "epoch": 0.9190311418685121, "grad_norm": 0.23483090102672577, "learning_rate": 1.8292295174068717e-05, "loss": 4.836, "step": 1328 }, { "epoch": 0.9301038062283737, "grad_norm": 0.24700886011123657, "learning_rate": 1.3795416317218035e-05, "loss": 4.9113, "step": 1344 }, { "epoch": 0.9411764705882353, "grad_norm": 0.20898666977882385, "learning_rate": 9.924337953834795e-06, "loss": 4.9519, "step": 1360 }, { "epoch": 0.9522491349480969, "grad_norm": 0.14755718410015106, "learning_rate": 6.684042601220186e-06, "loss": 4.8827, "step": 1376 }, { "epoch": 0.9633217993079585, "grad_norm": 0.2121778130531311, "learning_rate": 4.078700887333364e-06, "loss": 4.861, "step": 1392 }, { "epoch": 0.9743944636678201, "grad_norm": 0.2525366544723511, "learning_rate": 2.1116661827202956e-06, "loss": 4.8674, "step": 1408 }, { "epoch": 0.9854671280276817, "grad_norm": 0.278390109539032, "learning_rate": 7.854702843449468e-07, "loss": 4.9493, "step": 1424 }, { "epoch": 0.9965397923875432, "grad_norm": 0.22038041055202484, "learning_rate": 1.0182015687909552e-07, "loss": 4.9065, "step": 1440 } ], "logging_steps": 16, "max_steps": 1445, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null, "tau_value": 0.8081 }