{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 41640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12007684918347743, "grad_norm": 0.9133885502815247, "learning_rate": 1.9759846301633046e-05, "loss": 0.1965, "step": 500 }, { "epoch": 0.24015369836695485, "grad_norm": 0.7431137561798096, "learning_rate": 1.951969260326609e-05, "loss": 0.1331, "step": 1000 }, { "epoch": 0.36023054755043227, "grad_norm": 0.9133521318435669, "learning_rate": 1.927953890489914e-05, "loss": 0.1226, "step": 1500 }, { "epoch": 0.4803073967339097, "grad_norm": 0.6364143490791321, "learning_rate": 1.903938520653218e-05, "loss": 0.1166, "step": 2000 }, { "epoch": 0.6003842459173871, "grad_norm": 0.6985872387886047, "learning_rate": 1.879923150816523e-05, "loss": 0.1127, "step": 2500 }, { "epoch": 0.7204610951008645, "grad_norm": 0.7310335040092468, "learning_rate": 1.855907780979827e-05, "loss": 0.1105, "step": 3000 }, { "epoch": 0.840537944284342, "grad_norm": 0.6750699877738953, "learning_rate": 1.831892411143132e-05, "loss": 0.1082, "step": 3500 }, { "epoch": 0.9606147934678194, "grad_norm": 0.6943972110748291, "learning_rate": 1.8078770413064363e-05, "loss": 0.1062, "step": 4000 }, { "epoch": 1.080691642651297, "grad_norm": 0.6837930083274841, "learning_rate": 1.7838616714697408e-05, "loss": 0.0992, "step": 4500 }, { "epoch": 1.2007684918347743, "grad_norm": 0.7417839765548706, "learning_rate": 1.7598463016330453e-05, "loss": 0.0954, "step": 5000 }, { "epoch": 1.3208453410182517, "grad_norm": 0.6663339734077454, "learning_rate": 1.7358309317963498e-05, "loss": 0.0943, "step": 5500 }, { "epoch": 1.440922190201729, "grad_norm": 0.838683545589447, "learning_rate": 1.7118155619596542e-05, "loss": 0.0945, "step": 6000 }, { "epoch": 1.5609990393852065, "grad_norm": 0.7603605389595032, "learning_rate": 1.6878001921229587e-05, "loss": 0.0945, "step": 6500 }, { "epoch": 1.6810758885686838, "grad_norm": 0.7760754227638245, "learning_rate": 1.6637848222862635e-05, "loss": 0.0934, "step": 7000 }, { "epoch": 1.8011527377521612, "grad_norm": 0.7395870089530945, "learning_rate": 1.6397694524495677e-05, "loss": 0.094, "step": 7500 }, { "epoch": 1.9212295869356388, "grad_norm": 0.6932771801948547, "learning_rate": 1.6157540826128725e-05, "loss": 0.0924, "step": 8000 }, { "epoch": 2.0413064361191164, "grad_norm": 0.8017829656600952, "learning_rate": 1.591738712776177e-05, "loss": 0.0882, "step": 8500 }, { "epoch": 2.161383285302594, "grad_norm": 0.669118344783783, "learning_rate": 1.5677233429394814e-05, "loss": 0.0814, "step": 9000 }, { "epoch": 2.281460134486071, "grad_norm": 0.8860443830490112, "learning_rate": 1.543707973102786e-05, "loss": 0.0804, "step": 9500 }, { "epoch": 2.4015369836695486, "grad_norm": 0.9437547326087952, "learning_rate": 1.5196926032660904e-05, "loss": 0.0812, "step": 10000 }, { "epoch": 2.521613832853026, "grad_norm": 0.7827271223068237, "learning_rate": 1.495677233429395e-05, "loss": 0.0808, "step": 10500 }, { "epoch": 2.6416906820365034, "grad_norm": 0.8598589897155762, "learning_rate": 1.4716618635926993e-05, "loss": 0.0821, "step": 11000 }, { "epoch": 2.7617675312199808, "grad_norm": 0.8088162541389465, "learning_rate": 1.447646493756004e-05, "loss": 0.0814, "step": 11500 }, { "epoch": 2.881844380403458, "grad_norm": 0.8733372688293457, "learning_rate": 1.4236311239193086e-05, "loss": 0.0803, "step": 12000 }, { "epoch": 3.0019212295869355, "grad_norm": 0.7828580141067505, "learning_rate": 1.399615754082613e-05, "loss": 0.0804, "step": 12500 }, { "epoch": 3.121998078770413, "grad_norm": 0.8635441064834595, "learning_rate": 1.3756003842459176e-05, "loss": 0.0687, "step": 13000 }, { "epoch": 3.2420749279538903, "grad_norm": 0.8870617747306824, "learning_rate": 1.3515850144092219e-05, "loss": 0.0697, "step": 13500 }, { "epoch": 3.3621517771373677, "grad_norm": 0.8219842910766602, "learning_rate": 1.3275696445725266e-05, "loss": 0.0691, "step": 14000 }, { "epoch": 3.4822286263208455, "grad_norm": 0.8110594749450684, "learning_rate": 1.303554274735831e-05, "loss": 0.0697, "step": 14500 }, { "epoch": 3.602305475504323, "grad_norm": 0.7413811087608337, "learning_rate": 1.2795389048991355e-05, "loss": 0.0698, "step": 15000 }, { "epoch": 3.7223823246878003, "grad_norm": 0.868746817111969, "learning_rate": 1.25552353506244e-05, "loss": 0.0709, "step": 15500 }, { "epoch": 3.8424591738712777, "grad_norm": 0.8406381011009216, "learning_rate": 1.2315081652257446e-05, "loss": 0.0708, "step": 16000 }, { "epoch": 3.962536023054755, "grad_norm": 0.9255300164222717, "learning_rate": 1.2074927953890491e-05, "loss": 0.0704, "step": 16500 }, { "epoch": 4.082612872238233, "grad_norm": 0.805314838886261, "learning_rate": 1.1834774255523536e-05, "loss": 0.0621, "step": 17000 }, { "epoch": 4.20268972142171, "grad_norm": 0.8355468511581421, "learning_rate": 1.1594620557156582e-05, "loss": 0.0593, "step": 17500 }, { "epoch": 4.322766570605188, "grad_norm": 0.9546997547149658, "learning_rate": 1.1354466858789625e-05, "loss": 0.0591, "step": 18000 }, { "epoch": 4.442843419788665, "grad_norm": 0.8729753494262695, "learning_rate": 1.1114313160422672e-05, "loss": 0.0597, "step": 18500 }, { "epoch": 4.562920268972142, "grad_norm": 0.8652579188346863, "learning_rate": 1.0874159462055715e-05, "loss": 0.0608, "step": 19000 }, { "epoch": 4.68299711815562, "grad_norm": 1.0657707452774048, "learning_rate": 1.0634005763688761e-05, "loss": 0.06, "step": 19500 }, { "epoch": 4.803073967339097, "grad_norm": 1.0188003778457642, "learning_rate": 1.0393852065321808e-05, "loss": 0.0592, "step": 20000 }, { "epoch": 4.923150816522575, "grad_norm": 1.243787407875061, "learning_rate": 1.0153698366954851e-05, "loss": 0.0601, "step": 20500 }, { "epoch": 5.043227665706052, "grad_norm": 0.8553707599639893, "learning_rate": 9.913544668587897e-06, "loss": 0.0565, "step": 21000 }, { "epoch": 5.163304514889529, "grad_norm": 0.8106087446212769, "learning_rate": 9.673390970220942e-06, "loss": 0.0498, "step": 21500 }, { "epoch": 5.283381364073007, "grad_norm": 0.925154447555542, "learning_rate": 9.433237271853987e-06, "loss": 0.0502, "step": 22000 }, { "epoch": 5.403458213256484, "grad_norm": 1.060344934463501, "learning_rate": 9.193083573487034e-06, "loss": 0.0501, "step": 22500 }, { "epoch": 5.5235350624399615, "grad_norm": 1.026363492012024, "learning_rate": 8.952929875120078e-06, "loss": 0.0505, "step": 23000 }, { "epoch": 5.643611911623439, "grad_norm": 0.9036206603050232, "learning_rate": 8.712776176753123e-06, "loss": 0.051, "step": 23500 }, { "epoch": 5.763688760806916, "grad_norm": 1.036981463432312, "learning_rate": 8.472622478386168e-06, "loss": 0.0506, "step": 24000 }, { "epoch": 5.883765609990394, "grad_norm": 0.9116144776344299, "learning_rate": 8.232468780019213e-06, "loss": 0.0517, "step": 24500 }, { "epoch": 6.003842459173871, "grad_norm": 0.8754561543464661, "learning_rate": 7.992315081652257e-06, "loss": 0.0507, "step": 25000 }, { "epoch": 6.123919308357348, "grad_norm": 0.947286069393158, "learning_rate": 7.752161383285304e-06, "loss": 0.042, "step": 25500 }, { "epoch": 6.243996157540826, "grad_norm": 0.9840195178985596, "learning_rate": 7.512007684918349e-06, "loss": 0.0424, "step": 26000 }, { "epoch": 6.364073006724303, "grad_norm": 0.9733700156211853, "learning_rate": 7.271853986551393e-06, "loss": 0.0428, "step": 26500 }, { "epoch": 6.484149855907781, "grad_norm": 0.927603542804718, "learning_rate": 7.031700288184439e-06, "loss": 0.0429, "step": 27000 }, { "epoch": 6.604226705091259, "grad_norm": 0.9660381078720093, "learning_rate": 6.791546589817484e-06, "loss": 0.0429, "step": 27500 }, { "epoch": 6.724303554274735, "grad_norm": 1.0146026611328125, "learning_rate": 6.551392891450529e-06, "loss": 0.0429, "step": 28000 }, { "epoch": 6.844380403458214, "grad_norm": 1.0021497011184692, "learning_rate": 6.311239193083573e-06, "loss": 0.0431, "step": 28500 }, { "epoch": 6.964457252641691, "grad_norm": 0.8908631801605225, "learning_rate": 6.071085494716619e-06, "loss": 0.043, "step": 29000 }, { "epoch": 7.084534101825168, "grad_norm": 1.148420810699463, "learning_rate": 5.830931796349665e-06, "loss": 0.0386, "step": 29500 }, { "epoch": 7.204610951008646, "grad_norm": 1.3032015562057495, "learning_rate": 5.590778097982709e-06, "loss": 0.0363, "step": 30000 }, { "epoch": 7.324687800192123, "grad_norm": 0.9548340439796448, "learning_rate": 5.350624399615755e-06, "loss": 0.0361, "step": 30500 }, { "epoch": 7.444764649375601, "grad_norm": 1.2898118495941162, "learning_rate": 5.1104707012488e-06, "loss": 0.0365, "step": 31000 }, { "epoch": 7.564841498559078, "grad_norm": 1.2241828441619873, "learning_rate": 4.8703170028818446e-06, "loss": 0.0367, "step": 31500 }, { "epoch": 7.684918347742555, "grad_norm": 1.2958580255508423, "learning_rate": 4.630163304514889e-06, "loss": 0.0366, "step": 32000 }, { "epoch": 7.804995196926033, "grad_norm": 1.0599451065063477, "learning_rate": 4.390009606147935e-06, "loss": 0.036, "step": 32500 }, { "epoch": 7.92507204610951, "grad_norm": 1.230745792388916, "learning_rate": 4.149855907780981e-06, "loss": 0.0362, "step": 33000 }, { "epoch": 8.045148895292987, "grad_norm": 0.964049756526947, "learning_rate": 3.909702209414025e-06, "loss": 0.0348, "step": 33500 }, { "epoch": 8.165225744476466, "grad_norm": 1.253049612045288, "learning_rate": 3.66954851104707e-06, "loss": 0.031, "step": 34000 }, { "epoch": 8.285302593659942, "grad_norm": 1.0042595863342285, "learning_rate": 3.4293948126801158e-06, "loss": 0.0312, "step": 34500 }, { "epoch": 8.40537944284342, "grad_norm": 1.0244569778442383, "learning_rate": 3.189241114313161e-06, "loss": 0.0314, "step": 35000 }, { "epoch": 8.525456292026897, "grad_norm": 1.0348340272903442, "learning_rate": 2.9490874159462057e-06, "loss": 0.0308, "step": 35500 }, { "epoch": 8.645533141210375, "grad_norm": 0.9708086848258972, "learning_rate": 2.708933717579251e-06, "loss": 0.0314, "step": 36000 }, { "epoch": 8.765609990393852, "grad_norm": 1.1302130222320557, "learning_rate": 2.468780019212296e-06, "loss": 0.0311, "step": 36500 }, { "epoch": 8.88568683957733, "grad_norm": 0.9478445649147034, "learning_rate": 2.2286263208453413e-06, "loss": 0.031, "step": 37000 }, { "epoch": 9.005763688760807, "grad_norm": 0.935819149017334, "learning_rate": 1.988472622478386e-06, "loss": 0.0312, "step": 37500 }, { "epoch": 9.125840537944285, "grad_norm": 0.9962431192398071, "learning_rate": 1.7483189241114315e-06, "loss": 0.0279, "step": 38000 }, { "epoch": 9.245917387127761, "grad_norm": 1.0154587030410767, "learning_rate": 1.5081652257444765e-06, "loss": 0.0278, "step": 38500 }, { "epoch": 9.36599423631124, "grad_norm": 1.0626943111419678, "learning_rate": 1.2680115273775217e-06, "loss": 0.0276, "step": 39000 }, { "epoch": 9.486071085494716, "grad_norm": 0.9499347805976868, "learning_rate": 1.027857829010567e-06, "loss": 0.0276, "step": 39500 }, { "epoch": 9.606147934678194, "grad_norm": 1.3397222757339478, "learning_rate": 7.87704130643612e-07, "loss": 0.0275, "step": 40000 }, { "epoch": 9.72622478386167, "grad_norm": 1.0309653282165527, "learning_rate": 5.475504322766571e-07, "loss": 0.0273, "step": 40500 }, { "epoch": 9.84630163304515, "grad_norm": 0.6541324257850647, "learning_rate": 3.0739673390970224e-07, "loss": 0.0275, "step": 41000 }, { "epoch": 9.966378482228626, "grad_norm": 0.9202414155006409, "learning_rate": 6.724303554274736e-08, "loss": 0.0272, "step": 41500 }, { "epoch": 10.0, "step": 41640, "total_flos": 6.208512944497459e+17, "train_loss": 0.06181274460669782, "train_runtime": 22287.4377, "train_samples_per_second": 119.554, "train_steps_per_second": 1.868 } ], "logging_steps": 500, "max_steps": 41640, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.208512944497459e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }