| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.994350282485876, |
| "eval_steps": 500, |
| "global_step": 795, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03766478342749529, |
| "grad_norm": 2.263125496637036, |
| "learning_rate": 5e-06, |
| "loss": 0.8213, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07532956685499058, |
| "grad_norm": 2.189120038314998, |
| "learning_rate": 5e-06, |
| "loss": 0.725, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11299435028248588, |
| "grad_norm": 2.7377229620997587, |
| "learning_rate": 5e-06, |
| "loss": 0.6984, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15065913370998116, |
| "grad_norm": 1.6393030705097076, |
| "learning_rate": 5e-06, |
| "loss": 0.6949, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18832391713747645, |
| "grad_norm": 1.0398712245575565, |
| "learning_rate": 5e-06, |
| "loss": 0.6683, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22598870056497175, |
| "grad_norm": 0.8808957646364496, |
| "learning_rate": 5e-06, |
| "loss": 0.6657, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.263653483992467, |
| "grad_norm": 0.7566447407586742, |
| "learning_rate": 5e-06, |
| "loss": 0.6496, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3013182674199623, |
| "grad_norm": 0.7117741359385918, |
| "learning_rate": 5e-06, |
| "loss": 0.6458, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3389830508474576, |
| "grad_norm": 0.7784154878802825, |
| "learning_rate": 5e-06, |
| "loss": 0.643, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3766478342749529, |
| "grad_norm": 1.2764054406718297, |
| "learning_rate": 5e-06, |
| "loss": 0.634, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4143126177024482, |
| "grad_norm": 0.7389123630080362, |
| "learning_rate": 5e-06, |
| "loss": 0.642, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4519774011299435, |
| "grad_norm": 0.5996098331338064, |
| "learning_rate": 5e-06, |
| "loss": 0.6213, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4896421845574388, |
| "grad_norm": 1.1501035575220573, |
| "learning_rate": 5e-06, |
| "loss": 0.6249, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.527306967984934, |
| "grad_norm": 0.6904388049559987, |
| "learning_rate": 5e-06, |
| "loss": 0.6243, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5649717514124294, |
| "grad_norm": 0.783351095580665, |
| "learning_rate": 5e-06, |
| "loss": 0.621, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6026365348399246, |
| "grad_norm": 0.5318412680049267, |
| "learning_rate": 5e-06, |
| "loss": 0.6296, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.64030131826742, |
| "grad_norm": 1.1247908870238332, |
| "learning_rate": 5e-06, |
| "loss": 0.6244, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6779661016949152, |
| "grad_norm": 0.5383437573904913, |
| "learning_rate": 5e-06, |
| "loss": 0.6174, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7156308851224106, |
| "grad_norm": 2.5733659531838198, |
| "learning_rate": 5e-06, |
| "loss": 0.6193, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7532956685499058, |
| "grad_norm": 0.7831306502565981, |
| "learning_rate": 5e-06, |
| "loss": 0.6127, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7909604519774012, |
| "grad_norm": 0.6934442696862589, |
| "learning_rate": 5e-06, |
| "loss": 0.6244, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8286252354048964, |
| "grad_norm": 0.6677867481758228, |
| "learning_rate": 5e-06, |
| "loss": 0.613, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8662900188323918, |
| "grad_norm": 0.4859701739274024, |
| "learning_rate": 5e-06, |
| "loss": 0.6101, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.903954802259887, |
| "grad_norm": 0.8722337210188531, |
| "learning_rate": 5e-06, |
| "loss": 0.606, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9416195856873822, |
| "grad_norm": 1.9266628990003756, |
| "learning_rate": 5e-06, |
| "loss": 0.6019, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9792843691148776, |
| "grad_norm": 1.056076074715482, |
| "learning_rate": 5e-06, |
| "loss": 0.6112, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9981167608286252, |
| "eval_loss": 0.605501651763916, |
| "eval_runtime": 91.0085, |
| "eval_samples_per_second": 78.586, |
| "eval_steps_per_second": 0.615, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.0169491525423728, |
| "grad_norm": 0.8391047454772584, |
| "learning_rate": 5e-06, |
| "loss": 0.5915, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.054613935969868, |
| "grad_norm": 0.834651626730102, |
| "learning_rate": 5e-06, |
| "loss": 0.5646, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0922787193973635, |
| "grad_norm": 0.7450681825170591, |
| "learning_rate": 5e-06, |
| "loss": 0.5518, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.1299435028248588, |
| "grad_norm": 0.590980176111281, |
| "learning_rate": 5e-06, |
| "loss": 0.5601, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.167608286252354, |
| "grad_norm": 0.9385625642802127, |
| "learning_rate": 5e-06, |
| "loss": 0.552, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.2052730696798493, |
| "grad_norm": 0.8126168794552087, |
| "learning_rate": 5e-06, |
| "loss": 0.558, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2429378531073447, |
| "grad_norm": 0.677905810554928, |
| "learning_rate": 5e-06, |
| "loss": 0.565, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.28060263653484, |
| "grad_norm": 0.6544626057539239, |
| "learning_rate": 5e-06, |
| "loss": 0.5582, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.3182674199623352, |
| "grad_norm": 0.8524924080405836, |
| "learning_rate": 5e-06, |
| "loss": 0.5602, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3559322033898304, |
| "grad_norm": 0.4907188308076832, |
| "learning_rate": 5e-06, |
| "loss": 0.5607, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3935969868173257, |
| "grad_norm": 0.53907446375581, |
| "learning_rate": 5e-06, |
| "loss": 0.5547, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.4312617702448212, |
| "grad_norm": 0.5927028384991923, |
| "learning_rate": 5e-06, |
| "loss": 0.5541, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4689265536723164, |
| "grad_norm": 0.7128973727870778, |
| "learning_rate": 5e-06, |
| "loss": 0.5528, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.5065913370998116, |
| "grad_norm": 0.49840825439685243, |
| "learning_rate": 5e-06, |
| "loss": 0.5668, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.544256120527307, |
| "grad_norm": 0.5370743335720791, |
| "learning_rate": 5e-06, |
| "loss": 0.5575, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5819209039548023, |
| "grad_norm": 0.6150871895812915, |
| "learning_rate": 5e-06, |
| "loss": 0.5597, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.6195856873822976, |
| "grad_norm": 0.563194743905304, |
| "learning_rate": 5e-06, |
| "loss": 0.5592, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6572504708097928, |
| "grad_norm": 0.5119581124907059, |
| "learning_rate": 5e-06, |
| "loss": 0.5621, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.694915254237288, |
| "grad_norm": 0.5352254655513019, |
| "learning_rate": 5e-06, |
| "loss": 0.5541, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.7325800376647833, |
| "grad_norm": 0.6077433771903062, |
| "learning_rate": 5e-06, |
| "loss": 0.5563, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7702448210922788, |
| "grad_norm": 0.562877694142977, |
| "learning_rate": 5e-06, |
| "loss": 0.555, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.807909604519774, |
| "grad_norm": 0.5453089094350608, |
| "learning_rate": 5e-06, |
| "loss": 0.5465, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8455743879472695, |
| "grad_norm": 0.5709862620082578, |
| "learning_rate": 5e-06, |
| "loss": 0.5592, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8832391713747647, |
| "grad_norm": 0.49785144147435545, |
| "learning_rate": 5e-06, |
| "loss": 0.5563, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.92090395480226, |
| "grad_norm": 0.48543855573710365, |
| "learning_rate": 5e-06, |
| "loss": 0.5552, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9585687382297552, |
| "grad_norm": 0.5180932799655572, |
| "learning_rate": 5e-06, |
| "loss": 0.5571, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9962335216572504, |
| "grad_norm": 0.5674984350650156, |
| "learning_rate": 5e-06, |
| "loss": 0.5554, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.5974339842796326, |
| "eval_runtime": 92.2503, |
| "eval_samples_per_second": 77.528, |
| "eval_steps_per_second": 0.607, |
| "step": 531 |
| }, |
| { |
| "epoch": 2.0338983050847457, |
| "grad_norm": 0.6380443072327275, |
| "learning_rate": 5e-06, |
| "loss": 0.5074, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.071563088512241, |
| "grad_norm": 0.7526012751703193, |
| "learning_rate": 5e-06, |
| "loss": 0.5056, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.109227871939736, |
| "grad_norm": 0.601125683400543, |
| "learning_rate": 5e-06, |
| "loss": 0.5081, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.146892655367232, |
| "grad_norm": 0.5412801866050161, |
| "learning_rate": 5e-06, |
| "loss": 0.4964, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.184557438794727, |
| "grad_norm": 0.6605525778778812, |
| "learning_rate": 5e-06, |
| "loss": 0.4924, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.5634126387252626, |
| "learning_rate": 5e-06, |
| "loss": 0.5017, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.2598870056497176, |
| "grad_norm": 0.5612826370434433, |
| "learning_rate": 5e-06, |
| "loss": 0.507, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.297551789077213, |
| "grad_norm": 0.5863149934883163, |
| "learning_rate": 5e-06, |
| "loss": 0.4966, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.335216572504708, |
| "grad_norm": 0.5234770461125302, |
| "learning_rate": 5e-06, |
| "loss": 0.504, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.3728813559322033, |
| "grad_norm": 0.6459395940002383, |
| "learning_rate": 5e-06, |
| "loss": 0.5026, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.4105461393596985, |
| "grad_norm": 0.6027956338487243, |
| "learning_rate": 5e-06, |
| "loss": 0.5025, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.4482109227871938, |
| "grad_norm": 0.5328974338222766, |
| "learning_rate": 5e-06, |
| "loss": 0.5003, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4858757062146895, |
| "grad_norm": 0.6107575449426592, |
| "learning_rate": 5e-06, |
| "loss": 0.5009, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.5235404896421847, |
| "grad_norm": 0.6193028412595688, |
| "learning_rate": 5e-06, |
| "loss": 0.5068, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.56120527306968, |
| "grad_norm": 0.5313172697707192, |
| "learning_rate": 5e-06, |
| "loss": 0.5087, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.598870056497175, |
| "grad_norm": 0.6705815338360445, |
| "learning_rate": 5e-06, |
| "loss": 0.5072, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.6365348399246704, |
| "grad_norm": 0.5631108090258757, |
| "learning_rate": 5e-06, |
| "loss": 0.5053, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.6741996233521657, |
| "grad_norm": 0.6409277069423337, |
| "learning_rate": 5e-06, |
| "loss": 0.503, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.711864406779661, |
| "grad_norm": 0.5852444630897177, |
| "learning_rate": 5e-06, |
| "loss": 0.5099, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.7495291902071566, |
| "grad_norm": 0.6554053610190018, |
| "learning_rate": 5e-06, |
| "loss": 0.5149, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.7871939736346514, |
| "grad_norm": 0.6563071365261379, |
| "learning_rate": 5e-06, |
| "loss": 0.5018, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.824858757062147, |
| "grad_norm": 0.5582449045429995, |
| "learning_rate": 5e-06, |
| "loss": 0.5103, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.8625235404896423, |
| "grad_norm": 0.5062040173398443, |
| "learning_rate": 5e-06, |
| "loss": 0.5063, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.9001883239171375, |
| "grad_norm": 0.6071759917390698, |
| "learning_rate": 5e-06, |
| "loss": 0.5003, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.937853107344633, |
| "grad_norm": 0.5606403524855348, |
| "learning_rate": 5e-06, |
| "loss": 0.512, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.975517890772128, |
| "grad_norm": 0.6859712101741441, |
| "learning_rate": 5e-06, |
| "loss": 0.5025, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.994350282485876, |
| "eval_loss": 0.6022372245788574, |
| "eval_runtime": 89.8413, |
| "eval_samples_per_second": 79.607, |
| "eval_steps_per_second": 0.623, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.994350282485876, |
| "step": 795, |
| "total_flos": 1331235850813440.0, |
| "train_loss": 0.5684360762062313, |
| "train_runtime": 14109.8059, |
| "train_samples_per_second": 28.892, |
| "train_steps_per_second": 0.056 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 795, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1331235850813440.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|