cass-src-1.5b / trainer_state.json
ahmedheakl's picture
End of training
d1b47d5 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04184100418410042,
"grad_norm": 0.2925145155398498,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.0243,
"step": 10
},
{
"epoch": 0.08368200836820083,
"grad_norm": 0.09332366083077137,
"learning_rate": 7.916666666666667e-06,
"loss": 0.0066,
"step": 20
},
{
"epoch": 0.12552301255230125,
"grad_norm": 0.037000329123698675,
"learning_rate": 1.2083333333333333e-05,
"loss": 0.0044,
"step": 30
},
{
"epoch": 0.16736401673640167,
"grad_norm": 0.024975645672741548,
"learning_rate": 1.6250000000000002e-05,
"loss": 0.0028,
"step": 40
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.033927305503936314,
"learning_rate": 1.9999733110857237e-05,
"loss": 0.0025,
"step": 50
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.014612909987451435,
"learning_rate": 1.9967723647752463e-05,
"loss": 0.0018,
"step": 60
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.017483510752084472,
"learning_rate": 1.988253206622306e-05,
"loss": 0.0019,
"step": 70
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.014386718401807688,
"learning_rate": 1.9744612900216588e-05,
"loss": 0.0026,
"step": 80
},
{
"epoch": 0.37656903765690375,
"grad_norm": 0.039483682416672744,
"learning_rate": 1.9554702008157567e-05,
"loss": 0.0017,
"step": 90
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.017654984549017917,
"learning_rate": 1.9313812646824432e-05,
"loss": 0.0013,
"step": 100
},
{
"epoch": 0.4602510460251046,
"grad_norm": 0.00968661778293667,
"learning_rate": 1.9023230065186192e-05,
"loss": 0.0031,
"step": 110
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.023973932216176105,
"learning_rate": 1.8684504647043093e-05,
"loss": 0.0022,
"step": 120
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.021080771265352845,
"learning_rate": 1.8299443639058238e-05,
"loss": 0.0024,
"step": 130
},
{
"epoch": 0.5857740585774058,
"grad_norm": 0.02233998245288774,
"learning_rate": 1.7870101508314686e-05,
"loss": 0.0008,
"step": 140
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.015428869613923153,
"learning_rate": 1.7398768980844664e-05,
"loss": 0.002,
"step": 150
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.011099263805834297,
"learning_rate": 1.6887960819615025e-05,
"loss": 0.0011,
"step": 160
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.015474457950600984,
"learning_rate": 1.634040240717878e-05,
"loss": 0.0008,
"step": 170
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.016896916614250686,
"learning_rate": 1.5759015204579958e-05,
"loss": 0.0012,
"step": 180
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.010764154560653242,
"learning_rate": 1.5146901164094914e-05,
"loss": 0.0013,
"step": 190
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.02624812740713589,
"learning_rate": 1.4507326178974789e-05,
"loss": 0.0021,
"step": 200
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.02213838592470387,
"learning_rate": 1.3843702658491961e-05,
"loss": 0.0014,
"step": 210
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.03256425649897184,
"learning_rate": 1.3159571321260114e-05,
"loss": 0.0007,
"step": 220
},
{
"epoch": 0.9623430962343096,
"grad_norm": 0.008534054334232773,
"learning_rate": 1.2458582303968466e-05,
"loss": 0.0011,
"step": 230
},
{
"epoch": 1.00418410041841,
"grad_norm": 0.015659822283413834,
"learning_rate": 1.1744475686323225e-05,
"loss": 0.0018,
"step": 240
},
{
"epoch": 1.0460251046025104,
"grad_norm": 0.009318785906390915,
"learning_rate": 1.1021061536104093e-05,
"loss": 0.0007,
"step": 250
},
{
"epoch": 1.0878661087866108,
"grad_norm": 0.02012715152590903,
"learning_rate": 1.02921995808042e-05,
"loss": 0.0008,
"step": 260
},
{
"epoch": 1.1297071129707112,
"grad_norm": 0.01535009737960017,
"learning_rate": 9.561778614313876e-06,
"loss": 0.0005,
"step": 270
},
{
"epoch": 1.1715481171548117,
"grad_norm": 0.011963284930504645,
"learning_rate": 8.833695748522702e-06,
"loss": 0.0011,
"step": 280
},
{
"epoch": 1.213389121338912,
"grad_norm": 0.01174691491766897,
"learning_rate": 8.111835620541397e-06,
"loss": 0.0008,
"step": 290
},
{
"epoch": 1.2552301255230125,
"grad_norm": 0.015140100128090778,
"learning_rate": 7.400049666482061e-06,
"loss": 0.0005,
"step": 300
},
{
"epoch": 1.297071129707113,
"grad_norm": 0.0082059300361877,
"learning_rate": 6.702135572380078e-06,
"loss": 0.001,
"step": 310
},
{
"epoch": 1.3389121338912133,
"grad_norm": 0.004079307018079572,
"learning_rate": 6.021817011896004e-06,
"loss": 0.0014,
"step": 320
},
{
"epoch": 1.3807531380753137,
"grad_norm": 0.01109473064765619,
"learning_rate": 5.362723778905427e-06,
"loss": 0.001,
"step": 330
},
{
"epoch": 1.4225941422594142,
"grad_norm": 0.0015855550385638771,
"learning_rate": 4.728372420978119e-06,
"loss": 0.0012,
"step": 340
},
{
"epoch": 1.4644351464435146,
"grad_norm": 0.006116117187679222,
"learning_rate": 4.12214747707527e-06,
"loss": 0.0014,
"step": 350
},
{
"epoch": 1.506276150627615,
"grad_norm": 0.020785630071871383,
"learning_rate": 3.5472834195697017e-06,
"loss": 0.0005,
"step": 360
},
{
"epoch": 1.5481171548117154,
"grad_norm": 0.01356591109411228,
"learning_rate": 3.0068473969362998e-06,
"loss": 0.0016,
"step": 370
},
{
"epoch": 1.5899581589958158,
"grad_norm": 0.0455348358926559,
"learning_rate": 2.5037228691878424e-06,
"loss": 0.0011,
"step": 380
},
{
"epoch": 1.6317991631799162,
"grad_norm": 0.010270285275849788,
"learning_rate": 2.0405942233682017e-06,
"loss": 0.0011,
"step": 390
},
{
"epoch": 1.6736401673640167,
"grad_norm": 0.009898474319911012,
"learning_rate": 1.619932451186048e-06,
"loss": 0.0019,
"step": 400
},
{
"epoch": 1.715481171548117,
"grad_norm": 0.025182308499786255,
"learning_rate": 1.2439819652049178e-06,
"loss": 0.0008,
"step": 410
},
{
"epoch": 1.7573221757322175,
"grad_norm": 0.01013827313449354,
"learning_rate": 9.147486239311032e-07,
"loss": 0.001,
"step": 420
},
{
"epoch": 1.799163179916318,
"grad_norm": 0.03137196761877556,
"learning_rate": 6.339890296906493e-07,
"loss": 0.0013,
"step": 430
},
{
"epoch": 1.8410041841004183,
"grad_norm": 0.016893363525461946,
"learning_rate": 4.032011563958893e-07,
"loss": 0.0014,
"step": 440
},
{
"epoch": 1.8828451882845187,
"grad_norm": 0.015987721995170366,
"learning_rate": 2.2361635720651199e-07,
"loss": 0.0012,
"step": 450
},
{
"epoch": 1.9246861924686192,
"grad_norm": 0.01083855584874376,
"learning_rate": 9.619279472766863e-08,
"loss": 0.0008,
"step": 460
},
{
"epoch": 1.9665271966527196,
"grad_norm": 0.009804991431603429,
"learning_rate": 2.1610328797904145e-08,
"loss": 0.002,
"step": 470
},
{
"epoch": 2.0,
"step": 478,
"total_flos": 506334263902208.0,
"train_loss": 0.002037838656673496,
"train_runtime": 19073.3023,
"train_samples_per_second": 6.413,
"train_steps_per_second": 0.025
}
],
"logging_steps": 10,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 506334263902208.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}