{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.200557103064067, "eval_steps": 5, "global_step": 405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11142061281337047, "eval_loss": 0.037675488740205765, "eval_runtime": 6.7342, "eval_samples_per_second": 222.743, "eval_steps_per_second": 27.917, "eval_sts_dev_pearson_cosine": 0.7494115429773479, "eval_sts_dev_pearson_dot": 0.6583752142885668, "eval_sts_dev_pearson_euclidean": 0.6941454281465765, "eval_sts_dev_pearson_manhattan": 0.6964259759684527, "eval_sts_dev_pearson_max": 0.7494115429773479, "eval_sts_dev_spearman_cosine": 0.7470700524367354, "eval_sts_dev_spearman_dot": 0.6497928276890669, "eval_sts_dev_spearman_euclidean": 0.684590776689316, "eval_sts_dev_spearman_manhattan": 0.6873610947323412, "eval_sts_dev_spearman_max": 0.7470700524367354, "step": 5 }, { "epoch": 0.22284122562674094, "grad_norm": 6.7429633140563965, "learning_rate": 6.957731779439903e-08, "loss": 0.6923, "step": 10 }, { "epoch": 0.22284122562674094, "eval_loss": 0.0376589335501194, "eval_runtime": 6.8403, "eval_samples_per_second": 219.29, "eval_steps_per_second": 27.484, "eval_sts_dev_pearson_cosine": 0.7494940477075391, "eval_sts_dev_pearson_dot": 0.6584328702717946, "eval_sts_dev_pearson_euclidean": 0.6942213054869852, "eval_sts_dev_pearson_manhattan": 0.6965001647458872, "eval_sts_dev_pearson_max": 0.7494940477075391, "eval_sts_dev_spearman_cosine": 0.7471377072884906, "eval_sts_dev_spearman_dot": 0.6498755431337675, "eval_sts_dev_spearman_euclidean": 0.6846545112671376, "eval_sts_dev_spearman_manhattan": 0.687454500948251, "eval_sts_dev_spearman_max": 0.7471377072884906, "step": 10 }, { "epoch": 0.3342618384401114, "eval_loss": 0.03763080760836601, "eval_runtime": 6.9686, "eval_samples_per_second": 215.253, "eval_steps_per_second": 26.978, "eval_sts_dev_pearson_cosine": 0.7496395035968593, "eval_sts_dev_pearson_dot": 0.6585292611324672, "eval_sts_dev_pearson_euclidean": 0.6943597344549325, "eval_sts_dev_pearson_manhattan": 0.6966356509027943, "eval_sts_dev_pearson_max": 0.7496395035968593, "eval_sts_dev_spearman_cosine": 0.747293071934341, "eval_sts_dev_spearman_dot": 0.6499672916131112, "eval_sts_dev_spearman_euclidean": 0.6848464778088699, "eval_sts_dev_spearman_manhattan": 0.6875927784863133, "eval_sts_dev_spearman_max": 0.747293071934341, "step": 15 }, { "epoch": 0.4456824512534819, "grad_norm": 7.523725986480713, "learning_rate": 1.3915463558879807e-07, "loss": 0.6832, "step": 20 }, { "epoch": 0.4456824512534819, "eval_loss": 0.03759082034230232, "eval_runtime": 6.9891, "eval_samples_per_second": 214.62, "eval_steps_per_second": 26.899, "eval_sts_dev_pearson_cosine": 0.7498515127163549, "eval_sts_dev_pearson_dot": 0.6586892126695529, "eval_sts_dev_pearson_euclidean": 0.6945632277600391, "eval_sts_dev_pearson_manhattan": 0.6968351270246123, "eval_sts_dev_pearson_max": 0.7498515127163549, "eval_sts_dev_spearman_cosine": 0.7475384213284385, "eval_sts_dev_spearman_dot": 0.6500677799755323, "eval_sts_dev_spearman_euclidean": 0.6850767084625934, "eval_sts_dev_spearman_manhattan": 0.6877654793239389, "eval_sts_dev_spearman_max": 0.7475384213284385, "step": 20 }, { "epoch": 0.5571030640668524, "eval_loss": 0.03754143416881561, "eval_runtime": 6.8247, "eval_samples_per_second": 219.79, "eval_steps_per_second": 27.547, "eval_sts_dev_pearson_cosine": 0.7501122656435163, "eval_sts_dev_pearson_dot": 0.6588212748683685, "eval_sts_dev_pearson_euclidean": 0.6948708332777139, "eval_sts_dev_pearson_manhattan": 0.6971351224061912, "eval_sts_dev_pearson_max": 0.7501122656435163, "eval_sts_dev_spearman_cosine": 0.7478755024321192, "eval_sts_dev_spearman_dot": 0.6502167543650381, "eval_sts_dev_spearman_euclidean": 0.6854436169483377, "eval_sts_dev_spearman_manhattan": 0.6880846722054696, "eval_sts_dev_spearman_max": 0.7478755024321192, "step": 25 }, { "epoch": 0.6685236768802229, "grad_norm": 7.176445960998535, "learning_rate": 2.0873195338319708e-07, "loss": 0.6787, "step": 30 }, { "epoch": 0.6685236768802229, "eval_loss": 0.037479061633348465, "eval_runtime": 6.9898, "eval_samples_per_second": 214.598, "eval_steps_per_second": 26.896, "eval_sts_dev_pearson_cosine": 0.7504502235424245, "eval_sts_dev_pearson_dot": 0.6589729935526047, "eval_sts_dev_pearson_euclidean": 0.6952782546669927, "eval_sts_dev_pearson_manhattan": 0.6975315748701472, "eval_sts_dev_pearson_max": 0.7504502235424245, "eval_sts_dev_spearman_cosine": 0.7483727549578874, "eval_sts_dev_spearman_dot": 0.6502927839552382, "eval_sts_dev_spearman_euclidean": 0.6858779938781956, "eval_sts_dev_spearman_manhattan": 0.6885426870287449, "eval_sts_dev_spearman_max": 0.7483727549578874, "step": 30 }, { "epoch": 0.7799442896935933, "eval_loss": 0.03741108253598213, "eval_runtime": 7.0272, "eval_samples_per_second": 213.456, "eval_steps_per_second": 26.753, "eval_sts_dev_pearson_cosine": 0.7508317539918448, "eval_sts_dev_pearson_dot": 0.6592089487188968, "eval_sts_dev_pearson_euclidean": 0.6957145823768739, "eval_sts_dev_pearson_manhattan": 0.6979566424519045, "eval_sts_dev_pearson_max": 0.7508317539918448, "eval_sts_dev_spearman_cosine": 0.7488095875667629, "eval_sts_dev_spearman_dot": 0.6505123414164061, "eval_sts_dev_spearman_euclidean": 0.6863890021142346, "eval_sts_dev_spearman_manhattan": 0.6889574531430644, "eval_sts_dev_spearman_max": 0.7488095875667629, "step": 35 }, { "epoch": 0.8913649025069638, "grad_norm": 5.811614036560059, "learning_rate": 2.7830927117759614e-07, "loss": 0.6154, "step": 40 }, { "epoch": 0.8913649025069638, "eval_loss": 0.03732568398118019, "eval_runtime": 6.9675, "eval_samples_per_second": 215.286, "eval_steps_per_second": 26.983, "eval_sts_dev_pearson_cosine": 0.7512943163494744, "eval_sts_dev_pearson_dot": 0.6595252251920851, "eval_sts_dev_pearson_euclidean": 0.6961931337237875, "eval_sts_dev_pearson_manhattan": 0.6984244275683631, "eval_sts_dev_pearson_max": 0.7512943163494744, "eval_sts_dev_spearman_cosine": 0.7494136836534844, "eval_sts_dev_spearman_dot": 0.6507477353375185, "eval_sts_dev_spearman_euclidean": 0.6869101418254764, "eval_sts_dev_spearman_manhattan": 0.6894841894318411, "eval_sts_dev_spearman_max": 0.7494136836534844, "step": 40 }, { "epoch": 1.0222841225626742, "eval_loss": 0.03723177686333656, "eval_runtime": 6.807, "eval_samples_per_second": 220.362, "eval_steps_per_second": 27.619, "eval_sts_dev_pearson_cosine": 0.7518064092372247, "eval_sts_dev_pearson_dot": 0.659916579804052, "eval_sts_dev_pearson_euclidean": 0.6966963956623822, "eval_sts_dev_pearson_manhattan": 0.6989173818306955, "eval_sts_dev_pearson_max": 0.7518064092372247, "eval_sts_dev_spearman_cosine": 0.7499673977394428, "eval_sts_dev_spearman_dot": 0.6509971484372724, "eval_sts_dev_spearman_euclidean": 0.6874727739859278, "eval_sts_dev_spearman_manhattan": 0.6900562750157024, "eval_sts_dev_spearman_max": 0.7499673977394428, "step": 45 }, { "epoch": 1.1337047353760445, "grad_norm": 5.788002014160156, "learning_rate": 3.4788658897199517e-07, "loss": 0.6231, "step": 50 }, { "epoch": 1.1337047353760445, "eval_loss": 0.03712593764066696, "eval_runtime": 6.8692, "eval_samples_per_second": 218.366, "eval_steps_per_second": 27.369, "eval_sts_dev_pearson_cosine": 0.7523963675159875, "eval_sts_dev_pearson_dot": 0.6603951554863274, "eval_sts_dev_pearson_euclidean": 0.6972537699536556, "eval_sts_dev_pearson_manhattan": 0.6994643420859175, "eval_sts_dev_pearson_max": 0.7523963675159875, "eval_sts_dev_spearman_cosine": 0.750612531997651, "eval_sts_dev_spearman_dot": 0.6513909659491809, "eval_sts_dev_spearman_euclidean": 0.6881037653193015, "eval_sts_dev_spearman_manhattan": 0.690698809264565, "eval_sts_dev_spearman_max": 0.750612531997651, "step": 50 }, { "epoch": 1.2451253481894151, "eval_loss": 0.03701437637209892, "eval_runtime": 6.9079, "eval_samples_per_second": 217.142, "eval_steps_per_second": 27.215, "eval_sts_dev_pearson_cosine": 0.7530298660085821, "eval_sts_dev_pearson_dot": 0.6608066497022934, "eval_sts_dev_pearson_euclidean": 0.6979265726405308, "eval_sts_dev_pearson_manhattan": 0.7001220235641434, "eval_sts_dev_pearson_max": 0.7530298660085821, "eval_sts_dev_spearman_cosine": 0.7512373553393066, "eval_sts_dev_spearman_dot": 0.6517398579494034, "eval_sts_dev_spearman_euclidean": 0.6888011095183327, "eval_sts_dev_spearman_manhattan": 0.691414492932023, "eval_sts_dev_spearman_max": 0.7512373553393066, "step": 55 }, { "epoch": 1.3565459610027855, "grad_norm": 6.15402889251709, "learning_rate": 4.1746390676639416e-07, "loss": 0.6562, "step": 60 }, { "epoch": 1.3565459610027855, "eval_loss": 0.03689862787723541, "eval_runtime": 7.0618, "eval_samples_per_second": 212.409, "eval_steps_per_second": 26.622, "eval_sts_dev_pearson_cosine": 0.7536755590598476, "eval_sts_dev_pearson_dot": 0.6612397236569308, "eval_sts_dev_pearson_euclidean": 0.6986261571902858, "eval_sts_dev_pearson_manhattan": 0.7008037618197723, "eval_sts_dev_pearson_max": 0.7536755590598476, "eval_sts_dev_spearman_cosine": 0.7518746736763288, "eval_sts_dev_spearman_dot": 0.6520463167363649, "eval_sts_dev_spearman_euclidean": 0.6896250409475332, "eval_sts_dev_spearman_manhattan": 0.6921595229559657, "eval_sts_dev_spearman_max": 0.7518746736763288, "step": 60 }, { "epoch": 1.467966573816156, "eval_loss": 0.03677487000823021, "eval_runtime": 7.0563, "eval_samples_per_second": 212.575, "eval_steps_per_second": 26.643, "eval_sts_dev_pearson_cosine": 0.7543139332813571, "eval_sts_dev_pearson_dot": 0.6616585186532069, "eval_sts_dev_pearson_euclidean": 0.6993248351610868, "eval_sts_dev_pearson_manhattan": 0.701480628825091, "eval_sts_dev_pearson_max": 0.7543139332813571, "eval_sts_dev_spearman_cosine": 0.7525649831393398, "eval_sts_dev_spearman_dot": 0.6522844686788962, "eval_sts_dev_spearman_euclidean": 0.6904248656764869, "eval_sts_dev_spearman_manhattan": 0.6929891697203803, "eval_sts_dev_spearman_max": 0.7525649831393398, "step": 65 }, { "epoch": 1.5793871866295266, "grad_norm": 7.177963733673096, "learning_rate": 4.870412245607932e-07, "loss": 0.6578, "step": 70 }, { "epoch": 1.5793871866295266, "eval_loss": 0.036648884415626526, "eval_runtime": 6.959, "eval_samples_per_second": 215.549, "eval_steps_per_second": 27.015, "eval_sts_dev_pearson_cosine": 0.7550016826683398, "eval_sts_dev_pearson_dot": 0.6621754750211006, "eval_sts_dev_pearson_euclidean": 0.7000072525876023, "eval_sts_dev_pearson_manhattan": 0.7021439545430929, "eval_sts_dev_pearson_max": 0.7550016826683398, "eval_sts_dev_spearman_cosine": 0.7533627904462605, "eval_sts_dev_spearman_dot": 0.6528254486243784, "eval_sts_dev_spearman_euclidean": 0.6912255844955055, "eval_sts_dev_spearman_manhattan": 0.693756112728956, "eval_sts_dev_spearman_max": 0.7533627904462605, "step": 70 }, { "epoch": 1.690807799442897, "eval_loss": 0.03651271015405655, "eval_runtime": 7.007, "eval_samples_per_second": 214.073, "eval_steps_per_second": 26.83, "eval_sts_dev_pearson_cosine": 0.7558035417500417, "eval_sts_dev_pearson_dot": 0.6628193044191966, "eval_sts_dev_pearson_euclidean": 0.7007736916543533, "eval_sts_dev_pearson_manhattan": 0.7028932537624881, "eval_sts_dev_pearson_max": 0.7558035417500417, "eval_sts_dev_spearman_cosine": 0.7541058046949494, "eval_sts_dev_spearman_dot": 0.6534460767465545, "eval_sts_dev_spearman_euclidean": 0.6920637091980477, "eval_sts_dev_spearman_manhattan": 0.6945583716986528, "eval_sts_dev_spearman_max": 0.7541058046949494, "step": 75 }, { "epoch": 1.8022284122562673, "grad_norm": 5.0526251792907715, "learning_rate": 5.566185423551923e-07, "loss": 0.6669, "step": 80 }, { "epoch": 1.8022284122562673, "eval_loss": 0.03638559579849243, "eval_runtime": 6.7319, "eval_samples_per_second": 222.821, "eval_steps_per_second": 27.927, "eval_sts_dev_pearson_cosine": 0.7566016073951273, "eval_sts_dev_pearson_dot": 0.6635483139977033, "eval_sts_dev_pearson_euclidean": 0.7014893647689773, "eval_sts_dev_pearson_manhattan": 0.703594324322853, "eval_sts_dev_pearson_max": 0.7566016073951273, "eval_sts_dev_spearman_cosine": 0.7549242270132541, "eval_sts_dev_spearman_dot": 0.6540274190985176, "eval_sts_dev_spearman_euclidean": 0.6927527403904686, "eval_sts_dev_spearman_manhattan": 0.695333682691011, "eval_sts_dev_spearman_max": 0.7549242270132541, "step": 80 }, { "epoch": 1.9136490250696379, "eval_loss": 0.036259058862924576, "eval_runtime": 7.0619, "eval_samples_per_second": 212.407, "eval_steps_per_second": 26.622, "eval_sts_dev_pearson_cosine": 0.7574029038481553, "eval_sts_dev_pearson_dot": 0.6643528168117957, "eval_sts_dev_pearson_euclidean": 0.7021549030740968, "eval_sts_dev_pearson_manhattan": 0.7042456310839478, "eval_sts_dev_pearson_max": 0.7574029038481553, "eval_sts_dev_spearman_cosine": 0.7559011874733633, "eval_sts_dev_spearman_dot": 0.654608486564625, "eval_sts_dev_spearman_euclidean": 0.6934728497203049, "eval_sts_dev_spearman_manhattan": 0.696009977505159, "eval_sts_dev_spearman_max": 0.7559011874733633, "step": 85 }, { "epoch": 2.0445682451253484, "grad_norm": 7.539985656738281, "learning_rate": 6.261958601495913e-07, "loss": 0.6428, "step": 90 }, { "epoch": 2.0445682451253484, "eval_loss": 0.036127302795648575, "eval_runtime": 7.0107, "eval_samples_per_second": 213.958, "eval_steps_per_second": 26.816, "eval_sts_dev_pearson_cosine": 0.7582160683192293, "eval_sts_dev_pearson_dot": 0.6651865048982631, "eval_sts_dev_pearson_euclidean": 0.7028452108161203, "eval_sts_dev_pearson_manhattan": 0.7049193272018017, "eval_sts_dev_pearson_max": 0.7582160683192293, "eval_sts_dev_spearman_cosine": 0.756839547083474, "eval_sts_dev_spearman_dot": 0.6554766310741506, "eval_sts_dev_spearman_euclidean": 0.6942368514501571, "eval_sts_dev_spearman_manhattan": 0.6967968978437559, "eval_sts_dev_spearman_max": 0.756839547083474, "step": 90 }, { "epoch": 2.1559888579387185, "eval_loss": 0.03598429635167122, "eval_runtime": 6.8629, "eval_samples_per_second": 218.568, "eval_steps_per_second": 27.394, "eval_sts_dev_pearson_cosine": 0.7590157349155543, "eval_sts_dev_pearson_dot": 0.6664047922354215, "eval_sts_dev_pearson_euclidean": 0.7032597502450331, "eval_sts_dev_pearson_manhattan": 0.7053311453976816, "eval_sts_dev_pearson_max": 0.7590157349155543, "eval_sts_dev_spearman_cosine": 0.7577360756559688, "eval_sts_dev_spearman_dot": 0.6567130424552957, "eval_sts_dev_spearman_euclidean": 0.694683268380771, "eval_sts_dev_spearman_manhattan": 0.6973000099834088, "eval_sts_dev_spearman_max": 0.7577360756559688, "step": 95 }, { "epoch": 2.267409470752089, "grad_norm": 5.178345680236816, "learning_rate": 6.957731779439903e-07, "loss": 0.5854, "step": 100 }, { "epoch": 2.267409470752089, "eval_loss": 0.03583008423447609, "eval_runtime": 7.173, "eval_samples_per_second": 209.119, "eval_steps_per_second": 26.21, "eval_sts_dev_pearson_cosine": 0.7597921519073876, "eval_sts_dev_pearson_dot": 0.6678122125215467, "eval_sts_dev_pearson_euclidean": 0.7035339087302831, "eval_sts_dev_pearson_manhattan": 0.7056098859433702, "eval_sts_dev_pearson_max": 0.7597921519073876, "eval_sts_dev_spearman_cosine": 0.758602852970159, "eval_sts_dev_spearman_dot": 0.6582467955758544, "eval_sts_dev_spearman_euclidean": 0.6948605697617651, "eval_sts_dev_spearman_manhattan": 0.6975703877172783, "eval_sts_dev_spearman_max": 0.758602852970159, "step": 100 }, { "epoch": 2.3788300835654597, "eval_loss": 0.035686325281858444, "eval_runtime": 7.0872, "eval_samples_per_second": 211.649, "eval_steps_per_second": 26.527, "eval_sts_dev_pearson_cosine": 0.7606568693270315, "eval_sts_dev_pearson_dot": 0.6687514434055418, "eval_sts_dev_pearson_euclidean": 0.704286798579542, "eval_sts_dev_pearson_manhattan": 0.7063472791256069, "eval_sts_dev_pearson_max": 0.7606568693270315, "eval_sts_dev_spearman_cosine": 0.7597087921768803, "eval_sts_dev_spearman_dot": 0.658946428183679, "eval_sts_dev_spearman_euclidean": 0.695592274693547, "eval_sts_dev_spearman_manhattan": 0.6983308228030709, "eval_sts_dev_spearman_max": 0.7597087921768803, "step": 105 }, { "epoch": 2.4902506963788302, "grad_norm": 5.807418346405029, "learning_rate": 7.653504957383893e-07, "loss": 0.6027, "step": 110 }, { "epoch": 2.4902506963788302, "eval_loss": 0.035556692630052567, "eval_runtime": 7.0493, "eval_samples_per_second": 212.788, "eval_steps_per_second": 26.669, "eval_sts_dev_pearson_cosine": 0.761527279815679, "eval_sts_dev_pearson_dot": 0.6695555734987789, "eval_sts_dev_pearson_euclidean": 0.705168673869323, "eval_sts_dev_pearson_manhattan": 0.7072038979059934, "eval_sts_dev_pearson_max": 0.761527279815679, "eval_sts_dev_spearman_cosine": 0.760744250643423, "eval_sts_dev_spearman_dot": 0.6597526569449198, "eval_sts_dev_spearman_euclidean": 0.6967183194293859, "eval_sts_dev_spearman_manhattan": 0.6992467241695522, "eval_sts_dev_spearman_max": 0.760744250643423, "step": 110 }, { "epoch": 2.6016713091922004, "eval_loss": 0.03542407229542732, "eval_runtime": 6.8156, "eval_samples_per_second": 220.083, "eval_steps_per_second": 27.584, "eval_sts_dev_pearson_cosine": 0.7623648733142145, "eval_sts_dev_pearson_dot": 0.6704772598451654, "eval_sts_dev_pearson_euclidean": 0.7059197567148983, "eval_sts_dev_pearson_manhattan": 0.7079355090955533, "eval_sts_dev_pearson_max": 0.7623648733142145, "eval_sts_dev_spearman_cosine": 0.7618131283610858, "eval_sts_dev_spearman_dot": 0.6605908503497494, "eval_sts_dev_spearman_euclidean": 0.6976245585578177, "eval_sts_dev_spearman_manhattan": 0.7002055764519721, "eval_sts_dev_spearman_max": 0.7618131283610858, "step": 115 }, { "epoch": 2.713091922005571, "grad_norm": 4.760545253753662, "learning_rate": 8.349278135327883e-07, "loss": 0.6375, "step": 120 }, { "epoch": 2.713091922005571, "eval_loss": 0.03528669476509094, "eval_runtime": 6.9936, "eval_samples_per_second": 214.481, "eval_steps_per_second": 26.882, "eval_sts_dev_pearson_cosine": 0.7631052098822656, "eval_sts_dev_pearson_dot": 0.6714460378701741, "eval_sts_dev_pearson_euclidean": 0.7064722681555804, "eval_sts_dev_pearson_manhattan": 0.7084736073971417, "eval_sts_dev_pearson_max": 0.7631052098822656, "eval_sts_dev_spearman_cosine": 0.7627318359213398, "eval_sts_dev_spearman_dot": 0.6614807337490313, "eval_sts_dev_spearman_euclidean": 0.6982972981814837, "eval_sts_dev_spearman_manhattan": 0.7008247751818659, "eval_sts_dev_spearman_max": 0.7627318359213398, "step": 120 }, { "epoch": 2.8245125348189415, "eval_loss": 0.03514046594500542, "eval_runtime": 7.0888, "eval_samples_per_second": 211.601, "eval_steps_per_second": 26.521, "eval_sts_dev_pearson_cosine": 0.7638287349941795, "eval_sts_dev_pearson_dot": 0.6724854308235324, "eval_sts_dev_pearson_euclidean": 0.7068315364008582, "eval_sts_dev_pearson_manhattan": 0.7088273928548983, "eval_sts_dev_pearson_max": 0.7638287349941795, "eval_sts_dev_spearman_cosine": 0.7635020295116245, "eval_sts_dev_spearman_dot": 0.6624349213377722, "eval_sts_dev_spearman_euclidean": 0.6987404256446157, "eval_sts_dev_spearman_manhattan": 0.7011827796563965, "eval_sts_dev_spearman_max": 0.7635020295116245, "step": 125 }, { "epoch": 2.935933147632312, "grad_norm": 4.589956760406494, "learning_rate": 9.045051313271874e-07, "loss": 0.6204, "step": 130 }, { "epoch": 2.935933147632312, "eval_loss": 0.03499244153499603, "eval_runtime": 7.03, "eval_samples_per_second": 213.37, "eval_steps_per_second": 26.742, "eval_sts_dev_pearson_cosine": 0.7646248483324349, "eval_sts_dev_pearson_dot": 0.6736502017218999, "eval_sts_dev_pearson_euclidean": 0.707216661995043, "eval_sts_dev_pearson_manhattan": 0.709212008478957, "eval_sts_dev_pearson_max": 0.7646248483324349, "eval_sts_dev_spearman_cosine": 0.7643307027826172, "eval_sts_dev_spearman_dot": 0.6636213615361183, "eval_sts_dev_spearman_euclidean": 0.6991046333767655, "eval_sts_dev_spearman_manhattan": 0.7016301334896569, "eval_sts_dev_spearman_max": 0.7643307027826172, "step": 130 }, { "epoch": 3.066852367688022, "eval_loss": 0.0348396897315979, "eval_runtime": 6.8191, "eval_samples_per_second": 219.972, "eval_steps_per_second": 27.57, "eval_sts_dev_pearson_cosine": 0.7654784319525549, "eval_sts_dev_pearson_dot": 0.6748874130308962, "eval_sts_dev_pearson_euclidean": 0.707641103763532, "eval_sts_dev_pearson_manhattan": 0.7096402166194079, "eval_sts_dev_pearson_max": 0.7654784319525549, "eval_sts_dev_spearman_cosine": 0.7653040232955037, "eval_sts_dev_spearman_dot": 0.6648308844991435, "eval_sts_dev_spearman_euclidean": 0.6995286562724882, "eval_sts_dev_spearman_manhattan": 0.7020556361876031, "eval_sts_dev_spearman_max": 0.7653040232955037, "step": 135 }, { "epoch": 3.1782729805013927, "grad_norm": 5.2525177001953125, "learning_rate": 9.740824491215864e-07, "loss": 0.6077, "step": 140 }, { "epoch": 3.1782729805013927, "eval_loss": 0.034706421196460724, "eval_runtime": 6.9212, "eval_samples_per_second": 216.726, "eval_steps_per_second": 27.163, "eval_sts_dev_pearson_cosine": 0.766263287304504, "eval_sts_dev_pearson_dot": 0.675948414551205, "eval_sts_dev_pearson_euclidean": 0.7081178470450136, "eval_sts_dev_pearson_manhattan": 0.7101145234880011, "eval_sts_dev_pearson_max": 0.766263287304504, "eval_sts_dev_spearman_cosine": 0.7662688094783671, "eval_sts_dev_spearman_dot": 0.666010980931314, "eval_sts_dev_spearman_euclidean": 0.7000434625148538, "eval_sts_dev_spearman_manhattan": 0.7026795019088747, "eval_sts_dev_spearman_max": 0.7662688094783671, "step": 140 }, { "epoch": 3.2896935933147633, "eval_loss": 0.03455406054854393, "eval_runtime": 7.043, "eval_samples_per_second": 212.977, "eval_steps_per_second": 26.693, "eval_sts_dev_pearson_cosine": 0.7669976344527232, "eval_sts_dev_pearson_dot": 0.6772130506339817, "eval_sts_dev_pearson_euclidean": 0.708340308738127, "eval_sts_dev_pearson_manhattan": 0.7103439855119656, "eval_sts_dev_pearson_max": 0.7669976344527232, "eval_sts_dev_spearman_cosine": 0.7671727295922609, "eval_sts_dev_spearman_dot": 0.6673064034335351, "eval_sts_dev_spearman_euclidean": 0.7002877031846776, "eval_sts_dev_spearman_manhattan": 0.702790337375281, "eval_sts_dev_spearman_max": 0.7671727295922609, "step": 145 }, { "epoch": 3.401114206128134, "grad_norm": 4.290932655334473, "learning_rate": 1.0436597669159855e-06, "loss": 0.5772, "step": 150 }, { "epoch": 3.401114206128134, "eval_loss": 0.03440996631979942, "eval_runtime": 6.9991, "eval_samples_per_second": 214.314, "eval_steps_per_second": 26.861, "eval_sts_dev_pearson_cosine": 0.7678013334085165, "eval_sts_dev_pearson_dot": 0.6782264568419771, "eval_sts_dev_pearson_euclidean": 0.7088142668828982, "eval_sts_dev_pearson_manhattan": 0.7108177316956372, "eval_sts_dev_pearson_max": 0.7678013334085165, "eval_sts_dev_spearman_cosine": 0.7681151208619762, "eval_sts_dev_spearman_dot": 0.6684064347787971, "eval_sts_dev_spearman_euclidean": 0.7007326265687747, "eval_sts_dev_spearman_manhattan": 0.7034130912956567, "eval_sts_dev_spearman_max": 0.7681151208619762, "step": 150 }, { "epoch": 3.5125348189415044, "eval_loss": 0.03426254168152809, "eval_runtime": 6.9148, "eval_samples_per_second": 216.927, "eval_steps_per_second": 27.188, "eval_sts_dev_pearson_cosine": 0.7686124311956235, "eval_sts_dev_pearson_dot": 0.6794161416279998, "eval_sts_dev_pearson_euclidean": 0.709205498775362, "eval_sts_dev_pearson_manhattan": 0.7112097548871816, "eval_sts_dev_pearson_max": 0.7686124311956235, "eval_sts_dev_spearman_cosine": 0.7690103394236019, "eval_sts_dev_spearman_dot": 0.6696872196092013, "eval_sts_dev_spearman_euclidean": 0.7011801292985436, "eval_sts_dev_spearman_manhattan": 0.7038459216523878, "eval_sts_dev_spearman_max": 0.7690103394236019, "step": 155 }, { "epoch": 3.6239554317548746, "grad_norm": 4.819970607757568, "learning_rate": 1.1132370847103846e-06, "loss": 0.5793, "step": 160 }, { "epoch": 3.6239554317548746, "eval_loss": 0.034103069454431534, "eval_runtime": 6.9114, "eval_samples_per_second": 217.034, "eval_steps_per_second": 27.202, "eval_sts_dev_pearson_cosine": 0.7693080146114311, "eval_sts_dev_pearson_dot": 0.6808270827318879, "eval_sts_dev_pearson_euclidean": 0.7092760536788121, "eval_sts_dev_pearson_manhattan": 0.7112999822871733, "eval_sts_dev_pearson_max": 0.7693080146114311, "eval_sts_dev_spearman_cosine": 0.7697572707961111, "eval_sts_dev_spearman_dot": 0.6712211078819724, "eval_sts_dev_spearman_euclidean": 0.7012923239631217, "eval_sts_dev_spearman_manhattan": 0.7039518721666204, "eval_sts_dev_spearman_max": 0.7697572707961111, "step": 160 }, { "epoch": 3.735376044568245, "eval_loss": 0.0339648611843586, "eval_runtime": 7.0549, "eval_samples_per_second": 212.619, "eval_steps_per_second": 26.648, "eval_sts_dev_pearson_cosine": 0.7699086602747196, "eval_sts_dev_pearson_dot": 0.6821532615290817, "eval_sts_dev_pearson_euclidean": 0.7093076470422841, "eval_sts_dev_pearson_manhattan": 0.7113457981900373, "eval_sts_dev_pearson_max": 0.7699086602747196, "eval_sts_dev_spearman_cosine": 0.7705074022984313, "eval_sts_dev_spearman_dot": 0.672703308451007, "eval_sts_dev_spearman_euclidean": 0.7012209819190688, "eval_sts_dev_spearman_manhattan": 0.7040382275601695, "eval_sts_dev_spearman_max": 0.7705074022984313, "step": 165 }, { "epoch": 3.8467966573816157, "grad_norm": 4.6819539070129395, "learning_rate": 1.1828144025047836e-06, "loss": 0.5807, "step": 170 }, { "epoch": 3.8467966573816157, "eval_loss": 0.033846523612737656, "eval_runtime": 7.007, "eval_samples_per_second": 214.071, "eval_steps_per_second": 26.83, "eval_sts_dev_pearson_cosine": 0.7705963307037736, "eval_sts_dev_pearson_dot": 0.6833451512490409, "eval_sts_dev_pearson_euclidean": 0.7096001250212141, "eval_sts_dev_pearson_manhattan": 0.711632091113557, "eval_sts_dev_pearson_max": 0.7705963307037736, "eval_sts_dev_spearman_cosine": 0.7712362988663569, "eval_sts_dev_spearman_dot": 0.6739107651886832, "eval_sts_dev_spearman_euclidean": 0.7015732848026783, "eval_sts_dev_spearman_manhattan": 0.7044555432408592, "eval_sts_dev_spearman_max": 0.7712362988663569, "step": 170 }, { "epoch": 3.958217270194986, "eval_loss": 0.03373364359140396, "eval_runtime": 7.0142, "eval_samples_per_second": 213.851, "eval_steps_per_second": 26.803, "eval_sts_dev_pearson_cosine": 0.7712791156976446, "eval_sts_dev_pearson_dot": 0.684640185733316, "eval_sts_dev_pearson_euclidean": 0.7099359677821528, "eval_sts_dev_pearson_manhattan": 0.7119586396389017, "eval_sts_dev_pearson_max": 0.7712791156976446, "eval_sts_dev_spearman_cosine": 0.7720608877912845, "eval_sts_dev_spearman_dot": 0.6752035487866894, "eval_sts_dev_spearman_euclidean": 0.702006901214985, "eval_sts_dev_spearman_manhattan": 0.7046928888569776, "eval_sts_dev_spearman_max": 0.7720608877912845, "step": 175 }, { "epoch": 4.089136490250697, "grad_norm": 5.301053524017334, "learning_rate": 1.2523917202991825e-06, "loss": 0.5576, "step": 180 }, { "epoch": 4.089136490250697, "eval_loss": 0.03359239175915718, "eval_runtime": 6.9089, "eval_samples_per_second": 217.11, "eval_steps_per_second": 27.211, "eval_sts_dev_pearson_cosine": 0.7720094084547121, "eval_sts_dev_pearson_dot": 0.6861711141593462, "eval_sts_dev_pearson_euclidean": 0.7101810374908146, "eval_sts_dev_pearson_manhattan": 0.7122008904979185, "eval_sts_dev_pearson_max": 0.7720094084547121, "eval_sts_dev_spearman_cosine": 0.7729262262575222, "eval_sts_dev_spearman_dot": 0.6768691715243821, "eval_sts_dev_spearman_euclidean": 0.7021927194762703, "eval_sts_dev_spearman_manhattan": 0.7048656110976538, "eval_sts_dev_spearman_max": 0.7729262262575222, "step": 180 }, { "epoch": 4.2005571030640665, "eval_loss": 0.03344343975186348, "eval_runtime": 6.9828, "eval_samples_per_second": 214.815, "eval_steps_per_second": 26.923, "eval_sts_dev_pearson_cosine": 0.7726659560898915, "eval_sts_dev_pearson_dot": 0.687537592193805, "eval_sts_dev_pearson_euclidean": 0.7102002040734198, "eval_sts_dev_pearson_manhattan": 0.7122330129546837, "eval_sts_dev_pearson_max": 0.7726659560898915, "eval_sts_dev_spearman_cosine": 0.773425201401485, "eval_sts_dev_spearman_dot": 0.6783853594387605, "eval_sts_dev_spearman_euclidean": 0.7021304440842328, "eval_sts_dev_spearman_manhattan": 0.7048113806985111, "eval_sts_dev_spearman_max": 0.773425201401485, "step": 185 }, { "epoch": 4.311977715877437, "grad_norm": 6.004096984863281, "learning_rate": 1.3219690380935816e-06, "loss": 0.5244, "step": 190 }, { "epoch": 4.311977715877437, "eval_loss": 0.033325061202049255, "eval_runtime": 7.0443, "eval_samples_per_second": 212.938, "eval_steps_per_second": 26.688, "eval_sts_dev_pearson_cosine": 0.7733338124006317, "eval_sts_dev_pearson_dot": 0.6884427467691409, "eval_sts_dev_pearson_euclidean": 0.7104572948924557, "eval_sts_dev_pearson_manhattan": 0.7124982272648852, "eval_sts_dev_pearson_max": 0.7733338124006317, "eval_sts_dev_spearman_cosine": 0.7740160113372371, "eval_sts_dev_spearman_dot": 0.679314640853313, "eval_sts_dev_spearman_euclidean": 0.7022600917103325, "eval_sts_dev_spearman_manhattan": 0.7049775652371025, "eval_sts_dev_spearman_max": 0.7740160113372371, "step": 190 }, { "epoch": 4.423398328690808, "eval_loss": 0.03322310745716095, "eval_runtime": 7.0234, "eval_samples_per_second": 213.573, "eval_steps_per_second": 26.768, "eval_sts_dev_pearson_cosine": 0.7739695721631923, "eval_sts_dev_pearson_dot": 0.6893517389464994, "eval_sts_dev_pearson_euclidean": 0.7107902241882483, "eval_sts_dev_pearson_manhattan": 0.7128377305936389, "eval_sts_dev_pearson_max": 0.7739695721631923, "eval_sts_dev_spearman_cosine": 0.7747804094168401, "eval_sts_dev_spearman_dot": 0.68031602482782, "eval_sts_dev_spearman_euclidean": 0.702670109171386, "eval_sts_dev_spearman_manhattan": 0.7052134150159887, "eval_sts_dev_spearman_max": 0.7747804094168401, "step": 195 }, { "epoch": 4.534818941504178, "grad_norm": 4.593257427215576, "learning_rate": 1.3915463558879807e-06, "loss": 0.539, "step": 200 }, { "epoch": 4.534818941504178, "eval_loss": 0.033111851662397385, "eval_runtime": 6.8979, "eval_samples_per_second": 217.459, "eval_steps_per_second": 27.255, "eval_sts_dev_pearson_cosine": 0.7745730498768191, "eval_sts_dev_pearson_dot": 0.6906339428851104, "eval_sts_dev_pearson_euclidean": 0.7109426833835167, "eval_sts_dev_pearson_manhattan": 0.7130040632760261, "eval_sts_dev_pearson_max": 0.7745730498768191, "eval_sts_dev_spearman_cosine": 0.775379074216674, "eval_sts_dev_spearman_dot": 0.6818359626434134, "eval_sts_dev_spearman_euclidean": 0.7027815708069985, "eval_sts_dev_spearman_manhattan": 0.7054772962806527, "eval_sts_dev_spearman_max": 0.775379074216674, "step": 200 }, { "epoch": 4.646239554317549, "eval_loss": 0.03302275016903877, "eval_runtime": 6.926, "eval_samples_per_second": 216.576, "eval_steps_per_second": 27.144, "eval_sts_dev_pearson_cosine": 0.7750720264452357, "eval_sts_dev_pearson_dot": 0.6916453477028499, "eval_sts_dev_pearson_euclidean": 0.7112190933233568, "eval_sts_dev_pearson_manhattan": 0.7132769890476369, "eval_sts_dev_pearson_max": 0.7750720264452357, "eval_sts_dev_spearman_cosine": 0.7760014955136583, "eval_sts_dev_spearman_dot": 0.6829098752509514, "eval_sts_dev_spearman_euclidean": 0.7030344024642863, "eval_sts_dev_spearman_manhattan": 0.7059066267642276, "eval_sts_dev_spearman_max": 0.7760014955136583, "step": 205 }, { "epoch": 4.757660167130919, "grad_norm": 4.035131931304932, "learning_rate": 1.4611236736823798e-06, "loss": 0.5517, "step": 210 }, { "epoch": 4.757660167130919, "eval_loss": 0.032943133264780045, "eval_runtime": 7.07, "eval_samples_per_second": 212.164, "eval_steps_per_second": 26.591, "eval_sts_dev_pearson_cosine": 0.775436879881936, "eval_sts_dev_pearson_dot": 0.6925918600460864, "eval_sts_dev_pearson_euclidean": 0.7113376814593186, "eval_sts_dev_pearson_manhattan": 0.7133931376814393, "eval_sts_dev_pearson_max": 0.775436879881936, "eval_sts_dev_spearman_cosine": 0.7764792385291549, "eval_sts_dev_spearman_dot": 0.6839147456943953, "eval_sts_dev_spearman_euclidean": 0.703141329969615, "eval_sts_dev_spearman_manhattan": 0.7059362250994191, "eval_sts_dev_spearman_max": 0.7764792385291549, "step": 210 }, { "epoch": 4.86908077994429, "eval_loss": 0.03284618631005287, "eval_runtime": 6.8565, "eval_samples_per_second": 218.77, "eval_steps_per_second": 27.419, "eval_sts_dev_pearson_cosine": 0.775910266044483, "eval_sts_dev_pearson_dot": 0.6934614331040406, "eval_sts_dev_pearson_euclidean": 0.711548659602332, "eval_sts_dev_pearson_manhattan": 0.7136048948232416, "eval_sts_dev_pearson_max": 0.775910266044483, "eval_sts_dev_spearman_cosine": 0.7769479258303382, "eval_sts_dev_spearman_dot": 0.6848713805423069, "eval_sts_dev_spearman_euclidean": 0.7033494094042918, "eval_sts_dev_spearman_manhattan": 0.7060255698767176, "eval_sts_dev_spearman_max": 0.7769479258303382, "step": 215 }, { "epoch": 4.9805013927576605, "grad_norm": 4.164207458496094, "learning_rate": 1.5307009914767787e-06, "loss": 0.5265, "step": 220 }, { "epoch": 4.9805013927576605, "eval_loss": 0.0327322892844677, "eval_runtime": 6.9904, "eval_samples_per_second": 214.58, "eval_steps_per_second": 26.894, "eval_sts_dev_pearson_cosine": 0.7764785690089298, "eval_sts_dev_pearson_dot": 0.6942342520710683, "eval_sts_dev_pearson_euclidean": 0.7119234281148877, "eval_sts_dev_pearson_manhattan": 0.7139725405773478, "eval_sts_dev_pearson_max": 0.7764785690089298, "eval_sts_dev_spearman_cosine": 0.7776377175908147, "eval_sts_dev_spearman_dot": 0.6856461394544989, "eval_sts_dev_spearman_euclidean": 0.7037087745638393, "eval_sts_dev_spearman_manhattan": 0.7064702298285305, "eval_sts_dev_spearman_max": 0.7776377175908147, "step": 220 }, { "epoch": 5.111420612813371, "eval_loss": 0.032635681331157684, "eval_runtime": 6.8033, "eval_samples_per_second": 220.482, "eval_steps_per_second": 27.634, "eval_sts_dev_pearson_cosine": 0.7768906949758223, "eval_sts_dev_pearson_dot": 0.695219441450241, "eval_sts_dev_pearson_euclidean": 0.7119427716298626, "eval_sts_dev_pearson_manhattan": 0.7139906781614199, "eval_sts_dev_pearson_max": 0.7768906949758223, "eval_sts_dev_spearman_cosine": 0.7779652464100915, "eval_sts_dev_spearman_dot": 0.6869571731826094, "eval_sts_dev_spearman_euclidean": 0.7036077013230951, "eval_sts_dev_spearman_manhattan": 0.7064509076431469, "eval_sts_dev_spearman_max": 0.7779652464100915, "step": 225 }, { "epoch": 5.222841225626741, "grad_norm": 4.008439064025879, "learning_rate": 1.6002783092711777e-06, "loss": 0.5285, "step": 230 }, { "epoch": 5.222841225626741, "eval_loss": 0.03253428637981415, "eval_runtime": 6.8983, "eval_samples_per_second": 217.445, "eval_steps_per_second": 27.253, "eval_sts_dev_pearson_cosine": 0.7772382339972829, "eval_sts_dev_pearson_dot": 0.6962971989781661, "eval_sts_dev_pearson_euclidean": 0.7116605569376889, "eval_sts_dev_pearson_manhattan": 0.7137176755568332, "eval_sts_dev_pearson_max": 0.7772382339972829, "eval_sts_dev_spearman_cosine": 0.7783426175116597, "eval_sts_dev_spearman_dot": 0.6882750477744878, "eval_sts_dev_spearman_euclidean": 0.7031754685029606, "eval_sts_dev_spearman_manhattan": 0.7062052563630147, "eval_sts_dev_spearman_max": 0.7783426175116597, "step": 230 }, { "epoch": 5.334261838440112, "eval_loss": 0.032446879893541336, "eval_runtime": 7.1024, "eval_samples_per_second": 211.195, "eval_steps_per_second": 26.47, "eval_sts_dev_pearson_cosine": 0.7776669424440168, "eval_sts_dev_pearson_dot": 0.6970405122472402, "eval_sts_dev_pearson_euclidean": 0.7117722670287954, "eval_sts_dev_pearson_manhattan": 0.7138312835497453, "eval_sts_dev_pearson_max": 0.7776669424440168, "eval_sts_dev_spearman_cosine": 0.7789160171177805, "eval_sts_dev_spearman_dot": 0.6891076670812013, "eval_sts_dev_spearman_euclidean": 0.7033258975002282, "eval_sts_dev_spearman_manhattan": 0.7062752235073074, "eval_sts_dev_spearman_max": 0.7789160171177805, "step": 235 }, { "epoch": 5.445682451253482, "grad_norm": 3.6369762420654297, "learning_rate": 1.6698556270655766e-06, "loss": 0.4697, "step": 240 }, { "epoch": 5.445682451253482, "eval_loss": 0.03234243392944336, "eval_runtime": 6.782, "eval_samples_per_second": 221.172, "eval_steps_per_second": 27.72, "eval_sts_dev_pearson_cosine": 0.7781440012528016, "eval_sts_dev_pearson_dot": 0.6975764419235699, "eval_sts_dev_pearson_euclidean": 0.712024820219635, "eval_sts_dev_pearson_manhattan": 0.7140934326314853, "eval_sts_dev_pearson_max": 0.7781440012528016, "eval_sts_dev_spearman_cosine": 0.779282426254369, "eval_sts_dev_spearman_dot": 0.6897740636983543, "eval_sts_dev_spearman_euclidean": 0.7035466980830317, "eval_sts_dev_spearman_manhattan": 0.706402706407244, "eval_sts_dev_spearman_max": 0.779282426254369, "step": 240 }, { "epoch": 5.557103064066853, "eval_loss": 0.0322665236890316, "eval_runtime": 7.0287, "eval_samples_per_second": 213.412, "eval_steps_per_second": 26.748, "eval_sts_dev_pearson_cosine": 0.7787273229273541, "eval_sts_dev_pearson_dot": 0.6977971151317023, "eval_sts_dev_pearson_euclidean": 0.7128704818639644, "eval_sts_dev_pearson_manhattan": 0.7149352374625544, "eval_sts_dev_pearson_max": 0.7787273229273541, "eval_sts_dev_spearman_cosine": 0.77981903098488, "eval_sts_dev_spearman_dot": 0.6899867909899472, "eval_sts_dev_spearman_euclidean": 0.7044750738813548, "eval_sts_dev_spearman_manhattan": 0.707203879577786, "eval_sts_dev_spearman_max": 0.77981903098488, "step": 245 }, { "epoch": 5.6685236768802225, "grad_norm": 3.939344882965088, "learning_rate": 1.739432944859976e-06, "loss": 0.4913, "step": 250 }, { "epoch": 5.6685236768802225, "eval_loss": 0.03220539167523384, "eval_runtime": 6.7653, "eval_samples_per_second": 221.72, "eval_steps_per_second": 27.789, "eval_sts_dev_pearson_cosine": 0.7791771917276973, "eval_sts_dev_pearson_dot": 0.6981160056071188, "eval_sts_dev_pearson_euclidean": 0.713488315174772, "eval_sts_dev_pearson_manhattan": 0.7155411689371374, "eval_sts_dev_pearson_max": 0.7791771917276973, "eval_sts_dev_spearman_cosine": 0.7803556746575578, "eval_sts_dev_spearman_dot": 0.6902449156806119, "eval_sts_dev_spearman_euclidean": 0.7052006351141208, "eval_sts_dev_spearman_manhattan": 0.7079806405930662, "eval_sts_dev_spearman_max": 0.7803556746575578, "step": 250 }, { "epoch": 5.779944289693593, "eval_loss": 0.0321136973798275, "eval_runtime": 6.8852, "eval_samples_per_second": 217.857, "eval_steps_per_second": 27.305, "eval_sts_dev_pearson_cosine": 0.7795408783298017, "eval_sts_dev_pearson_dot": 0.698621796206566, "eval_sts_dev_pearson_euclidean": 0.713845705178594, "eval_sts_dev_pearson_manhattan": 0.7158847989781144, "eval_sts_dev_pearson_max": 0.7795408783298017, "eval_sts_dev_spearman_cosine": 0.7808851254829866, "eval_sts_dev_spearman_dot": 0.6910441279803855, "eval_sts_dev_spearman_euclidean": 0.7057147472696849, "eval_sts_dev_spearman_manhattan": 0.7084308417857139, "eval_sts_dev_spearman_max": 0.7808851254829866, "step": 255 }, { "epoch": 5.891364902506964, "grad_norm": 4.813522815704346, "learning_rate": 1.8090102626543748e-06, "loss": 0.5253, "step": 260 }, { "epoch": 5.891364902506964, "eval_loss": 0.03203197568655014, "eval_runtime": 7.0476, "eval_samples_per_second": 212.839, "eval_steps_per_second": 26.676, "eval_sts_dev_pearson_cosine": 0.7799732728461426, "eval_sts_dev_pearson_dot": 0.6992354089058229, "eval_sts_dev_pearson_euclidean": 0.7142404896335972, "eval_sts_dev_pearson_manhattan": 0.716270082443381, "eval_sts_dev_pearson_max": 0.7799732728461426, "eval_sts_dev_spearman_cosine": 0.7812777358255738, "eval_sts_dev_spearman_dot": 0.6917093769490908, "eval_sts_dev_spearman_euclidean": 0.7062223056881557, "eval_sts_dev_spearman_manhattan": 0.7089598550457142, "eval_sts_dev_spearman_max": 0.7812777358255738, "step": 260 }, { "epoch": 6.022284122562674, "eval_loss": 0.03195018321275711, "eval_runtime": 7.099, "eval_samples_per_second": 211.299, "eval_steps_per_second": 26.483, "eval_sts_dev_pearson_cosine": 0.7803233438802165, "eval_sts_dev_pearson_dot": 0.6999738035020234, "eval_sts_dev_pearson_euclidean": 0.7143605362249807, "eval_sts_dev_pearson_manhattan": 0.7163833317778756, "eval_sts_dev_pearson_max": 0.7803233438802165, "eval_sts_dev_spearman_cosine": 0.7817289518382318, "eval_sts_dev_spearman_dot": 0.692658440982393, "eval_sts_dev_spearman_euclidean": 0.7062913822145624, "eval_sts_dev_spearman_manhattan": 0.7091007508962174, "eval_sts_dev_spearman_max": 0.7817289518382318, "step": 265 }, { "epoch": 6.133704735376044, "grad_norm": 3.873243570327759, "learning_rate": 1.8785875804487739e-06, "loss": 0.4924, "step": 270 }, { "epoch": 6.133704735376044, "eval_loss": 0.031853143125772476, "eval_runtime": 7.045, "eval_samples_per_second": 212.918, "eval_steps_per_second": 26.686, "eval_sts_dev_pearson_cosine": 0.7805555688659683, "eval_sts_dev_pearson_dot": 0.7005444051022546, "eval_sts_dev_pearson_euclidean": 0.7142124903197049, "eval_sts_dev_pearson_manhattan": 0.716248059084913, "eval_sts_dev_pearson_max": 0.7805555688659683, "eval_sts_dev_spearman_cosine": 0.7818561644430513, "eval_sts_dev_spearman_dot": 0.6936098988133554, "eval_sts_dev_spearman_euclidean": 0.7060309965817769, "eval_sts_dev_spearman_manhattan": 0.7089509487437853, "eval_sts_dev_spearman_max": 0.7818561644430513, "step": 270 }, { "epoch": 6.245125348189415, "eval_loss": 0.031787075102329254, "eval_runtime": 7.0799, "eval_samples_per_second": 211.867, "eval_steps_per_second": 26.554, "eval_sts_dev_pearson_cosine": 0.7807075804252084, "eval_sts_dev_pearson_dot": 0.7015197969666243, "eval_sts_dev_pearson_euclidean": 0.713830705347577, "eval_sts_dev_pearson_manhattan": 0.7158793994538133, "eval_sts_dev_pearson_max": 0.7807075804252084, "eval_sts_dev_spearman_cosine": 0.7819875621854264, "eval_sts_dev_spearman_dot": 0.694826261852757, "eval_sts_dev_spearman_euclidean": 0.7053731328646764, "eval_sts_dev_spearman_manhattan": 0.7083527948173437, "eval_sts_dev_spearman_max": 0.7819875621854264, "step": 275 }, { "epoch": 6.3565459610027855, "grad_norm": 4.469658374786377, "learning_rate": 1.9481648982431728e-06, "loss": 0.4844, "step": 280 }, { "epoch": 6.3565459610027855, "eval_loss": 0.031746331602334976, "eval_runtime": 6.7748, "eval_samples_per_second": 221.41, "eval_steps_per_second": 27.75, "eval_sts_dev_pearson_cosine": 0.7808289673024869, "eval_sts_dev_pearson_dot": 0.702423126121021, "eval_sts_dev_pearson_euclidean": 0.7134962000576563, "eval_sts_dev_pearson_manhattan": 0.7155503733116253, "eval_sts_dev_pearson_max": 0.7808289673024869, "eval_sts_dev_spearman_cosine": 0.7822111314547963, "eval_sts_dev_spearman_dot": 0.6958278382473629, "eval_sts_dev_spearman_euclidean": 0.7049726585244658, "eval_sts_dev_spearman_manhattan": 0.7078651037745494, "eval_sts_dev_spearman_max": 0.7822111314547963, "step": 280 }, { "epoch": 6.467966573816156, "eval_loss": 0.0316670723259449, "eval_runtime": 7.0619, "eval_samples_per_second": 212.406, "eval_steps_per_second": 26.622, "eval_sts_dev_pearson_cosine": 0.781180936397055, "eval_sts_dev_pearson_dot": 0.7027629453006121, "eval_sts_dev_pearson_euclidean": 0.7136902176147873, "eval_sts_dev_pearson_manhattan": 0.715757628364657, "eval_sts_dev_pearson_max": 0.781180936397055, "eval_sts_dev_spearman_cosine": 0.78250079334828, "eval_sts_dev_spearman_dot": 0.6962981450393402, "eval_sts_dev_spearman_euclidean": 0.7051141445683561, "eval_sts_dev_spearman_manhattan": 0.70821885209965, "eval_sts_dev_spearman_max": 0.78250079334828, "step": 285 }, { "epoch": 6.579387186629527, "grad_norm": 4.325808048248291, "learning_rate": 2.017742216037572e-06, "loss": 0.442, "step": 290 }, { "epoch": 6.579387186629527, "eval_loss": 0.03155314922332764, "eval_runtime": 6.933, "eval_samples_per_second": 216.356, "eval_steps_per_second": 27.117, "eval_sts_dev_pearson_cosine": 0.781592834547759, "eval_sts_dev_pearson_dot": 0.7030321075873802, "eval_sts_dev_pearson_euclidean": 0.7138293804278546, "eval_sts_dev_pearson_manhattan": 0.7159175761814789, "eval_sts_dev_pearson_max": 0.781592834547759, "eval_sts_dev_spearman_cosine": 0.7827403875693918, "eval_sts_dev_spearman_dot": 0.6966818933630766, "eval_sts_dev_spearman_euclidean": 0.705222522900883, "eval_sts_dev_spearman_manhattan": 0.7082679375517423, "eval_sts_dev_spearman_max": 0.7827403875693918, "step": 290 }, { "epoch": 6.690807799442897, "eval_loss": 0.0314662829041481, "eval_runtime": 7.0474, "eval_samples_per_second": 212.844, "eval_steps_per_second": 26.676, "eval_sts_dev_pearson_cosine": 0.7820122068864954, "eval_sts_dev_pearson_dot": 0.703421139648371, "eval_sts_dev_pearson_euclidean": 0.7141068771656474, "eval_sts_dev_pearson_manhattan": 0.7162068261112142, "eval_sts_dev_pearson_max": 0.7820122068864954, "eval_sts_dev_spearman_cosine": 0.7829970553896861, "eval_sts_dev_spearman_dot": 0.6970113959506001, "eval_sts_dev_spearman_euclidean": 0.7054796488454884, "eval_sts_dev_spearman_manhattan": 0.7085587324330124, "eval_sts_dev_spearman_max": 0.7829970553896861, "step": 295 }, { "epoch": 6.802228412256268, "grad_norm": 3.6315908432006836, "learning_rate": 2.087319533831971e-06, "loss": 0.4665, "step": 300 }, { "epoch": 6.802228412256268, "eval_loss": 0.03142312169075012, "eval_runtime": 6.9811, "eval_samples_per_second": 214.864, "eval_steps_per_second": 26.93, "eval_sts_dev_pearson_cosine": 0.7823768397963167, "eval_sts_dev_pearson_dot": 0.7038756871911903, "eval_sts_dev_pearson_euclidean": 0.7145009916374723, "eval_sts_dev_pearson_manhattan": 0.7165993081434159, "eval_sts_dev_pearson_max": 0.7823768397963167, "eval_sts_dev_spearman_cosine": 0.7834457731278, "eval_sts_dev_spearman_dot": 0.6973417239926998, "eval_sts_dev_spearman_euclidean": 0.7059158400220358, "eval_sts_dev_spearman_manhattan": 0.7090603670611569, "eval_sts_dev_spearman_max": 0.7834457731278, "step": 300 }, { "epoch": 6.913649025069638, "eval_loss": 0.03140180557966232, "eval_runtime": 7.0935, "eval_samples_per_second": 211.462, "eval_steps_per_second": 26.503, "eval_sts_dev_pearson_cosine": 0.7827189612475338, "eval_sts_dev_pearson_dot": 0.7043799909167585, "eval_sts_dev_pearson_euclidean": 0.715034388904346, "eval_sts_dev_pearson_manhattan": 0.7171022025564596, "eval_sts_dev_pearson_max": 0.7827189612475338, "eval_sts_dev_spearman_cosine": 0.7839004976206189, "eval_sts_dev_spearman_dot": 0.6975156259478882, "eval_sts_dev_spearman_euclidean": 0.7065303588201288, "eval_sts_dev_spearman_manhattan": 0.7095736568498506, "eval_sts_dev_spearman_max": 0.7839004976206189, "step": 305 }, { "epoch": 7.044568245125348, "grad_norm": 4.26026725769043, "learning_rate": 2.15689685162637e-06, "loss": 0.4672, "step": 310 }, { "epoch": 7.044568245125348, "eval_loss": 0.03136160969734192, "eval_runtime": 6.6776, "eval_samples_per_second": 224.63, "eval_steps_per_second": 28.154, "eval_sts_dev_pearson_cosine": 0.7831698418188764, "eval_sts_dev_pearson_dot": 0.7044122663834302, "eval_sts_dev_pearson_euclidean": 0.7156598421085834, "eval_sts_dev_pearson_manhattan": 0.7176890258722983, "eval_sts_dev_pearson_max": 0.7831698418188764, "eval_sts_dev_spearman_cosine": 0.7843284949390994, "eval_sts_dev_spearman_dot": 0.697639093220699, "eval_sts_dev_spearman_euclidean": 0.7073241375609828, "eval_sts_dev_spearman_manhattan": 0.710185012169815, "eval_sts_dev_spearman_max": 0.7843284949390994, "step": 310 }, { "epoch": 7.155988857938719, "eval_loss": 0.031366512179374695, "eval_runtime": 6.9924, "eval_samples_per_second": 214.518, "eval_steps_per_second": 26.886, "eval_sts_dev_pearson_cosine": 0.7835832006721541, "eval_sts_dev_pearson_dot": 0.7043934252027199, "eval_sts_dev_pearson_euclidean": 0.7164264689263184, "eval_sts_dev_pearson_manhattan": 0.7184030248845167, "eval_sts_dev_pearson_max": 0.7835832006721541, "eval_sts_dev_spearman_cosine": 0.7850548943796795, "eval_sts_dev_spearman_dot": 0.6977756771302583, "eval_sts_dev_spearman_euclidean": 0.708343725874613, "eval_sts_dev_spearman_manhattan": 0.7111504960736558, "eval_sts_dev_spearman_max": 0.7850548943796795, "step": 315 }, { "epoch": 7.2674094707520895, "grad_norm": 3.808695077896118, "learning_rate": 2.226474169420769e-06, "loss": 0.4131, "step": 320 }, { "epoch": 7.2674094707520895, "eval_loss": 0.03135285899043083, "eval_runtime": 6.9057, "eval_samples_per_second": 217.213, "eval_steps_per_second": 27.224, "eval_sts_dev_pearson_cosine": 0.7836045257427042, "eval_sts_dev_pearson_dot": 0.7048735903915628, "eval_sts_dev_pearson_euclidean": 0.7161062363729224, "eval_sts_dev_pearson_manhattan": 0.7180798998241316, "eval_sts_dev_pearson_max": 0.7836045257427042, "eval_sts_dev_spearman_cosine": 0.7849975337135177, "eval_sts_dev_spearman_dot": 0.6982899839848741, "eval_sts_dev_spearman_euclidean": 0.7079431278357644, "eval_sts_dev_spearman_manhattan": 0.710852480857077, "eval_sts_dev_spearman_max": 0.7849975337135177, "step": 320 }, { "epoch": 7.378830083565459, "eval_loss": 0.03127756714820862, "eval_runtime": 6.9241, "eval_samples_per_second": 216.634, "eval_steps_per_second": 27.151, "eval_sts_dev_pearson_cosine": 0.7836610063557831, "eval_sts_dev_pearson_dot": 0.705409260823171, "eval_sts_dev_pearson_euclidean": 0.7154023331837831, "eval_sts_dev_pearson_manhattan": 0.717401985035912, "eval_sts_dev_pearson_max": 0.7836610063557831, "eval_sts_dev_spearman_cosine": 0.7848718916416149, "eval_sts_dev_spearman_dot": 0.6991510364393221, "eval_sts_dev_spearman_euclidean": 0.7071171759954781, "eval_sts_dev_spearman_manhattan": 0.709827734664151, "eval_sts_dev_spearman_max": 0.7848718916416149, "step": 325 }, { "epoch": 7.49025069637883, "grad_norm": 3.8009250164031982, "learning_rate": 2.2960514872151678e-06, "loss": 0.4221, "step": 330 }, { "epoch": 7.49025069637883, "eval_loss": 0.031188100576400757, "eval_runtime": 7.0999, "eval_samples_per_second": 211.272, "eval_steps_per_second": 26.479, "eval_sts_dev_pearson_cosine": 0.7838825238345812, "eval_sts_dev_pearson_dot": 0.7057496676467132, "eval_sts_dev_pearson_euclidean": 0.7150892410708943, "eval_sts_dev_pearson_manhattan": 0.7171064711121474, "eval_sts_dev_pearson_max": 0.7838825238345812, "eval_sts_dev_spearman_cosine": 0.784820320759411, "eval_sts_dev_spearman_dot": 0.6997042671311072, "eval_sts_dev_spearman_euclidean": 0.7065608619879493, "eval_sts_dev_spearman_manhattan": 0.7094620852598932, "eval_sts_dev_spearman_max": 0.784820320759411, "step": 330 }, { "epoch": 7.6016713091922, "eval_loss": 0.0310923233628273, "eval_runtime": 6.9662, "eval_samples_per_second": 215.326, "eval_steps_per_second": 26.987, "eval_sts_dev_pearson_cosine": 0.7843923769897447, "eval_sts_dev_pearson_dot": 0.7058530968248947, "eval_sts_dev_pearson_euclidean": 0.7155332189451762, "eval_sts_dev_pearson_manhattan": 0.7175425736786123, "eval_sts_dev_pearson_max": 0.7843923769897447, "eval_sts_dev_spearman_cosine": 0.7853756910328091, "eval_sts_dev_spearman_dot": 0.6999248217974418, "eval_sts_dev_spearman_euclidean": 0.7071685659073802, "eval_sts_dev_spearman_manhattan": 0.7099135119853421, "eval_sts_dev_spearman_max": 0.7853756910328091, "step": 335 }, { "epoch": 7.713091922005571, "grad_norm": 4.329479694366455, "learning_rate": 2.3656288050095673e-06, "loss": 0.4268, "step": 340 }, { "epoch": 7.713091922005571, "eval_loss": 0.031015686690807343, "eval_runtime": 6.8718, "eval_samples_per_second": 218.283, "eval_steps_per_second": 27.358, "eval_sts_dev_pearson_cosine": 0.7848078944075182, "eval_sts_dev_pearson_dot": 0.7062611613987171, "eval_sts_dev_pearson_euclidean": 0.7156669541008578, "eval_sts_dev_pearson_manhattan": 0.7176849379592309, "eval_sts_dev_pearson_max": 0.7848078944075182, "eval_sts_dev_spearman_cosine": 0.7857175803487115, "eval_sts_dev_spearman_dot": 0.7006071388870717, "eval_sts_dev_spearman_euclidean": 0.7074396606352066, "eval_sts_dev_spearman_manhattan": 0.7101303213368534, "eval_sts_dev_spearman_max": 0.7857175803487115, "step": 340 }, { "epoch": 7.8245125348189415, "eval_loss": 0.030945729464292526, "eval_runtime": 6.9722, "eval_samples_per_second": 215.14, "eval_steps_per_second": 26.964, "eval_sts_dev_pearson_cosine": 0.7852280992749574, "eval_sts_dev_pearson_dot": 0.7063015365766652, "eval_sts_dev_pearson_euclidean": 0.71618048050416, "eval_sts_dev_pearson_manhattan": 0.7181959951306995, "eval_sts_dev_pearson_max": 0.7852280992749574, "eval_sts_dev_spearman_cosine": 0.7861447827888495, "eval_sts_dev_spearman_dot": 0.7007253260607372, "eval_sts_dev_spearman_euclidean": 0.7080307843557273, "eval_sts_dev_spearman_manhattan": 0.710707788624518, "eval_sts_dev_spearman_max": 0.7861447827888495, "step": 345 }, { "epoch": 7.935933147632312, "grad_norm": 4.521576881408691, "learning_rate": 2.435206122803966e-06, "loss": 0.4316, "step": 350 }, { "epoch": 7.935933147632312, "eval_loss": 0.030903467908501625, "eval_runtime": 6.8754, "eval_samples_per_second": 218.169, "eval_steps_per_second": 27.344, "eval_sts_dev_pearson_cosine": 0.7857408106817081, "eval_sts_dev_pearson_dot": 0.7063227803586387, "eval_sts_dev_pearson_euclidean": 0.7171064497768416, "eval_sts_dev_pearson_manhattan": 0.7190977579026478, "eval_sts_dev_pearson_max": 0.7857408106817081, "eval_sts_dev_spearman_cosine": 0.786647063435545, "eval_sts_dev_spearman_dot": 0.7004210617791904, "eval_sts_dev_spearman_euclidean": 0.7090060931384192, "eval_sts_dev_spearman_manhattan": 0.7117304388117395, "eval_sts_dev_spearman_max": 0.786647063435545, "step": 350 }, { "epoch": 8.066852367688023, "eval_loss": 0.03090326115489006, "eval_runtime": 6.7967, "eval_samples_per_second": 220.696, "eval_steps_per_second": 27.661, "eval_sts_dev_pearson_cosine": 0.7860914327083659, "eval_sts_dev_pearson_dot": 0.7067109311815922, "eval_sts_dev_pearson_euclidean": 0.7179978723314155, "eval_sts_dev_pearson_manhattan": 0.7199506434198831, "eval_sts_dev_pearson_max": 0.7860914327083659, "eval_sts_dev_spearman_cosine": 0.7871799411716375, "eval_sts_dev_spearman_dot": 0.7005966817709771, "eval_sts_dev_spearman_euclidean": 0.7099849983444726, "eval_sts_dev_spearman_manhattan": 0.7126081974741519, "eval_sts_dev_spearman_max": 0.7871799411716375, "step": 355 }, { "epoch": 8.178272980501394, "grad_norm": 3.464381217956543, "learning_rate": 2.504783440598365e-06, "loss": 0.4277, "step": 360 }, { "epoch": 8.178272980501394, "eval_loss": 0.030861668288707733, "eval_runtime": 6.8952, "eval_samples_per_second": 217.544, "eval_steps_per_second": 27.265, "eval_sts_dev_pearson_cosine": 0.7862113365203784, "eval_sts_dev_pearson_dot": 0.7070142268847368, "eval_sts_dev_pearson_euclidean": 0.7181137478219999, "eval_sts_dev_pearson_manhattan": 0.7200573508948256, "eval_sts_dev_pearson_max": 0.7862113365203784, "eval_sts_dev_spearman_cosine": 0.7873051906331155, "eval_sts_dev_spearman_dot": 0.700851803333668, "eval_sts_dev_spearman_euclidean": 0.7101326235059475, "eval_sts_dev_spearman_manhattan": 0.7126791959108771, "eval_sts_dev_spearman_max": 0.7873051906331155, "step": 360 }, { "epoch": 8.289693593314762, "eval_loss": 0.03079277276992798, "eval_runtime": 7.0041, "eval_samples_per_second": 214.159, "eval_steps_per_second": 26.841, "eval_sts_dev_pearson_cosine": 0.7861051555153227, "eval_sts_dev_pearson_dot": 0.7077462081618229, "eval_sts_dev_pearson_euclidean": 0.7175047036545574, "eval_sts_dev_pearson_manhattan": 0.7194616943503004, "eval_sts_dev_pearson_max": 0.7861051555153227, "eval_sts_dev_spearman_cosine": 0.7869754283660466, "eval_sts_dev_spearman_dot": 0.7018953525077267, "eval_sts_dev_spearman_euclidean": 0.7093618435488815, "eval_sts_dev_spearman_manhattan": 0.7120432245619701, "eval_sts_dev_spearman_max": 0.7869754283660466, "step": 365 }, { "epoch": 8.401114206128133, "grad_norm": 3.629032850265503, "learning_rate": 2.5743607583927645e-06, "loss": 0.3925, "step": 370 }, { "epoch": 8.401114206128133, "eval_loss": 0.03077574074268341, "eval_runtime": 6.9569, "eval_samples_per_second": 215.613, "eval_steps_per_second": 27.024, "eval_sts_dev_pearson_cosine": 0.7860927703016911, "eval_sts_dev_pearson_dot": 0.7084805810982604, "eval_sts_dev_pearson_euclidean": 0.7171292733763057, "eval_sts_dev_pearson_manhattan": 0.7191008391698412, "eval_sts_dev_pearson_max": 0.7860927703016911, "eval_sts_dev_spearman_cosine": 0.7868465023058949, "eval_sts_dev_spearman_dot": 0.7026257860756843, "eval_sts_dev_spearman_euclidean": 0.7087433915922463, "eval_sts_dev_spearman_manhattan": 0.7115662090675204, "eval_sts_dev_spearman_max": 0.7868465023058949, "step": 370 }, { "epoch": 8.512534818941504, "eval_loss": 0.03077036887407303, "eval_runtime": 6.8481, "eval_samples_per_second": 219.038, "eval_steps_per_second": 27.453, "eval_sts_dev_pearson_cosine": 0.7860543259557101, "eval_sts_dev_pearson_dot": 0.7090029747286515, "eval_sts_dev_pearson_euclidean": 0.7168001987123229, "eval_sts_dev_pearson_manhattan": 0.7187912798445806, "eval_sts_dev_pearson_max": 0.7860543259557101, "eval_sts_dev_spearman_cosine": 0.786577121013552, "eval_sts_dev_spearman_dot": 0.7032207123703509, "eval_sts_dev_spearman_euclidean": 0.7083026579268292, "eval_sts_dev_spearman_manhattan": 0.7111138102646555, "eval_sts_dev_spearman_max": 0.786577121013552, "step": 375 }, { "epoch": 8.623955431754874, "grad_norm": 4.5424346923828125, "learning_rate": 2.643938076187163e-06, "loss": 0.4049, "step": 380 }, { "epoch": 8.623955431754874, "eval_loss": 0.030785972252488136, "eval_runtime": 6.9052, "eval_samples_per_second": 217.228, "eval_steps_per_second": 27.226, "eval_sts_dev_pearson_cosine": 0.786338341456081, "eval_sts_dev_pearson_dot": 0.7090251722360976, "eval_sts_dev_pearson_euclidean": 0.7176375494602096, "eval_sts_dev_pearson_manhattan": 0.7195903686388057, "eval_sts_dev_pearson_max": 0.786338341456081, "eval_sts_dev_spearman_cosine": 0.7869461186588641, "eval_sts_dev_spearman_dot": 0.7030353980707192, "eval_sts_dev_spearman_euclidean": 0.7093240329985625, "eval_sts_dev_spearman_manhattan": 0.7120013731894795, "eval_sts_dev_spearman_max": 0.7869461186588641, "step": 380 }, { "epoch": 8.735376044568245, "eval_loss": 0.03077947534620762, "eval_runtime": 6.94, "eval_samples_per_second": 216.137, "eval_steps_per_second": 27.089, "eval_sts_dev_pearson_cosine": 0.7867836664964302, "eval_sts_dev_pearson_dot": 0.7089649699768177, "eval_sts_dev_pearson_euclidean": 0.7185998785212442, "eval_sts_dev_pearson_manhattan": 0.7205256023581162, "eval_sts_dev_pearson_max": 0.7867836664964302, "eval_sts_dev_spearman_cosine": 0.7875195626790124, "eval_sts_dev_spearman_dot": 0.7028351666319841, "eval_sts_dev_spearman_euclidean": 0.7105482738364566, "eval_sts_dev_spearman_manhattan": 0.7132642042369475, "eval_sts_dev_spearman_max": 0.7875195626790124, "step": 385 }, { "epoch": 8.846796657381615, "grad_norm": 3.7269480228424072, "learning_rate": 2.7135153939815623e-06, "loss": 0.3742, "step": 390 }, { "epoch": 8.846796657381615, "eval_loss": 0.030757909640669823, "eval_runtime": 6.912, "eval_samples_per_second": 217.015, "eval_steps_per_second": 27.199, "eval_sts_dev_pearson_cosine": 0.7873307957198338, "eval_sts_dev_pearson_dot": 0.7087450117938812, "eval_sts_dev_pearson_euclidean": 0.7199394166229915, "eval_sts_dev_pearson_manhattan": 0.7218118008402783, "eval_sts_dev_pearson_max": 0.7873307957198338, "eval_sts_dev_spearman_cosine": 0.7883481466120934, "eval_sts_dev_spearman_dot": 0.702431533404311, "eval_sts_dev_spearman_euclidean": 0.7122286167501692, "eval_sts_dev_spearman_manhattan": 0.7149544811678771, "eval_sts_dev_spearman_max": 0.7883481466120934, "step": 390 }, { "epoch": 8.958217270194986, "eval_loss": 0.03074067085981369, "eval_runtime": 7.0786, "eval_samples_per_second": 211.905, "eval_steps_per_second": 26.559, "eval_sts_dev_pearson_cosine": 0.7875281932009626, "eval_sts_dev_pearson_dot": 0.7091183187974348, "eval_sts_dev_pearson_euclidean": 0.720306579358833, "eval_sts_dev_pearson_manhattan": 0.7221545912209083, "eval_sts_dev_pearson_max": 0.7875281932009626, "eval_sts_dev_spearman_cosine": 0.7884911216315376, "eval_sts_dev_spearman_dot": 0.7026504547905195, "eval_sts_dev_spearman_euclidean": 0.7125846397557779, "eval_sts_dev_spearman_manhattan": 0.7153917764693033, "eval_sts_dev_spearman_max": 0.7884911216315376, "step": 395 }, { "epoch": 9.089136490250697, "grad_norm": 3.8048255443573, "learning_rate": 2.7830927117759614e-06, "loss": 0.3498, "step": 400 }, { "epoch": 9.089136490250697, "eval_loss": 0.03073756769299507, "eval_runtime": 7.1819, "eval_samples_per_second": 208.858, "eval_steps_per_second": 26.177, "eval_sts_dev_pearson_cosine": 0.7875285006609543, "eval_sts_dev_pearson_dot": 0.709718276464936, "eval_sts_dev_pearson_euclidean": 0.7202436438310591, "eval_sts_dev_pearson_manhattan": 0.7220766094080024, "eval_sts_dev_pearson_max": 0.7875285006609543, "eval_sts_dev_spearman_cosine": 0.7885939335328866, "eval_sts_dev_spearman_dot": 0.7032536436958657, "eval_sts_dev_spearman_euclidean": 0.7124855846354039, "eval_sts_dev_spearman_manhattan": 0.7153797502128406, "eval_sts_dev_spearman_max": 0.7885939335328866, "step": 400 }, { "epoch": 9.200557103064067, "eval_loss": 0.03071259893476963, "eval_runtime": 6.8201, "eval_samples_per_second": 219.938, "eval_steps_per_second": 27.566, "eval_sts_dev_pearson_cosine": 0.787184477170156, "eval_sts_dev_pearson_dot": 0.7102603851217889, "eval_sts_dev_pearson_euclidean": 0.7195444208609296, "eval_sts_dev_pearson_manhattan": 0.7213936268781151, "eval_sts_dev_pearson_max": 0.787184477170156, "eval_sts_dev_spearman_cosine": 0.78809909542145, "eval_sts_dev_spearman_dot": 0.7036724949513745, "eval_sts_dev_spearman_euclidean": 0.7115938480269084, "eval_sts_dev_spearman_manhattan": 0.7143300985487689, "eval_sts_dev_spearman_max": 0.78809909542145, "step": 405 } ], "logging_steps": 10, "max_steps": 440, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }