dh-mc commited on
Commit
c8eca2c
·
1 Parent(s): 426d315

final results

Browse files
data/Qwen2-72B-Instruct_shots_metrics.csv CHANGED
@@ -1,2 +1,4 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct_torch/shots-00,0.7516666666666667,0.7949378981748352,0.7516666666666667,0.7572499605227642,0.9773333333333334
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct_torch/shots-00,0.7516666666666667,0.7949378981748352,0.7516666666666667,0.7572499605227642,0.9773333333333334
3
+ 5,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/shots-05,0.778,0.79531681283817,0.778,0.7722405723376975,0.9876666666666667
4
+ 10,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/shots-10,0.775,0.7935761766767606,0.775,0.7740445924385057,0.9946666666666667
data/Qwen2-7B-Instruct_shots_metrics.csv CHANGED
@@ -1,3 +1,8 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-00,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666667
3
- 10,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-10,0.5646666666666667,0.7391197908117386,0.5646666666666667,0.6064049121095652,0.9896666666666667
 
 
 
 
 
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
  0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-00,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666667
3
+ 5,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-05,0.357,0.7558028637770273,0.357,0.4365296526050415,0.997
4
+ 10,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-10,0.5593333333333333,0.7492096614172068,0.5593333333333333,0.5991418028711349,0.99
5
+ 20,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-20,0.5686666666666667,0.7520662661534714,0.5686666666666667,0.606675877273536,0.9976666666666667
6
+ 30,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-30,0.658,0.7560839971561838,0.658,0.6783328633107383,0.9886666666666667
7
+ 40,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-40,0.685,0.748405672861726,0.685,0.6925462636609232,0.991
8
+ 50,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-50,0.6303333333333333,0.7557444295881455,0.6303333333333333,0.6534318328499943,0.983
data/Qwen2.5-1.5B-Instruct_metrics.csv CHANGED
@@ -1,12 +1,12 @@
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 0.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct_torch.bfloat16_lf,0.18366666666666667,0.5244570465301668,0.18366666666666667,0.23286492799102732,0.931
3
- 0.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-35_torch.bfloat16_lf,0.521,0.6393141994049955,0.521,0.5543058103456981,1.0
4
- 0.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-70_torch.bfloat16_lf,0.5786666666666667,0.6827334710464682,0.5786666666666667,0.6055896299128966,1.0
5
- 0.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-105_torch.bfloat16_lf,0.544,0.7064593462910856,0.544,0.5946365105633672,1.0
6
- 0.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-140_torch.bfloat16_lf,0.659,0.7267092412287238,0.659,0.6825875108247536,1.0
7
- 1.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-175_torch.bfloat16_lf,0.637,0.7191389576964738,0.637,0.6562859054038414,1.0
8
- 1.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-210_torch.bfloat16_lf,0.6086666666666667,0.7293412868960213,0.6086666666666667,0.6479350184617141,1.0
9
- 1.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-245_torch.bfloat16_lf,0.6326666666666667,0.716380475510422,0.6326666666666667,0.6591217616290708,1.0
10
- 1.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-280_torch.bfloat16_lf,0.6273333333333333,0.7224778228100358,0.6273333333333333,0.6551405164716649,1.0
11
- 1.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-315_torch.bfloat16_lf,0.5973333333333334,0.7263124149931549,0.5973333333333334,0.6349391744052281,1.0
12
- 2.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-350_torch.bfloat16_lf,0.6046666666666667,0.7203284046544999,0.6046666666666667,0.6377776248713325,1.0
 
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct_torch.bfloat16_lf,0.18066666666666667,0.5190896501490828,0.18066666666666667,0.2279835258033138,0.931
3
+ 0.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-35_torch.bfloat16_lf,0.518,0.6492636460363231,0.518,0.5572067605634098,1.0
4
+ 0.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-70_torch.bfloat16_lf,0.5773333333333334,0.6947418792907191,0.5773333333333334,0.6099122869891875,1.0
5
+ 0.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-105_torch.bfloat16_lf,0.538,0.7167362916136871,0.538,0.5968520817286569,1.0
6
+ 0.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-140_torch.bfloat16_lf,0.6583333333333333,0.7364753067653045,0.6583333333333333,0.6860876366125171,1.0
7
+ 1.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-175_torch.bfloat16_lf,0.634,0.728416475102691,0.634,0.6585911813020522,1.0
8
+ 1.2,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-210_torch.bfloat16_lf,0.604,0.739271718192585,0.604,0.650207257048292,1.0
9
+ 1.4,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-245_torch.bfloat16_lf,0.627,0.7239095156237754,0.627,0.6595024812639674,1.0
10
+ 1.6,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-280_torch.bfloat16_lf,0.6203333333333333,0.7299176022060372,0.6203333333333333,0.6547846347747823,1.0
11
+ 1.8,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-315_torch.bfloat16_lf,0.591,0.7339511078925822,0.591,0.6350661217426716,1.0
12
+ 2.0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/checkpoint-350_torch.bfloat16_lf,0.5963333333333334,0.7286221052443161,0.5963333333333334,0.6381137491097367,1.0
data/Qwen2.5-1.5B-Instruct_shots_metrics.csv CHANGED
@@ -1,8 +1,8 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-00,0.18366666666666667,0.5244570465301668,0.18366666666666667,0.23286492799102732,0.931
3
- 5,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-05,0.349,0.5695965528635436,0.349,0.3771117506970461,0.9756666666666667
4
- 10,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-10,0.457,0.5932373185073849,0.457,0.4641792696031706,0.9933333333333333
5
- 20,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-20,0.24166666666666667,0.5333408149946145,0.24166666666666667,0.30859243868426434,0.8263333333333334
6
- 30,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-30,0.23,0.5479545947886839,0.23,0.3064381040560128,0.661
7
- 40,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-40,0.29233333333333333,0.5608411738006117,0.29233333333333333,0.3751714671158081,0.5206666666666667
8
- 50,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-50,0.29,0.5646814860840066,0.29,0.36883826526592467,0.4603333333333333
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-00,0.18066666666666667,0.5190896501490828,0.18066666666666667,0.2279835258033138,0.931
3
+ 5,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-05,0.3466666666666667,0.5781128109800681,0.3466666666666667,0.37886593168708843,0.9756666666666667
4
+ 10,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-10,0.4523333333333333,0.5964896895382023,0.4523333333333333,0.46219676531721876,0.9933333333333333
5
+ 20,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-20,0.243,0.5419108277814879,0.243,0.31071147199535726,0.8263333333333334
6
+ 30,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-30,0.23033333333333333,0.55368556787824,0.23033333333333333,0.3067125355762305,0.661
7
+ 40,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-40,0.292,0.5667420801465655,0.292,0.375496356843247,0.5206666666666667
8
+ 50,Qwen2.5-1.5B-Instruct,Qwen/Qwen2.5-1.5B-Instruct/shots-50,0.2876666666666667,0.5660207537890989,0.2876666666666667,0.36627420118815035,0.4603333333333333
data/Qwen2.5-7B-Instruct_metrics.csv CHANGED
@@ -1,12 +1,12 @@
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.6436666666666667,0.717651042027604,0.6436666666666667,0.6066932578767255,1.0
3
- 0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.7473333333333333,0.759526705532232,0.7473333333333333,0.7480522291877509,0.998
4
- 0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.752,0.7774114736945115,0.752,0.7611191332452362,0.9996666666666667
5
- 0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7623333333333333,0.7987495507688677,0.7623333333333333,0.7754658001873385,0.9996666666666667
6
- 0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.7596666666666667,0.7923123268836624,0.7596666666666667,0.7724543387690386,1.0
7
- 1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.782,0.8023938029436536,0.782,0.7888740758699296,0.9993333333333333
8
- 1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7563333333333333,0.7997527018417315,0.7563333333333333,0.7728761539215637,1.0
9
- 1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.762,0.7997301280029149,0.762,0.7743858484379207,0.9993333333333333
10
- 1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7726666666666666,0.8006851113573145,0.7726666666666666,0.7813968284378919,0.9996666666666667
11
- 1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.7696666666666667,0.799287702962426,0.7696666666666667,0.7792120245789584,0.9993333333333333
12
- 2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.769,0.8010881984531473,0.769,0.7793801070552965,0.9996666666666667
 
1
  epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
3
+ 0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.745,0.7643041174791825,0.745,0.7482828029872421,0.998
4
+ 0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.7446666666666667,0.7800215227839997,0.7446666666666667,0.7576550061479678,0.9996666666666667
5
+ 0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7513333333333333,0.7996792149630704,0.7513333333333333,0.7693730206330721,0.9996666666666667
6
+ 0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.75,0.7923028105975739,0.75,0.7665531868559959,1.0
7
+ 1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
8
+ 1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7443333333333333,0.79978900243777,0.7443333333333333,0.7660506505481828,1.0
9
+ 1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.7486666666666667,0.7974562319123832,0.7486666666666667,0.7655275916268014,0.9993333333333333
10
+ 1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7566666666666667,0.7939852407869384,0.7566666666666667,0.7689495073735431,0.9996666666666667
11
+ 1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.755,0.7940575522966016,0.755,0.7681326415137147,0.9993333333333333
12
+ 2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.756,0.7982464722401461,0.756,0.7704035278260453,0.9996666666666667
data/Qwen2.5-7B-Instruct_shots_metrics.csv CHANGED
@@ -1,8 +1,8 @@
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.6436666666666667,0.717651042027604,0.6436666666666667,0.6066932578767255,1.0
3
- 5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.63,0.7622571683877091,0.63,0.6151126410759672,0.998
4
- 10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.677,0.7663956674673086,0.677,0.6770580664953397,0.9796666666666667
5
- 20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7343333333333333,0.7730863408305184,0.7343333333333333,0.7243291573141537,0.807
6
- 30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.765,0.7840432806350224,0.765,0.7512220322751986,0.805
7
- 40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.757,0.7733827213068922,0.757,0.7427592763321033,0.8546666666666667
8
- 50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.758,0.763149679724481,0.758,0.7376580515312735,0.7563333333333333
 
1
  shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
3
+ 5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.6346666666666667,0.7653343185471776,0.6346666666666667,0.6219419633691871,0.998
4
+ 10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.678,0.7675951017673515,0.678,0.6790860659550377,0.9796666666666667
5
+ 20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7353333333333333,0.7702034737275962,0.7353333333333333,0.7278047438569933,0.807
6
+ 30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.7646666666666667,0.7787918401418651,0.7646666666666667,0.7527649874769439,0.805
7
+ 40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.759,0.7736852689131295,0.759,0.7472252604775926,0.8546666666666667
8
+ 50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.7586666666666667,0.7640431634617543,0.7586666666666667,0.7414332963557551,0.7563333333333333
data/best_metrics.csv CHANGED
@@ -1,18 +1,17 @@
1
  index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
- 1,truth,truth,0.0,0.0,0.0,0.0,0.0
3
- 2,Llama3.1-8B (1.0-epoch),Llama3.1-8B (1.0-epoch),0.78,0.810582723471486,0.78,0.7924651054056209,1.0
4
- 3,Llama3.1-70B (1.0-epoch),Llama3.1-70B (1.0-epoch),0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
5
- 4,Mistral-7B (1.4-epoch),Mistral-7B (1.4-epoch),0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
6
- 5,InternLM2.5-7B (0.8-epoch),InternLM2.5-7B (0.8-epoch),0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
7
- 6,InternLM2.5-7B-1M (0.8-epoch),InternLM2.5-7B-1M (0.8-epoch),0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
8
- 7,InternLM2.5-20B (0.8-epoch),InternLM2.5-20B (0.8-epoch),0.795,0.817457691710893,0.795,0.8027552955647029,1.0
9
- 8,Qwen2-7B (0.4-epoch),Qwen2-7B (0.4-epoch),0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
10
- 9,Qwen2-72B (1.8-epoch),Qwen2-72B (1.8-epoch),0.784,0.8354349234761956,0.784,0.804194683154365,1.0
11
- 10,Qwen2.5-3B (1.4-epoch),Qwen2.5-3B (1.4-epoch),0.7233333333333334,0.7720989063414209,0.7233333333333334,0.7410476466041488,1.0
12
- 11,Qwen2.5-7B (1.0-epoch),Qwen2.5-7B (1.0-epoch),0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
13
- 12,Qwen2.5-72B (0.8-epoch),Qwen2.5-72B (0.8-epoch),0.7846666666666666,0.8199033961265727,0.7846666666666666,0.7983932694517433,1.0
14
- 13,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
15
- 14,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
16
- 15,o1-mini (50-shot),o1-mini (50-shot),0.75,0.7767849265833893,0.75,0.7590020698968893,1.0
17
- 16,o1-preview (50-shot),o1-preview (50-shot),0.7546666666666667,0.7979981023789272,0.7546666666666667,0.7708181822112403,0.9996666666666667
18
- 17,Qwen2.5-72B (10-shot),Qwen2.5-72B (10-shot),0.8103333333333333,0.8136844357537636,0.8103333333333333,0.8088046626262355,0.998
 
1
  index,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
2
+ 1,Llama3.1-8B (1.0-epoch),Llama3.1-8B (1.0-epoch),0.78,0.810582723471486,0.78,0.7924651054056209,1.0
3
+ 2,Llama3.1-70B (1.0-epoch),Llama3.1-70B (1.0-epoch),0.7963333333333333,0.8248972880055918,0.7963333333333333,0.8076868978089201,1.0
4
+ 3,Mistral-7B (1.4-epoch),Mistral-7B (1.4-epoch),0.75,0.7885868317699068,0.75,0.7648234347578796,1.0
5
+ 4,InternLM2.5-7B (0.8-epoch),InternLM2.5-7B (0.8-epoch),0.7496666666666667,0.8041871978859686,0.7496666666666667,0.7660159670998776,1.0
6
+ 5,InternLM2.5-7B-1M (0.8-epoch),InternLM2.5-7B-1M (0.8-epoch),0.803,0.8031411888150441,0.803,0.8028064320197301,1.0
7
+ 6,InternLM2.5-20B (0.8-epoch),InternLM2.5-20B (0.8-epoch),0.795,0.817457691710893,0.795,0.8027552955647029,1.0
8
+ 7,Qwen2-7B (0.4-epoch),Qwen2-7B (0.4-epoch),0.759,0.8005303465799652,0.759,0.7748745026535183,1.0
9
+ 8,Qwen2-72B (1.8-epoch),Qwen2-72B (1.8-epoch),0.784,0.8354349234761956,0.784,0.804194683154365,1.0
10
+ 9,Qwen2.5-3B (1.4-epoch),Qwen2.5-3B (1.4-epoch),0.7233333333333334,0.7720989063414209,0.7233333333333334,0.7410476466041488,1.0
11
+ 10,Qwen2.5-7B (1.0-epoch),Qwen2.5-7B (1.0-epoch),0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333333
12
+ 11,Qwen2.5-72B (0.8-epoch),Qwen2.5-72B (0.8-epoch),0.7846666666666666,0.8199033961265727,0.7846666666666666,0.7983932694517433,1.0
13
+ 12,gpt-4o-mini (0-shot),gpt-4o-mini (0-shot),0.7176666666666667,0.785706730193659,0.7176666666666667,0.7296061848734905,1.0
14
+ 13,gpt-4o (10-shot),gpt-4o (10-shot),0.7916666666666666,0.8227707658360168,0.7916666666666666,0.803614688453356,0.9996666666666667
15
+ 14,o1-mini (50-shot),o1-mini (50-shot),0.75,0.7767849265833893,0.75,0.7590020698968893,1.0
16
+ 15,o1-preview (50-shot),o1-preview (50-shot),0.7546666666666667,0.7979981023789272,0.7546666666666667,0.7708181822112403,0.9996666666666667
17
+ 16,Qwen2.5-72B (10-shot),Qwen2.5-72B (10-shot),0.8103333333333333,0.8136844357537636,0.8103333333333333,0.8088046626262355,0.998
 
data/best_results.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/few-shots_metrics.csv CHANGED
@@ -61,8 +61,15 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
61
  50,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-50,0.7213333333333334,0.7546008508718184,0.7213333333333334,0.70308601382351,0.8846666666666667
62
  0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/shots-00,0.564,0.7745256693833624,0.564,0.6352190975436365,0.6726666666666666
63
  0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-00,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666668
64
- 10,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-10,0.5646666666666667,0.7391197908117386,0.5646666666666667,0.6064049121095652,0.9896666666666668
 
 
 
 
 
65
  0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct_torch/shots-00,0.7516666666666667,0.7949378981748352,0.7516666666666667,0.7572499605227642,0.9773333333333334
 
 
66
  0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
67
  5,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-05,0.639,0.7226431221398603,0.639,0.641568790114368,0.9973333333333332
68
  10,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-10,0.625,0.7164154004131771,0.625,0.6402584852791593,0.995
@@ -70,13 +77,13 @@ shots,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
70
  30,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-30,0.475,0.6880994914236809,0.475,0.5310948082593374,0.904
71
  40,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-40,0.584,0.7065303262365236,0.584,0.6214992664375876,0.7173333333333334
72
  50,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-50,0.6093333333333333,0.7120506480394511,0.6093333333333333,0.6451959368825358,0.574
73
- 0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.6436666666666667,0.717651042027604,0.6436666666666667,0.6066932578767255,1.0
74
- 5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.63,0.7622571683877091,0.63,0.6151126410759672,0.998
75
- 10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.677,0.7663956674673086,0.677,0.6770580664953397,0.9796666666666668
76
- 20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7343333333333333,0.7730863408305184,0.7343333333333333,0.7243291573141537,0.807
77
- 30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.765,0.7840432806350224,0.765,0.7512220322751986,0.805
78
- 40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.757,0.7733827213068922,0.757,0.7427592763321033,0.8546666666666667
79
- 50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.758,0.763149679724481,0.758,0.7376580515312735,0.7563333333333333
80
  0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-00,0.7856666666666666,0.7942511546806512,0.7856666666666666,0.7699212943617263,0.994
81
  5,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-05,0.8113333333333334,0.8112264644451684,0.8113333333333334,0.8039596846574816,0.9416666666666668
82
  10,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-10,0.8103333333333333,0.8136844357537636,0.8103333333333333,0.8088046626262355,0.9123333333333332
 
61
  50,internlm2_5-7b-chat-1m,internlm/internlm2_5-7b-chat-1m/shots-50,0.7213333333333334,0.7546008508718184,0.7213333333333334,0.70308601382351,0.8846666666666667
62
  0,internlm2_5-20b-chat,internlm/internlm2_5-20b-chat/shots-00,0.564,0.7745256693833624,0.564,0.6352190975436365,0.6726666666666666
63
  0,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-00,0.683,0.7493103872717293,0.683,0.710140098232232,0.9996666666666668
64
+ 5,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-05,0.357,0.7558028637770273,0.357,0.4365296526050415,0.997
65
+ 10,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-10,0.5593333333333333,0.7492096614172068,0.5593333333333333,0.5991418028711349,0.99
66
+ 20,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-20,0.5686666666666667,0.7520662661534714,0.5686666666666667,0.606675877273536,0.9976666666666668
67
+ 30,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-30,0.658,0.7560839971561838,0.658,0.6783328633107383,0.9886666666666668
68
+ 40,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-40,0.685,0.748405672861726,0.685,0.6925462636609232,0.991
69
+ 50,Qwen2-7B-Instruct,Qwen/Qwen2-7B-Instruct/shots-50,0.6303333333333333,0.7557444295881455,0.6303333333333333,0.6534318328499943,0.983
70
  0,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct_torch/shots-00,0.7516666666666667,0.7949378981748352,0.7516666666666667,0.7572499605227642,0.9773333333333334
71
+ 5,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/shots-05,0.778,0.79531681283817,0.778,0.7722405723376975,0.9876666666666668
72
+ 10,Qwen2-72B-Instruct,Qwen/Qwen2-72B-Instruct/shots-10,0.775,0.7935761766767606,0.775,0.7740445924385057,0.9946666666666668
73
  0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-00,0.5796666666666667,0.6966500240864278,0.5796666666666667,0.5506370828782681,1.0
74
  5,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-05,0.639,0.7226431221398603,0.639,0.641568790114368,0.9973333333333332
75
  10,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-10,0.625,0.7164154004131771,0.625,0.6402584852791593,0.995
 
77
  30,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-30,0.475,0.6880994914236809,0.475,0.5310948082593374,0.904
78
  40,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-40,0.584,0.7065303262365236,0.584,0.6214992664375876,0.7173333333333334
79
  50,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/shots-50,0.6093333333333333,0.7120506480394511,0.6093333333333333,0.6451959368825358,0.574
80
+ 0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-00,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
81
+ 5,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-05,0.6346666666666667,0.7653343185471776,0.6346666666666667,0.6219419633691871,0.998
82
+ 10,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-10,0.678,0.7675951017673515,0.678,0.6790860659550377,0.9796666666666668
83
+ 20,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-20,0.7353333333333333,0.7702034737275962,0.7353333333333333,0.7278047438569933,0.807
84
+ 30,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-30,0.7646666666666667,0.7787918401418651,0.7646666666666667,0.7527649874769439,0.805
85
+ 40,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-40,0.759,0.7736852689131295,0.759,0.7472252604775926,0.8546666666666667
86
+ 50,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/shots-50,0.7586666666666667,0.7640431634617543,0.7586666666666667,0.7414332963557551,0.7563333333333333
87
  0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-00,0.7856666666666666,0.7942511546806512,0.7856666666666666,0.7699212943617263,0.994
88
  5,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-05,0.8113333333333334,0.8112264644451684,0.8113333333333334,0.8039596846574816,0.9416666666666668
89
  10,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/shots-10,0.8103333333333333,0.8136844357537636,0.8103333333333333,0.8088046626262355,0.9123333333333332
data/fine-tuning_metrics.csv CHANGED
@@ -98,17 +98,17 @@ epoch,model,run,accuracy,precision,recall,f1,ratio_valid_classifications
98
  1.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7156666666666667,0.7724266286892245,0.7156666666666667,0.7356331945937126,1.0
99
  1.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-315_torch.bfloat16_lf,0.6986666666666667,0.7734046031514225,0.6986666666666667,0.7262724373234384,1.0
100
  2.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-350_torch.bfloat16_lf,0.704,0.7725944595890188,0.704,0.7290337960305111,1.0
101
- 0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.6436666666666667,0.717651042027604,0.6436666666666667,0.6066932578767255,1.0
102
- 0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.7473333333333333,0.759526705532232,0.7473333333333333,0.7480522291877509,0.998
103
- 0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.752,0.7774114736945115,0.752,0.7611191332452362,0.9996666666666668
104
- 0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7623333333333333,0.7987495507688677,0.7623333333333333,0.7754658001873385,0.9996666666666668
105
- 0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.7596666666666667,0.7923123268836624,0.7596666666666667,0.7724543387690386,1.0
106
- 1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.782,0.8023938029436536,0.782,0.7888740758699296,0.9993333333333332
107
- 1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7563333333333333,0.7997527018417315,0.7563333333333333,0.7728761539215637,1.0
108
- 1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.762,0.7997301280029149,0.762,0.7743858484379207,0.9993333333333332
109
- 1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7726666666666666,0.8006851113573145,0.7726666666666666,0.7813968284378919,0.9996666666666668
110
- 1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.7696666666666667,0.799287702962426,0.7696666666666667,0.7792120245789584,0.9993333333333332
111
- 2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.769,0.8010881984531473,0.769,0.7793801070552965,0.9996666666666668
112
  0.0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct_torch.bfloat16_4bit_lf,0.7856666666666666,0.7942511546806512,0.7856666666666666,0.7699212943617263,0.994
113
  0.2,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7736666666666666,0.8102875293385203,0.7736666666666666,0.7874095844134584,1.0
114
  0.4,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.748,0.8094861650366822,0.748,0.7718522396481117,1.0
 
98
  1.6,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7156666666666667,0.7724266286892245,0.7156666666666667,0.7356331945937126,1.0
99
  1.8,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-315_torch.bfloat16_lf,0.6986666666666667,0.7734046031514225,0.6986666666666667,0.7262724373234384,1.0
100
  2.0,Qwen2.5-3B-Instruct,Qwen/Qwen2.5-3B-Instruct/checkpoint-350_torch.bfloat16_lf,0.704,0.7725944595890188,0.704,0.7290337960305111,1.0
101
+ 0.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct_torch.bfloat16_lf,0.644,0.7200261355300325,0.644,0.6101052277961244,1.0
102
+ 0.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-35_torch.bfloat16_lf,0.745,0.7643041174791825,0.745,0.7482828029872421,0.998
103
+ 0.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-70_torch.bfloat16_lf,0.7446666666666667,0.7800215227839997,0.7446666666666667,0.7576550061479678,0.9996666666666668
104
+ 0.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-105_torch.bfloat16_lf,0.7513333333333333,0.7996792149630704,0.7513333333333333,0.7693730206330721,0.9996666666666668
105
+ 0.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-140_torch.bfloat16_lf,0.75,0.7923028105975739,0.75,0.7665531868559959,1.0
106
+ 1.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-175_torch.bfloat16_lf,0.771,0.8005814962709542,0.771,0.7814602739241332,0.9993333333333332
107
+ 1.2,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-210_torch.bfloat16_lf,0.7443333333333333,0.79978900243777,0.7443333333333333,0.7660506505481828,1.0
108
+ 1.4,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-245_torch.bfloat16_lf,0.7486666666666667,0.7974562319123832,0.7486666666666667,0.7655275916268014,0.9993333333333332
109
+ 1.6,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-280_torch.bfloat16_lf,0.7566666666666667,0.7939852407869384,0.7566666666666667,0.7689495073735431,0.9996666666666668
110
+ 1.8,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-315_torch.bfloat16_lf,0.755,0.7940575522966016,0.755,0.7681326415137147,0.9993333333333332
111
+ 2.0,Qwen2.5-7B-Instruct,Qwen/Qwen2.5-7B-Instruct/checkpoint-350_torch.bfloat16_lf,0.756,0.7982464722401461,0.756,0.7704035278260453,0.9996666666666668
112
  0.0,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct_torch.bfloat16_4bit_lf,0.7856666666666666,0.7942511546806512,0.7856666666666666,0.7699212943617263,0.994
113
  0.2,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-35_torch.bfloat16_4bit_lf,0.7736666666666666,0.8102875293385203,0.7736666666666666,0.7874095844134584,1.0
114
  0.4,Qwen2.5-72B-Instruct,Qwen/Qwen2.5-72B-Instruct/checkpoint-70_torch.bfloat16_4bit_lf,0.748,0.8094861650366822,0.748,0.7718522396481117,1.0
llm_toolkit/logical_reasoning_utils.py CHANGED
@@ -518,27 +518,16 @@ def plot_metrics(perf_df, model_name, variant="epoch", offset=0.01):
518
  )
519
  perf_df = perf_df.iloc[:min_length]
520
 
521
- # Plot accuracy and f1 on the same chart with different markers
522
  ax.plot(
523
- perf_df[variant], perf_df["accuracy"], marker="o", label="Accuracy", color="r"
524
- )
525
- ax.plot(
526
- perf_df[variant], perf_df["f1"], marker="s", label="F1 Score", color="b"
527
  ) # Square marker for F1 Score
 
 
 
528
 
529
  # Add values on top of points
530
  for i in range(min_length):
531
  print(f"{perf_df[variant].iloc[i]}: {perf_df['run'].iloc[i]}")
532
- ax.annotate(
533
- f"{perf_df['accuracy'].iloc[i]*100:.2f}%",
534
- (perf_df[variant].iloc[i], perf_df["accuracy"].iloc[i]),
535
- ha="center",
536
- va="bottom", # Move accuracy numbers below the points
537
- xytext=(0, -15),
538
- textcoords="offset points",
539
- fontsize=10,
540
- color="r",
541
- )
542
  ax.annotate(
543
  f"{perf_df['f1'].iloc[i]*100:.2f}%",
544
  (perf_df[variant].iloc[i], perf_df["f1"].iloc[i]),
@@ -549,6 +538,16 @@ def plot_metrics(perf_df, model_name, variant="epoch", offset=0.01):
549
  fontsize=10,
550
  color="b",
551
  )
 
 
 
 
 
 
 
 
 
 
552
 
553
  # Set y-axis limit
554
  ylimits = ax.get_ylim()
@@ -560,7 +559,7 @@ def plot_metrics(perf_df, model_name, variant="epoch", offset=0.01):
560
  if variant == "epoch"
561
  else "Number of Shots"
562
  )
563
- ax.set_ylabel("Accuracy and F1 Score")
564
 
565
  ax.xaxis.set_major_locator(MultipleLocator(0.2 if variant == "epoch" else 5))
566
  ax.set_title(f"Performance Analysis Across {'Checkpoints' if variant == 'epoch' else 'Shots'} for the {model_name} Model")
 
518
  )
519
  perf_df = perf_df.iloc[:min_length]
520
 
 
521
  ax.plot(
522
+ perf_df[variant], perf_df["f1"], marker="s", label="F1", color="b"
 
 
 
523
  ) # Square marker for F1 Score
524
+ ax.plot(
525
+ perf_df[variant], perf_df["ratio_valid_classifications"], marker="o", label="VCR", color="r"
526
+ )
527
 
528
  # Add values on top of points
529
  for i in range(min_length):
530
  print(f"{perf_df[variant].iloc[i]}: {perf_df['run'].iloc[i]}")
 
 
 
 
 
 
 
 
 
 
531
  ax.annotate(
532
  f"{perf_df['f1'].iloc[i]*100:.2f}%",
533
  (perf_df[variant].iloc[i], perf_df["f1"].iloc[i]),
 
538
  fontsize=10,
539
  color="b",
540
  )
541
+ ax.annotate(
542
+ f"{perf_df['ratio_valid_classifications'].iloc[i]*100:.2f}%",
543
+ (perf_df[variant].iloc[i], perf_df["ratio_valid_classifications"].iloc[i]),
544
+ ha="center",
545
+ va="bottom", # Move accuracy numbers below the points
546
+ xytext=(0, 5),
547
+ textcoords="offset points",
548
+ fontsize=10,
549
+ color="r",
550
+ )
551
 
552
  # Set y-axis limit
553
  ylimits = ax.get_ylim()
 
559
  if variant == "epoch"
560
  else "Number of Shots"
561
  )
562
+ ax.set_ylabel("Valid Classification Ratio (VCR) and F1 Score")
563
 
564
  ax.xaxis.set_major_locator(MultipleLocator(0.2 if variant == "epoch" else 5))
565
  ax.set_title(f"Performance Analysis Across {'Checkpoints' if variant == 'epoch' else 'Shots'} for the {model_name} Model")
logs/{Qwen2.5-72B-shots.txt → Qwen2-72B-shots.txt} RENAMED
File without changes
notebooks/00_Data Analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c27a6be2814f5be1f48aa22ad45c264d24556d09cb5347629dab1cf0755ab97
3
- size 1148416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41c186e8985991dc385de77805508e5e588926f797cf30047be750fec3eef330
3
+ size 1240375
notebooks/01a_internlm2_5-20b-chat_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:734e3db8a49e84a606f2d38ca30e4d0e8ff3cdb2c67684d5825356c023cf08fc
3
- size 6385778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0cea5cdad0ca0abe4b744b55b5eb5d578f7e517323228aeb37281265a535d9d
3
+ size 6378310
notebooks/01a_internlm2_5-7b-chat-1m_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f887698f08bdb2f767c6edbb0329171670ab1ae3e53e4eb1b065ea8c175d864
3
- size 2385934
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e227a94216b212a5cfd4a4a9b71f546d3f1002669264a1a82205d09070a91945
3
+ size 2366870
notebooks/01a_internlm2_5-7b-chat_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef39d306780d8c6717de34adbaa5a7180854019439fa151016007eb8d52f6a05
3
- size 6122973
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:160bb2fe298b5f9dfd5f63f2e70e5f95849df97feb00a8da00fd1ab3e320c529
3
+ size 6092005
notebooks/01b_Mistral-7B-v0.3-Chinese-Chat_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71fbd8541e14161d6ff7a5668ce60021cda2bd703819dab6bb405cc6c25553e3
3
- size 14778268
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f6addfbd7ca2686025b0a37ca511f29c4b6373d19965e5450edac30e62616ee
3
+ size 14736032
notebooks/02a_Qwen2-7B-Instruct_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9cda076bb798749bdd0a680484547091497d7bd4dc67be75fd40bc75e0651b27
3
- size 1789053
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f344c1583e389861aa33f01884078e2a1779e36e41b775505c3a669da84040b
3
+ size 3426960
notebooks/02b_Qwen2-72B-Instruct_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:223ce2a464dbad51d4f2cb02e50bcc656c2e1774366f30051e7a4a10505c3c0a
3
- size 2080974
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22e99daa352a629d6d8b591032b94f12a0d66c11e3b04f6744423e7d72fbf039
3
+ size 2425374
notebooks/02c_Qwen2.5-3B-Instruct_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:119be6f4500034ba44e2a26d945219790661b0e3b0b3a51abf4538aa4ebb3ce0
3
- size 7235492
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb7915eae3b9005684a4ecc9c60fcd6880eee21ee4f21123ef8f2d43c4c1612a
3
+ size 7185712
notebooks/02d_Qwen2.5-7B-Instruct_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0dff7d4d0e788b1d9c954a3ae1bfefbc972c618dcb3868ceab6868b1ca66611
3
- size 7864021
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa59b9e345312b8ba34a7379c49c82e0f8d05c34f322b65cc754c320007748fa
3
+ size 7862529
notebooks/02g_Qwen2.5-72B-Instruct_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6475da89bebb0df2859c6de9e7c27ddb624ed98e4fb21a45aa164d0c96feb12
3
- size 1666060
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41817d8ed0faae7016ee54185d22b1a0ba9f766373ea2ea589306ef19bd25513
3
+ size 1637565
notebooks/03a_Llama3.1-8B-Chinese-Chat_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:117d77b6b10f6eb1046514ce09508d766a94f78245991c715f18108e2a3f65ba
3
- size 6562505
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14fd3f1c5211d9c791f276f760c1171e29e2110a965c64eeb34ad4e5c680aa43
3
+ size 6542205
notebooks/03b_Llama3.1-70B-Chinese-Chat_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9174bf54d1fc0f59d172dde9c72a8caa7e0cac9d0a86ecc33b6c501356aa64d1
3
- size 2063794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:163c79dd0f5dd1e1112fe00626f3bee46bd0a1df12ecc551579c01ac3c33d929
3
+ size 2035718
notebooks/04b_OpenAI-Models_analysis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4355f25f8b4139956c99ecd85460f3bd52833101da012e7fee45f806f28326b5
3
- size 4866244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae89e9c9fdd3d2c744c5aee8f4b06d303473cd439c050c660f0bdc9a5d6d62c
3
+ size 4815051