diff --git a/results/cross_lingual/few_shot/cross_logiqa.csv b/results/cross_lingual/few_shot/cross_logiqa.csv index c70b781c2b64d574dce967ddf17d8399f2a2a234..0583fe8723e2bc420b72c71ce0be88a88765df9b 100644 --- a/results/cross_lingual/few_shot/cross_logiqa.csv +++ b/results/cross_lingual/few_shot/cross_logiqa.csv @@ -1 +1,5 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3.1-70B,0.6241883116883117,0.5274350649350649,0.5717473426837402,0.7386363636363636,0.6306818181818182,0.6193181818181818,0.6079545454545454,0.6022727272727273,0.625,0.5454545454545454 +Meta-Llama-3-8B,0.4569805194805195,0.25519480519480514,0.32750096946546897,0.5340909090909091,0.5284090909090909,0.44886363636363635,0.4602272727272727,0.38636363636363635,0.44886363636363635,0.39204545454545453 +llama3-8b-cpt-sea-lionv2-base,0.4553571428571429,0.2600649350649351,0.33105611198328916,0.45454545454545453,0.48295454545454547,0.48863636363636365,0.48295454545454547,0.4431818181818182,0.45454545454545453,0.3806818181818182 +Meta-Llama-3.1-8B,0.4586038961038961,0.28230519480519484,0.3494794808137755,0.5056818181818182,0.5170454545454546,0.4602272727272727,0.4943181818181818,0.39204545454545453,0.45454545454545453,0.38636363636363635 diff --git a/results/cross_lingual/few_shot/cross_mmlu.csv b/results/cross_lingual/few_shot/cross_mmlu.csv index c70b781c2b64d574dce967ddf17d8399f2a2a234..1522e86d89224e2320884ab5035a42884a90b772 100644 --- a/results/cross_lingual/few_shot/cross_mmlu.csv +++ b/results/cross_lingual/few_shot/cross_mmlu.csv @@ -1 +1,5 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3.1-70B,0.74,0.6445714285714286,0.688997110969635,0.7933333333333333,0.72,0.7466666666666667,0.7333333333333333,0.7466666666666667,0.7466666666666667,0.6933333333333334 +Meta-Llama-3-8B,0.5295238095238095,0.29771428571428576,0.381140094029779,0.6133333333333333,0.4666666666666667,0.58,0.5,0.5333333333333333,0.47333333333333333,0.54 +llama3-8b-cpt-sea-lionv2-base,0.5114285714285715,0.33390476190476204,0.404026266610288,0.6333333333333333,0.47333333333333333,0.52,0.47333333333333333,0.52,0.49333333333333335,0.4666666666666667 +Meta-Llama-3.1-8B,0.5304761904761904,0.34419047619047627,0.4174958519516044,0.68,0.5,0.5533333333333333,0.4866666666666667,0.5333333333333333,0.5133333333333333,0.44666666666666666 diff --git a/results/cross_lingual/few_shot/cross_xquad.csv b/results/cross_lingual/few_shot/cross_xquad.csv index c70b781c2b64d574dce967ddf17d8399f2a2a234..e188a55123ec6263e655d85cc81159ae8580808e 100644 --- a/results/cross_lingual/few_shot/cross_xquad.csv +++ b/results/cross_lingual/few_shot/cross_xquad.csv @@ -1 +1,5 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Meta-Llama-3.1-70B,0.9588235294117647,0.9382352941176471,0.9484176926084648,0.9647058823529412,0.9571428571428572,0.9554621848739496,0.957983193277311,,, +Meta-Llama-3-8B,0.8951680672268908,0.8123949579831933,0.851775323760744,0.9277310924369748,0.8756302521008403,0.8907563025210085,0.8865546218487395,,, +llama3-8b-cpt-sea-lionv2-base,0.9054621848739495,0.8464285714285714,0.8749507477272823,0.9193277310924369,0.8899159663865546,0.9126050420168067,0.9,,, +Meta-Llama-3.1-8B,0.9063025210084034,0.8340336134453781,0.8686675538861947,0.9319327731092437,0.8890756302521008,0.9117647058823529,0.892436974789916,,, diff --git a/results/cross_lingual/zero_shot/cross_logiqa.csv b/results/cross_lingual/zero_shot/cross_logiqa.csv index 2bf883c68970ac2b7722e6b9ca8a8aa78e4ce466..c675e8c48d16616b487f680fcaddc9397dd88314 100644 --- a/results/cross_lingual/zero_shot/cross_logiqa.csv +++ b/results/cross_lingual/zero_shot/cross_logiqa.csv @@ -1,7 +1,14 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.564935064935065,0.48279220779220783,0.5206435955861558,0.6590909090909091,0.7045454545454546,0.5340909090909091,0.5738636363636364,0.5397727272727273,0.5113636363636364,0.4318181818181818 Meta-Llama-3.1-8B-Instruct,0.4472402597402597,0.43717532467532455,0.44215052105151864,0.5227272727272727,0.4602272727272727,0.4715909090909091,0.4715909090909091,0.4147727272727273,0.3977272727272727,0.39204545454545453 +Qwen2-72B-Instruct,0.6728896103896104,0.6762987012987012,0.6745898487968579,0.75,0.8068181818181818,0.6534090909090909,0.6193181818181818,0.625,0.6534090909090909,0.6022727272727273 +Meta-Llama-3-8B-Instruct,0.4610389610389611,0.45097402597402597,0.4559509553669637,0.5965909090909091,0.48295454545454547,0.5,0.4602272727272727,0.42045454545454547,0.4034090909090909,0.36363636363636365 Meta-Llama-3.1-70B-Instruct,0.6566558441558442,0.598051948051948,0.6259852839118454,0.7443181818181818,0.7215909090909091,0.6647727272727273,0.6534090909090909,0.6193181818181818,0.625,0.5681818181818182 +SeaLLMs-v3-7B-Chat,0.5551948051948051,0.5142857142857142,0.5339578453833284,0.6022727272727273,0.6647727272727273,0.5738636363636364,0.5454545454545454,0.5170454545454546,0.5,0.48295454545454547 gemma-2-9b-it,0.6185064935064934,0.5592532467532466,0.5873893507784849,0.6647727272727273,0.6761363636363636,0.5625,0.6193181818181818,0.5795454545454546,0.6420454545454546,0.5852272727272727 Meta-Llama-3-70B-Instruct,0.6306818181818182,0.6186688311688312,0.6246175698800746,0.7102272727272727,0.6875,0.6420454545454546,0.6193181818181818,0.6022727272727273,0.6136363636363636,0.5397727272727273 sg_llama3_70b_inst,0.6217532467532468,0.5629870129870129,0.590912649920049,0.7272727272727273,0.6590909090909091,0.6477272727272727,0.6079545454545454,0.6136363636363636,0.5795454545454546,0.5170454545454546 +gemma-2-2b-it,0.4780844155844156,0.4448051948051948,0.46084478401384643,0.5568181818181818,0.5,0.5,0.48863636363636365,0.4375,0.4602272727272727,0.4034090909090909 +llama3-8b-cpt-sea-lionv2-instruct,0.48538961038961037,0.4472402597402597,0.46553468284769084,0.5284090909090909,0.5113636363636364,0.5227272727272727,0.5227272727272727,0.48863636363636365,0.44886363636363635,0.375 GPT4o_0513,0.7159090909090909,0.6941558441558444,0.7048646724637749,0.7613636363636364,0.7670454545454546,0.6988636363636364,0.6988636363636364,0.7045454545454546,0.6761363636363636,0.7045454545454546 +Meta-Llama-3.1-8B,0.29464285714285715,0.07857142857142858,0.12406015034269886,0.32954545454545453,0.32386363636363635,0.2840909090909091,0.2727272727272727,0.2840909090909091,0.3125,0.2556818181818182 diff --git a/results/cross_lingual/zero_shot/cross_mmlu.csv b/results/cross_lingual/zero_shot/cross_mmlu.csv index 7523ef0b0cf94591cb583dbbc1f82c6784ff1ed6..0472cb618c1465582ff8a5957ea9dec9e4a867bc 100644 --- a/results/cross_lingual/zero_shot/cross_mmlu.csv +++ b/results/cross_lingual/zero_shot/cross_mmlu.csv @@ -1,7 +1,14 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.6628571428571428,0.5257142857142858,0.5863736263242921,0.76,0.6666666666666666,0.72,0.5933333333333334,0.7066666666666667,0.6133333333333333,0.58 Meta-Llama-3.1-8B-Instruct,0.5619047619047618,0.5020952380952383,0.5303189947159841,0.66,0.5266666666666666,0.5733333333333334,0.5266666666666666,0.5533333333333333,0.5533333333333333,0.54 +Qwen2-72B-Instruct,0.779047619047619,0.7611428571428573,0.7699911663398871,0.8133333333333334,0.7933333333333333,0.7933333333333333,0.7333333333333333,0.7666666666666667,0.78,0.7733333333333333 +Meta-Llama-3-8B-Instruct,0.5733333333333334,0.4742857142857144,0.5191272726777197,0.7133333333333334,0.5866666666666667,0.5733333333333334,0.5866666666666667,0.5066666666666667,0.5333333333333333,0.5133333333333333 Meta-Llama-3.1-70B-Instruct,0.7638095238095238,0.7716190476190474,0.7676944251955988,0.8,0.74,0.7666666666666667,0.7666666666666667,0.76,0.7666666666666667,0.7466666666666667 +SeaLLMs-v3-7B-Chat,0.6628571428571429,0.6135238095238095,0.6372370860992635,0.74,0.6933333333333334,0.6933333333333334,0.6466666666666666,0.68,0.6,0.5866666666666667 gemma-2-9b-it,0.7161904761904762,0.7163809523809525,0.7162857015727578,0.7733333333333333,0.74,0.7066666666666667,0.64,0.7266666666666667,0.6933333333333334,0.7333333333333333 Meta-Llama-3-70B-Instruct,0.758095238095238,0.7316190476190477,0.7446218665971989,0.7933333333333333,0.7466666666666667,0.7733333333333333,0.7466666666666667,0.7733333333333333,0.7333333333333333,0.74 sg_llama3_70b_inst,0.7342857142857142,0.7079999999999999,0.7209033280007295,0.82,0.6866666666666666,0.7333333333333333,0.6933333333333334,0.78,0.7266666666666667,0.7 +gemma-2-2b-it,0.5780952380952381,0.5480000000000002,0.5626454667971265,0.7,0.5866666666666667,0.5866666666666667,0.5333333333333333,0.5666666666666667,0.5333333333333333,0.54 +llama3-8b-cpt-sea-lionv2-instruct,0.6104761904761905,0.5685714285714286,0.5887791368067445,0.72,0.6,0.6133333333333333,0.58,0.6333333333333333,0.5933333333333334,0.5333333333333333 GPT4o_0513,0.8038095238095239,0.8506666666666668,0.8265745643832277,0.8266666666666667,0.7933333333333333,0.8,0.7666666666666667,0.7933333333333333,0.8266666666666667,0.82 +Meta-Llama-3.1-8B,0.42000000000000004,0.1535238095238095,0.22485552968513808,0.4866666666666667,0.43333333333333335,0.44,0.38666666666666666,0.47333333333333333,0.3333333333333333,0.38666666666666666 diff --git a/results/cross_lingual/zero_shot/cross_xquad.csv b/results/cross_lingual/zero_shot/cross_xquad.csv index d6801b0207b18e384511462765bf61ba82c93537..1f99af80081cabcac8a227571661405cdb75a359 100644 --- a/results/cross_lingual/zero_shot/cross_xquad.csv +++ b/results/cross_lingual/zero_shot/cross_xquad.csv @@ -1,7 +1,14 @@ Model,Accuracy,Cross-Lingual Consistency,AC3,English,Chinese,Spanish,Vietnamese,Indonesian,Malay,Filipino +Qwen2-7B-Instruct,0.9418067226890756,0.9046218487394958,0.9228398561109394,0.957983193277311,0.9336134453781513,0.9436974789915966,0.9319327731092437,,, Meta-Llama-3.1-8B-Instruct,0.9287815126050419,0.8867647058823529,0.9072869161050563,0.9420168067226891,0.9193277310924369,0.9361344537815126,0.9176470588235294,,, +Qwen2-72B-Instruct,0.9613445378151261,0.9516806722689075,0.956488195931227,0.9638655462184874,0.9596638655462185,0.9596638655462185,0.9621848739495799,,, +Meta-Llama-3-8B-Instruct,0.9210084033613445,0.880672268907563,0.9003888121913395,0.9411764705882353,0.9033613445378151,0.9260504201680673,0.9134453781512605,,, Meta-Llama-3.1-70B-Instruct,0.9615546218487395,0.9512605042016806,0.9563798632627071,0.9647058823529412,0.9512605042016806,0.9647058823529412,0.965546218487395,,, +SeaLLMs-v3-7B-Chat,0.9403361344537815,0.917016806722689,0.9285300818164836,0.9537815126050421,0.9378151260504202,0.9394957983193277,0.9302521008403362,,, gemma-2-9b-it,0.9567226890756303,0.9350840336134454,0.9457796088507574,0.9663865546218487,0.9411764705882353,0.9588235294117647,0.9605042016806723,,, Meta-Llama-3-70B-Instruct,0.9592436974789916,0.9422268907563025,0.9506591499208973,0.9714285714285714,0.9403361344537815,0.9596638655462185,0.965546218487395,,, sg_llama3_70b_inst,0.9552521008403361,0.9453781512605042,0.9502894779607259,0.9663865546218487,0.9436974789915966,0.957983193277311,0.9529411764705882,,, +gemma-2-2b-it,0.917016806722689,0.8665966386554622,0.8910940700869288,0.934453781512605,0.9025210084033614,0.9193277310924369,0.9117647058823529,,, +llama3-8b-cpt-sea-lionv2-instruct,0.9365546218487395,0.9086134453781513,0.9223724784871395,0.9420168067226891,0.926890756302521,0.9436974789915966,0.9336134453781513,,, GPT4o_0513,0.9605042016806723,0.951890756302521,0.9561780814209724,0.965546218487395,0.9537815126050421,0.9630252100840336,0.9596638655462185,,, +Meta-Llama-3.1-8B,0.5619747899159664,0.21176470588235294,0.307613678067924,0.4756302521008403,0.6579831932773109,0.5571428571428572,0.5571428571428572,,, diff --git a/results/cultural_reasoning/few_shot/cn_eval.csv b/results/cultural_reasoning/few_shot/cn_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..f3dcf0c7f2d4babc2267e052420979fd9e233e3e 100644 --- a/results/cultural_reasoning/few_shot/cn_eval.csv +++ b/results/cultural_reasoning/few_shot/cn_eval.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.5904761904761905 +Meta-Llama-3-8B,0.42857142857142855 +llama3-8b-cpt-sea-lionv2-base,0.38095238095238093 +Meta-Llama-3.1-8B,0.4380952380952381 diff --git a/results/cultural_reasoning/few_shot/ph_eval.csv b/results/cultural_reasoning/few_shot/ph_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..ccaf1883b6a596625a21dfe98283d873626911a7 100644 --- a/results/cultural_reasoning/few_shot/ph_eval.csv +++ b/results/cultural_reasoning/few_shot/ph_eval.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.66 +Meta-Llama-3-8B,0.5 +llama3-8b-cpt-sea-lionv2-base,0.54 +Meta-Llama-3.1-8B,0.52 diff --git a/results/cultural_reasoning/few_shot/sg_eval.csv b/results/cultural_reasoning/few_shot/sg_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..b0974f54dfc062e9cb88e83b300fcaf7722ede5b 100644 --- a/results/cultural_reasoning/few_shot/sg_eval.csv +++ b/results/cultural_reasoning/few_shot/sg_eval.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7475728155339806 +Meta-Llama-3-8B,0.6601941747572816 +llama3-8b-cpt-sea-lionv2-base,0.6310679611650486 +Meta-Llama-3.1-8B,0.6504854368932039 diff --git a/results/cultural_reasoning/few_shot/sg_eval_v1_cleaned.csv b/results/cultural_reasoning/few_shot/sg_eval_v1_cleaned.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..469db5d46a426b2e000f3abb02b68fcffd7f0d68 100644 --- a/results/cultural_reasoning/few_shot/sg_eval_v1_cleaned.csv +++ b/results/cultural_reasoning/few_shot/sg_eval_v1_cleaned.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7794117647058824 +Meta-Llama-3-8B,0.6470588235294118 +llama3-8b-cpt-sea-lionv2-base,0.6470588235294118 +Meta-Llama-3.1-8B,0.6029411764705882 diff --git a/results/cultural_reasoning/few_shot/us_eval.csv b/results/cultural_reasoning/few_shot/us_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..30cdc7aec6ec17aec63de2e473b6f90bb5e48ccb 100644 --- a/results/cultural_reasoning/few_shot/us_eval.csv +++ b/results/cultural_reasoning/few_shot/us_eval.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.8691588785046729 +Meta-Llama-3-8B,0.6542056074766355 +llama3-8b-cpt-sea-lionv2-base,0.6542056074766355 +Meta-Llama-3.1-8B,0.7009345794392523 diff --git a/results/cultural_reasoning/zero_shot/cn_eval.csv b/results/cultural_reasoning/zero_shot/cn_eval.csv index 878a142b608130c7367c979ddf2c9dc3952b83f4..ad97e49c46c92b41eb192b4d9cb2a8aad61cddb8 100644 --- a/results/cultural_reasoning/zero_shot/cn_eval.csv +++ b/results/cultural_reasoning/zero_shot/cn_eval.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.8285714285714286 Meta-Llama-3.1-8B-Instruct,0.4857142857142857 +Qwen2-72B-Instruct,0.8285714285714286 +Meta-Llama-3-8B-Instruct,0.4666666666666667 Meta-Llama-3.1-70B-Instruct,0.5428571428571428 +SeaLLMs-v3-7B-Chat,0.819047619047619 gemma-2-9b-it,0.580952380952381 Meta-Llama-3-70B-Instruct,0.5333333333333333 sg_llama3_70b_inst,0.5523809523809524 +gemma-2-2b-it,0.3619047619047619 +llama3-8b-cpt-sea-lionv2-instruct,0.49523809523809526 GPT4o_0513,0.8095238095238095 +Meta-Llama-3.1-8B,0.3904761904761905 diff --git a/results/cultural_reasoning/zero_shot/ph_eval.csv b/results/cultural_reasoning/zero_shot/ph_eval.csv index c051412313e7d44b9a7fbd0612124ddb648901ba..4c327f0897926d66b380fd92602faaf2e1bd16fb 100644 --- a/results/cultural_reasoning/zero_shot/ph_eval.csv +++ b/results/cultural_reasoning/zero_shot/ph_eval.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.52 Meta-Llama-3.1-8B-Instruct,0.6 +Qwen2-72B-Instruct,0.62 +Meta-Llama-3-8B-Instruct,0.58 Meta-Llama-3.1-70B-Instruct,0.68 +SeaLLMs-v3-7B-Chat,0.47 gemma-2-9b-it,0.58 Meta-Llama-3-70B-Instruct,0.63 sg_llama3_70b_inst,0.69 +gemma-2-2b-it,0.4 +llama3-8b-cpt-sea-lionv2-instruct,0.56 GPT4o_0513,0.77 +Meta-Llama-3.1-8B,0.43 diff --git a/results/cultural_reasoning/zero_shot/sg_eval.csv b/results/cultural_reasoning/zero_shot/sg_eval.csv index 1f4645571f198f8145b1f08d621fff3517395a79..fc7a27fcc38a0dc50300133be678300c5aa7a249 100644 --- a/results/cultural_reasoning/zero_shot/sg_eval.csv +++ b/results/cultural_reasoning/zero_shot/sg_eval.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.6796116504854369 Meta-Llama-3.1-8B-Instruct,0.5728155339805825 +Qwen2-72B-Instruct,0.7378640776699029 +Meta-Llama-3-8B-Instruct,0.6504854368932039 Meta-Llama-3.1-70B-Instruct,0.7184466019417476 +SeaLLMs-v3-7B-Chat,0.7184466019417476 gemma-2-9b-it,0.6699029126213593 Meta-Llama-3-70B-Instruct,0.7087378640776699 sg_llama3_70b_inst,0.6699029126213593 +gemma-2-2b-it,0.5533980582524272 +llama3-8b-cpt-sea-lionv2-instruct,0.6504854368932039 GPT4o_0513,0.8446601941747572 +Meta-Llama-3.1-8B,0.39805825242718446 diff --git a/results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv b/results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv index 4b7f6d9864221196469e842963c69e0ffc149537..17e070e68d0c5f6280be75d747084208f30ece22 100644 --- a/results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv +++ b/results/cultural_reasoning/zero_shot/sg_eval_v1_cleaned.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.6323529411764706 Meta-Llama-3.1-8B-Instruct,0.5294117647058824 +Qwen2-72B-Instruct,0.6764705882352942 +Meta-Llama-3-8B-Instruct,0.5882352941176471 Meta-Llama-3.1-70B-Instruct,0.6617647058823529 +SeaLLMs-v3-7B-Chat,0.5882352941176471 gemma-2-9b-it,0.6029411764705882 Meta-Llama-3-70B-Instruct,0.6617647058823529 sg_llama3_70b_inst,0.6176470588235294 +gemma-2-2b-it,0.4852941176470588 +llama3-8b-cpt-sea-lionv2-instruct,0.6617647058823529 GPT4o_0513,0.8088235294117647 +Meta-Llama-3.1-8B,0.4117647058823529 diff --git a/results/cultural_reasoning/zero_shot/us_eval.csv b/results/cultural_reasoning/zero_shot/us_eval.csv index 25657547d33be96ec2ebb978f93c9945f63efbb0..31d9113fc0092b0b8a287a4e86f09d8140f92f45 100644 --- a/results/cultural_reasoning/zero_shot/us_eval.csv +++ b/results/cultural_reasoning/zero_shot/us_eval.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7289719626168224 Meta-Llama-3.1-8B-Instruct,0.7289719626168224 +Qwen2-72B-Instruct,0.8785046728971962 +Meta-Llama-3-8B-Instruct,0.7009345794392523 Meta-Llama-3.1-70B-Instruct,0.8411214953271028 +SeaLLMs-v3-7B-Chat,0.6915887850467289 gemma-2-9b-it,0.8130841121495327 Meta-Llama-3-70B-Instruct,0.8691588785046729 sg_llama3_70b_inst,0.8598130841121495 +gemma-2-2b-it,0.6915887850467289 +llama3-8b-cpt-sea-lionv2-instruct,0.7009345794392523 GPT4o_0513,0.8691588785046729 +Meta-Llama-3.1-8B,0.3644859813084112 diff --git a/results/dialogue/few_shot/dream.csv b/results/dialogue/few_shot/dream.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..04add5f0b2e02d84e5ac5b5a73fe96db40411a6a 100644 --- a/results/dialogue/few_shot/dream.csv +++ b/results/dialogue/few_shot/dream.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.9495345418912298 +Meta-Llama-3-8B,0.8280254777070064 +llama3-8b-cpt-sea-lionv2-base,0.8520333170014699 +Meta-Llama-3.1-8B,0.8569328760411563 diff --git a/results/dialogue/zero_shot/dialogsum.csv b/results/dialogue/zero_shot/dialogsum.csv index c59ee36fafe3ae3f2c7e02dda33d58751fd4e674..1e986c201a348ca848a63d8106009392bcb3f0a2 100644 --- a/results/dialogue/zero_shot/dialogsum.csv +++ b/results/dialogue/zero_shot/dialogsum.csv @@ -1,5 +1,13 @@ Model,Average,ROUGE-1,ROUGE-2,ROUGE-L +Qwen2-7B-Instruct,0.2092663759873139,0.30486100228371826,0.09413830506038247,0.22879982061784096 Meta-Llama-3.1-8B-Instruct,0.24990743661648132,0.3515557454075673,0.12563120411564133,0.2725353603262354 +Qwen2-72B-Instruct,0.2183280630214023,0.316174552903144,0.10156543495268992,0.23724420120837297 +Meta-Llama-3-8B-Instruct,0.23978455271183616,0.33971099717559883,0.1203340311564728,0.2593086298034369 +Meta-Llama-3.1-70B-Instruct,0.2526239717396146,0.35714386898604744,0.1258832921736473,0.27484475405914904 +SeaLLMs-v3-7B-Chat,0.24891094210680076,0.35393482223136147,0.12172072639345373,0.27107727769558715 +gemma-2-9b-it,0.2560682231168516,0.36247455000865003,0.12571639767749476,0.2800137216644101 Meta-Llama-3-70B-Instruct,0.2557065499979308,0.36058417323628,0.12758087337786866,0.2789546033796438 sg_llama3_70b_inst,0.26633840691332344,0.3692028513115729,0.1412505883866801,0.2885617810417173 +gemma-2-2b-it,0.2597323674875989,0.36848124762381895,0.12622684440269072,0.2844890104362872 +llama3-8b-cpt-sea-lionv2-instruct,0.25777587511641403,0.35911990072292727,0.13269121463917308,0.2815165099871418 GPT4o_0513,0.2375730297294346,0.3364674648846549,0.11718194476069822,0.25906967954295057 diff --git a/results/dialogue/zero_shot/dream.csv b/results/dialogue/zero_shot/dream.csv index f0f5aa50d0888b236c60253bd0573d4b5d661a52..3728588db24bc0177be9a15a1a95b9ed2eb1e396 100644 --- a/results/dialogue/zero_shot/dream.csv +++ b/results/dialogue/zero_shot/dream.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.9353258206761391 Meta-Llama-3.1-8B-Instruct,0.9039686428221461 +Qwen2-72B-Instruct,0.9612934835864773 +Meta-Llama-3-8B-Instruct,0.8946594806467418 +Meta-Llama-3.1-70B-Instruct,0.9559039686428221 +SeaLLMs-v3-7B-Chat,0.9265066144047036 +gemma-2-9b-it,0.9416952474277315 Meta-Llama-3-70B-Instruct,0.9480646741793238 sg_llama3_70b_inst,0.9524742773150416 +gemma-2-2b-it,0.8510534051935326 +llama3-8b-cpt-sea-lionv2-instruct,0.8858402743753062 GPT4o_0513,0.9583537481626654 diff --git a/results/dialogue/zero_shot/samsum.csv b/results/dialogue/zero_shot/samsum.csv index d24e9ef8f7ee050854f8bf4e89b78cb01bd52429..d07481830dfd286f0423b4296d6771dae09bd0c7 100644 --- a/results/dialogue/zero_shot/samsum.csv +++ b/results/dialogue/zero_shot/samsum.csv @@ -1,5 +1,13 @@ Model,Average,ROUGE-1,ROUGE-2,ROUGE-L +Qwen2-7B-Instruct,0.25668781132950264,0.36375948458827556,0.12939804942125302,0.27690589997897935 Meta-Llama-3.1-8B-Instruct,0.2891505262763006,0.4001228010515775,0.15677431231732958,0.31055446545999466 +Qwen2-72B-Instruct,0.2800906719573321,0.3887231369098802,0.15237661526996754,0.29917226369214855 +Meta-Llama-3-8B-Instruct,0.2846315092346869,0.39397110152251813,0.154320846916639,0.30560257926490364 +Meta-Llama-3.1-70B-Instruct,0.28934874612070227,0.4036295731242805,0.15211190810296196,0.31230475713486433 +SeaLLMs-v3-7B-Chat,0.2959981719045788,0.4078820748825196,0.16338306782652476,0.316729373004692 +gemma-2-9b-it,0.3100514077180449,0.4289412957792292,0.16727050182456474,0.3339424255503407 Meta-Llama-3-70B-Instruct,0.2893525314227379,0.4030746211134018,0.15236139065578,0.3126215824990321 sg_llama3_70b_inst,0.3146051103643872,0.4271361513564755,0.18238925099430264,0.33428992874238356 +gemma-2-2b-it,0.31118787136959813,0.4324251755711466,0.16441328335793207,0.33672515517971563 +llama3-8b-cpt-sea-lionv2-instruct,0.306997595680581,0.4214048099551701,0.1709790451938523,0.3286089318927205 GPT4o_0513,0.27736679291505306,0.386750207633093,0.14889081847621596,0.2964593526358502 diff --git a/results/emotion/few_shot/ind_emotion.csv b/results/emotion/few_shot/ind_emotion.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..30a3511380b03883e1df7de609a9bf42d50a253d 100644 --- a/results/emotion/few_shot/ind_emotion.csv +++ b/results/emotion/few_shot/ind_emotion.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7204545454545455 +Meta-Llama-3-8B,0.4681818181818182 +llama3-8b-cpt-sea-lionv2-base,0.5727272727272728 +Meta-Llama-3.1-8B,0.5318181818181819 diff --git a/results/emotion/few_shot/sst2.csv b/results/emotion/few_shot/sst2.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..30646975a4d1f914d116550aa103e73b386504d4 100644 --- a/results/emotion/few_shot/sst2.csv +++ b/results/emotion/few_shot/sst2.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.9036697247706422 +Meta-Llama-3-8B,0.7201834862385321 +llama3-8b-cpt-sea-lionv2-base,0.7282110091743119 +Meta-Llama-3.1-8B,0.8222477064220184 diff --git a/results/emotion/zero_shot/ind_emotion.csv b/results/emotion/zero_shot/ind_emotion.csv index 9224a76d89b65ff915d68e04d87172159a64016f..a76ebbf0a99b2aaf126566d92259332dce7e8e54 100644 --- a/results/emotion/zero_shot/ind_emotion.csv +++ b/results/emotion/zero_shot/ind_emotion.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.6545454545454545 Meta-Llama-3.1-8B-Instruct,0.6545454545454545 +Qwen2-72B-Instruct,0.675 +Meta-Llama-3-8B-Instruct,0.6522727272727272 +Meta-Llama-3.1-70B-Instruct,0.7159090909090909 +SeaLLMs-v3-7B-Chat,0.6454545454545455 +gemma-2-9b-it,0.7477272727272727 Meta-Llama-3-70B-Instruct,0.6909090909090909 sg_llama3_70b_inst,0.7 +gemma-2-2b-it,0.6636363636363637 +llama3-8b-cpt-sea-lionv2-instruct,0.6613636363636364 GPT4o_0513,0.7068181818181818 diff --git a/results/emotion/zero_shot/sst2.csv b/results/emotion/zero_shot/sst2.csv index db41f703d526c95d1d0df1ab37b533abaae5345f..859f467c1db0dca2596cf3d2499f227358695900 100644 --- a/results/emotion/zero_shot/sst2.csv +++ b/results/emotion/zero_shot/sst2.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.9346330275229358 Meta-Llama-3.1-8B-Instruct,0.8646788990825688 +Qwen2-72B-Instruct,0.9346330275229358 +Meta-Llama-3-8B-Instruct,0.8784403669724771 +Meta-Llama-3.1-70B-Instruct,0.9529816513761468 +SeaLLMs-v3-7B-Chat,0.9403669724770642 +gemma-2-9b-it,0.9311926605504587 Meta-Llama-3-70B-Instruct,0.9495412844036697 sg_llama3_70b_inst,0.9334862385321101 +gemma-2-2b-it,0.9243119266055045 +llama3-8b-cpt-sea-lionv2-instruct,0.9128440366972477 GPT4o_0513,0.9415137614678899 diff --git a/results/flores_translation/few_shot/ind2eng.csv b/results/flores_translation/few_shot/ind2eng.csv index c92cc58bffcf5c69f0b19b68264727e785b847ae..899d1370b188a217e1cdcdc055a8190596c3f321 100644 --- a/results/flores_translation/few_shot/ind2eng.csv +++ b/results/flores_translation/few_shot/ind2eng.csv @@ -1 +1,5 @@ Model,BLEU +Meta-Llama-3.1-70B,0.42145684080212753 +Meta-Llama-3-8B,0.37684086636912956 +llama3-8b-cpt-sea-lionv2-base,0.38065942591799257 +Meta-Llama-3.1-8B,0.38181303557840174 diff --git a/results/flores_translation/few_shot/vie2eng.csv b/results/flores_translation/few_shot/vie2eng.csv index c92cc58bffcf5c69f0b19b68264727e785b847ae..7a823c975f043edecb0a6bc482360ac96d5d6529 100644 --- a/results/flores_translation/few_shot/vie2eng.csv +++ b/results/flores_translation/few_shot/vie2eng.csv @@ -1 +1,5 @@ Model,BLEU +Meta-Llama-3.1-70B,0.3486043252859807 +Meta-Llama-3-8B,0.3088281924097908 +llama3-8b-cpt-sea-lionv2-base,0.3101352718812011 +Meta-Llama-3.1-8B,0.31860377848723964 diff --git a/results/flores_translation/few_shot/zho2eng.csv b/results/flores_translation/few_shot/zho2eng.csv index c92cc58bffcf5c69f0b19b68264727e785b847ae..a02431bc8e6cd248193119fbf668f0a06de9f889 100644 --- a/results/flores_translation/few_shot/zho2eng.csv +++ b/results/flores_translation/few_shot/zho2eng.csv @@ -1 +1,5 @@ Model,BLEU +Meta-Llama-3.1-70B,0.2784128355061452 +Meta-Llama-3-8B,0.24157503759807666 +llama3-8b-cpt-sea-lionv2-base,0.2196548010627023 +Meta-Llama-3.1-8B,0.23636236548065317 diff --git a/results/flores_translation/few_shot/zsm2eng.csv b/results/flores_translation/few_shot/zsm2eng.csv index c92cc58bffcf5c69f0b19b68264727e785b847ae..0d22724602fd412c930906e44f9e9fda8ce19733 100644 --- a/results/flores_translation/few_shot/zsm2eng.csv +++ b/results/flores_translation/few_shot/zsm2eng.csv @@ -1 +1,5 @@ Model,BLEU +Meta-Llama-3.1-70B,0.4419951682556223 +Meta-Llama-3-8B,0.38778379180318306 +llama3-8b-cpt-sea-lionv2-base,0.373752985045955 +Meta-Llama-3.1-8B,0.39297234157214134 diff --git a/results/flores_translation/zero_shot/ind2eng.csv b/results/flores_translation/zero_shot/ind2eng.csv index e597ae87754ef555add807f5ef248617396226ba..764fab6dda0ea66b6de19b8961ad0b09e80601f1 100644 --- a/results/flores_translation/zero_shot/ind2eng.csv +++ b/results/flores_translation/zero_shot/ind2eng.csv @@ -1,7 +1,14 @@ Model,BLEU +Qwen2-7B-Instruct,0.29408553325533265 Meta-Llama-3.1-8B-Instruct,0.3765752579792989 +Qwen2-72B-Instruct,0.4043588265556185 +Meta-Llama-3-8B-Instruct,0.33079891679041123 Meta-Llama-3.1-70B-Instruct,0.43366494500251235 +SeaLLMs-v3-7B-Chat,0.3594829412574955 gemma-2-9b-it,0.40786563079141763 Meta-Llama-3-70B-Instruct,0.3830092775167675 sg_llama3_70b_inst,0.4086440304524362 +gemma-2-2b-it,0.3482500758113138 +llama3-8b-cpt-sea-lionv2-instruct,0.3916108972514423 GPT4o_0513,0.42589589086974855 +Meta-Llama-3.1-8B,0.008893689222008793 diff --git a/results/flores_translation/zero_shot/vie2eng.csv b/results/flores_translation/zero_shot/vie2eng.csv index 0b58cfeb6e212f3d3c147ff569f26be934861152..add5d76f111d336f1ac2434d3392585321d2c308 100644 --- a/results/flores_translation/zero_shot/vie2eng.csv +++ b/results/flores_translation/zero_shot/vie2eng.csv @@ -1,7 +1,14 @@ Model,BLEU +Qwen2-7B-Instruct,0.24106736560355876 Meta-Llama-3.1-8B-Instruct,0.31019605539004524 +Qwen2-72B-Instruct,0.33005323227052946 +Meta-Llama-3-8B-Instruct,0.2647448190950291 Meta-Llama-3.1-70B-Instruct,0.37244508311079816 +SeaLLMs-v3-7B-Chat,0.30981028289420137 gemma-2-9b-it,0.3367700653885 Meta-Llama-3-70B-Instruct,0.3230140263371192 sg_llama3_70b_inst,0.34258533717783785 +gemma-2-2b-it,0.27518909199172303 +llama3-8b-cpt-sea-lionv2-instruct,0.327781936019637 GPT4o_0513,0.36219303373759176 +Meta-Llama-3.1-8B,0.0064729173628987014 diff --git a/results/flores_translation/zero_shot/zho2eng.csv b/results/flores_translation/zero_shot/zho2eng.csv index 972870dd07087720ba19a225a6c0bcf55f1c0293..911b3ea7e8b9d8fb66a2f8f5255a7fb724eb6729 100644 --- a/results/flores_translation/zero_shot/zho2eng.csv +++ b/results/flores_translation/zero_shot/zho2eng.csv @@ -1,7 +1,14 @@ Model,BLEU +Qwen2-7B-Instruct,0.2113761361724575 Meta-Llama-3.1-8B-Instruct,0.23889886925287113 +Qwen2-72B-Instruct,0.23893268538329387 +Meta-Llama-3-8B-Instruct,0.199495011482748 Meta-Llama-3.1-70B-Instruct,0.2832594176173152 +SeaLLMs-v3-7B-Chat,0.2516593644617717 gemma-2-9b-it,0.267527968123433 Meta-Llama-3-70B-Instruct,0.24397819518058994 sg_llama3_70b_inst,0.26000707510414633 +gemma-2-2b-it,0.21164036008441425 +llama3-8b-cpt-sea-lionv2-instruct,0.2381535278220489 GPT4o_0513,0.27722306559544163 +Meta-Llama-3.1-8B,0.0030426517414972854 diff --git a/results/flores_translation/zero_shot/zsm2eng.csv b/results/flores_translation/zero_shot/zsm2eng.csv index 6e25f198004ab0832cf0fb58060ffe40072c6a52..f9728db7132c28a7725bc0d383b1dafea05b6ee9 100644 --- a/results/flores_translation/zero_shot/zsm2eng.csv +++ b/results/flores_translation/zero_shot/zsm2eng.csv @@ -1,7 +1,14 @@ Model,BLEU +Qwen2-7B-Instruct,0.28031997065822994 Meta-Llama-3.1-8B-Instruct,0.3700921225177551 +Qwen2-72B-Instruct,0.40796892621611885 +Meta-Llama-3-8B-Instruct,0.31625368345049 Meta-Llama-3.1-70B-Instruct,0.4462132282683508 +SeaLLMs-v3-7B-Chat,0.3484133510670942 gemma-2-9b-it,0.4234100394581857 Meta-Llama-3-70B-Instruct,0.3957287030176054 sg_llama3_70b_inst,0.4163761508073963 +gemma-2-2b-it,0.33737270487369614 +llama3-8b-cpt-sea-lionv2-instruct,0.38799258214381604 GPT4o_0513,0.451496635720668 +Meta-Llama-3.1-8B,0.00798239824596684 diff --git a/results/fundamental_nlp_tasks/few_shot/c3.csv b/results/fundamental_nlp_tasks/few_shot/c3.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..1d8edfe3686f4eced58da04abea262934d211e2c 100644 --- a/results/fundamental_nlp_tasks/few_shot/c3.csv +++ b/results/fundamental_nlp_tasks/few_shot/c3.csv @@ -1 +1,4 @@ Model,Accuracy +Meta-Llama-3-8B,0.7655198204936425 +llama3-8b-cpt-sea-lionv2-base,0.7995512341062079 +Meta-Llama-3.1-8B,0.8103964098728497 diff --git a/results/fundamental_nlp_tasks/few_shot/cola.csv b/results/fundamental_nlp_tasks/few_shot/cola.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..f6a4095a2633c334a3299bb38081b5458fc7d0c8 100644 --- a/results/fundamental_nlp_tasks/few_shot/cola.csv +++ b/results/fundamental_nlp_tasks/few_shot/cola.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7248322147651006 +Meta-Llama-3-8B,0.5934803451581975 +llama3-8b-cpt-sea-lionv2-base,0.6203259827420902 +Meta-Llama-3.1-8B,0.6471716203259827 diff --git a/results/fundamental_nlp_tasks/few_shot/mnli.csv b/results/fundamental_nlp_tasks/few_shot/mnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..4dd57d0a9e9aabb2df8dda802a752667efa328ea 100644 --- a/results/fundamental_nlp_tasks/few_shot/mnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/mnli.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7395 +Meta-Llama-3-8B,0.442 +llama3-8b-cpt-sea-lionv2-base,0.456 +Meta-Llama-3.1-8B,0.465 diff --git a/results/fundamental_nlp_tasks/few_shot/mrpc.csv b/results/fundamental_nlp_tasks/few_shot/mrpc.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..2481c0c4f7879200d8f14520e0e37597cf0a7008 100644 --- a/results/fundamental_nlp_tasks/few_shot/mrpc.csv +++ b/results/fundamental_nlp_tasks/few_shot/mrpc.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.6397058823529411 +Meta-Llama-3-8B,0.5906862745098039 +llama3-8b-cpt-sea-lionv2-base,0.5686274509803921 +Meta-Llama-3.1-8B,0.571078431372549 diff --git a/results/fundamental_nlp_tasks/few_shot/ocnli.csv b/results/fundamental_nlp_tasks/few_shot/ocnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..eb484ed4d465e6fde0a619e2f178e009155ba854 100644 --- a/results/fundamental_nlp_tasks/few_shot/ocnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/ocnli.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.6786440677966101 +Meta-Llama-3-8B,0.38101694915254236 +llama3-8b-cpt-sea-lionv2-base,0.3871186440677966 +Meta-Llama-3.1-8B,0.4067796610169492 diff --git a/results/fundamental_nlp_tasks/few_shot/qnli.csv b/results/fundamental_nlp_tasks/few_shot/qnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..44a262af8c3e3b98b147607c39910afdca0bb326 100644 --- a/results/fundamental_nlp_tasks/few_shot/qnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/qnli.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.5777045579352005 +Meta-Llama-3-8B,0.5028372688998719 +llama3-8b-cpt-sea-lionv2-base,0.500274574409665 +Meta-Llama-3.1-8B,0.500274574409665 diff --git a/results/fundamental_nlp_tasks/few_shot/qqp.csv b/results/fundamental_nlp_tasks/few_shot/qqp.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..9e51863e34df728d9bf5a0bc7f6a47d50c9ef0bb 100644 --- a/results/fundamental_nlp_tasks/few_shot/qqp.csv +++ b/results/fundamental_nlp_tasks/few_shot/qqp.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7365 +Meta-Llama-3-8B,0.54 +llama3-8b-cpt-sea-lionv2-base,0.5295 +Meta-Llama-3.1-8B,0.557 diff --git a/results/fundamental_nlp_tasks/few_shot/rte.csv b/results/fundamental_nlp_tasks/few_shot/rte.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..eb2060cdc0862caee2c43a2cb93d6472027166c7 100644 --- a/results/fundamental_nlp_tasks/few_shot/rte.csv +++ b/results/fundamental_nlp_tasks/few_shot/rte.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7725631768953068 +Meta-Llama-3-8B,0.5848375451263538 +llama3-8b-cpt-sea-lionv2-base,0.6570397111913358 +Meta-Llama-3.1-8B,0.6642599277978339 diff --git a/results/fundamental_nlp_tasks/few_shot/wnli.csv b/results/fundamental_nlp_tasks/few_shot/wnli.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..6a308b29dc8d4dbdd28946a716cdbc379c762045 100644 --- a/results/fundamental_nlp_tasks/few_shot/wnli.csv +++ b/results/fundamental_nlp_tasks/few_shot/wnli.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7323943661971831 +Meta-Llama-3-8B,0.49295774647887325 +llama3-8b-cpt-sea-lionv2-base,0.43661971830985913 +Meta-Llama-3.1-8B,0.5774647887323944 diff --git a/results/fundamental_nlp_tasks/zero_shot/c3.csv b/results/fundamental_nlp_tasks/zero_shot/c3.csv index 137108a993a2c6f9e63bdcb4d596c79a86b88e12..3912f6cb511dcacfd20d84b4b82ed36bbbe34106 100644 --- a/results/fundamental_nlp_tasks/zero_shot/c3.csv +++ b/results/fundamental_nlp_tasks/zero_shot/c3.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.9244577412116679 Meta-Llama-3.1-8B-Instruct,0.8672400897531788 +Qwen2-72B-Instruct,0.9611069558713538 +Meta-Llama-3-8B-Instruct,0.8515332834704562 +Meta-Llama-3.1-70B-Instruct,0.9603590127150337 +SeaLLMs-v3-7B-Chat,0.9143605086013463 +gemma-2-9b-it,0.9222139117427075 Meta-Llama-3-70B-Instruct,0.9521316379955124 sg_llama3_70b_inst,0.9289454001495886 +gemma-2-2b-it,0.7700074794315632 +llama3-8b-cpt-sea-lionv2-instruct,0.8672400897531788 GPT4o_0513,0.9648466716529543 diff --git a/results/fundamental_nlp_tasks/zero_shot/cola.csv b/results/fundamental_nlp_tasks/zero_shot/cola.csv index 3644b505f47d43e0908c1da367c60467e32e4a6c..91530b13f3b5d7608cb20e59e95035c7ec8fea6f 100644 --- a/results/fundamental_nlp_tasks/zero_shot/cola.csv +++ b/results/fundamental_nlp_tasks/zero_shot/cola.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7871524448705657 Meta-Llama-3.1-8B-Instruct,0.6673058485139022 +Qwen2-72B-Instruct,0.8341323106423778 +Meta-Llama-3-8B-Instruct,0.6548418024928092 +Meta-Llama-3.1-70B-Instruct,0.850431447746884 +SeaLLMs-v3-7B-Chat,0.785234899328859 +gemma-2-9b-it,0.7938638542665388 Meta-Llama-3-70B-Instruct,0.835091083413231 sg_llama3_70b_inst,0.8696069031639502 +gemma-2-2b-it,0.6749760306807286 +llama3-8b-cpt-sea-lionv2-instruct,0.6078619367209971 GPT4o_0513,0.8398849472674976 diff --git a/results/fundamental_nlp_tasks/zero_shot/mnli.csv b/results/fundamental_nlp_tasks/zero_shot/mnli.csv index fc88d658f8c3b4e1dc5fc50128c64e3f9472b6f8..7fcaf3c5ba58e34c48ba55d4f3b613f34270b6e0 100644 --- a/results/fundamental_nlp_tasks/zero_shot/mnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/mnli.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7295 Meta-Llama-3.1-8B-Instruct,0.4825 +Qwen2-72B-Instruct,0.7925 +Meta-Llama-3-8B-Instruct,0.546 +Meta-Llama-3.1-70B-Instruct,0.7015 +SeaLLMs-v3-7B-Chat,0.653 +gemma-2-9b-it,0.716 Meta-Llama-3-70B-Instruct,0.6709421285692472 sg_llama3_70b_inst,0.7685 +gemma-2-2b-it,0.6185 +llama3-8b-cpt-sea-lionv2-instruct,0.5765 GPT4o_0513,0.8335 diff --git a/results/fundamental_nlp_tasks/zero_shot/mrpc.csv b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv index 5ae21c996fe72a897d291f24a1f2b9534f7f962c..bdcac6e3168a55aaaff4bbc1d32ebe49b9b7c34d 100644 --- a/results/fundamental_nlp_tasks/zero_shot/mrpc.csv +++ b/results/fundamental_nlp_tasks/zero_shot/mrpc.csv @@ -1,4 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7867647058823529 Meta-Llama-3.1-8B-Instruct,0.6740196078431373 +Qwen2-72B-Instruct,0.8063725490196079 +Meta-Llama-3-8B-Instruct,0.678921568627451 +Meta-Llama-3.1-70B-Instruct,0.7696078431372549 +SeaLLMs-v3-7B-Chat,0.7475490196078431 +gemma-2-9b-it,0.7401960784313726 Meta-Llama-3-70B-Instruct,0.7598039215686274 sg_llama3_70b_inst,0.7892156862745098 +gemma-2-2b-it,0.7083333333333334 +llama3-8b-cpt-sea-lionv2-instruct,0.5833333333333334 +GPT4o_0513,0.7377450980392157 diff --git a/results/fundamental_nlp_tasks/zero_shot/ocnli.csv b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv index bcf0d209d9364f847e5697468df38acc8321d167..8b0506f90e4755f99bddf286093c5909554e9bdb 100644 --- a/results/fundamental_nlp_tasks/zero_shot/ocnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/ocnli.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.6542372881355932 Meta-Llama-3.1-8B-Instruct,0.40983050847457625 +Qwen2-72B-Instruct,0.7820338983050847 +Meta-Llama-3-8B-Instruct,0.44033898305084745 +Meta-Llama-3.1-70B-Instruct,0.6423728813559322 +SeaLLMs-v3-7B-Chat,0.5698305084745763 +gemma-2-9b-it,0.6189830508474576 Meta-Llama-3-70B-Instruct,0.5928813559322034 sg_llama3_70b_inst,0.6420338983050847 +gemma-2-2b-it,0.43322033898305085 +llama3-8b-cpt-sea-lionv2-instruct,0.45559322033898303 GPT4o_0513,0.7308474576271187 diff --git a/results/fundamental_nlp_tasks/zero_shot/qnli.csv b/results/fundamental_nlp_tasks/zero_shot/qnli.csv index 98cf8e83605d465b77ff668e450ffabd50bd365c..19bde4db8a99843c96db6d63389578fc91467942 100644 --- a/results/fundamental_nlp_tasks/zero_shot/qnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/qnli.csv @@ -1,4 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.8154859967051071 Meta-Llama-3.1-8B-Instruct,0.5777045579352005 +Qwen2-72B-Instruct,0.8887058392824455 +Meta-Llama-3-8B-Instruct,0.6025993044114956 +Meta-Llama-3.1-70B-Instruct,0.9026176093721399 +SeaLLMs-v3-7B-Chat,0.7159070107999268 +gemma-2-9b-it,0.9070107999267801 Meta-Llama-3-70B-Instruct,0.876807614863628 sg_llama3_70b_inst,0.9004210140948197 +gemma-2-2b-it,0.7792421746293245 +llama3-8b-cpt-sea-lionv2-instruct,0.6101043382756727 +GPT4o_0513,0.9304411495515285 diff --git a/results/fundamental_nlp_tasks/zero_shot/qqp.csv b/results/fundamental_nlp_tasks/zero_shot/qqp.csv index 68c3c6e80b40664974a05bed855f291234ef07a3..411bc4bcaed534b93dfda6262101845649f10440 100644 --- a/results/fundamental_nlp_tasks/zero_shot/qqp.csv +++ b/results/fundamental_nlp_tasks/zero_shot/qqp.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.781 Meta-Llama-3.1-8B-Instruct,0.5645 +Qwen2-72B-Instruct,0.8065 +Meta-Llama-3-8B-Instruct,0.563 +Meta-Llama-3.1-70B-Instruct,0.815 +SeaLLMs-v3-7B-Chat,0.7625 +gemma-2-9b-it,0.7775 Meta-Llama-3-70B-Instruct,0.7876082117239673 sg_llama3_70b_inst,0.804 +gemma-2-2b-it,0.761 +llama3-8b-cpt-sea-lionv2-instruct,0.6225 GPT4o_0513,0.8085 diff --git a/results/fundamental_nlp_tasks/zero_shot/rte.csv b/results/fundamental_nlp_tasks/zero_shot/rte.csv index 071898f539e3babde849027aaddb09d69fd2f41d..5d0708c7dca165b9c2e6a71c49d36d6f0d97c22b 100644 --- a/results/fundamental_nlp_tasks/zero_shot/rte.csv +++ b/results/fundamental_nlp_tasks/zero_shot/rte.csv @@ -1,4 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.8231046931407943 Meta-Llama-3.1-8B-Instruct,0.6750902527075813 +Qwen2-72B-Instruct,0.8447653429602888 +Meta-Llama-3-8B-Instruct,0.6173285198555957 +Meta-Llama-3.1-70B-Instruct,0.8483754512635379 +SeaLLMs-v3-7B-Chat,0.7870036101083032 +gemma-2-9b-it,0.7472924187725631 Meta-Llama-3-70B-Instruct,0.8086642599277978 sg_llama3_70b_inst,0.8916967509025271 +gemma-2-2b-it,0.7292418772563177 +llama3-8b-cpt-sea-lionv2-instruct,0.6859205776173285 +GPT4o_0513,0.8700361010830325 diff --git a/results/fundamental_nlp_tasks/zero_shot/wnli.csv b/results/fundamental_nlp_tasks/zero_shot/wnli.csv index 36f474487f3a471256051a88b24800caa6d789e4..78e05337a70211bcdf546010a780ab9c53e60086 100644 --- a/results/fundamental_nlp_tasks/zero_shot/wnli.csv +++ b/results/fundamental_nlp_tasks/zero_shot/wnli.csv @@ -1,4 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7183098591549296 Meta-Llama-3.1-8B-Instruct,0.49295774647887325 +Qwen2-72B-Instruct,0.8873239436619719 +Meta-Llama-3-8B-Instruct,0.4788732394366197 +Meta-Llama-3.1-70B-Instruct,0.8450704225352113 +SeaLLMs-v3-7B-Chat,0.5915492957746479 +gemma-2-9b-it,0.7746478873239436 Meta-Llama-3-70B-Instruct,0.7887323943661971 sg_llama3_70b_inst,0.8309859154929577 +gemma-2-2b-it,0.43661971830985913 +llama3-8b-cpt-sea-lionv2-instruct,0.5774647887323944 +GPT4o_0513,0.9295774647887324 diff --git a/results/general_reasoning/few_shot/c_eval.csv b/results/general_reasoning/few_shot/c_eval.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..12e0768b6b102f57b50dc039c72623d2fd5f29a7 100644 --- a/results/general_reasoning/few_shot/c_eval.csv +++ b/results/general_reasoning/few_shot/c_eval.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.6363636363636364 +Meta-Llama-3-8B,0.43212951432129515 +llama3-8b-cpt-sea-lionv2-base,0.40099626400996263 +Meta-Llama-3.1-8B,0.43711083437110837 diff --git a/results/general_reasoning/few_shot/cmmlu.csv b/results/general_reasoning/few_shot/cmmlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..94b0cf4c8eab647bb20a0b0a063554def9c7a64d 100644 --- a/results/general_reasoning/few_shot/cmmlu.csv +++ b/results/general_reasoning/few_shot/cmmlu.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.6496287342427906 +Meta-Llama-3-8B,0.4393023657399413 +llama3-8b-cpt-sea-lionv2-base,0.4429286824382663 +Meta-Llama-3.1-8B,0.45467104127093766 diff --git a/results/general_reasoning/few_shot/indommlu.csv b/results/general_reasoning/few_shot/indommlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..4c3c2db7f3249e40a459319d5a493b201909eadc 100644 --- a/results/general_reasoning/few_shot/indommlu.csv +++ b/results/general_reasoning/few_shot/indommlu.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.6336204018959877 +Meta-Llama-3-8B,0.4515655250684291 +llama3-8b-cpt-sea-lionv2-base,0.5042392683089659 +Meta-Llama-3.1-8B,0.47393016890313105 diff --git a/results/general_reasoning/few_shot/mmlu.csv b/results/general_reasoning/few_shot/mmlu.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..e01b2482f1e6c2f778e47dd1bca1619de54f1992 100644 --- a/results/general_reasoning/few_shot/mmlu.csv +++ b/results/general_reasoning/few_shot/mmlu.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.7457275652484805 +Meta-Llama-3-8B,0.5681086878798713 +llama3-8b-cpt-sea-lionv2-base,0.5608866642831605 +Meta-Llama-3.1-8B,0.5723274937432964 diff --git a/results/general_reasoning/few_shot/zbench.csv b/results/general_reasoning/few_shot/zbench.csv index 1dd9db3171b0ad2e4edc84dcebb54f8d76c675ab..54824c3ad624bf80a4e574d55e19ccbc29c8b25c 100644 --- a/results/general_reasoning/few_shot/zbench.csv +++ b/results/general_reasoning/few_shot/zbench.csv @@ -1 +1,5 @@ Model,Accuracy +Meta-Llama-3.1-70B,0.5757575757575758 +Meta-Llama-3-8B,0.24242424242424243 +llama3-8b-cpt-sea-lionv2-base,0.2727272727272727 +Meta-Llama-3.1-8B,0.3939393939393939 diff --git a/results/general_reasoning/zero_shot/c_eval.csv b/results/general_reasoning/zero_shot/c_eval.csv index 77527bf9e184071a9942ec09f49cadcf30e88cda..c988ee3650de213e459ddd2a5d733ea19e304e17 100644 --- a/results/general_reasoning/zero_shot/c_eval.csv +++ b/results/general_reasoning/zero_shot/c_eval.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7615193026151931 Meta-Llama-3.1-8B-Instruct,0.5149439601494396 +Qwen2-72B-Instruct,0.8312577833125778 +Meta-Llama-3-8B-Instruct,0.4775840597758406 Meta-Llama-3.1-70B-Instruct,0.6612702366127023 +SeaLLMs-v3-7B-Chat,0.7658779576587795 gemma-2-9b-it,0.5523038605230386 Meta-Llama-3-70B-Instruct,0.6220423412204235 sg_llama3_70b_inst,0.5722291407222914 +gemma-2-2b-it,0.4352428393524284 +llama3-8b-cpt-sea-lionv2-instruct,0.49813200498132004 GPT4o_0513,0.7073474470734745 +Meta-Llama-3.1-8B,0.3742216687422167 diff --git a/results/general_reasoning/zero_shot/cmmlu.csv b/results/general_reasoning/zero_shot/cmmlu.csv index 4073d9705d65c1f04fadf0a5d7cd09f771696eea..82da3092cbb8e29cf48dd704e7381cb28fc598a9 100644 --- a/results/general_reasoning/zero_shot/cmmlu.csv +++ b/results/general_reasoning/zero_shot/cmmlu.csv @@ -1,7 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7727508202383008 Meta-Llama-3.1-8B-Instruct,0.5246934898981178 +Qwen2-72B-Instruct,0.8293904334311863 +Meta-Llama-3-8B-Instruct,0.4839405974788465 Meta-Llama-3.1-70B-Instruct,0.6814885166637886 +SeaLLMs-v3-7B-Chat,0.7684337765498187 gemma-2-9b-it,0.5700224486271801 Meta-Llama-3-70B-Instruct,0.6494560524952513 sg_llama3_70b_inst,0.6044724572612675 +gemma-2-2b-it,0.4412882058366431 +llama3-8b-cpt-sea-lionv2-instruct,0.48929373165256435 GPT4o_0513,0.7414954239336902 diff --git a/results/general_reasoning/zero_shot/indommlu.csv b/results/general_reasoning/zero_shot/indommlu.csv index b5086cb81ec88cb92a53b871b7c4200d1836d697..ee7a98bf72f688db603ac4cdbde952a8ffa2fd31 100644 --- a/results/general_reasoning/zero_shot/indommlu.csv +++ b/results/general_reasoning/zero_shot/indommlu.csv @@ -1,5 +1,13 @@ Model,Accuracy +Qwen2-7B-Instruct,0.5385539755657921 Meta-Llama-3.1-8B-Instruct,0.5252687095266707 +Qwen2-72B-Instruct,0.6385606515788771 +Meta-Llama-3-8B-Instruct,0.5264703918819681 +Meta-Llama-3.1-70B-Instruct,0.6740770411910008 +SeaLLMs-v3-7B-Chat,0.5267374324053675 +gemma-2-9b-it,0.606983109686895 Meta-Llama-3-70B-Instruct,0.6323519594098405 sg_llama3_70b_inst,0.6394285332799252 +gemma-2-2b-it,0.48220842512851325 +llama3-8b-cpt-sea-lionv2-instruct,0.5252687095266707 GPT4o_0513,0.7584618465852193 diff --git a/results/general_reasoning/zero_shot/mmlu.csv b/results/general_reasoning/zero_shot/mmlu.csv index d4832543c4760ceedfb23cc1d803b6982788ae90..a3fe9fe8bf0d5570ce177c289d30ca84f56a8c2c 100644 --- a/results/general_reasoning/zero_shot/mmlu.csv +++ b/results/general_reasoning/zero_shot/mmlu.csv @@ -1,7 +1,14 @@ Model,Accuracy +Qwen2-7B-Instruct,0.672506256703611 Meta-Llama-3.1-8B-Instruct,0.6037182695745441 +Qwen2-72B-Instruct,0.7922774401144083 +Meta-Llama-3-8B-Instruct,0.6005720414730068 Meta-Llama-3.1-70B-Instruct,0.8058634250983197 +SeaLLMs-v3-7B-Chat,0.6670003575259207 gemma-2-9b-it,0.7100464783696818 Meta-Llama-3-70B-Instruct,0.7649624597783339 sg_llama3_70b_inst,0.7407937075437969 +gemma-2-2b-it,0.5706828745084018 +llama3-8b-cpt-sea-lionv2-instruct,0.6130854486950303 GPT4o_0513,0.8308187343582409 +Meta-Llama-3.1-8B,0.386271004647837 diff --git a/results/general_reasoning/zero_shot/zbench.csv b/results/general_reasoning/zero_shot/zbench.csv index bb02673b5b03a43abc10422e445bab35fce0aefb..6e37e528b9f277a6353b919230c0ada35a9017ab 100644 --- a/results/general_reasoning/zero_shot/zbench.csv +++ b/results/general_reasoning/zero_shot/zbench.csv @@ -1,6 +1,12 @@ Model,Accuracy +Qwen2-7B-Instruct,0.7272727272727273 Meta-Llama-3.1-8B-Instruct,0.42424242424242425 +Qwen2-72B-Instruct,0.5757575757575758 +Meta-Llama-3-8B-Instruct,0.3333333333333333 Meta-Llama-3.1-70B-Instruct,0.48484848484848486 +SeaLLMs-v3-7B-Chat,0.5454545454545454 gemma-2-9b-it,0.48484848484848486 Meta-Llama-3-70B-Instruct,0.5151515151515151 sg_llama3_70b_inst,0.42424242424242425 +gemma-2-2b-it,0.24242424242424243 +llama3-8b-cpt-sea-lionv2-instruct,0.30303030303030304